{"id":"https://openalex.org/W4394839102","doi":"https://doi.org/10.48550/arxiv.2404.08555","title":"RLHF Deciphered: A Critical Analysis of Reinforcement Learning from\n Human Feedback for LLMs","display_name":"RLHF Deciphered: A Critical Analysis of Reinforcement Learning from\n Human Feedback for LLMs","publication_year":2024,"publication_date":"2024-04-12","ids":{"openalex":"https://openalex.org/W4394839102","doi":"https://doi.org/10.48550/arxiv.2404.08555"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2404.08555","pdf_url":"https://arxiv.org/pdf/2404.08555","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2404.08555","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5091446143","display_name":"Shreyas Chaudhari","orcid":"https://orcid.org/0000-0002-8826-2253"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chaudhari, Shreyas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045442167","display_name":"Pranjal Aggarwal","orcid":"https://orcid.org/0000-0002-2962-1535"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aggarwal, Pranjal","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049841785","display_name":"Vishvak Murahari","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Murahari, Vishvak","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019768140","display_name":"Tanmay Rajpurohit","orcid":"https://orcid.org/0000-0001-9302-4244"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rajpurohit, Tanmay","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114053294","display_name":"Ashwin Kalyan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kalyan, Ashwin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025205227","display_name":"Karthik Narasimhan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Narasimhan, Karthik","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041777363","display_name":"Ameet Deshpande","orcid":"https://orcid.org/0000-0001-9885-0385"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deshpande, Ameet","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5034701757","display_name":"Bruno Castro da Silva","orcid":"https://orcid.org/0000-0002-3708-5728"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"da Silva, Bruno Castro","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.999954,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9794,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9794,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/categorical-variable","display_name":"Categorical variable","score":0.5100346}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7736532},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.6129241},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.59617877},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.58167815},{"id":"https://openalex.org/C5274069","wikidata":"https://www.wikidata.org/wiki/Q2285707","display_name":"Categorical variable","level":2,"score":0.5100346},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.38013518},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36431187},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.35628086},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.3517428},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.28581074},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26541036},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.1243549},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.08778483},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C78458016","wikidata":"https://www.wikidata.org/wiki/Q840400","display_name":"Evolutionary biology","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2404.08555","pdf_url":"https://arxiv.org/pdf/2404.08555","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2404.08555","pdf_url":"https://arxiv.org/pdf/2404.08555","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W65104662","https://openalex.org/W4386799044","https://openalex.org/W4297454206","https://openalex.org/W2773208253","https://openalex.org/W2560646951","https://openalex.org/W2362286668","https://openalex.org/W2153339597","https://openalex.org/W2133382151","https://openalex.org/W1871748041","https://openalex.org/W1528412344"],"abstract_inverted_index":{"State-of-the-art":[0],"large":[1],"language":[2,192],"models":[3,156],"(LLMs)":[4],"have":[5],"become":[6],"indispensable":[7],"tools":[8],"for":[9,20,60,159,211],"various":[10],"tasks.":[11],"However,":[12],"training":[13,134],"LLMs":[14,61],"to":[15,39,100,110,215],"serve":[16],"as":[17,208],"effective":[18],"assistants":[19],"humans":[21],"requires":[22],"careful":[23],"consideration.":[24],"A":[25],"promising":[26],"approach":[27],"is":[28,62],"reinforcement":[29,97],"learning":[30,98],"from":[31],"human":[32,37,46],"feedback":[33,38],"(RLHF),":[34],"which":[35],"leverages":[36],"update":[40],"the":[41,71,85,94,111,117,137,142,149,152,166,179,188,217],"model":[42,176],"in":[43],"accordance":[44],"with":[45,65,184],"preferences":[47],"and":[48,53,73,129,157,178,196,213,221],"mitigate":[49],"issues":[50],"like":[51],"toxicity":[52],"hallucinations.":[54],"Yet,":[55],"an":[56,102],"understanding":[57,103,150],"of":[58,96,104,114,126,144,151,154,165,181,190,204,219],"RLHF":[59,92,115,133,220],"largely":[63],"entangled":[64],"initial":[66],"design":[67],"choices":[68,80],"that":[69],"popularized":[70],"method":[72],"current":[74,167,205],"research":[75],"focuses":[76],"on":[77,132,187],"augmenting":[78],"those":[79],"rather":[81],"than":[82],"fundamentally":[83],"improving":[84],"framework.":[86],"In":[87],"this":[88],"paper,":[89],"we":[90],"analyze":[91],"through":[93],"lens":[95],"principles":[99],"develop":[101],"its":[105],"fundamentals,":[106],"dedicating":[107],"substantial":[108],"focus":[109],"core":[112],"component":[113],"--":[116],"reward":[118,155],"model.":[119,193],"Our":[120,146],"study":[121],"investigates":[122],"modeling":[123],"choices,":[124],"caveats":[125],"function":[127],"approximation,":[128],"their":[130,160,185],"implications":[131],"algorithms,":[135],"highlighting":[136],"underlying":[138],"assumptions":[139],"made":[140],"about":[141],"expressivity":[143],"reward.":[145],"analysis":[147,197],"improves":[148],"role":[153],"methods":[158],"training,":[161],"concurrently":[162],"revealing":[163],"limitations":[164],"methodology.":[168],"We":[169],"characterize":[170],"these":[171],"limitations,":[172],"including":[173],"incorrect":[174],"generalization,":[175],"misspecification,":[177],"sparsity":[180],"feedback,":[182],"along":[183],"impact":[186],"performance":[189],"a":[191,201,209],"The":[194],"discussion":[195],"are":[198],"substantiated":[199],"by":[200],"categorical":[202],"review":[203],"literature,":[206],"serving":[207],"reference":[210],"researchers":[212],"practitioners":[214],"understand":[216],"challenges":[218],"build":[222],"upon":[223],"existing":[224],"efforts.":[225]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4394839102","counts_by_year":[{"year":2024,"cited_by_count":4}],"updated_date":"2025-01-04T11:44:00.417678","created_date":"2024-04-16"}