{"id":"https://openalex.org/W4395444023","doi":"https://doi.org/10.48550/arxiv.2404.14367","title":"Preference Fine-Tuning of LLMs Should Leverage Suboptimal, On-Policy\n Data","display_name":"Preference Fine-Tuning of LLMs Should Leverage Suboptimal, On-Policy\n Data","publication_year":2024,"publication_date":"2024-04-22","ids":{"openalex":"https://openalex.org/W4395444023","doi":"https://doi.org/10.48550/arxiv.2404.14367"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2404.14367","pdf_url":"https://arxiv.org/pdf/2404.14367","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2404.14367","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046483574","display_name":"Fahim Tajwar","orcid":"https://orcid.org/0000-0001-9257-6282"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tajwar, Fahim","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088818925","display_name":"Anikait Singh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Singh, Anikait","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033189369","display_name":"Archit Sharma","orcid":"https://orcid.org/0000-0001-9487-1576"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sharma, Archit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028522433","display_name":"Rafael Rafailov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rafailov, Rafael","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055199976","display_name":"Jeff Schneider","orcid":"https://orcid.org/0000-0002-5080-9073"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schneider, Jeff","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095886714","display_name":"Tengyang Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Tengyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091179481","display_name":"Stefano Ermon","orcid":"https://orcid.org/0000-0003-0039-2887"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ermon, Stefano","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005431772","display_name":"Chelsea Finn","orcid":"https://orcid.org/0000-0001-6298-0874"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Finn, Chelsea","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5102493293","display_name":"Aviral Kumar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kumar, Aviral","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":77},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10357","display_name":"Efficiency Analysis Using DEA","score":0.5179,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10357","display_name":"Efficiency Analysis Using DEA","score":0.5179,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10391","display_name":"Healthcare Policy and Management","score":0.491,"subfield":{"id":"https://openalex.org/subfields/2002","display_name":"Economics and Econometrics"},"field":{"id":"https://openalex.org/fields/20","display_name":"Economics, Econometrics and Finance"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11182","display_name":"Auction Theory and Applications","score":0.4812,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.8012973},{"id":"https://openalex.org/keywords/fine-tuning","display_name":"Fine-tuning","score":0.45515957}],"concepts":[{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.8012973},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.551474},{"id":"https://openalex.org/C157524613","wikidata":"https://www.wikidata.org/wiki/Q2828883","display_name":"Fine-tuning","level":2,"score":0.45515957},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.44566005},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.32061124},{"id":"https://openalex.org/C175444787","wikidata":"https://www.wikidata.org/wiki/Q39072","display_name":"Microeconomics","level":1,"score":0.25364357},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.25092822},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.083443135},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.06647542},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2404.14367","pdf_url":"https://arxiv.org/pdf/2404.14367","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2404.14367","pdf_url":"https://arxiv.org/pdf/2404.14367","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W4390306690","https://openalex.org/W4385572141","https://openalex.org/W4381280689","https://openalex.org/W2847365777","https://openalex.org/W2787993192","https://openalex.org/W2158269427","https://openalex.org/W2090624569","https://openalex.org/W1969209172","https://openalex.org/W1502198272"],"abstract_inverted_index":{"Learning":[0],"from":[1],"preference":[2,19,89,211],"labels":[3],"plays":[4],"a":[5,77,102,106,141,165,185,189],"crucial":[6],"role":[7],"in":[8,122],"fine-tuning":[9,62,87,109,212],"large":[10],"language":[11],"models.":[12],"There":[13],"are":[14,84,175],"several":[15],"distinct":[16],"approaches":[17,83,124],"for":[18,48,86,170,210,222],"fine-tuning,":[20],"including":[21],"supervised":[22,72],"learning,":[23],"on-policy":[24,127,159],"reinforcement":[25],"learning":[26],"(RL),":[27],"and":[28,38,41,91,113,146,154,215],"contrastive":[29,68],"learning.":[30],"Different":[31],"methods":[32,73,156],"come":[33],"with":[34,88],"different":[35,46],"implementation":[36],"tradeoffs":[37],"performance":[39],"differences,":[40],"existing":[42],"empirical":[43],"findings":[44],"present":[45],"conclusions,":[47],"instance,":[49],"some":[50],"results":[51],"show":[52],"that":[53,125,157],"online":[54],"RL":[55],"is":[56,120],"quite":[57],"important":[58,85],"to":[59,131,177,193,198],"attain":[60],"good":[61],"results,":[63],"while":[64],"others":[65],"find":[66],"(offline)":[67],"or":[69,129,161],"even":[70],"purely":[71],"sufficient.":[74],"This":[75],"raises":[76],"natural":[78],"question:":[79],"what":[80],"kind":[81],"of":[82,105,108,167,184,213],"data":[90,218],"why?":[92],"In":[93],"this":[94,98],"paper,":[95],"we":[96],"answer":[97],"question":[99],"by":[100],"performing":[101],"rigorous":[103],"analysis":[104,206],"number":[107],"techniques":[110],"on":[111,136,181],"didactic":[112],"full-scale":[114],"LLM":[115],"problems.":[116],"Our":[117,205],"main":[118],"finding":[119],"that,":[121],"general,":[123],"use":[126,158],"sampling":[128,160],"attempt":[130],"push":[132],"down":[133],"the":[134],"likelihood":[135,148],"certain":[137],"responses":[138],"(i.e.,":[139],"employ":[140],"\"negative":[142],"gradient\")":[143],"outperform":[144],"offline":[145],"maximum":[147,194],"objectives.":[149],"We":[150],"conceptualize":[151],"our":[152],"insights":[153,209],"unify":[155],"negative":[162],"gradient":[163],"under":[164],"notion":[166],"mode-seeking":[168],"objectives":[169,174],"categorical":[171,186],"distributions.":[172],"Mode-seeking":[173],"able":[176],"alter":[178],"probability":[179],"mass":[180],"specific":[182],"bins":[183,202],"distribution":[187],"at":[188],"fast":[190],"rate":[191],"compared":[192],"likelihood,":[195],"allowing":[196],"them":[197],"relocate":[199],"masses":[200],"across":[201],"more":[203],"effectively.":[204],"prescribes":[207],"actionable":[208],"LLMs":[214],"informs":[216],"how":[217],"should":[219],"be":[220],"collected":[221],"maximal":[223],"improvement.":[224]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4395444023","counts_by_year":[],"updated_date":"2025-04-18T17:41:37.427259","created_date":"2024-04-26"}