{"id":"https://openalex.org/W4394906666","doi":"https://doi.org/10.48550/arxiv.2404.08495","title":"Dataset Reset Policy Optimization for RLHF","display_name":"Dataset Reset Policy Optimization for RLHF","publication_year":2024,"publication_date":"2024-04-12","ids":{"openalex":"https://openalex.org/W4394906666","doi":"https://doi.org/10.48550/arxiv.2404.08495"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.08495","pdf_url":"http://arxiv.org/pdf/2404.08495","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2404.08495","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110249072","display_name":"Jonathan Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chang, Jonathan D.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095767203","display_name":"Wenhao Zhan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhan, Wenhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095045747","display_name":"Owen Oertell","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Oertell, Owen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022968100","display_name":"Kiant\u00e9 Brantley","orcid":"https://orcid.org/0000-0002-8395-594X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brantley, Kiant\u00e9","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088218114","display_name":"Dipendra Misra","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Misra, Dipendra","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059740024","display_name":"Jason D. Lee","orcid":"https://orcid.org/0000-0003-0064-7800"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Jason D.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101338644","display_name":"Wen Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Wen","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":84},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.5376,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.5376,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4835,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reset","display_name":"Reset (finance)","score":0.865498}],"concepts":[{"id":"https://openalex.org/C2779795794","wikidata":"https://www.wikidata.org/wiki/Q7315343","display_name":"Reset (finance)","level":2,"score":0.865498},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4789277},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.2603116},{"id":"https://openalex.org/C10138342","wikidata":"https://www.wikidata.org/wiki/Q43015","display_name":"Finance","level":1,"score":0.076964915}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.08495","pdf_url":"http://arxiv.org/pdf/2404.08495","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.08495","pdf_url":"http://arxiv.org/pdf/2404.08495","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W96259911","https://openalex.org/W4391375266","https://openalex.org/W4385608460","https://openalex.org/W350273603","https://openalex.org/W2748952813","https://openalex.org/W2393495588","https://openalex.org/W2370772865","https://openalex.org/W2168225754","https://openalex.org/W2000034628","https://openalex.org/W1528611913"],"abstract_inverted_index":{"Reinforcement":[0],"Learning":[1],"(RL)":[2],"from":[3,37,129,184,190],"Human":[4],"Preference-based":[5],"feedback":[6],"is":[7,84,151,186],"a":[8,34,63],"popular":[9],"paradigm":[10],"for":[11,207],"fine-tuning":[12],"generative":[13],"models,":[14],"which":[15],"has":[16],"produced":[17],"impressive":[18],"models":[19],"such":[20],"as":[21,145,147],"GPT-4":[22],"and":[23,175,195],"Claude3":[24],"Opus.":[25],"This":[26],"framework":[27],"often":[28],"consists":[29],"of":[30,59,126,203],"two":[31],"steps:":[32],"learning":[33],"reward":[35,51],"model":[36],"an":[38],"offline":[39,75,100,123,155],"preference":[40,76,101],"dataset":[41,77,102,110,156],"followed":[42],"by":[43,71,86,153],"running":[44],"online":[45,105],"RL":[46],"to":[47,118,141],"optimize":[48],"the":[49,57,72,87,98,104,115,119,122,130,154,172,176,182,201],"learned":[50],"model.":[52],"In":[53,134,165],"this":[54,208],"work,":[55],"leveraging":[56],"idea":[58],"reset,":[60],"we":[61,136,167],"propose":[62],"new":[64,90],"RLHF":[65],"algorithm":[66],"with":[67,161],"provable":[68],"guarantees.":[69],"Motivated":[70],"fact":[73],"that":[74,83,138,150,169,189],"provides":[78],"informative":[79],"states":[80,120],"(i.e.,":[81],"data":[82],"preferred":[85],"labelers),":[88],"our":[89],"algorithm,":[91],"Dataset":[92],"Reset":[93],"Policy":[94,192],"Optimization":[95,193,198],"(DR-PO),":[96],"integrates":[97],"existing":[99],"into":[103],"policy":[106,116,149],"training":[107],"procedure":[108],"via":[109],"reset:":[111],"it":[112],"directly":[113],"resets":[114],"optimizer":[117],"in":[121],"dataset,":[124,181],"instead":[125],"always":[127],"starting":[128],"initial":[131],"state":[132],"distribution.":[133],"theory,":[135],"show":[137],"DR-PO":[139,185],"learns":[140],"perform":[142],"at":[143,213],"least":[144],"good":[146],"any":[148],"covered":[152],"under":[157,200],"general":[158],"function":[159],"approximation":[160],"finite":[162],"sample":[163],"complexity.":[164],"experiments,":[166],"demonstrate":[168],"on":[170],"both":[171],"TL;DR":[173],"summarization":[174],"Anthropic":[177],"Helpful":[178],"Harmful":[179],"(HH)":[180],"generation":[183],"better":[187],"than":[188],"Proximal":[191],"(PPO)":[194],"Direction":[196],"Preference":[197],"(DPO),":[199],"metric":[202],"GPT4":[204],"win-rate.":[205],"Code":[206],"work":[209],"can":[210],"be":[211],"found":[212],"https://github.com/Cornell-RL/drpo.":[214]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4394906666","counts_by_year":[],"updated_date":"2024-12-15T12:30:55.486243","created_date":"2024-04-18"}