{"id":"https://openalex.org/W4399400217","doi":"https://doi.org/10.48550/arxiv.2406.01013","title":"Scalable Ensembling For Mitigating Reward Overoptimisation","display_name":"Scalable Ensembling For Mitigating Reward Overoptimisation","publication_year":2024,"publication_date":"2024-06-03","ids":{"openalex":"https://openalex.org/W4399400217","doi":"https://doi.org/10.48550/arxiv.2406.01013"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.01013","pdf_url":"https://arxiv.org/pdf/2406.01013","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2406.01013","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5099038456","display_name":"Ahmed M. Ahmed","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ahmed, Ahmed M.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028522433","display_name":"Rafael Rafailov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rafailov, Rafael","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099038457","display_name":"Stepan Sharkov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sharkov, Stepan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027490170","display_name":"Xuechen Li","orcid":"https://orcid.org/0000-0002-4177-2408"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xuechen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5091266570","display_name":"Sanmi Koyejo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koyejo, Sanmi","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":{"min":0,"max":77},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":null,"topics":[],"keywords":[],"concepts":[{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.59715164},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5429551},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.43520683},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.095250756}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.01013","pdf_url":"https://arxiv.org/pdf/2406.01013","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.01013","pdf_url":"https://arxiv.org/pdf/2406.01013","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4396701345","https://openalex.org/W4396696052","https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2389214306","https://openalex.org/W2382290278","https://openalex.org/W2376932109","https://openalex.org/W2358668433","https://openalex.org/W2001405890"],"abstract_inverted_index":{"Reinforcement":[0,83],"Learning":[1,84],"from":[2],"Human":[3],"Feedback":[4],"(RLHF)":[5],"has":[6,63],"enabled":[7],"significant":[8],"advancements":[9],"within":[10],"language":[11,89],"modeling":[12],"for":[13,88,99,136,138],"powerful,":[14],"instruction-following":[15],"models.":[16,102],"However,":[17],"the":[18,28,33,124],"alignment":[19],"of":[20,42,75,140],"these":[21],"models":[22,90,139],"remains":[23],"a":[24,47,56,69,109],"pressing":[25],"challenge":[26],"as":[27,44,59,123],"policy":[29],"tends":[30],"to":[31,120],"overfit":[32],"learned":[34],"``proxy\"":[35],"reward":[36,49,76],"model":[37,50],"past":[38],"an":[39,73],"inflection":[40],"point":[41],"utility":[43],"measured":[45],"by":[46,67],"``gold\"":[48],"that":[51],"is":[52,79],"more":[53],"performant":[54],"--":[55],"phenomenon":[57],"known":[58],"\\textit{over-optimization}.":[60],"Prior":[61],"work":[62],"mitigated":[64],"this":[65,104,118],"issue":[66],"computing":[68],"pessimistic":[70],"statistic":[71],"over":[72],"ensemble":[74,126],"models,":[77],"which":[78],"common":[80],"in":[81,131],"Offline":[82],"but":[85,112],"incredibly":[86],"costly":[87],"with":[91],"high":[92],"memory":[93,132],"requirements,":[94],"making":[95],"such":[96],"approaches":[97],"infeasible":[98],"sufficiently":[100],"large":[101],"To":[103],"end,":[105],"we":[106],"propose":[107],"using":[108],"shared":[110],"encoder":[111],"separate":[113],"linear":[114],"heads.":[115],"We":[116],"find":[117],"leads":[119],"similar":[121,141],"performance":[122],"full":[125],"while":[127],"allowing":[128],"tremendous":[129],"savings":[130],"and":[133],"time":[134],"required":[135],"training":[137],"size.":[142],"\\end{abstract}":[143]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4399400217","counts_by_year":[],"updated_date":"2025-04-19T04:50:54.227439","created_date":"2024-06-07"}