{"id":"https://openalex.org/W4392011670","doi":"https://doi.org/10.48550/arxiv.2402.11411","title":"Aligning Modalities in Vision Large Language Models via Preference\n Fine-tuning","display_name":"Aligning Modalities in Vision Large Language Models via Preference\n Fine-tuning","publication_year":2024,"publication_date":"2024-02-17","ids":{"openalex":"https://openalex.org/W4392011670","doi":"https://doi.org/10.48550/arxiv.2402.11411"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.11411","pdf_url":"http://arxiv.org/pdf/2402.11411","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2402.11411","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001050580","display_name":"Yiyang Zhou","orcid":"https://orcid.org/0000-0002-1534-8005"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yiyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102563465","display_name":"Chenhang Cui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cui, Chenhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028522433","display_name":"Rafael Rafailov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rafailov, Rafael","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005431772","display_name":"Chelsea Finn","orcid":"https://orcid.org/0000-0001-6298-0874"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Finn, Chelsea","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5051534896","display_name":"Huaxiu Yao","orcid":"https://orcid.org/0000-0002-8691-9629"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Huaxiu","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.915412,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":77,"max":88},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9968,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9968,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9929,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9785,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.7113794}],"concepts":[{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.7113794},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.6506069},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.51464117},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.39953318},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35756993},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.35588858},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.34916162},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.16572979},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.15127388},{"id":"https://openalex.org/C175444787","wikidata":"https://www.wikidata.org/wiki/Q39072","display_name":"Microeconomics","level":1,"score":0.07827607},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.11411","pdf_url":"http://arxiv.org/pdf/2402.11411","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.11411","pdf_url":"http://arxiv.org/pdf/2402.11411","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3116076068","https://openalex.org/W2775347418","https://openalex.org/W2772917594","https://openalex.org/W2755342338","https://openalex.org/W2229312674","https://openalex.org/W2166024367","https://openalex.org/W2079911747","https://openalex.org/W2058170566","https://openalex.org/W2036807459","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Instruction-following":[0],"Vision":[1],"Large":[2],"Language":[3],"Models":[4],"(VLLMs)":[5],"have":[6],"achieved":[7],"significant":[8],"progress":[9],"recently":[10],"on":[11,44,165],"a":[12,123,171],"variety":[13],"of":[14,153,181],"tasks.":[15],"These":[16],"approaches":[17],"merge":[18],"strong":[19],"pre-trained":[20],"vision":[21,80],"models":[22,26],"and":[23,53,78,122,221],"large":[24],"language":[25],"(LLMs).":[27],"Since":[28],"these":[29,182],"components":[30],"are":[31,185,223],"trained":[32],"separately,":[33],"the":[34,56,68,72,79,91,119,139,145,149,154],"learned":[35],"representations":[36],"need":[37],"to":[38,58,107,126,134,147],"be":[39],"aligned":[40],"with":[41,100,111],"joint":[42],"training":[43],"additional":[45],"image-language":[46],"pairs.":[47],"This":[48,156],"procedure":[49],"is":[50,75,157],"not":[51,65,163,205],"perfect":[52,172],"can":[54,204],"cause":[55],"model":[57,211],"hallucinate":[59],"-":[60],"provide":[61],"answers":[62],"that":[63,202],"do":[64],"accurately":[66],"reflect":[67],"image,":[69],"even":[70],"when":[71],"core":[73],"LLM":[74],"highly":[76],"factual":[77],"backbone":[81],"has":[82],"sufficiently":[83],"complete":[84],"representations.":[85],"In":[86,195],"this":[87],"work,":[88],"we":[89,104,131,143,200,203],"frame":[90],"hallucination":[92,151],"problem":[93],"as":[94,118],"an":[95,158,188],"alignment":[96],"issue,":[97],"tackle":[98],"it":[99,176],"preference":[101],"tuning.":[102],"Specifically,":[103],"propose":[105],"POVID":[106],"generate":[108,127],"feedback":[109],"data":[110,167,220],"AI":[112],"models.":[113],"We":[114],"use":[115],"ground-truth":[116],"instructions":[117],"preferred":[120],"response":[121],"two-stage":[124],"approach":[125],"dispreferred":[128],"data.":[129],"First,":[130],"prompt":[132],"GPT-4V":[133],"inject":[135],"plausible":[136],"hallucinations":[137],"into":[138,187],"correct":[140],"answer.":[141],"Second,":[142],"distort":[144],"image":[146],"trigger":[148],"inherent":[150],"behavior":[152],"VLLM.":[155],"automated":[159],"approach,":[160],"which":[161,174],"does":[162],"rely":[164],"human":[166],"generation":[168,183],"or":[169],"require":[170],"expert,":[173],"makes":[175],"easily":[177],"scalable.":[178],"Finally,":[179],"both":[180],"strategies":[184],"integrated":[186],"RLHF":[189],"pipeline":[190],"via":[191],"Direct":[192],"Preference":[193],"Optimization.":[194],"experiments":[196],"across":[197,213],"broad":[198],"benchmarks,":[199,215],"show":[201],"only":[206],"reduce":[207],"hallucinations,":[208],"but":[209],"improve":[210],"performance":[212],"standard":[214],"outperforming":[216],"prior":[217],"approaches.":[218],"Our":[219],"code":[222],"available":[224],"at":[225],"https://github.com/YiyangZhou/POVID.":[226]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4392011670","counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-04-23T00:27:16.383243","created_date":"2024-02-22"}