{"id":"https://openalex.org/W4366850747","doi":"https://doi.org/10.48550/arxiv.2304.10592","title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models","display_name":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4366850747","doi":"https://doi.org/10.48550/arxiv.2304.10592"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2304.10592","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2304.10592","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5082616743","display_name":"Deyao Zhu","orcid":"https://orcid.org/0000-0001-8014-7309"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Deyao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100450148","display_name":"Jun Chen","orcid":"https://orcid.org/0000-0001-8883-0970"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020511969","display_name":"Xiaoqian Shen","orcid":"https://orcid.org/0000-0001-6284-520X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Xiaoqian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100331094","display_name":"Xiang Li","orcid":"https://orcid.org/0000-0002-9851-6376"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5085089542","display_name":"Mohamed Elhoseiny","orcid":"https://orcid.org/0000-0001-9659-1551"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Elhoseiny, Mohamed","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":332,"citation_normalized_percentile":{"value":0.999887,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9902,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9866,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/repetition","display_name":"Repetition (rhetorical device)","score":0.5933074},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.47299036}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.74146855},{"id":"https://openalex.org/C170130773","wikidata":"https://www.wikidata.org/wiki/Q216378","display_name":"Usability","level":2,"score":0.6475895},{"id":"https://openalex.org/C2776141515","wikidata":"https://www.wikidata.org/wiki/Q1274479","display_name":"Repetition (rhetorical device)","level":2,"score":0.5933074},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.577503},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.47299036},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.46114594},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.43347695},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41084814},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.39044023},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.24763545},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.19336194},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.19171253},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2304.10592","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2304.10592","pdf_url":"http://arxiv.org/pdf/2304.10592","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2304.10592","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2304.10592","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"display_name":"Quality education","score":0.62,"id":"https://metadata.un.org/sdg/4"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4389670110","https://openalex.org/W4315621326","https://openalex.org/W2899790217","https://openalex.org/W2611942503","https://openalex.org/W2598865957","https://openalex.org/W2524154428","https://openalex.org/W2429057255","https://openalex.org/W2187546663","https://openalex.org/W1576092969","https://openalex.org/W148745890"],"abstract_inverted_index":{"The":[0],"recent":[1],"GPT-4":[2,36,50],"has":[3],"demonstrated":[4,109],"extraordinary":[5],"multi-modal":[6,46,107],"abilities,":[7],"such":[8,112],"as":[9,113],"directly":[10],"generating":[11],"websites":[12],"from":[13,52,121],"handwritten":[14],"text":[15],"and":[16,118,136,151,175,203,210],"identifying":[17],"humorous":[18],"elements":[19],"within":[20],"images.":[21],"These":[22],"features":[23,96],"are":[24,213],"rarely":[25],"observed":[26],"in":[27,131,188],"previous":[28],"vision-language":[29],"models.":[30],"However,":[31],"the":[32,44,53,87,94,160,189,194,199],"technical":[33],"details":[34],"behind":[35],"continue":[37],"to":[38,145,192],"remain":[39],"undisclosed.":[40],"We":[41],"believe":[42],"that":[43,91,159],"enhanced":[45],"generation":[47,117,201],"capabilities":[48,130],"of":[49,55],"stem":[51],"utilization":[54],"sophisticated":[56],"large":[57,100],"language":[58,101,171],"models":[59],"(LLM).":[60],"To":[61,177],"examine":[62],"this":[63,179],"phenomenon,":[64],"we":[65,125,157,181],"present":[66],"MiniGPT-4,":[67,132],"which":[68,196],"aligns":[69],"a":[70,75,183],"frozen":[71,76],"visual":[72,95],"encoder":[73],"with":[74,97],"advanced":[77,99,106],"LLM,":[78],"Vicuna,":[79],"using":[80],"one":[81],"projection":[82],"layer.":[83],"Our":[84,206],"work,":[85],"for":[86],"first":[88],"time,":[89],"uncovers":[90],"properly":[92],"aligning":[93],"an":[98],"model":[102,161],"can":[103],"possess":[104],"numerous":[105],"abilities":[108],"by":[110,139],"GPT-4,":[111],"detailed":[114,184],"image":[115,165,185],"description":[116,186],"website":[119],"creation":[120],"hand-drawn":[122],"drafts.":[123],"Furthermore,":[124],"also":[126],"observe":[127],"other":[128],"emerging":[129],"including":[133],"writing":[134],"stories":[135],"poems":[137],"inspired":[138],"given":[140],"images,":[141],"teaching":[142],"users":[143],"how":[144],"cook":[146],"based":[147],"on":[148,163],"food":[149],"photos,":[150],"so":[152],"on.":[153],"In":[154],"our":[155],"experiment,":[156],"found":[158],"trained":[162],"short":[164],"caption":[166],"pairs":[167],"could":[168],"produce":[169],"unnatural":[170],"outputs":[172],"(e.g.,":[173],"repetition":[174],"fragmentation).":[176],"address":[178],"problem,":[180],"curate":[182],"dataset":[187,212],"second":[190],"stage":[191],"finetune":[193],"model,":[195,209],"consequently":[197],"improves":[198],"model's":[200],"reliability":[202],"overall":[204],"usability.":[205],"code,":[207],"pre-trained":[208],"collected":[211],"available":[214],"at":[215],"https://minigpt-4.github.io/.":[216]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4366850747","counts_by_year":[{"year":2025,"cited_by_count":24},{"year":2024,"cited_by_count":234},{"year":2023,"cited_by_count":70},{"year":2022,"cited_by_count":1}],"updated_date":"2025-04-16T16:17:08.157412","created_date":"2023-04-25"}