{"id":"https://openalex.org/W4403853618","doi":"https://doi.org/10.48550/arxiv.2409.12191","title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at\n Any Resolution","display_name":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at\n Any Resolution","publication_year":2024,"publication_date":"2024-09-18","ids":{"openalex":"https://openalex.org/W4403853618","doi":"https://doi.org/10.48550/arxiv.2409.12191"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.12191","pdf_url":"http://arxiv.org/pdf/2409.12191","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2409.12191","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058176560","display_name":"Peng Wang","orcid":"https://orcid.org/0000-0001-8782-857X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101014470","display_name":"Shuai Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Shuai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008661936","display_name":"Sinan Tan","orcid":"https://orcid.org/0000-0003-2035-2479"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Sinan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106406632","display_name":"Shijie Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shijie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100313851","display_name":"Zhihao Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Zhihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063334231","display_name":"Jinze Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Jinze","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030987813","display_name":"Keqin Chen","orcid":"https://orcid.org/0000-0002-9091-8258"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Keqin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101944061","display_name":"Xuejing Liu","orcid":"https://orcid.org/0000-0001-9612-3707"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Xuejing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100430890","display_name":"Jialin Wang","orcid":"https://orcid.org/0000-0001-5985-9061"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jialin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005725445","display_name":"Ge Wen-bin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ge, Wenbin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101520404","display_name":"Yang Fan","orcid":"https://orcid.org/0000-0001-8875-6686"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014400759","display_name":"Kai Dang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dang, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019333214","display_name":"Mengfei Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Mengfei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049239373","display_name":"Xuancheng Ren","orcid":"https://orcid.org/0000-0002-6994-2114"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ren, Xuancheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004626105","display_name":"Rui Men","orcid":"https://orcid.org/0000-0002-4429-3461"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Men, Rui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062188134","display_name":"Dayiheng Liu","orcid":"https://orcid.org/0000-0002-8755-8941"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Dayiheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091103295","display_name":"Chang Zhou","orcid":"https://orcid.org/0000-0002-3744-2940"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Chang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113621558","display_name":"Jingren Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Jingren","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100612233","display_name":"Junyang Lin","orcid":"https://orcid.org/0000-0001-9931-383X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Junyang","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.999366,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T12694","display_name":"Categorization, perception, and language","score":0.4868,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12694","display_name":"Categorization, perception, and language","score":0.4868,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5822055},{"id":"https://openalex.org/C138268822","wikidata":"https://www.wikidata.org/wiki/Q1051925","display_name":"Resolution (logic)","level":2,"score":0.5793058},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4538712},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4362287},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.36828944},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.26947916},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.050893724}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.12191","pdf_url":"http://arxiv.org/pdf/2409.12191","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.12191","pdf_url":"http://arxiv.org/pdf/2409.12191","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3116076068","https://openalex.org/W2951359407","https://openalex.org/W2772917594","https://openalex.org/W2755342338","https://openalex.org/W2229312674","https://openalex.org/W2166024367","https://openalex.org/W2079911747","https://openalex.org/W2058170566","https://openalex.org/W2036807459","https://openalex.org/W1969923398"],"abstract_inverted_index":{"We":[0,86],"present":[1],"the":[2,9,15,24,31,49,75,98,105,113,124,134,139,147],"Qwen2-VL":[3,22,111,140],"Series,":[4],"an":[5],"advanced":[6],"upgrade":[7],"of":[8,37,43,78,107,136],"previous":[10],"Qwen-VL":[11],"models":[12,119,155],"that":[13],"redefines":[14],"conventional":[16],"predetermined-resolution":[17],"approach":[18,47],"in":[19],"visual":[20,44,57,100],"processing.":[21],"introduces":[23],"Naive":[25],"Dynamic":[26],"Resolution":[27],"mechanism,":[28],"which":[29],"enables":[30],"model":[32,50,66,125,149],"to":[33,51,153],"dynamically":[34],"process":[35],"images":[36,94],"varying":[38],"resolutions":[39],"into":[40],"different":[41],"numbers":[42],"tokens.":[45],"This":[46],"allows":[48],"generate":[52],"more":[53],"efficient":[54],"and":[55,84,95,131,159],"accurate":[56],"representations,":[58],"closely":[59],"aligning":[60],"with":[61],"human":[62],"perceptual":[63],"processes.":[64],"The":[65],"also":[67],"integrates":[68],"Multimodal":[69],"Rotary":[70],"Position":[71],"Embedding":[72],"(M-RoPE),":[73],"facilitating":[74],"effective":[76],"fusion":[77],"positional":[79],"information":[80],"across":[81,161],"text,":[82],"images,":[83],"videos.":[85],"employ":[87],"a":[88],"unified":[89],"paradigm":[90],"for":[91,116],"processing":[92],"both":[93,123],"videos,":[96],"enhancing":[97],"model's":[99],"perception":[101],"capabilities.":[102],"To":[103],"explore":[104],"potential":[106],"large":[108,117],"multimodal":[109,163],"models,":[110],"investigates":[112],"scaling":[114,122],"laws":[115],"vision-language":[118],"(LVLMs).":[120],"By":[121],"size-with":[126],"versions":[127],"at":[128,172],"2B,":[129],"8B,":[130],"72B":[132],"parameters-and":[133],"amount":[135],"training":[137],"data,":[138],"Series":[141],"achieves":[142,150],"highly":[143],"competitive":[144],"performance.":[145],"Notably,":[146],"Qwen2-VL-72B":[148],"results":[151],"comparable":[152],"leading":[154],"such":[156],"as":[157],"GPT-4o":[158],"Claude3.5-Sonnet":[160],"various":[162],"benchmarks,":[164],"outperforming":[165],"other":[166],"generalist":[167],"models.":[168],"Code":[169],"is":[170],"available":[171],"https://github.com/QwenLM/Qwen2-VL":[173],".":[174]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4403853618","counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1}],"updated_date":"2025-04-22T19:13:10.137366","created_date":"2024-10-29"}