{"id":"https://openalex.org/W4392538992","doi":"https://doi.org/10.48550/arxiv.2403.01777","title":"NPHardEval4V: A Dynamic Reasoning Benchmark of Multimodal Large Language\n Models","display_name":"NPHardEval4V: A Dynamic Reasoning Benchmark of Multimodal Large Language\n Models","publication_year":2024,"publication_date":"2024-03-04","ids":{"openalex":"https://openalex.org/W4392538992","doi":"https://doi.org/10.48550/arxiv.2403.01777"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2403.01777","pdf_url":"http://arxiv.org/pdf/2403.01777","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2403.01777","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033265331","display_name":"Lizhou Fan","orcid":"https://orcid.org/0000-0002-9398-4875"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Lizhou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042013742","display_name":"Wenyue Hua","orcid":"https://orcid.org/0009-0008-2043-2704"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hua, Wenyue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100394772","display_name":"Xiang Li","orcid":"https://orcid.org/0000-0002-4962-002X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103056110","display_name":"Kaijie Zhu","orcid":"https://orcid.org/0009-0002-6220-1476"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Kaijie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075202019","display_name":"Mingyu Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Mingyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031522503","display_name":"Lingyao Li","orcid":"https://orcid.org/0000-0001-5888-8311"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Lingyao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101313941","display_name":"Haoyang Ling","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ling, Haoyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071907952","display_name":"Jinkui Chi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chi, Jinkui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100700956","display_name":"Jindong Wang","orcid":"https://orcid.org/0000-0002-4833-0880"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jindong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070296642","display_name":"Xin Ma","orcid":"https://orcid.org/0000-0001-7847-911X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100329828","display_name":"Yongfeng Zhang","orcid":"https://orcid.org/0000-0003-2633-8555"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yongfeng","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":83},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9976,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9976,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9924,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9633,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7918247}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7918247},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.62195516},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.419249},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.40574372},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.09791112},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.07142049}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2403.01777","pdf_url":"http://arxiv.org/pdf/2403.01777","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2403.01777","pdf_url":"http://arxiv.org/pdf/2403.01777","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W972276598","https://openalex.org/W4321353415","https://openalex.org/W3204019825","https://openalex.org/W2748952813","https://openalex.org/W2745001401","https://openalex.org/W2378211422","https://openalex.org/W2130974462","https://openalex.org/W2087343574","https://openalex.org/W2086519370","https://openalex.org/W2028665553"],"abstract_inverted_index":{"Understanding":[0],"the":[1,28,33,48,61,65,104,120,136,142,178,191],"reasoning":[2,35,75,97,137,195],"capabilities":[3],"of":[4,14,37,50,64,84,108,115,122,139,145,177,194],"Multimodal":[5],"Large":[6],"Language":[7],"Models":[8],"(MLLMs)":[9],"is":[10,78],"an":[11],"important":[12],"area":[13],"research.":[15],"In":[16],"this":[17,183],"study,":[18],"we":[19],"introduce":[20],"a":[21,44,171],"dynamic":[22],"benchmark,":[23],"NPHardEval4V,":[24],"aimed":[25],"at":[26,206],"addressing":[27],"existing":[29],"gaps":[30],"in":[31,96,113,148,187,197],"evaluating":[32,73],"pure":[34],"abilities":[36,98,138,196],"MLLMs.":[38,198],"Our":[39,91],"benchmark":[40,161,184,200],"aims":[41],"to":[42,46,69,88,111,166],"provide":[43],"venue":[45],"disentangle":[47],"effect":[49],"various":[51],"factors":[52],"such":[53],"as":[54],"image":[55,89],"recognition":[56],"and":[57,102,129,132,169,174,189,202],"instruction":[58],"following,":[59],"from":[60,86],"overall":[62],"performance":[63,107],"models,":[66],"allowing":[67],"us":[68],"focus":[70,155],"solely":[71],"on":[72,135,157],"their":[74],"abilities.":[76],"It":[77],"built":[79],"by":[80],"converting":[81],"textual":[82],"description":[83],"questions":[85],"NPHardEval":[87],"representations.":[90],"findings":[92],"reveal":[93],"significant":[94],"discrepancies":[95],"across":[99],"different":[100,123,143],"models":[101],"highlight":[103],"relatively":[105],"weak":[106],"MLLMs":[109],"compared":[110],"LLMs":[112],"terms":[114],"reasoning.":[116],"We":[117,180],"also":[118],"investigate":[119],"impact":[121],"prompting":[124],"styles,":[125],"including":[126],"visual,":[127],"text,":[128],"combined":[130],"visual":[131],"text":[133],"prompts,":[134],"MLLMs,":[140],"demonstrating":[141],"impacts":[144],"multimodal":[146],"inputs":[147],"model":[149],"performance.":[150],"Unlike":[151],"traditional":[152],"benchmarks,":[153],"which":[154],"primarily":[156],"static":[158],"evaluations,":[159],"our":[160],"will":[162],"be":[163],"updated":[164],"monthly":[165],"prevent":[167],"overfitting":[168],"ensure":[170],"more":[172],"authentic":[173],"fine-grained":[175],"evaluation":[176],"models.":[179],"believe":[181],"that":[182],"can":[185],"aid":[186],"understanding":[188],"guide":[190],"further":[192],"development":[193],"The":[199],"dataset":[201],"code":[203],"are":[204],"available":[205],"https://github.com/lizhouf/NPHardEval4V":[207]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4392538992","counts_by_year":[],"updated_date":"2025-01-13T05:41:02.562809","created_date":"2024-03-07"}