{"id":"https://openalex.org/W4385681588","doi":"https://doi.org/10.48550/arxiv.2308.03151","title":"Food-500 Cap: A Fine-Grained Food Caption Benchmark for Evaluating Vision-Language Models","display_name":"Food-500 Cap: A Fine-Grained Food Caption Benchmark for Evaluating Vision-Language Models","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4385681588","doi":"https://doi.org/10.48550/arxiv.2308.03151"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2308.03151","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2308.03151","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5064437214","display_name":"Zheng Ma","orcid":"https://orcid.org/0000-0002-0251-1483"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Zheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041468227","display_name":"Mianzhi Pan","orcid":"https://orcid.org/0009-0008-5971-5285"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Mianzhi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100511911","display_name":"Wenhan Wu","orcid":"https://orcid.org/0009-0008-7090-3666"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Wenhan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001863177","display_name":"Kanzhi Cheng","orcid":"https://orcid.org/0009-0004-4532-1446"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Kanzhi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049207163","display_name":"Jianbing Zhang","orcid":"https://orcid.org/0000-0003-0642-3939"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jianbing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102865824","display_name":"Shujian Huang","orcid":"https://orcid.org/0000-0003-4869-0832"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Shujian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100427087","display_name":"Jiajun Chen","orcid":"https://orcid.org/0000-0003-0477-7442"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jiajun","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":65},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9922,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9922,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9409,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8386643},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6103031}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8386643},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.75563806},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6103031},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.5003927},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.47392187},{"id":"https://openalex.org/C549605437","wikidata":"https://www.wikidata.org/wiki/Q1229911","display_name":"Food security","level":3,"score":0.4723091},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.46535784},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.43230212},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.41027787},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.37091988},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3249325},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.12035537},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.090197146},{"id":"https://openalex.org/C118518473","wikidata":"https://www.wikidata.org/wiki/Q11451","display_name":"Agriculture","level":2,"score":0.08351767},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.08065131},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2308.03151","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2308.03151","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2308.03151","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"display_name":"Zero hunger","id":"https://metadata.un.org/sdg/2","score":0.54}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4390569940","https://openalex.org/W4388422664","https://openalex.org/W4361193272","https://openalex.org/W4312407344","https://openalex.org/W4310278675","https://openalex.org/W2963326959","https://openalex.org/W2905433371","https://openalex.org/W2894289927","https://openalex.org/W2888392564","https://openalex.org/W2806259446"],"abstract_inverted_index":{"Vision-language":[0],"models":[1],"(VLMs)":[2],"have":[3,38],"shown":[4],"impressive":[5],"performance":[6,17,176,200],"in":[7,88,107,170,180,193,201,211],"substantial":[8],"downstream":[9,19],"multi-modal":[10],"tasks.":[11],"However,":[12],"only":[13],"comparing":[14],"the":[15,23,84,102,111,148,175,194,202,237,256],"fine-tuned":[16],"on":[18,166,184],"tasks":[20],"leads":[21],"to":[22,31,51,74,172,214,232,235,249,255,267],"poor":[24],"interpretability":[25],"of":[26,64,86,104,144,178,258],"VLMs,":[27],"which":[28,125],"is":[29,135],"adverse":[30],"their":[32,199],"future":[33],"improvement.":[34],"Several":[35],"prior":[36],"works":[37],"identified":[39],"this":[40,97,115,269],"issue":[41],"and":[42,80,151,227,263],"used":[43],"various":[44],"probing":[45,225],"methods":[46,226],"under":[47],"a":[48,108,119,138,156],"zero-shot":[49],"setting":[50],"detect":[52],"VLMs'":[53,212],"limitations,":[54],"but":[55],"they":[56],"all":[57],"examine":[58],"VLMs":[59,70,87,106,191,230],"using":[60],"general":[61,203],"datasets":[62,187],"instead":[63],"specialized":[65],"ones.":[66],"In":[67,96],"practical":[68],"applications,":[69],"are":[71],"usually":[72],"applied":[73],"specific":[75,89,109],"scenarios,":[76],"such":[77,146],"as":[78,147],"e-commerce":[79],"news":[81],"fields,":[82],"so":[83],"generalization":[85],"domains":[90],"should":[91],"be":[92],"given":[93],"more":[94],"attention.":[95],"paper,":[98],"we":[99,117],"comprehensively":[100],"investigate":[101],"capabilities":[103],"popular":[105,190],"field,":[110],"food":[112,120,128,163,195,216,259],"domain.":[113,204],"To":[114],"end,":[116],"build":[118],"caption":[121],"dataset,":[122],"Food-500":[123],"Cap,":[124],"contains":[126],"24,700":[127],"images":[129],"with":[130,198],"494":[131],"categories.":[132],"Each":[133],"image":[134],"accompanied":[136],"by":[137],"detailed":[139],"caption,":[140],"including":[141],"fine-grained":[142],"attributes":[143],"food,":[145],"ingredient,":[149],"shape,":[150],"color.":[152],"We":[153,222,240],"also":[154],"provide":[155],"culinary":[157,261],"culture":[158],"taxonomy":[159],"that":[160,189,242],"classifies":[161],"each":[162],"category":[164],"based":[165],"its":[167],"geographic":[168,220],"origin":[169],"order":[171],"better":[173],"analyze":[174],"differences":[177],"VLM":[179],"different":[181,219,233],"regions.":[182,221],"Experiments":[183],"our":[185,206,243],"proposed":[186],"demonstrate":[188],"underperform":[192],"domain":[196,257],"compared":[197],"Furthermore,":[205],"research":[207],"reveals":[208],"severe":[209],"bias":[210],"ability":[213],"handle":[215],"items":[217],"from":[218],"adopt":[223],"diverse":[224],"evaluate":[228],"nine":[229],"belonging":[231],"architectures":[234],"verify":[236],"aforementioned":[238],"observations.":[239],"hope":[241],"study":[244],"will":[245],"bring":[246],"researchers'":[247],"attention":[248],"VLM's":[250],"limitations":[251],"when":[252],"applying":[253],"them":[254],"or":[260],"cultures,":[262],"spur":[264],"further":[265],"investigations":[266],"address":[268],"issue.":[270]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4385681588","counts_by_year":[],"updated_date":"2025-04-23T07:48:40.007839","created_date":"2023-08-09"}