{"id":"https://openalex.org/W4404649840","doi":"https://doi.org/10.48550/arxiv.2411.13543","title":"BALROG: Benchmarking Agentic LLM and VLM Reasoning On Games","display_name":"BALROG: Benchmarking Agentic LLM and VLM Reasoning On Games","publication_year":2024,"publication_date":"2024-11-20","ids":{"openalex":"https://openalex.org/W4404649840","doi":"https://doi.org/10.48550/arxiv.2411.13543"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.13543","pdf_url":"http://arxiv.org/pdf/2411.13543","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2411.13543","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093796358","display_name":"Davide Paglieri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Paglieri, Davide","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093227523","display_name":"Bart\u0142omiej Cupia\u0142","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cupia\u0142, Bart\u0142omiej","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018626596","display_name":"Samuel Coward","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Coward, Samuel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003598569","display_name":"Ulyana Piterbarg","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Piterbarg, Ulyana","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025503042","display_name":"Maciej Wo\u0142czyk","orcid":"https://orcid.org/0000-0002-3933-9971"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wolczyk, Maciej","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084386381","display_name":"Akbir Khan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Khan, Akbir","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047676934","display_name":"Eduardo Pignatelli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pignatelli, Eduardo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055195176","display_name":"\u0141ukasz Kuci\u0144ski","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kuci\u0144ski, \u0141ukasz","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103895643","display_name":"Lerrel Pinto","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pinto, Lerrel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089960673","display_name":"Rob Fergus","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fergus, Rob","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059094093","display_name":"Jakob Foerster","orcid":"https://orcid.org/0000-0001-9688-2498"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Foerster, Jakob Nicolaus","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083828420","display_name":"Jack Parker-Holder","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Parker-Holder, Jack","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5079315903","display_name":"Tim Rockt\u00e4schel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rockt\u00e4schel, Tim","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":84},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.9717,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.9717,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.917,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.7658951}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.7658951},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.47968268},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.39178768},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.36265594},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.16080117},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.07230142}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.13543","pdf_url":"http://arxiv.org/pdf/2411.13543","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.13543","pdf_url":"http://arxiv.org/pdf/2411.13543","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W435179959","https://openalex.org/W4238897586","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2619091065","https://openalex.org/W2291782699","https://openalex.org/W2284465472","https://openalex.org/W2059640416","https://openalex.org/W1490753184"],"abstract_inverted_index":{"Large":[0],"Language":[1,6],"Models":[2,7],"(LLMs)":[3],"and":[4,12,39,74,130,139,142,191,198],"Vision":[5],"(VLMs)":[8],"possess":[9],"extensive":[10,133],"knowledge":[11],"exhibit":[13],"promising":[14],"reasoning":[15],"abilities;":[16],"however,":[17],"they":[18,158],"still":[19],"struggle":[20,159],"to":[21,67,108,116,127,194],"perform":[22,175],"well":[23],"in":[24,45,106,154,170,200],"complex,":[25],"dynamic":[26],"environments.":[27],"Real-world":[28],"tasks":[29,99],"require":[30],"handling":[31],"intricate":[32],"interactions,":[33],"advanced":[34],"spatial":[35],"reasoning,":[36],"long-term":[37],"planning,":[38],"continuous":[40],"exploration":[41],"of":[42,72,80,88,96,135,180],"new":[43],"strategies-areas":[44],"which":[46],"we":[47,60,166],"lack":[48],"effective":[49],"methodologies":[50],"for":[51],"comprehensively":[52],"evaluating":[53],"these":[54],"capabilities.":[55],"To":[56],"address":[57],"this":[58],"gap,":[59],"introduce":[61],"BALROG,":[62],"a":[63,77,86],"novel":[64],"benchmark":[65,84,193],"designed":[66],"assess":[68],"the":[69,119,155,181,201],"agentic":[70,202],"capabilities":[71],"LLMs":[73,141],"VLMs":[75],"through":[76],"diverse":[78],"set":[79],"challenging":[81,110,163],"games.":[82],"Our":[83,144],"incorporates":[85],"range":[87],"existing":[89],"reinforcement":[90],"learning":[91],"environments":[92,182],"with":[93,161],"varying":[94],"levels":[95],"difficulty,":[97],"including":[98],"that":[100,112,147],"are":[101,183],"solvable":[102],"by":[103],"non-expert":[104],"humans":[105],"seconds":[107],"extremely":[109],"ones":[111],"may":[113],"take":[114],"years":[115],"master":[117],"(e.g.,":[118],"NetHack":[120],"Learning":[121],"Environment).":[122],"We":[123,185],"devise":[124],"fine-grained":[125],"metrics":[126],"measure":[128],"performance":[129],"conduct":[131],"an":[132,189],"evaluation":[134],"several":[136],"popular":[137],"open-source":[138],"closed-source":[140],"VLMs.":[143],"findings":[145],"indicate":[146],"while":[148],"current":[149],"models":[150,174],"achieve":[151],"partial":[152],"success":[153],"easier":[156],"games,":[157],"significantly":[160],"more":[162],"tasks.":[164],"Notably,":[165],"observe":[167],"severe":[168],"deficiencies":[169],"vision-based":[171],"decision-making,":[172],"as":[173,188],"worse":[176],"when":[177],"visual":[178],"representations":[179],"provided.":[184],"release":[186],"BALROG":[187],"open":[190],"user-friendly":[192],"facilitate":[195],"future":[196],"research":[197],"development":[199],"community.":[203]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4404649840","counts_by_year":[],"updated_date":"2024-12-14T15:30:20.912137","created_date":"2024-11-24"}