{"id":"https://openalex.org/W4379474733","doi":"https://doi.org/10.48550/arxiv.2306.01694","title":"Evaluating Language Models for Mathematics through Interactions","display_name":"Evaluating Language Models for Mathematics through Interactions","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4379474733","doi":"https://doi.org/10.48550/arxiv.2306.01694"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2306.01694","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2306.01694","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024828997","display_name":"Katherine M. Collins","orcid":"https://orcid.org/0000-0002-7032-716X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Collins, Katherine M.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083358660","display_name":"Albert Q. Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Albert Q.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001165299","display_name":"Simon Frieder","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Frieder, Simon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111624973","display_name":"Lionel Wong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wong, Lionel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080823518","display_name":"Miri Zilka","orcid":"https://orcid.org/0000-0001-9640-8139"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zilka, Miri","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016469734","display_name":"Umang Bhatt","orcid":"https://orcid.org/0000-0002-4611-1668"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhatt, Umang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091549352","display_name":"Thomas Lukasiewicz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lukasiewicz, Thomas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024901763","display_name":"Yuhuai Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yuhuai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071093940","display_name":"Joshua B. Tenenbaum","orcid":"https://orcid.org/0000-0002-1925-2035"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tenenbaum, Joshua B.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102842741","display_name":"William E. Hart","orcid":"https://orcid.org/0000-0002-6849-2780"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hart, William","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046740352","display_name":"W. T. Gowers","orcid":"https://orcid.org/0000-0002-5168-0785"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gowers, Timothy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100761698","display_name":"Wenda Li","orcid":"https://orcid.org/0000-0001-6617-9136"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Wenda","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042278493","display_name":"Adrian Weller","orcid":"https://orcid.org/0000-0003-1915-7158"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weller, Adrian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5036018012","display_name":"Mateja Jamnik","orcid":"https://orcid.org/0000-0003-2772-2532"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jamnik, Mateja","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.710701,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":65,"max":76},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9545,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9545,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.9338,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/helpfulness","display_name":"Helpfulness","score":0.90408915},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.44618458}],"concepts":[{"id":"https://openalex.org/C2781265381","wikidata":"https://www.wikidata.org/wiki/Q5710255","display_name":"Helpfulness","level":2,"score":0.90408915},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.6992676},{"id":"https://openalex.org/C2778143727","wikidata":"https://www.wikidata.org/wiki/Q1820650","display_name":"Readability","level":2,"score":0.54882085},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5299177},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.44618458},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.36603963},{"id":"https://openalex.org/C145420912","wikidata":"https://www.wikidata.org/wiki/Q853077","display_name":"Mathematics education","level":1,"score":0.35009217},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.31379834},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.20145324},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.15482184},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.15083703},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.14277327},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2306.01694","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2306.01694","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2306.01694","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality education","score":0.61}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4285360723","https://openalex.org/W4281847990","https://openalex.org/W3037056935","https://openalex.org/W2934621214","https://openalex.org/W2613921548","https://openalex.org/W2611407113","https://openalex.org/W2488228222","https://openalex.org/W2092282862","https://openalex.org/W2002563848","https://openalex.org/W1498449133"],"abstract_inverted_index":{"There":[0],"is":[1,36,223],"much":[2],"excitement":[3],"about":[4,43],"the":[5,9,21,62,129,229],"opportunity":[6],"to":[7,59,87,99,123,208,227,249],"harness":[8],"power":[10],"of":[11,24,31,118,125,143,158,178,185,231,238],"large":[12],"language":[13,75,102,239],"models":[14,103,202],"(LLMs)":[15],"when":[16],"building":[17],"problem-solving":[18,181],"assistants.":[19,220],"However,":[20],"standard":[22],"methodology":[23],"evaluating":[25],"LLMs":[26,45],"relies":[27],"on":[28],"static":[29],"pairs":[30],"inputs":[32],"and":[33,35,46,69,90,106,132,146,162,200,211,215,243],"outputs,":[34],"insufficient":[37],"for":[38,61,85,197],"making":[39],"an":[40,81],"informed":[41],"decision":[42],"which":[44,48],"under":[47],"assistive":[49],"settings":[50],"can":[51],"they":[52,246],"be":[53,236],"sensibly":[54],"used.":[55],"Static":[56],"assessment":[57],"fails":[58],"account":[60],"essential":[63],"interactive":[64],"element":[65],"in":[66,110,165],"LLM":[67,166],"deployment,":[68],"therefore":[70],"limits":[71],"how":[72],"we":[73,139,172],"understand":[74],"model":[76],"capabilities.":[77],"We":[78,93,127,192],"introduce":[79],"CheckMate,":[80],"adaptable":[82],"prototype":[83],"platform":[84],"humans":[86,234],"interact":[88],"with":[89,97,114,194],"evaluate":[91,100],"LLMs.":[92],"conduct":[94],"a":[95,115,141,150,174,183,224],"study":[96],"CheckMate":[98],"three":[101],"(InstructGPT,":[104],"ChatGPT,":[105],"GPT-4)":[107],"as":[108],"assistants":[109],"proving":[111],"undergraduate-level":[112],"mathematics,":[113],"mixed":[116],"cohort":[117],"participants":[119],"from":[120],"undergraduate":[121],"students":[122],"professors":[124],"mathematics.":[126],"release":[128],"resulting":[130],"interaction":[131],"rating":[133],"dataset,":[134],"MathConverse.":[135],"By":[136],"analysing":[137],"MathConverse,":[138],"derive":[140],"taxonomy":[142],"human":[144],"behaviours":[145],"uncover":[147],"that":[148,203],"despite":[149],"generally":[151],"positive":[152],"correlation,":[153],"there":[154],"are":[155,212,247],"notable":[156],"instances":[157],"divergence":[159],"between":[160],"correctness":[161],"perceived":[163],"helpfulness":[164],"generations,":[167],"amongst":[168],"other":[169],"findings.":[170],"Further,":[171],"garner":[173],"more":[175,213],"granular":[176],"understanding":[177],"GPT-4":[179],"mathematical":[180],"through":[182],"series":[184],"case":[186],"studies,":[187],"contributed":[188],"by":[189],"expert":[190],"mathematicians.":[191],"conclude":[193],"actionable":[195],"takeaways":[196],"ML":[198],"practitioners":[199],"mathematicians:":[201],"communicate":[204],"uncertainty":[205],"respond":[206],"well":[207],"user":[209],"corrections,":[210],"interpretable":[214],"concise":[216],"may":[217],"constitute":[218],"better":[219],"Interactive":[221],"evaluation":[222],"promising":[225],"way":[226],"navigate":[228],"capability":[230],"these":[232],"models;":[233],"should":[235],"aware":[237],"models'":[240],"algebraic":[241],"fallibility":[242],"discern":[244],"where":[245],"appropriate":[248],"use.":[250]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4379474733","counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-04-22T21:36:15.291731","created_date":"2023-06-07"}