{"id":"https://openalex.org/W4388747663","doi":"https://doi.org/10.48550/arxiv.2311.07878","title":"Evaluating LLMs on Document-Based QA: Exact Answer Selection and Numerical Extraction using Cogtale dataset","display_name":"Evaluating LLMs on Document-Based QA: Exact Answer Selection and Numerical Extraction using Cogtale dataset","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4388747663","doi":"https://doi.org/10.48550/arxiv.2311.07878"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2311.07878","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2311.07878","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088904375","display_name":"Zafaryab Rasool","orcid":"https://orcid.org/0000-0002-3603-3125"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rasool, Zafaryab","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012088137","display_name":"Scott Barnett","orcid":"https://orcid.org/0000-0002-3187-4937"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Barnett, Scott","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080975739","display_name":"Stefanus Kurniawan","orcid":"https://orcid.org/0009-0001-4469-1056"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kurniawan, Stefanus","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093278278","display_name":"Sherwin Balugo","orcid":"https://orcid.org/0009-0001-5619-011X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Balugo, Sherwin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030486012","display_name":"Rajesh Vasa","orcid":"https://orcid.org/0000-0003-4805-1467"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vasa, Rajesh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093278279","display_name":"Courtney Chesser","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chesser, Courtney","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5053166971","display_name":"Alex Bahar\u2010Fuchs","orcid":"https://orcid.org/0000-0002-9248-6057"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bahar-Fuchs, Alex","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.778623,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":67,"max":78},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9947,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9284,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7610605}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7610605},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.74354863},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.7093849},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6190805},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5984937},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.59013903},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.48175526},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4757038},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.4691963},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.41321236},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.23957148},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2311.07878","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2311.07878","pdf_url":"http://arxiv.org/pdf/2311.07878","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2311.07878","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2311.07878","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"display_name":"Quality education","score":0.58,"id":"https://metadata.un.org/sdg/4"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W972276598","https://openalex.org/W4321353415","https://openalex.org/W2745001401","https://openalex.org/W2387743295","https://openalex.org/W2384605597","https://openalex.org/W2378211422","https://openalex.org/W2130974462","https://openalex.org/W2086519370","https://openalex.org/W2028665553","https://openalex.org/W1984061923"],"abstract_inverted_index":{"Document-based":[0],"Question-Answering":[1],"(QA)":[2],"tasks":[3],"are":[4],"crucial":[5],"for":[6,92,103,164,198,204,212,220],"precise":[7,176],"information":[8,129,177,221],"retrieval.":[9],"While":[10],"some":[11],"existing":[12],"work":[13,208],"focus":[14,57],"on":[15,21,32,58,71,151,173,188],"evaluating":[16],"large":[17],"language":[18],"models":[19,157],"performance":[20,31,134,147],"retrieving":[22],"and":[23,43,62,69,78,105,119,140,223],"answering":[24],"questions":[25,81,121],"from":[26,40,82,179],"documents,":[27,180],"assessing":[28],"the":[29,89,145,149,165,169,189,192,202],"LLMs":[30,67,172],"QA":[33],"types":[34],"that":[35,110,155,191,217],"require":[36],"exact":[37],"answer":[38,116],"selection":[39],"predefined":[41],"options":[42],"numerical":[44],"extraction":[45,80,142,178],"is":[46],"yet":[47,160],"to":[48,227],"be":[49,161],"fully":[50],"assessed.":[51],"In":[52],"this":[53,59,152],"paper,":[54],"we":[55],"specifically":[56],"underexplored":[60],"context":[61,196],"conduct":[63],"empirical":[64],"analysis":[65,225],"of":[66,148,171],"(GPT-4":[68],"GPT-3.5)":[70],"question":[72],"types,":[73],"including":[74],"single-choice,":[75],"yes-no,":[76],"multiple-choice,":[77],"number":[79,141],"documents":[83],"in":[84,128],"zero-shot":[85],"setting.":[86],"We":[87,108],"use":[88],"CogTale":[90],"dataset":[91,214],"evaluation,":[93,215],"which":[94],"provide":[95],"human":[96],"expert-tagged":[97],"responses,":[98,200],"offering":[99],"a":[100,210],"robust":[101],"benchmark":[102],"precision":[104],"factual":[106],"grounding.":[107],"found":[109],"LLMs,":[111],"particularly":[112],"GPT-4,":[113],"can":[114],"precisely":[115],"many":[117],"single-choice":[118],"yes-no":[120],"given":[122],"relevant":[123],"context,":[124],"demonstrating":[125],"their":[126,133],"efficacy":[127],"retrieval":[130,222],"tasks.":[131,184],"However,":[132],"diminishes":[135],"when":[136],"confronted":[137],"with":[138],"multiple-choice":[139],"formats,":[143],"lowering":[144],"overall":[146],"model":[150],"task,":[153],"indicating":[154],"these":[156],"may":[158],"not":[159],"sufficiently":[162],"reliable":[163],"task.":[166],"This":[167],"limits":[168],"applications":[170,174,219],"demanding":[175],"such":[181],"as":[182],"meta-analysis":[183],"These":[185],"findings":[186],"hinge":[187],"assumption":[190],"retrievers":[193],"furnish":[194],"pertinent":[195],"necessary":[197],"accurate":[199],"emphasizing":[201],"need":[203],"further":[205],"research.":[206],"Our":[207],"offers":[209],"framework":[211],"ongoing":[213],"ensuring":[216],"LLM":[218],"document":[224],"continue":[226],"meet":[228],"evolving":[229],"standards.":[230]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4388747663","counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-01-04T09:41:58.628607","created_date":"2023-11-17"}