{"id":"https://openalex.org/W4402502966","doi":"https://doi.org/10.48550/arxiv.2408.08978","title":"See What LLMs Cannot Answer: A Self-Challenge Framework for Uncovering\n LLM Weaknesses","display_name":"See What LLMs Cannot Answer: A Self-Challenge Framework for Uncovering\n LLM Weaknesses","publication_year":2024,"publication_date":"2024-08-16","ids":{"openalex":"https://openalex.org/W4402502966","doi":"https://doi.org/10.48550/arxiv.2408.08978"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.08978","pdf_url":"http://arxiv.org/pdf/2408.08978","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2408.08978","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100777438","display_name":"Yulong Chen","orcid":"https://orcid.org/0000-0002-4960-5241"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yulong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115602248","display_name":"Yang Liu","orcid":"https://orcid.org/0000-0002-7240-1546"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101123756","display_name":"Jianhao Yan","orcid":"https://orcid.org/0000-0002-5670-1207"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Jianhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022924513","display_name":"Xuefeng Bai","orcid":"https://orcid.org/0000-0002-8757-7034"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Xuefeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100633973","display_name":"Zhong Ming","orcid":"https://orcid.org/0000-0001-9310-3460"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Ming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102516123","display_name":"Yinghao Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yinghao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018104523","display_name":"Ziyi Yang","orcid":"https://orcid.org/0000-0002-4138-5598"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Ziyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054819831","display_name":"Chenguang Zhu","orcid":"https://orcid.org/0000-0002-7343-8279"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Chenguang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100333729","display_name":"Yue Zhang","orcid":"https://orcid.org/0000-0002-5214-2268"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yue","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":84},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T13851","display_name":"Law, AI, and Intellectual Property","score":0.8962,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13851","display_name":"Law, AI, and Intellectual Property","score":0.8962,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13643","display_name":"Artificial Intelligence in Law","score":0.8273,"subfield":{"id":"https://openalex.org/subfields/3320","display_name":"Political Science and International Relations"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/strengths-and-weaknesses","display_name":"Strengths and weaknesses","score":0.555791}],"concepts":[{"id":"https://openalex.org/C63882131","wikidata":"https://www.wikidata.org/wiki/Q17122954","display_name":"Strengths and weaknesses","level":2,"score":0.555791},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.4425137},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.32072014},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.17403036}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.08978","pdf_url":"http://arxiv.org/pdf/2408.08978","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.08978","pdf_url":"http://arxiv.org/pdf/2408.08978","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W93605524","https://openalex.org/W821271700","https://openalex.org/W4391375266","https://openalex.org/W4295769391","https://openalex.org/W3112960490","https://openalex.org/W2972220648","https://openalex.org/W2748952813","https://openalex.org/W2332667808","https://openalex.org/W2021145421","https://openalex.org/W1997921863"],"abstract_inverted_index":{"The":[0,140],"impressive":[1],"performance":[2],"of":[3,21,40,127,153,162],"Large":[4],"Language":[5],"Models":[6],"(LLMs)":[7],"has":[8],"consistently":[9],"surpassed":[10],"numerous":[11],"human-designed":[12],"benchmarks,":[13],"presenting":[14],"new":[15,87],"challenges":[16],"in":[17,164],"assessing":[18],"the":[19,38,50,200],"shortcomings":[20],"LLMs.":[22],"Designing":[23],"tasks":[24],"and":[25,89,116,188,190,213,219],"finding":[26],"LLMs'":[27,154],"limitations":[28,48],"are":[29],"becoming":[30],"increasingly":[31],"important.":[32],"In":[33],"this":[34,55],"paper,":[35],"we":[36,57,74],"investigate":[37],"question":[39],"whether":[41],"an":[42],"LLM":[43],"can":[44,82,166,207],"discover":[45],"its":[46],"own":[47],"from":[49,66],"errors":[51],"it":[52],"makes.":[53],"To":[54],"end,":[56],"propose":[58],"a":[59,123,144,150],"Self-Challenge":[60],"evaluation":[61],"framework":[62],"with":[63,108,118,136],"human-in-the-loop.":[64],"Starting":[65],"seed":[67],"instances":[68,88,129,163],"that":[69,81,147,159,177,205],"GPT-4":[70,76,132],"fails":[71],"to":[72,77,85,95,203],"answer,":[73],"prompt":[75],"summarize":[78],"error":[79,179],"patterns":[80,98,180],"be":[83,167,192],"used":[84],"generate":[86],"incorporate":[90],"human":[91],"feedback":[92],"on":[93],"them":[94],"refine":[96],"these":[97,134,178],"for":[99,149,216],"generating":[100],"more":[101],"challenging":[102,145],"data,":[103],"iteratively.":[104],"We":[105,120],"end":[106],"up":[107],"8":[109],"diverse":[110],"patterns,":[111,135],"such":[112,185],"as":[113,143,186],"text":[114],"manipulation":[115],"questions":[117],"assumptions.":[119],"then":[121],"build":[122],"benchmark,":[124],"SC-G4,":[125],"consisting":[126],"1,835":[128],"generated":[130],"by":[131,170],"using":[133],"human-annotated":[137],"gold":[138],"responses.":[139],"SC-G4":[141,165],"serves":[142],"benchmark":[146],"allows":[148],"detailed":[151],"assessment":[152],"abilities.":[155],"Our":[156,197],"results":[157],"show":[158],"only":[160],"44.96\\%":[161],"answered":[168],"correctly":[169],"GPT-4.":[171],"Interestingly,":[172],"our":[173],"pilot":[174],"study":[175],"indicates":[176],"also":[181],"challenge":[182],"other":[183],"LLMs,":[184],"Claude-3":[187],"Llama-3,":[189],"cannot":[191],"fully":[193],"resolved":[194],"through":[195],"fine-tuning.":[196],"work":[198],"takes":[199],"first":[201],"step":[202],"demonstrate":[204],"LLMs":[206],"autonomously":[208],"identify":[209],"their":[210],"inherent":[211],"flaws":[212],"provide":[214],"insights":[215],"future":[217],"dynamic":[218],"automatic":[220],"evaluation.":[221]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4402502966","counts_by_year":[],"updated_date":"2024-12-24T02:03:00.788567","created_date":"2024-09-14"}