{"id":"https://openalex.org/W4393905718","doi":"https://doi.org/10.48550/arxiv.2404.00599","title":"EvoCodeBench: An Evolving Code Generation Benchmark Aligned with\n Real-World Code Repositories","display_name":"EvoCodeBench: An Evolving Code Generation Benchmark Aligned with\n Real-World Code Repositories","publication_year":2024,"publication_date":"2024-03-31","ids":{"openalex":"https://openalex.org/W4393905718","doi":"https://doi.org/10.48550/arxiv.2404.00599"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2404.00599","pdf_url":"https://arxiv.org/pdf/2404.00599","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2404.00599","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102859755","display_name":"Jia Li","orcid":"https://orcid.org/0000-0003-4411-6614"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100447691","display_name":"Ge Li","orcid":"https://orcid.org/0000-0003-0140-0949"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Ge","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101829922","display_name":"Xuanming Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xuanming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069159185","display_name":"Yihong Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Yihong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5049100391","display_name":"Zhi Jin","orcid":"https://orcid.org/0000-0003-1087-226X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Zhi","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.996894,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":93,"max":96},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11450","display_name":"Model-Driven Software Engineering Techniques","score":0.9959,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11450","display_name":"Model-Driven Software Engineering Techniques","score":0.9959,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9921,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.9794,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.74958664},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.72837806}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.74958664},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.72837806},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6078041},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.47357497},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.45016676},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.25762427},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.18485084},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.075356185},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.041053325},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2404.00599","pdf_url":"https://arxiv.org/pdf/2404.00599","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2404.00599","pdf_url":"https://arxiv.org/pdf/2404.00599","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4321353415","https://openalex.org/W2748952813","https://openalex.org/W2745001401","https://openalex.org/W2378211422","https://openalex.org/W2258184894","https://openalex.org/W2163672025","https://openalex.org/W2130974462","https://openalex.org/W2048831961","https://openalex.org/W2028665553","https://openalex.org/W1606349578"],"abstract_inverted_index":{"How":[0],"to":[1,26,41,92,101],"evaluate":[2,27,131],"Large":[3],"Language":[4],"Models":[5],"(LLMs)":[6],"in":[7,57,156,169,184],"code":[8,21,61,128],"generation":[9,129],"is":[10,88,167],"an":[11,89,98],"open":[12],"question.":[13],"Existing":[14],"benchmarks":[15],"demonstrate":[16],"poor":[17],"alignment":[18],"with":[19,54],"real-world":[20,55,120,157],"repositories":[22,56],"and":[23,63,75,78,84,130,144,177,191],"are":[24],"insufficient":[25],"the":[28,43,105,110,150,161,179],"coding":[29,151],"abilities":[30,152],"of":[31,153,164,181],"LLMs.":[32],"This":[33],"paper":[34],"proposes":[35],"a":[36],"new":[37],"benchmark":[38,91],"-":[39,113],"EvoCodeBench":[40,52,67,87,103],"address":[42],"preceding":[44],"problems,":[45],"which":[46],"has":[47],"three":[48],"primary":[49],"advances.":[50],"(1)":[51],"aligns":[53],"multiple":[58],"dimensions,":[59],"e.g.,":[60],"distributions":[62],"dependency":[64],"distributions.":[65],"(2)":[66],"offers":[68],"comprehensive":[69],"annotations":[70],"(e.g.,":[71,82,135],"requirements,":[72],"reference":[73,76],"code,":[74],"dependencies),":[77],"robust":[79],"evaluation":[80],"metrics":[81],"Pass@k":[83],"Recall@k).":[85],"(3)":[86],"evolving":[90],"avoid":[93],"data":[94],"leakage.":[95],"We":[96,108,172,186],"build":[97],"automatic":[99],"pipeline":[100],"update":[102],"from":[104,118],"latest":[106],"repositories.":[107,121,158],"release":[109,187],"first":[111],"version":[112],"EvoCodeBench-2403,":[114],"containing":[115],"275":[116],"samples":[117],"25":[119],"Based":[122],"on":[123],"EvoCodeBench,":[124,188],"we":[125],"propose":[126],"repository-level":[127],"10":[132],"popular":[133],"LLMs":[134,155,183],"gpt-4,":[136],"gpt-3.5,":[137],"DeepSeek":[138],"Coder,":[139],"StarCoder":[140],"2,":[141],"CodeLLaMa,":[142],"Gemma,":[143],"Qwen":[145],"1.5).":[146],"Our":[147],"experiments":[148],"reveal":[149],"these":[154],"For":[159],"example,":[160],"highest":[162],"Pass@1":[163],"gpt-4":[165],"only":[166],"20.73%":[168],"our":[170],"experiments.":[171],"also":[173],"analyze":[174],"failed":[175],"cases":[176],"summarize":[178],"shortcomings":[180],"existing":[182],"EvoCodeBench.":[185],"all":[189],"prompts,":[190],"LLMs'":[192],"completions":[193],"for":[194],"further":[195],"community":[196],"analysis.":[197]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4393905718","counts_by_year":[{"year":2024,"cited_by_count":3}],"updated_date":"2024-12-05T00:28:30.238705","created_date":"2024-04-04"}