{"id":"https://openalex.org/W4385576721","doi":"https://doi.org/10.1016/j.patter.2023.100804","title":"Leakage and the reproducibility crisis in machine-learning-based science","display_name":"Leakage and the reproducibility crisis in machine-learning-based science","publication_year":2023,"publication_date":"2023-08-04","ids":{"openalex":"https://openalex.org/W4385576721","doi":"https://doi.org/10.1016/j.patter.2023.100804","pmid":"https://pubmed.ncbi.nlm.nih.gov/37720327"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.patter.2023.100804","pdf_url":"https://www.cell.com/article/S2666389923001599/pdf","source":{"id":"https://openalex.org/S4210198061","display_name":"Patterns","issn_l":"2666-3899","issn":["2666-3899"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true},"type":"article","type_crossref":"journal-article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.cell.com/article/S2666389923001599/pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063345981","display_name":"Sayash Kapoor","orcid":"https://orcid.org/0000-0001-5695-280X"},"institutions":[{"id":"https://openalex.org/I29955533","display_name":"Center for Information Technology","ror":"https://ror.org/03jh5a977","country_code":"US","type":"facility","lineage":["https://openalex.org/I1299022934","https://openalex.org/I1299303238","https://openalex.org/I29955533"]},{"id":"https://openalex.org/I20089843","display_name":"Princeton University","ror":"https://ror.org/00hx57361","country_code":"US","type":"education","lineage":["https://openalex.org/I20089843"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Sayash Kapoor","raw_affiliation_strings":["Department of Computer Science and Center for Information Technology Policy, Princeton University, Princeton, NJ 08540, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Center for Information Technology Policy, Princeton University, Princeton, NJ 08540, USA","institution_ids":["https://openalex.org/I29955533","https://openalex.org/I20089843"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5058102069","display_name":"Arvind Narayanan","orcid":"https://orcid.org/0000-0001-7176-4479"},"institutions":[{"id":"https://openalex.org/I29955533","display_name":"Center for Information Technology","ror":"https://ror.org/03jh5a977","country_code":"US","type":"facility","lineage":["https://openalex.org/I1299022934","https://openalex.org/I1299303238","https://openalex.org/I29955533"]},{"id":"https://openalex.org/I20089843","display_name":"Princeton University","ror":"https://ror.org/00hx57361","country_code":"US","type":"education","lineage":["https://openalex.org/I20089843"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Arvind Narayanan","raw_affiliation_strings":["Department of Computer Science and Center for Information Technology Policy, Princeton University, Princeton, NJ 08540, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Center for Information Technology Policy, Princeton University, Princeton, NJ 08540, USA","institution_ids":["https://openalex.org/I29955533","https://openalex.org/I20089843"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5063345981"],"corresponding_institution_ids":["https://openalex.org/I29955533","https://openalex.org/I20089843"],"apc_list":{"value":5200,"currency":"USD","value_usd":5200,"provenance":"doaj"},"apc_paid":{"value":5200,"currency":"USD","value_usd":5200,"provenance":"doaj"},"fwci":61.303,"has_fulltext":false,"cited_by_count":183,"citation_normalized_percentile":{"value":0.999976,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"4","issue":"9","first_page":"100804","last_page":"100804"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9956,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9956,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9925,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.9675,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leakage","display_name":"Leakage (economics)","score":0.63091767}],"concepts":[{"id":"https://openalex.org/C9893847","wikidata":"https://www.wikidata.org/wiki/Q1425625","display_name":"Reproducibility","level":2,"score":0.7695129},{"id":"https://openalex.org/C2777042071","wikidata":"https://www.wikidata.org/wiki/Q6509304","display_name":"Leakage (economics)","level":2,"score":0.63091767},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.55872464},{"id":"https://openalex.org/C151956035","wikidata":"https://www.wikidata.org/wiki/Q1132755","display_name":"Logistic regression","level":2,"score":0.4895607},{"id":"https://openalex.org/C83546350","wikidata":"https://www.wikidata.org/wiki/Q1139051","display_name":"Regression","level":2,"score":0.45459747},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43747377},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.43214834},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.38359103},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.2751484},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1725499},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C139719470","wikidata":"https://www.wikidata.org/wiki/Q39680","display_name":"Macroeconomics","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.patter.2023.100804","pdf_url":"https://www.cell.com/article/S2666389923001599/pdf","source":{"id":"https://openalex.org/S4210198061","display_name":"Patterns","issn_l":"2666-3899","issn":["2666-3899"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true},{"is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10499856","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":["National Institutes of Health"],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true},{"is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/37720327","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":["National Institutes of Health"],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.patter.2023.100804","pdf_url":"https://www.cell.com/article/S2666389923001599/pdf","source":{"id":"https://openalex.org/S4210198061","display_name":"Patterns","issn_l":"2666-3899","issn":["2666-3899"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true},"sustainable_development_goals":[],"grants":[{"funder":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation","award_id":"IIS-1763642"}],"datasets":[],"versions":[],"referenced_works_count":88,"referenced_works":["https://openalex.org/W1622128722","https://openalex.org/W1673923490","https://openalex.org/W1677182931","https://openalex.org/W1897139626","https://openalex.org/W1957051317","https://openalex.org/W1979450493","https://openalex.org/W1998392635","https://openalex.org/W2006617902","https://openalex.org/W2019694480","https://openalex.org/W2076602957","https://openalex.org/W2084341220","https://openalex.org/W2092444470","https://openalex.org/W2117539524","https://openalex.org/W2125186487","https://openalex.org/W2137195266","https://openalex.org/W2278972578","https://openalex.org/W2284729062","https://openalex.org/W2472069995","https://openalex.org/W2557007068","https://openalex.org/W2557088923","https://openalex.org/W2560136348","https://openalex.org/W2581974582","https://openalex.org/W2584419678","https://openalex.org/W2591090211","https://openalex.org/W2599674900","https://openalex.org/W2736287575","https://openalex.org/W2747402019","https://openalex.org/W2755142264","https://openalex.org/W2783880833","https://openalex.org/W2785011159","https://openalex.org/W2791681873","https://openalex.org/W2793993424","https://openalex.org/W2811374795","https://openalex.org/W2888592790","https://openalex.org/W2900246877","https://openalex.org/W2903488876","https://openalex.org/W2906730591","https://openalex.org/W2911926566","https://openalex.org/W2913668833","https://openalex.org/W2913997948","https://openalex.org/W2922705140","https://openalex.org/W2951243568","https://openalex.org/W2963905884","https://openalex.org/W2966686526","https://openalex.org/W2984916588","https://openalex.org/W2990091959","https://openalex.org/W3006027555","https://openalex.org/W3007502937","https://openalex.org/W3013182397","https://openalex.org/W3013294478","https://openalex.org/W3017867703","https://openalex.org/W3019226033","https://openalex.org/W3022995359","https://openalex.org/W3029827716","https://openalex.org/W3099348293","https://openalex.org/W3103934428","https://openalex.org/W3104239639","https://openalex.org/W3106983564","https://openalex.org/W3128540648","https://openalex.org/W3136933888","https://openalex.org/W3138114698","https://openalex.org/W3142490982","https://openalex.org/W3154124171","https://openalex.org/W3155717344","https://openalex.org/W3160678577","https://openalex.org/W3168318937","https://openalex.org/W3168499785","https://openalex.org/W3170561227","https://openalex.org/W3174174150","https://openalex.org/W3175126868","https://openalex.org/W3184231327","https://openalex.org/W3205127938","https://openalex.org/W3207830467","https://openalex.org/W3212368439","https://openalex.org/W3212464620","https://openalex.org/W3217314753","https://openalex.org/W4221165557","https://openalex.org/W4241857777","https://openalex.org/W4244185376","https://openalex.org/W4248754047","https://openalex.org/W4253752119","https://openalex.org/W4286850188","https://openalex.org/W4287207825","https://openalex.org/W4287637491","https://openalex.org/W4287825389","https://openalex.org/W429766147","https://openalex.org/W4389879592","https://openalex.org/W46790137"],"related_works":["https://openalex.org/W638612878","https://openalex.org/W4324138256","https://openalex.org/W3211735916","https://openalex.org/W3106281778","https://openalex.org/W2417696084","https://openalex.org/W2413717610","https://openalex.org/W2368782778","https://openalex.org/W2140154150","https://openalex.org/W2087830269","https://openalex.org/W1973270181"],"abstract_inverted_index":{"Machine-learning":[0],"(ML)":[1],"methods":[2],"have":[3,39],"gained":[4],"prominence":[5],"in":[6,20,28,36,57],"the":[7,134],"quantitative":[8],"sciences.":[9],"However,":[10],"there":[11],"are":[12,120,136],"many":[13],"known":[14],"methodological":[15],"pitfalls,":[16],"including":[17],"data":[18],"leakage,":[19,78],"ML-based":[21,29],"science.":[22,30],"We":[23,87],"systematically":[24],"investigate":[25],"reproducibility":[26,110],"issues":[27],"Through":[31],"a":[32,71,109],"survey":[33],"of":[34,74,77,95,112],"literature":[35],"fields":[37,46],"that":[38,89],"adopted":[40],"ML":[41,118,139],"methods,":[42],"we":[43,69,104,107],"find":[44],"17":[45],"where":[47,116],"leakage":[48,96],"has":[49],"been":[50],"found,":[51],"collectively":[52],"affecting":[53],"294":[54],"papers":[55],"and,":[56],"some":[58],"cases,":[59],"leading":[60],"to":[61,83,122],"wildly":[62],"overoptimistic":[63],"conclusions.":[64],"Based":[65],"on":[66],"our":[67],"survey,":[68],"introduce":[70],"detailed":[72],"taxonomy":[73],"eight":[75],"types":[76],"ranging":[79],"from":[80],"textbook":[81],"errors":[82,135],"open":[84],"research":[85],"problems.":[86],"propose":[88],"researchers":[90],"test":[91],"for":[92],"each":[93],"type":[94],"by":[97],"filling":[98],"out":[99],"model":[100],"info":[101],"sheets,":[102],"which":[103],"introduce.":[105],"Finally,":[106],"conduct":[108],"study":[111],"civil":[113],"war":[114],"prediction,":[115],"complex":[117,138],"models":[119,127,140],"believed":[121],"vastly":[123],"outperform":[124],"traditional":[125],"statistical":[126],"such":[128],"as":[129],"logistic":[130],"regression":[131],"(LR).":[132],"When":[133],"corrected,":[137],"do":[141],"not":[142],"perform":[143],"substantively":[144],"better":[145],"than":[146],"decades-old":[147],"LR":[148],"models.":[149]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4385576721","counts_by_year":[{"year":2024,"cited_by_count":134},{"year":2023,"cited_by_count":34},{"year":2022,"cited_by_count":6}],"updated_date":"2024-12-14T01:49:03.343378","created_date":"2023-08-05"}