{"id":"https://openalex.org/W4298181859","doi":"https://doi.org/10.48550/arxiv.2209.14389","title":"Downstream Datasets Make Surprisingly Good Pretraining Corpora","display_name":"Downstream Datasets Make Surprisingly Good Pretraining Corpora","publication_year":2022,"publication_date":"2022-01-01","ids":{"openalex":"https://openalex.org/W4298181859","doi":"https://doi.org/10.48550/arxiv.2209.14389"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2209.14389","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2209.14389","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102781494","display_name":"K. Siva Krishna","orcid":"https://orcid.org/0000-0003-3840-0950"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Krishna, Kundan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013136179","display_name":"Saurabh Garg","orcid":"https://orcid.org/0000-0001-8719-284X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Garg, Saurabh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082603621","display_name":"Jeffrey P. Bigham","orcid":"https://orcid.org/0000-0002-2072-0625"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bigham, Jeffrey P.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5029448258","display_name":"Zachary C. Lipton","orcid":"https://orcid.org/0000-0002-3824-4241"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lipton, Zachary C.","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.865382,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":80,"max":83},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9915,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9915,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9895,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.9659,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/labeled-data","display_name":"Labeled data","score":0.45696372},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.41161084}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.66945577},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.66128725},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6386874},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.62646925},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5704143},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.52382785},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5006335},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.45742974},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.45696372},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.41161084},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3910925},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2209.14389","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2209.14389","pdf_url":"http://arxiv.org/pdf/2209.14389","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2209.14389","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2209.14389","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"score":0.54,"id":"https://metadata.un.org/sdg/4","display_name":"Quality education"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4389518428","https://openalex.org/W4381058564","https://openalex.org/W4287635093","https://openalex.org/W4206178588","https://openalex.org/W3214715529","https://openalex.org/W3094491777","https://openalex.org/W2964413124","https://openalex.org/W2387743295","https://openalex.org/W2384605597","https://openalex.org/W1583765404"],"abstract_inverted_index":{"For":[0],"most":[1],"natural":[2],"language":[3],"processing":[4],"tasks,":[5,123,130],"the":[6,23,39,48,61,95,105,125,155,162,181,191],"dominant":[7],"practice":[8],"is":[9,66],"to":[10,31,38,47,175,190],"finetune":[11],"large":[12],"pretrained":[13,116],"transformer":[14],"models":[15,80,117],"(e.g.,":[16],"BERT)":[17],"using":[18,99],"smaller":[19],"downstream":[20,84],"datasets.":[21],"Despite":[22],"success":[24],"of":[25,58,154,193,207],"this":[26],"approach,":[27],"it":[28],"remains":[29],"unclear":[30],"what":[32],"extent":[33],"these":[34,114],"gains":[35,173],"are":[36,177,186,202],"attributable":[37,174,189],"massive":[40,198],"background":[41],"corpora":[42],"employed":[43],"for":[44,68],"pretraining":[45,49,70,93,160,176,182,195,217],"versus":[46],"objectives":[50],"themselves.":[51],"This":[52],"paper":[53],"introduces":[54],"a":[55],"large-scale":[56],"study":[57],"self-pretraining,":[59],"where":[60],"same":[62],"(downstream)":[63],"training":[64],"data":[65,196],"used":[67],"both":[69,76],"and":[71,78,81,109,146,185,212],"finetuning.":[72],"In":[73],"experiments":[74],"addressing":[75],"ELECTRA":[77],"RoBERTa":[79],"10":[82],"distinct":[83],"classification":[85,129],"datasets,":[86,111],"we":[87],"observe":[88],"that":[89,168],"self-pretraining":[90,131],"rivals":[91],"standard":[92],"on":[94,107,121,135,161],"BookWiki":[96,163],"corpus":[97],"(despite":[98],"around":[100],"$10\\times$--$500\\times$":[101],"less":[102],"data),":[103],"outperforming":[104],"latter":[106],"$7$":[108],"$5$":[110],"respectively.":[112],"Surprisingly,":[113],"task-specific":[115],"often":[118,149],"perform":[119],"well":[120],"other":[122],"including":[124],"GLUE":[126],"benchmark.":[127],"Besides":[128],"also":[132],"provides":[133],"benefits":[134],"structured":[136],"output":[137],"prediction":[138],"tasks":[139],"such":[140],"as":[141],"span":[142],"based":[143],"question":[144],"answering":[145],"commonsense":[147],"inference,":[148],"providing":[150],"more":[151],"than":[152],"$50\\%$":[153],"performance":[156,172],"boosts":[157],"provided":[158],"by":[159,180],"corpus.":[164],"Our":[165],"results":[166],"hint":[167],"in":[169,197,205,215],"many":[170],"scenarios,":[171],"driven":[178],"primarily":[179],"objective":[183],"itself":[184],"not":[187],"always":[188],"use":[192],"external":[194],"amounts.":[199],"These":[200],"findings":[201],"especially":[203],"relevant":[204],"light":[206],"concerns":[208],"about":[209],"intellectual":[210],"property":[211],"offensive":[213],"content":[214],"web-scale":[216],"data.":[218]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4298181859","counts_by_year":[{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":1}],"updated_date":"2024-12-09T21:03:28.932682","created_date":"2022-10-01"}