{"id":"https://openalex.org/W4404089414","doi":"https://doi.org/10.48550/arxiv.2410.16215","title":"Pre-training Distillation for Large Language Models: A Design Space\n Exploration","display_name":"Pre-training Distillation for Large Language Models: A Design Space\n Exploration","publication_year":2024,"publication_date":"2024-10-21","ids":{"openalex":"https://openalex.org/W4404089414","doi":"https://doi.org/10.48550/arxiv.2410.16215"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.16215","pdf_url":"http://arxiv.org/pdf/2410.16215","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2410.16215","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100740622","display_name":"Hao Peng","orcid":"https://orcid.org/0000-0003-0458-5977"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011372729","display_name":"Xin Lv","orcid":"https://orcid.org/0009-0009-8062-0675"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lv, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058973687","display_name":"Yushi Bai","orcid":"https://orcid.org/0000-0003-1295-1615"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Yushi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046687207","display_name":"Zijun Yao","orcid":"https://orcid.org/0000-0002-0288-9283"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Zijun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100626090","display_name":"Jiajie Zhang","orcid":"https://orcid.org/0000-0003-3928-8171"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jiajie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060498828","display_name":"Lei Hou","orcid":"https://orcid.org/0000-0002-8907-3526"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hou, Lei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5003324011","display_name":"Juanzi Li","orcid":"https://orcid.org/0000-0002-6244-0664"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Juanzi","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":77},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9715,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9715,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9552,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.6725176},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.64147323},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.5454335},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5336877},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38246822},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32032365},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.14620233},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.10657182},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.09697342},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.16215","pdf_url":"http://arxiv.org/pdf/2410.16215","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.16215","pdf_url":"http://arxiv.org/pdf/2410.16215","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4394896187","https://openalex.org/W4386462264","https://openalex.org/W4364306694","https://openalex.org/W4312192474","https://openalex.org/W4306674287","https://openalex.org/W4283697347","https://openalex.org/W3170094116","https://openalex.org/W3107602296","https://openalex.org/W3046775127","https://openalex.org/W2961085424"],"abstract_inverted_index":{"Knowledge":[0],"distillation":[1,65,106,132],"(KD)":[2],"aims":[3],"to":[4,12,57,79,125],"transfer":[5],"knowledge":[6],"from":[7,41,148],"a":[8,13,70,81,152],"large":[9,25],"teacher":[10,49,77,154],"model":[11],"smaller":[14],"student":[15,37,84,143],"model.":[16,50],"Previous":[17],"work":[18],"applying":[19],"KD":[20,56],"in":[21,174],"the":[22,32,36,48,58,76,87,92,101,127,167],"field":[23],"of":[24,61,89,96,104,130,166],"language":[26],"models":[27],"(LLMs)":[28],"typically":[29],"focused":[30],"on":[31],"post-training":[33],"phase,":[34],"where":[35],"LLM":[38,78,155],"learns":[39],"directly":[40],"instructions":[42],"and":[43,116,133,137],"corresponding":[44],"responses":[45],"generated":[46],"by":[47],"In":[51],"this":[52],"paper,":[53],"we":[54,98],"extend":[55],"pre-training":[59,64,105,131,149,175],"phase":[60],"LLMs,":[62],"named":[63],"(PD).":[66],"We":[67,121,162],"first":[68],"conduct":[69,122],"preliminary":[71],"experiment":[72],"using":[73],"GLM-4-9B":[74],"as":[75,141],"distill":[80],"1.9B":[82],"parameter":[83],"LLM,":[85],"validating":[86],"effectiveness":[88],"PD.":[90],"Considering":[91],"key":[93],"impact":[94],"factors":[95],"distillation,":[97,150],"systematically":[99],"explore":[100,126],"design":[102,128,168],"space":[103,129,169],"across":[107],"four":[108],"aspects:":[109],"logits":[110],"processing,":[111],"loss":[112],"selection,":[113],"scaling":[114],"law,":[115],"offline":[117],"or":[118],"online":[119],"logits.":[120],"extensive":[123],"experiments":[124],"find":[134],"better":[135,160],"configurations":[136],"interesting":[138],"conclusions,":[139],"such":[140],"larger":[142,153],"LLMs":[144],"generally":[145],"benefiting":[146],"more":[147],"while":[151],"does":[156],"not":[157],"necessarily":[158],"guarantee":[159],"results.":[161],"hope":[163],"our":[164],"exploration":[165],"will":[170],"inform":[171],"future":[172],"practices":[173],"distillation.":[176]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4404089414","counts_by_year":[],"updated_date":"2025-04-16T03:33:52.166246","created_date":"2024-11-06"}