{"id":"https://openalex.org/W4403853742","doi":"https://doi.org/10.48550/arxiv.2410.01610","title":"Upcycling Instruction Tuning from Dense to Mixture-of-Experts via\n Parameter Merging","display_name":"Upcycling Instruction Tuning from Dense to Mixture-of-Experts via\n Parameter Merging","publication_year":2024,"publication_date":"2024-10-02","ids":{"openalex":"https://openalex.org/W4403853742","doi":"https://doi.org/10.48550/arxiv.2410.01610"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.01610","pdf_url":"http://arxiv.org/pdf/2410.01610","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2410.01610","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113204589","display_name":"Tingfeng Hui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hui, Tingfeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114439635","display_name":"Zhenyu Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhenyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101351951","display_name":"Shuohuan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shuohuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077454145","display_name":"Yuzhou Sun","orcid":"https://orcid.org/0000-0001-6105-1002"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100677198","display_name":"Hua Wu","orcid":"https://orcid.org/0000-0002-5687-7800"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Hua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5036865453","display_name":"Sen Su","orcid":"https://orcid.org/0000-0003-4266-7527"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Sen","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":84},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.947,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.947,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.9037,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.47077224},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.40208027}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.01610","pdf_url":"http://arxiv.org/pdf/2410.01610","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.01610","pdf_url":"http://arxiv.org/pdf/2410.01610","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W4390401159","https://openalex.org/W4388998267","https://openalex.org/W4246450666","https://openalex.org/W2899084033","https://openalex.org/W2898370298","https://openalex.org/W2805339068","https://openalex.org/W2748952813","https://openalex.org/W2137437058","https://openalex.org/W2016187641"],"abstract_inverted_index":{"Mixture-of-Experts":[0],"(MoE)":[1],"shines":[2],"brightly":[3],"in":[4,13,120,166,180],"large":[5],"language":[6,16],"models":[7,92],"(LLMs)":[8],"and":[9,32,82,101,149,156],"demonstrates":[10],"outstanding":[11,154],"performance":[12,155],"plentiful":[14],"natural":[15],"processing":[17],"tasks.":[18],"However,":[19],"existing":[20],"methods":[21],"transforming":[22],"LLMs":[23],"from":[24],"dense":[25,53,74],"to":[26,89,106,139],"MoE":[27,58,122],"face":[28],"significant":[29],"data":[30,134,147,157,169],"requirements":[31],"typically":[33],"rely":[34],"on":[35],"large-scale":[36],"post-training.":[37],"In":[38],"this":[39],"paper,":[40],"we":[41,62,127],"propose":[42,84],"Upcycling":[43],"Instruction":[44],"Tuning":[45],"(UpIT),":[46],"a":[47,52,57,129],"data-efficient":[48],"approach":[49],"for":[50,79],"tuning":[51,71],"pre-trained":[54],"model":[55,75,123],"into":[56],"instruction":[59,70],"model.":[60],"Specifically,":[61],"first":[63],"point":[64],"out":[65],"that":[66,116,135],"intermediate":[67],"checkpoints":[68],"during":[69],"of":[72,96,110,132,159,176],"the":[73,121,141,153,174],"are":[76,104],"naturally":[77],"suitable":[78],"specialized":[80,118],"experts,":[81,97],"then":[83],"an":[85],"expert":[86,119,137,167,178],"expansion":[87],"stage":[88],"flexibly":[90],"achieve":[91],"with":[93,145],"flexible":[94],"numbers":[95],"where":[98],"genetic":[99],"algorithm":[100],"parameter":[102],"merging":[103],"introduced":[105],"ensure":[107,115],"sufficient":[108],"diversity":[109,179],"new":[111],"extended":[112],"experts.":[113],"To":[114],"each":[117,136],"works":[124],"as":[125,161,163],"expected,":[126],"select":[128],"small":[130],"amount":[131],"seed":[133],"excels":[138],"pre-optimize":[140],"router.":[142],"Extensive":[143],"experiments":[144],"various":[146],"scales":[148],"upcycling":[150],"settings":[151],"demonstrate":[152],"efficiency":[158],"UpIT,":[160],"well":[162],"stable":[164],"improvement":[165],"or":[168],"scaling.":[170],"Further":[171],"analysis":[172],"reveals":[173],"importance":[175],"ensuring":[177],"upcycling.":[181]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4403853742","counts_by_year":[],"updated_date":"2024-12-11T17:42:08.296127","created_date":"2024-10-29"}