{"id":"https://openalex.org/W4402502835","doi":"https://doi.org/10.48550/arxiv.2408.08274","title":"BAM! Just Like That: Simple and Efficient Parameter Upcycling for\n Mixture of Experts","display_name":"BAM! Just Like That: Simple and Efficient Parameter Upcycling for\n Mixture of Experts","publication_year":2024,"publication_date":"2024-08-15","ids":{"openalex":"https://openalex.org/W4402502835","doi":"https://doi.org/10.48550/arxiv.2408.08274"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.08274","pdf_url":"http://arxiv.org/pdf/2408.08274","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2408.08274","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5039131712","display_name":"Qizhen Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Qizhen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107179580","display_name":"Nikolas Gritsch","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gritsch, Nikolas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107179581","display_name":"Dwaraknath Gnaneshwar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gnaneshwar, Dwaraknath","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111325457","display_name":"Simon Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Simon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5098901825","display_name":"David Cairuz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cairuz, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033303445","display_name":"Bharat Venkitesh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Venkitesh, Bharat","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059094093","display_name":"Jakob Foerster","orcid":"https://orcid.org/0000-0001-9688-2498"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Foerster, Jakob","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014309771","display_name":"Phil Blunsom","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Blunsom, Phil","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037310413","display_name":"Sebastian Ruder","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ruder, Sebastian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063061204","display_name":"Ahmet \u00dcst\u00fcn","orcid":"https://orcid.org/0000-0002-1640-4291"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ustun, Ahmet","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5003520377","display_name":"Acyr Locatelli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Locatelli, Acyr","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":84},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T12205","display_name":"Time Series Analysis and Forecasting","score":0.8429,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12205","display_name":"Time Series Analysis and Forecasting","score":0.8429,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12384","display_name":"Customer churn and segmentation","score":0.826,"subfield":{"id":"https://openalex.org/subfields/1406","display_name":"Marketing"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12761","display_name":"Data Stream Mining Techniques","score":0.8255,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.85773015},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.38900286},{"id":"https://openalex.org/C183696295","wikidata":"https://www.wikidata.org/wiki/Q2487696","display_name":"Biochemical engineering","level":1,"score":0.34953707},{"id":"https://openalex.org/C186060115","wikidata":"https://www.wikidata.org/wiki/Q30336093","display_name":"Biological system","level":1,"score":0.34217387},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.33384693},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.3277154},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.22320324},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.18419445},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.17788479},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.14201456}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.08274","pdf_url":"http://arxiv.org/pdf/2408.08274","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.08274","pdf_url":"http://arxiv.org/pdf/2408.08274","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W4387497383","https://openalex.org/W4306842392","https://openalex.org/W2948807893","https://openalex.org/W2778153218","https://openalex.org/W2748952813","https://openalex.org/W2527526854","https://openalex.org/W2078814861","https://openalex.org/W1976181487","https://openalex.org/W1531601525"],"abstract_inverted_index":{"The":[0],"Mixture":[1,142],"of":[2,77,113,141,143],"Experts":[3],"(MoE)":[4],"framework":[5],"has":[6],"become":[7],"a":[8,29,100,139,194],"popular":[9],"architecture":[10,198],"for":[11,151,167,184],"large":[12],"language":[13],"models":[14,44,93,116,162,217],"due":[15],"to":[16,49,62,81,123,182,199,209,222],"its":[17],"superior":[18],"performance":[19],"over":[20],"dense":[21,42,78,115,161],"models.":[22],"However,":[23,71],"training":[24],"MoEs":[25],"from":[26,160,219],"scratch":[27],"in":[28,231],"large-scale":[30],"regime":[31],"is":[32,54],"prohibitively":[33],"expensive.":[34],"Existing":[35],"methods":[36,150],"mitigate":[37],"this":[38,72,107],"by":[39,56,117,135],"pre-training":[40],"multiple":[41],"expert":[43],"independently":[45],"and":[46,172,176,206,234,242],"using":[47,57,120],"them":[48,137],"initialize":[50,63,124],"an":[51],"MoE.":[52],"This":[53],"done":[55],"experts'":[58,131],"feed-forward":[59],"network":[60],"(FFN)":[61],"the":[64,75,83,88,125,168,203,239],"MoE's":[65],"experts":[66,159,181,205,208],"while":[67],"merging":[68],"other":[69],"parameters.":[70],"method":[73,104],"limits":[74],"reuse":[76],"model":[79,170],"parameters":[80,133,166,178,225],"only":[82,119],"FFN":[84,122,207],"layers,":[85],"thereby":[86],"constraining":[87],"advantages":[89],"when":[90],"\"upcycling\"":[91],"these":[92],"into":[94,138],"MoEs.":[95],"We":[96,147],"propose":[97],"BAM":[98,109,228],"(Branch-Attend-Mix),":[99],"simple":[101],"yet":[102],"effective":[103],"that":[105,227],"addresses":[106],"shortcoming.":[108],"makes":[110],"full":[111],"use":[112],"specialized":[114],"not":[118],"their":[121],"MoE":[126],"layers":[127],"but":[128],"also":[129],"leveraging":[130],"attention":[132,153,158,165,196,204],"fully":[134],"initializing":[136,156],"soft-variant":[140],"Attention":[144],"(MoA)":[145],"layers.":[146],"explore":[148],"two":[149],"upcycling":[152],"parameters:":[154],"1)":[155],"separate":[157],"including":[163],"all":[164,180],"best":[169],"performance;":[171],"2)":[173],"sharing":[174],"key":[175],"value":[177],"across":[179],"facilitate":[183],"better":[185],"inference":[186],"efficiency.":[187],"To":[188],"further":[189],"improve":[190],"efficiency,":[191],"we":[192],"adopt":[193],"parallel":[195],"transformer":[197],"MoEs,":[200],"which":[201],"allows":[202],"be":[210],"computed":[211],"concurrently.":[212],"Our":[213],"experiments":[214],"on":[215],"seed":[216],"ranging":[218],"590":[220],"million":[221],"2":[223],"billion":[224],"demonstrate":[226],"surpasses":[229],"baselines":[230],"both":[232],"perplexity":[233],"downstream":[235],"task":[236],"performance,":[237],"within":[238],"same":[240],"computational":[241],"data":[243],"constraints.":[244]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4402502835","counts_by_year":[],"updated_date":"2024-12-15T07:53:45.332116","created_date":"2024-09-13"}