{"id":"https://openalex.org/W4226426852","doi":"https://doi.org/10.48550/arxiv.2204.00595","title":"Monarch: Expressive Structured Matrices for Efficient and Accurate Training","display_name":"Monarch: Expressive Structured Matrices for Efficient and Accurate Training","publication_year":2022,"publication_date":"2022-01-01","ids":{"openalex":"https://openalex.org/W4226426852","doi":"https://doi.org/10.48550/arxiv.2204.00595"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2204.00595","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2204.00595","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5091734792","display_name":"Tri Dao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dao, Tri","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031842648","display_name":"Beidi Chen","orcid":"https://orcid.org/0000-0002-7586-1855"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Beidi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070318397","display_name":"Nimit S. Sohoni","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sohoni, Nimit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062564649","display_name":"Arjun Desai","orcid":"https://orcid.org/0000-0003-0645-3257"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Desai, Arjun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078213488","display_name":"Michael Poli","orcid":"https://orcid.org/0000-0001-5384-9372"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Poli, Michael","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039570590","display_name":"Jessica Grogan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Grogan, Jessica","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001277622","display_name":"Alexander Liu","orcid":"https://orcid.org/0000-0002-3985-982X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Alexander","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112286697","display_name":"A. Rajeshwar Rao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rao, Aniruddh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001041485","display_name":"Atri Rudra","orcid":"https://orcid.org/0000-0003-4136-4719"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rudra, Atri","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5103852640","display_name":"Christopher R\u00e9","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"R\u00e9, Christopher","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.633892,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":81,"max":84},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9925,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9925,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9914,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.9691,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.5758552},{"id":"https://openalex.org/keywords/representation","display_name":"Representation","score":0.50362915},{"id":"https://openalex.org/keywords/rank","display_name":"Rank (graph theory)","score":0.4444887}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.74397266},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.5758552},{"id":"https://openalex.org/C124066611","wikidata":"https://www.wikidata.org/wiki/Q28684319","display_name":"Sparse approximation","level":2,"score":0.5125499},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.50362915},{"id":"https://openalex.org/C130367717","wikidata":"https://www.wikidata.org/wiki/Q189791","display_name":"Diagonal","level":2,"score":0.4770024},{"id":"https://openalex.org/C165464430","wikidata":"https://www.wikidata.org/wiki/Q1570441","display_name":"Parameterized complexity","level":2,"score":0.4751023},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.46672383},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.4537114},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.4444887},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.41884732},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.41657367},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34205517},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3315345},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.1529507},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.14232534},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2204.00595","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2204.00595","pdf_url":"http://arxiv.org/pdf/2204.00595","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2204.00595","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2204.00595","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4318256793","https://openalex.org/W3173235360","https://openalex.org/W2594370889","https://openalex.org/W2390720471","https://openalex.org/W2174948646","https://openalex.org/W2169356733","https://openalex.org/W2091883426","https://openalex.org/W2051410394","https://openalex.org/W2024017047","https://openalex.org/W1600789676"],"abstract_inverted_index":{"Large":[0],"neural":[1],"networks":[2],"excel":[3],"in":[4,47,57,155],"many":[5,105],"domains,":[6],"but":[7],"they":[8],"are":[9,88],"expensive":[10],"to":[11,18,26,51,61,66,137,211],"train":[12,138],"and":[13,55,100,139,142,164,170,180,187],"fine-tune.":[14],"A":[15],"popular":[16],"approach":[17],"reduce":[19],"their":[20],"compute":[21],"or":[22],"memory":[23],"requirements":[24],"is":[25,85],"replace":[27],"dense":[28,70,115,143],"weight":[29,71,116],"matrices":[30,82,95,133,204],"with":[31,118,176,196,263],"structured":[32],"ones":[33],"(e.g.,":[34],"sparse,":[35],"low-rank,":[36],"Fourier":[37],"transform).":[38],"These":[39,129],"methods":[40],"have":[41],"not":[42],"seen":[43],"widespread":[44],"adoption":[45],"(1)":[46],"end-to-end":[48,157],"training":[49,159,166],"due":[50,60],"unfavorable":[52],"efficiency--quality":[53],"tradeoffs,":[54],"(2)":[56],"dense-to-sparse":[58,246],"fine-tuning":[59,258],"lack":[62],"of":[63,81,92,112,131],"tractable":[64],"algorithms":[65],"approximate":[67],"a":[68,79,114,119,197,207,249],"given":[69],"matrix.":[72],"To":[73],"address":[74],"these":[75],"issues,":[76],"we":[77],"propose":[78],"class":[80],"(Monarch)":[83],"that":[84,148,239],"hardware-efficient":[86],"(they":[87,102],"parameterized":[89],"as":[90,206,248],"products":[91],"two":[93],"block-diagonal":[94],"for":[96],"better":[97],"hardware":[98],"utilization)":[99],"expressive":[101],"can":[103,150],"represent":[104],"commonly":[106],"used":[107],"transforms).":[108],"Surprisingly,":[109],"the":[110,182,233,241],"problem":[111],"approximating":[113],"matrix":[117],"Monarch":[120,132,149,203,252],"matrix,":[121],"though":[122],"nonconvex,":[123],"has":[124],"an":[125],"analytical":[126],"optimal":[127],"solution.":[128],"properties":[130],"unlock":[134],"new":[135],"ways":[136],"fine-tune":[140],"sparse":[141,158],"models.":[144],"We":[145],"empirically":[146],"validate":[147],"achieve":[151],"favorable":[152],"accuracy-efficiency":[153],"tradeoffs":[154],"several":[156],"applications:":[160],"speeding":[161],"up":[162,213,256],"ViT":[163],"GPT-2":[165,214],"on":[167,184,216,259],"ImageNet":[168],"classification":[169],"Wikitext-103":[171],"language":[172],"modeling":[173],"by":[174,191,218,261],"2x":[175,219],"comparable":[177,264],"model":[178],"quality,":[179],"reducing":[181],"error":[183],"PDE":[185],"solving":[186],"MRI":[188],"reconstruction":[189],"tasks":[190],"40%.":[192],"In":[193,245],"sparse-to-dense":[194],"training,":[195],"simple":[198],"technique":[199,225],"called":[200],"\"reverse":[201],"sparsification,\"":[202],"serve":[205],"useful":[208],"intermediate":[209],"representation":[210],"speed":[212],"pretraining":[215,230],"OpenWebText":[217],"without":[220],"quality":[221],"drop.":[222],"The":[223],"same":[224],"brings":[226],"23%":[227],"faster":[228],"BERT":[229,257],"than":[231],"even":[232],"very":[234],"optimized":[235],"implementation":[236],"from":[237],"Nvidia":[238],"set":[240],"MLPerf":[242],"1.1":[243],"record.":[244],"fine-tuning,":[247],"proof-of-concept,":[250],"our":[251],"approximation":[253],"algorithm":[254],"speeds":[255],"GLUE":[260],"1.7x":[262],"accuracy.":[265]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4226426852","counts_by_year":[{"year":2023,"cited_by_count":5}],"updated_date":"2025-04-24T11:48:58.374567","created_date":"2022-05-05"}