{"id":"https://openalex.org/W4389814425","doi":"https://doi.org/10.48550/arxiv.2312.08553","title":"USM-Lite: Quantization and Sparsity Aware Fine-tuning for Speech Recognition with Universal Speech Models","display_name":"USM-Lite: Quantization and Sparsity Aware Fine-tuning for Speech Recognition with Universal Speech Models","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4389814425","doi":"https://doi.org/10.48550/arxiv.2312.08553"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2312.08553","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2312.08553","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058886181","display_name":"Shaojin Ding","orcid":"https://orcid.org/0000-0002-2108-3111"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Shaojin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5105940239","display_name":"Qiu David","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"David, Qiu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092046744","display_name":"David Rim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rim, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101319167","display_name":"Yanzhang He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Yanzhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070398306","display_name":"Oleg Rybakov\u200e","orcid":"https://orcid.org/0000-0003-4805-3083"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rybakov, Oleg","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100374360","display_name":"Bo Li","orcid":"https://orcid.org/0000-0001-6709-0942"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Bo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032640894","display_name":"Rohit Prabhavalkar","orcid":"https://orcid.org/0000-0001-5331-6058"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Prabhavalkar, Rohit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101432591","display_name":"Weiran Wang","orcid":"https://orcid.org/0009-0000-0843-707X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Weiran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070513394","display_name":"Tara N. Sainath","orcid":"https://orcid.org/0000-0002-4126-6556"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sainath, Tara N.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033596864","display_name":"Shivani Agrawal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Agrawal, Shivani","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043621779","display_name":"Zhonglin Han","orcid":"https://orcid.org/0000-0001-8795-7953"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Zhonglin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100402391","display_name":"Jian Li","orcid":"https://orcid.org/0000-0001-5335-9832"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5070172290","display_name":"Amir Yazdanbakhsh","orcid":"https://orcid.org/0000-0001-8199-7671"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yazdanbakhsh, Amir","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":67},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9988,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9957,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.42404187}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7394237},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.7057565},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5088887},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.42404187},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4123367}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2312.08553","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.08553","pdf_url":"http://arxiv.org/pdf/2312.08553","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2312.08553","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2312.08553","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"score":0.64,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, justice, and strong institutions"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2547793174","https://openalex.org/W2544241817","https://openalex.org/W2309273277","https://openalex.org/W2132885390","https://openalex.org/W2132658536","https://openalex.org/W2070212102","https://openalex.org/W2061937230","https://openalex.org/W1769849273","https://openalex.org/W1574295218","https://openalex.org/W113247760"],"abstract_inverted_index":{"End-to-end":[0],"automatic":[1],"speech":[2,18],"recognition":[3],"(ASR)":[4],"models":[5,19],"have":[6,157],"seen":[7],"revolutionary":[8],"quality":[9],"gains":[10],"with":[11,66,95],"the":[12,31,77,81,117,135,154,160,163,180,185],"recent":[13],"development":[14],"of":[15,113,119,159,165],"large-scale":[16,102],"universal":[17],"(USM).":[20],"However,":[21,127],"deploying":[22],"these":[23],"massive":[24],"USMs":[25],"is":[26,41],"extremely":[27],"expensive":[28],"due":[29],"to":[30,46,106,121,133,156],"enormous":[32],"memory":[33],"usage":[34],"and":[35,70,87,124,144,182,187],"computational":[36],"cost.":[37],"Therefore,":[38],"model":[39,78,82,155],"compression":[40,130],"an":[42],"important":[43],"research":[44],"topic":[45],"fit":[47],"USM-based":[48],"ASR":[49],"under":[50,138],"budget":[51],"in":[52],"real-world":[53],"scenarios.":[54],"In":[55],"this":[56],"study,":[57],"we":[58],"propose":[59],"a":[60,67,96,101,128],"USM":[61,99],"fine-tuning":[62],"approach":[63],"for":[64,194],"ASR,":[65],"low-bit":[68],"quantization":[69,123,143],"N:M":[71],"structured":[72],"sparsity":[73],"aware":[74],"paradigm":[75],"on":[76,100,179,184],"weights,":[79],"reducing":[80],"complexity":[83],"from":[84],"parameter":[85,98],"precision":[86],"matrix":[88],"topology":[89],"perspectives.":[90],"We":[91,174],"conducted":[92],"extensive":[93],"experiments":[94],"2-billion":[97],"voice":[103],"search":[104],"dataset":[105],"evaluate":[107],"our":[108,149],"proposed":[109,150],"method.":[110],"A":[111],"series":[112],"ablation":[114],"studies":[115],"validate":[116],"effectiveness":[118],"up":[120],"int4":[122],"2:4":[125],"sparsity.":[126,146],"single":[129],"technique":[131],"fails":[132],"recover":[134],"performance":[136],"well":[137],"extreme":[139],"setups":[140],"including":[141],"int2":[142],"1:4":[145],"By":[147],"contrast,":[148],"method":[151],"can":[152],"compress":[153],"9.4%":[158],"size,":[161],"at":[162],"cost":[164],"only":[166],"7.3%":[167],"relative":[168],"word":[169],"error":[170],"rate":[171],"(WER)":[172],"regressions.":[173],"also":[175],"provided":[176],"in-depth":[177],"analyses":[178],"results":[181],"discussions":[183],"limitations":[186],"potential":[188],"solutions,":[189],"which":[190],"would":[191],"be":[192],"valuable":[193],"future":[195],"studies.":[196]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4389814425","counts_by_year":[],"updated_date":"2025-01-02T04:03:00.190302","created_date":"2023-12-16"}