{"id":"https://openalex.org/W4320471493","doi":"https://doi.org/10.48550/arxiv.2209.06058","title":"Streaming End-to-End Multilingual Speech Recognition with Joint Language Identification","display_name":"Streaming End-to-End Multilingual Speech Recognition with Joint Language Identification","publication_year":2022,"publication_date":"2022-01-01","ids":{"openalex":"https://openalex.org/W4320471493","doi":"https://doi.org/10.48550/arxiv.2209.06058"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2209.06058","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2209.06058","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100460193","display_name":"Chao Zhang","orcid":"https://orcid.org/0000-0002-6519-3371"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Chao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100374448","display_name":"Bo Li","orcid":"https://orcid.org/0000-0002-6711-3603"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Bo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070513394","display_name":"Tara N. Sainath","orcid":"https://orcid.org/0000-0002-4126-6556"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sainath, Tara","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032931723","display_name":"Trevor Strohman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Strohman, Trevor","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060420622","display_name":"Sepand Mavandadi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mavandadi, Sepand","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001306222","display_name":"Shuo-Yiin Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chang, Shuo-yiin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5037705635","display_name":"Parisa Haghani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haghani, Parisa","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":60},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9993,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9993,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9989,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9944,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.5839431},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.52782565},{"id":"https://openalex.org/keywords/language-identification","display_name":"Language identification","score":0.48207957},{"id":"https://openalex.org/keywords/identification","display_name":"Identification","score":0.4253474}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.85406744},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.68547285},{"id":"https://openalex.org/C147168706","wikidata":"https://www.wikidata.org/wiki/Q1457734","display_name":"Recurrent neural network","level":3,"score":0.6140037},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.6099238},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.60633004},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5874539},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.58411247},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.5839431},{"id":"https://openalex.org/C154504017","wikidata":"https://www.wikidata.org/wiki/Q853614","display_name":"Identifier","level":2,"score":0.5344673},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.52782565},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.51718986},{"id":"https://openalex.org/C55166926","wikidata":"https://www.wikidata.org/wiki/Q2892946","display_name":"Oracle","level":2,"score":0.50726146},{"id":"https://openalex.org/C129792486","wikidata":"https://www.wikidata.org/wiki/Q1050419","display_name":"Language identification","level":3,"score":0.48207957},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.44683155},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.44354957},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.4253474},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36672705},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3014645},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.19857424},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.13205382},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.11574188},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.082755715},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C59822182","wikidata":"https://www.wikidata.org/wiki/Q441","display_name":"Botany","level":1,"score":0.0},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2209.06058","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2209.06058","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2209.06058","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W851785710","https://openalex.org/W2949174760","https://openalex.org/W2916997151","https://openalex.org/W2624072012","https://openalex.org/W2262900283","https://openalex.org/W2098508228","https://openalex.org/W2064542902","https://openalex.org/W1998541766","https://openalex.org/W1985349217","https://openalex.org/W1508853749"],"abstract_inverted_index":{"Language":[0],"identification":[1],"is":[2,14],"critical":[3],"for":[4],"many":[5],"downstream":[6],"tasks":[7],"in":[8,86,146],"automatic":[9],"speech":[10],"recognition":[11],"(ASR),":[12],"and":[13,69,89,134],"beneficial":[15],"to":[16,31],"integrate":[17],"into":[18],"multilingual":[19],"end-to-end":[20],"ASR":[21,59],"as":[22,139],"an":[23,127],"additional":[24],"task.":[25],"In":[26],"this":[27],"paper,":[28],"we":[29],"propose":[30],"modify":[32],"the":[33,36,87,96,123,135,147],"structure":[34],"of":[35,93,129],"cascaded-encoder-based":[37],"recurrent":[38],"neural":[39],"network":[40],"transducer":[41],"(RNN-T)":[42],"model":[43],"by":[44,142],"integrating":[45],"a":[46,90,113],"per-frame":[47],"language":[48,119],"identifier":[49],"(LID)":[50],"predictor.":[51],"RNN-T":[52],"with":[53,60,66,79,105,117],"cascaded":[54],"encoders":[55],"can":[56,99],"achieve":[57,70,100],"streaming":[58,91,102],"low":[61],"latency":[62],"using":[63,76],"first-pass":[64],"decoding":[65,78],"no":[67],"right-context,":[68],"lower":[71],"word":[72],"error":[73],"rates":[74],"(WERs)":[75],"second-pass":[77,137],"longer":[80],"right-context.":[81],"By":[82],"leveraging":[83],"such":[84],"differences":[85],"right-contexts":[88],"implementation":[92],"statistics":[94],"pooling,":[95],"proposed":[97,124],"method":[98,125],"accurate":[101],"LID":[103,131,145],"prediction":[104,132],"little":[106],"extra":[107],"test-time":[108],"cost.":[109],"Experimental":[110],"results":[111],"on":[112],"voice":[114],"search":[115],"dataset":[116],"9":[118],"locales":[120],"shows":[121],"that":[122,140],"achieves":[126],"average":[128],"96.2%":[130],"accuracy":[133],"same":[136],"WER":[138],"obtained":[141],"including":[143],"oracle":[144],"input.":[148]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4320471493","counts_by_year":[],"updated_date":"2025-01-19T02:12:11.472407","created_date":"2023-02-14"}