{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:06:08Z","timestamp":1730297168547,"version":"3.28.0"},"reference-count":71,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,1,9]]},"DOI":"10.1109\/slt54892.2023.10022774","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T18:54:03Z","timestamp":1674845643000},"page":"52-59","source":"Crossref","is-referenced-by-count":9,"title":["JOIST: A Joint Speech and Text Streaming Model for ASR"],"prefix":"10.1109","author":[{"given":"Tara N.","family":"Sainath","sequence":"first","affiliation":[{"name":"Google, Inc."}]},{"given":"Rohit","family":"Prabhavalkar","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Ankur","family":"Bapna","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Yu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Zhouyuan","family":"Huo","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Zhehuai","family":"Chen","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Bo","family":"Li","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Weiran","family":"Wang","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Trevor","family":"Strohman","sequence":"additional","affiliation":[{"name":"Google, Inc."}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2846"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682336"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462105"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/asru46091.2019.9003906"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1855"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-206"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-343"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2018.8462682"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688018"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683307"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268950"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746554"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053510"},{"key":"ref15","article-title":"SLAM: A Unified Encoder for Speech and Language Modeling via Speech-Text Joint Pre-Training","author":"Bapna","year":"2021","journal-title":"arXiv preprint"},{"key":"ref16","article-title":"mSLAM: Massively Multilingual Joint Pre-Training for Speech and Text","volume":"abs\/2202.01374","author":"Bapna","year":"2022","journal-title":"CoRR"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.105"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747555"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10937"},{"article-title":"SPLAT: Speech-Language Joint Pre-Training for Spoken Language Under-standing","volume-title":"Proc. of NAACL-HLT","author":"Chung","key":"ref20"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.393"},{"article-title":"Cascaded en-coders for unifying streaming and non-streaming ASR","volume-title":"Proc. ICASSP","author":"Narayanan","key":"ref22"},{"key":"ref23","doi-asserted-by":"crossref","DOI":"10.1109\/ICASSP43922.2022.9747879","article-title":"Improving the Latency and Quality of Cascaded Encoder","volume-title":"Proc. ICASSP","author":"Sainath"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/icassp43922.2022.9746038"},{"article-title":"Phone Recognition using Re-stricted Boltzmann Machines","volume-title":"Proc. ICASSP","author":"Mohamed","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2011.6163900"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2011-169"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-80"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461809"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-233"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2277"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003913"},{"key":"ref33","article-title":"Representation Learning with Contrastive Predictive Coding","author":"Van Den Oord","year":"2018","journal-title":"arXiv preprint"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054438"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054458"},{"article-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","volume-title":"Proc. NAACL-HLT","author":"Devlin","key":"ref37"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"article-title":"vq-wav2vec: Self-Supervised Learning of Discrete Speech Representations","volume-title":"Proc. ICLR","author":"Baevski","key":"ref39"},{"article-title":"Self-Supervised Learning with Random-Projection Quantizer for Speech Recognition","volume-title":"Proc. ICML","author":"Chiu","key":"ref40"},{"article-title":"wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations","volume-title":"Proc. Neurips","author":"Baevski","key":"ref41"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414227"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2015.7178964"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746801"},{"key":"ref46","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2020.acl-main.703","article-title":"BART: Denoising Sequence-to-Sequence Pre-Training for Natural Language Generation, Translation, and Comprehension","volume-title":"arXiv preprint","author":"Lewis"},{"key":"ref47","doi-asserted-by":"crossref","DOI":"10.1109\/ICASSP.2013.6638947","article-title":"Speech Recognition with Deep Neural Networks","volume-title":"Proc. ICASSP","author":"Graves"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053600"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2012.6289079"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-275"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.1998-603"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1557"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1221"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413899"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683109"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746475"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2013.6707758"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11034"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1510"},{"article-title":"Feature learning in deep neural networks-studies on speech recognition tasks","volume-title":"Proc. ICLR","author":"Yu","key":"ref60"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/slt.2012.6424210"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2011-249"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2014-566"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1465"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-264"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00300"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-212"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413803"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682441"}],"event":{"name":"2022 IEEE Spoken Language Technology Workshop (SLT)","start":{"date-parts":[[2023,1,9]]},"location":"Doha, Qatar","end":{"date-parts":[[2023,1,12]]}},"container-title":["2022 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10022052\/10022330\/10022774.pdf?arnumber=10022774","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,13]],"date-time":"2024-04-13T04:52:32Z","timestamp":1712983952000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10022774\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,9]]},"references-count":71,"URL":"https:\/\/doi.org\/10.1109\/slt54892.2023.10022774","relation":{},"subject":[],"published":{"date-parts":[[2023,1,9]]}}}