{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T10:55:17Z","timestamp":1730199317150,"version":"3.28.0"},"reference-count":36,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,12,13]]},"DOI":"10.1109\/asru51503.2021.9687895","type":"proceedings-article","created":{"date-parts":[[2022,2,3]],"date-time":"2022-02-03T20:31:00Z","timestamp":1643920260000},"page":"503-510","source":"Crossref","is-referenced-by-count":31,"title":["Context-Aware Transformer Transducer for Speech Recognition"],"prefix":"10.1109","author":[{"given":"Feng-Ju","family":"Chang","sequence":"first","affiliation":[{"name":"Amazon Alexa"}]},{"given":"Jing","family":"Liu","sequence":"additional","affiliation":[{"name":"Amazon Alexa"}]},{"given":"Martin","family":"Radfar","sequence":"additional","affiliation":[{"name":"Amazon Alexa"}]},{"given":"Athanasios","family":"Mouchtaris","sequence":"additional","affiliation":[{"name":"Amazon Alexa"}]},{"given":"Maurizio","family":"Omologo","sequence":"additional","affiliation":[{"name":"Amazon Alexa"}]},{"given":"Ariya","family":"Rastrow","sequence":"additional","affiliation":[{"name":"Amazon Alexa"}]},{"given":"Siegfried","family":"Kunzmann","sequence":"additional","affiliation":[{"name":"Amazon Alexa"}]}],"member":"263","reference":[{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/78.650093"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053896"},{"key":"ref30","article-title":"Transformer-transducer: End-to-end speech recognition with self-attention","author":"yeh","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref36","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014","journal-title":"ArXiv Preprint"},{"key":"ref35","article-title":"Well-read students learn better: On the importance of pre-training compact models","author":"turc","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref34","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref10","article-title":"Joint grapheme and phoneme embeddings for contextual end-to-end asr","author":"zhehuai","year":"0","journal-title":"InterSpeech"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682441"},{"key":"ref12","doi-asserted-by":"crossref","first-page":"69","DOI":"10.1006\/csla.2001.0184","article-title":"Weighted finite-state transducers in speech recognition","volume":"16","author":"mehryar","year":"2002","journal-title":"Comput Speech Lang"},{"key":"ref13","article-title":"Contextual speech recognition in end-to-end neural network systems using beam search","author":"ian","year":"0","journal-title":"InterSpeech"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414800"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2986"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1344"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413962"},{"key":"ref18","article-title":"Shallow-fusion end-to-end contextual biasing","author":"ding","year":"0","journal-title":"InterSpeech"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-177"},{"key":"ref28","article-title":"Attention is all you need","author":"ashish","year":"0","journal-title":"NeurIPS"},{"key":"ref4","article-title":"Speech-transformer: A no-recurrence sequence-to-sequence model for speech recognition","author":"linhao","year":"0","journal-title":"ICASSP"},{"key":"ref27","article-title":"Joint grapheme and phoneme embeddings for contextual end-to-end asr","author":"zhehuai","year":"0","journal-title":"InterSpeech"},{"key":"ref3","article-title":"Sequence transduction with recurrent neural networks","author":"graves","year":"2012","journal-title":"ArXiv Preprint"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1162"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2203"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462105"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682336"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1007"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref9","article-title":"Deep context: end-to-end contextual speech recognition","author":"golan","year":"0","journal-title":"SLT"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref20","article-title":"On using monolingual corpora in neural machine translation","author":"gulcehre","year":"2015","journal-title":"ArXiv Preprint"},{"key":"ref22","article-title":"End-to-end contextual speech recognition using class language models and a token passing decoder","author":"zhehuai","year":"0","journal-title":"ICASSP"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1787"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462682"},{"key":"ref23","article-title":"A comparison of techniques for language model integration in encoder-decoder speech recognition","author":"shubham","year":"0","journal-title":"SLT"},{"key":"ref26","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"2015","journal-title":"ArXiv Preprint"},{"key":"ref25","article-title":"Cold fusion: Training seq2seq models together with language models","author":"sriram","year":"2017","journal-title":"ArXiv Preprint"}],"event":{"name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2021,12,13]]},"location":"Cartagena, Colombia","end":{"date-parts":[[2021,12,17]]}},"container-title":["2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9687821\/9687855\/09687895.pdf?arnumber=9687895","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,16]],"date-time":"2022-05-16T20:41:17Z","timestamp":1652733677000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9687895\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,13]]},"references-count":36,"URL":"https:\/\/doi.org\/10.1109\/asru51503.2021.9687895","relation":{},"subject":[],"published":{"date-parts":[[2021,12,13]]}}}