{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T00:22:08Z","timestamp":1729642928938,"version":"3.28.0"},"reference-count":51,"publisher":"IEEE","license":[{"start":{"date-parts":[[2019,12,1]],"date-time":"2019-12-01T00:00:00Z","timestamp":1575158400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2019,12,1]],"date-time":"2019-12-01T00:00:00Z","timestamp":1575158400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2019,12,1]],"date-time":"2019-12-01T00:00:00Z","timestamp":1575158400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019,12]]},"DOI":"10.1109\/asru46091.2019.9003750","type":"proceedings-article","created":{"date-parts":[[2020,2,21]],"date-time":"2020-02-21T07:01:33Z","timestamp":1582268493000},"page":"449-456","source":"Crossref","is-referenced-by-count":376,"title":["A Comparative Study on Transformer vs RNN in Speech Applications"],"prefix":"10.1109","author":[{"given":"Shigeki","family":"Karita","sequence":"first","affiliation":[]},{"given":"Nanxin","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Tomoki","family":"Hayashi","sequence":"additional","affiliation":[]},{"given":"Takaaki","family":"Hori","sequence":"additional","affiliation":[]},{"given":"Hirofumi","family":"Inaguma","sequence":"additional","affiliation":[]},{"given":"Ziyan","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Masao","family":"Someki","sequence":"additional","affiliation":[]},{"given":"Nelson Enrique Yalta","family":"Soplin","sequence":"additional","affiliation":[]},{"given":"Ryuichi","family":"Yamamoto","sequence":"additional","affiliation":[]},{"given":"Xiaofei","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[]},{"given":"Takenori","family":"Yoshimura","sequence":"additional","affiliation":[]},{"given":"Wangyou","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"journal-title":"VoxForge","year":"0","key":"ref39"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-99579-3_21"},{"key":"ref33","doi-asserted-by":"crossref","first-page":"724","DOI":"10.1007\/11939993_73","article-title":"HKUST\/MTS: A very large scale mandarin telephone speech corpus","author":"liu","year":"2006","journal-title":"Chinese Spoken Language Processing"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1768"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2016.10.005"},{"key":"ref30","article-title":"Sponta-neous speech corpus of Japanese","author":"maekawa","year":"2000","journal-title":"Proc Second International Conference on Language Resources and Evaluation (LREC)"},{"key":"ref37","article-title":"TED-LIUM: An automatic speech recognition dedicated corpus","author":"rousseau","year":"2012","journal-title":"Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1992.225858"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-64680-0_15"},{"key":"ref34","first-page":"5206","article-title":"Lib-riSpeech: An ASR corpus based on public domain audio books","author":"panayotov","year":"2015","journal-title":"ICASSP"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"ref27","article-title":"Audio augmentation for speech recognition","author":"ko","year":"2015","journal-title":"Sixteenth Annual Conference of the International Speech Communication Association"},{"key":"ref29","article-title":"Aurora working group: Dsr front end lvcsr evaluation au\/384\/02","author":"pearce","year":"2002","journal-title":"Inst Signal Inform Process Mississippi State Univ"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"ref1","first-page":"5998","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854049"},{"key":"ref22","doi-asserted-by":"crossref","first-page":"1045","DOI":"10.21437\/Interspeech.2010-343","article-title":"Recurrent neural network based language model","author":"mikolov","year":"2010","journal-title":"Proc INTERSPEECH"},{"key":"ref21","first-page":"949","article-title":"Advances in joint CTC-attention based end-to-end speech recognition with a deep CNN encoder and RNN-LM","author":"hori","year":"2017","journal-title":"Proc Inter-speech"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461829"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-503"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref25","article-title":"JSUT corpus: Free large-scale japanese speech corpus for end-to-end speech synthesis","volume":"abs 1711 354","author":"sonobe","year":"2017","journal-title":"CoRR"},{"key":"ref50","article-title":"FastSpeech: Fast, robust and controllable text to speech","author":"ren","year":"2019","journal-title":"ArXiv e-prints"},{"key":"ref51","first-page":"4006","article-title":"Tacotron: Towards End-to-End Speech Synthesis","author":"wang","year":"2017","journal-title":"Proc Inter-speech"},{"key":"ref10","first-page":"641","article-title":"A comparison of transformer and recurrent neural networks on multilingual neural machine translation","author":"lakew","year":"2018","journal-title":"Proceedings of the 27th International Conference on Computational Linguistics"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1107"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.3115\/1075527.1075614"},{"key":"ref12","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","volume":"27","author":"sutskever","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref13","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2015","journal-title":"International Conference on Learning Representations"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref17","doi-asserted-by":"crossref","first-page":"389","DOI":"10.1109\/SLT.2018.8639693","article-title":"End-to-end speech recognition with word-based rnn language models","author":"hori","year":"2018","journal-title":"2018 IEEE Spoken Language Technology Workshop SLT 2018"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-2012"},{"key":"ref4","article-title":"Neu-ral speech synthesis with transformer network","author":"li","year":"2019","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/IberSPEECH.2018-13"},{"key":"ref6","first-page":"193","article-title":"Tensor2Tensor for neural machine translation","author":"vaswani","year":"2018","journal-title":"Proceedings of the 13th Conference of the Association for Machine Translation in the Americas"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-6301"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref7","article-title":"The Kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"2011 IEEE Workshop on Automatic Speech Recognition & Understanding"},{"journal-title":"The LJ speech dataset","year":"2017","author":"ito","key":"ref49"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.2478\/pralin-2018-0002"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268945"},{"journal-title":"RWTH ASR systems for Lib-riSpeech hybrid vs attention-w\/o data augmentation","year":"0","author":"luscher","key":"ref45"},{"journal-title":"The M-AILABS speech dataset","year":"2019","author":"solak","key":"ref48"},{"key":"ref47","article-title":"Improved speech-to-text translation with the Fisher and Callhome Spanish-English speech translation corpus","author":"post","year":"2013","journal-title":"Proceedings of the International Workshop on Spoken Language Translation (IWSLT)"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1616"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1938"},{"journal-title":"Language modeling with deep transformers","year":"2019","author":"irie","key":"ref44"},{"key":"ref43","article-title":"Adadelta: An adaptive learning rate method","volume":"abs 1212 5701","author":"zeiler","year":"2012","journal-title":"CoRR"}],"event":{"name":"2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2019,12,14]]},"location":"SG, Singapore","end":{"date-parts":[[2019,12,18]]}},"container-title":["2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8985378\/9003727\/09003750.pdf?arnumber=9003750","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,27]],"date-time":"2023-09-27T08:49:31Z","timestamp":1695804571000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9003750\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,12]]},"references-count":51,"URL":"https:\/\/doi.org\/10.1109\/asru46091.2019.9003750","relation":{},"subject":[],"published":{"date-parts":[[2019,12]]}}}