{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T13:27:54Z","timestamp":1743341274574},"reference-count":62,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2017,5,1]],"date-time":"2017-05-01T00:00:00Z","timestamp":1493596800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Speech Communication"],"published-print":{"date-parts":[[2017,5]]},"DOI":"10.1016\/j.specom.2017.02.009","type":"journal-article","created":{"date-parts":[[2017,3,11]],"date-time":"2017-03-11T12:00:25Z","timestamp":1489233625000},"page":"70-83","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":64,"special_numbering":"C","title":["Error detection and accuracy estimation in automatic speech recognition using deep bidirectional recurrent neural networks"],"prefix":"10.1016","volume":"89","author":[{"given":"Atsunori","family":"Ogawa","sequence":"first","affiliation":[]},{"given":"Takaaki","family":"Hori","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.specom.2017.02.009_bib0001","series-title":"Proc. Interspeech","first-page":"725","article-title":"Open vocabulary speech recognition with flat hybrid models","author":"Bisani","year":"2005"},{"key":"10.1016\/j.specom.2017.02.009_bib0002","series-title":"Proc. ICASSP","first-page":"4856","article-title":"Social signal classification using deep BLSTM recurrent neural networks","author":"Brueckner","year":"2014"},{"key":"10.1016\/j.specom.2017.02.009_bib0003","series-title":"Proc. ICASSP","first-page":"4081","article-title":"Combination of strongly and weakly constrained recognizers for reliable detection of OOVs","author":"Burget","year":"2008"},{"key":"10.1016\/j.specom.2017.02.009_bib0004","series-title":"Proc. ICASSP","first-page":"4481","article-title":"Discriminative training of hierarchical acoustic models for large vocabulary continuous speech recognition","author":"Chang","year":"2009"},{"key":"10.1016\/j.specom.2017.02.009_bib0005","series-title":"Proc. ICASSP","first-page":"1655","article-title":"Large vocabulary decoding and confidence estimation using word posterior probabilities","author":"Evermann","year":"2000"},{"key":"10.1016\/j.specom.2017.02.009_bib0006","series-title":"Proc. Interspeech","first-page":"1942","article-title":"CRF-based combination of contextual features to improve a posteriori word-level confidence measures","author":"Fayolle","year":"2010"},{"key":"10.1016\/j.specom.2017.02.009_bib0007","first-page":"115","article-title":"Learning precise timing with LSTM recurrent networks","volume":"3","author":"Gers","year":"2002","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.specom.2017.02.009_bib0008","series-title":"Proc. Interspeech","first-page":"2553","article-title":"Recent progress in the MIT spoken lecture processing project","author":"Glass","year":"2007"},{"issue":"2","key":"10.1016\/j.specom.2017.02.009_bib0009","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1006\/csla.2000.0138","article-title":"Minimum Bayes-risk automatic speech recognition","volume":"14","author":"Goel","year":"2000","journal-title":"Comput. Speech Lang."},{"key":"10.1016\/j.specom.2017.02.009_bib0010","series-title":"Supervised Sequence Labelling with Recurrent Neural Networks","author":"Graves","year":"2012"},{"key":"10.1016\/j.specom.2017.02.009_bib0011","unstructured":"Graves, A., 2013. RNNLIB. http:\/\/sourceforge.net\/projects\/rnnl\/."},{"issue":"9","key":"10.1016\/j.specom.2017.02.009_bib0012","doi-asserted-by":"publisher","first-page":"1263","DOI":"10.1109\/TKDE.2008.239","article-title":"Learning from imbalanced data","volume":"21","author":"He","year":"2009","journal-title":"IEEE Trans. Knowl. Data Eng."},{"issue":"6","key":"10.1016\/j.specom.2017.02.009_bib0013","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","article-title":"Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups","volume":"29","author":"Hinton","year":"2012","journal-title":"IEEE Signal Process. Mag."},{"key":"10.1016\/j.specom.2017.02.009_bib0014","series-title":"A Field Guide to Dynamical Recurrent Networks","first-page":"237","article-title":"Gradient flow in recurrent nets: the difficulty of learning long-term dependencies","author":"Hochreiter","year":"2001"},{"issue":"8","key":"10.1016\/j.specom.2017.02.009_bib0015","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"Hochreiter","year":"1997","journal-title":"Neural Comput."},{"key":"10.1016\/j.specom.2017.02.009_bib0016","series-title":"Proc. ICASSP","first-page":"IV","article-title":"Open-vocabulary spoken utterance retrieval using confusion networks","author":"Hori","year":"2007"},{"issue":"4","key":"10.1016\/j.specom.2017.02.009_bib0017","doi-asserted-by":"publisher","first-page":"1352","DOI":"10.1109\/TASL.2006.889790","article-title":"Efficient WFST-based one-pass decoding with on-the-fly hypothesis rescoring in extremely large vocabulary continuous speech recognition","volume":"15","author":"Hori","year":"2007","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2017.02.009_bib0018","series-title":"Proc. ICASSP","first-page":"6364","article-title":"Real-time one-pass decoding with recurrent neural network language model for speech recognition","author":"Hori","year":"2014"},{"issue":"4","key":"10.1016\/j.specom.2017.02.009_bib0019","doi-asserted-by":"publisher","first-page":"455","DOI":"10.1016\/j.specom.2004.12.004","article-title":"Confidence measures for speech recognition: a survey","volume":"45","author":"Jiang","year":"2005","journal-title":"Speech Commun."},{"key":"10.1016\/j.specom.2017.02.009_bib0020","series-title":"Proc. ICASSP","first-page":"4999","article-title":"Estimating confidence scores on ASR results using recurrent neural networks","author":"Kalgaonkar","year":"2015"},{"key":"10.1016\/j.specom.2017.02.009_bib0021","series-title":"Proc. Workshop on Spontaneous Speech Processing and Recognition (SSPR)","first-page":"135","article-title":"Benchmark test for speech recognition using the corpus of spontaneous Japanese","author":"Kawahara","year":"2003"},{"key":"10.1016\/j.specom.2017.02.009_bib0022","series-title":"Proc. ICASSP","first-page":"530","article-title":"Modelling unknown words in spontaneous speech","author":"Kemp","year":"1996"},{"key":"10.1016\/j.specom.2017.02.009_bib0023","series-title":"Proc. Eurospeech","first-page":"827","article-title":"Estimating confidence using word lattices","author":"Kemp","year":"1997"},{"key":"10.1016\/j.specom.2017.02.009_bib0024","series-title":"Proc. Interspeech","first-page":"80","article-title":"Posterior-based out of vocabulary word detection in telephone speech","author":"Kombrink","year":"2009"},{"key":"10.1016\/j.specom.2017.02.009_bib0025","unstructured":"Kudo, T., 2013. CRF++ 0.58. https:\/\/taku910.github.io\/crfpp\/."},{"key":"10.1016\/j.specom.2017.02.009_bib0026","series-title":"Proc. International Conference on Machine Learning (ICML)","first-page":"282","article-title":"Conditional random fields: probabilistic models for segmenting and labeling sequence data","author":"Lafferty","year":"2001"},{"key":"10.1016\/j.specom.2017.02.009_bib0027","series-title":"Proc. Workshop on Spontaneous Speech Processing and Recognition (SSPR)","first-page":"7","article-title":"Corpus of spontaneous Japanese: its design and evaluation","author":"Maekawa","year":"2003"},{"key":"10.1016\/j.specom.2017.02.009_bib0028","series-title":"Proc. Eurospeech","first-page":"495","article-title":"Finding consensus among words: lattice-based word error minimization","author":"Mangu","year":"1999"},{"issue":"4","key":"10.1016\/j.specom.2017.02.009_bib0029","doi-asserted-by":"publisher","first-page":"373","DOI":"10.1006\/csla.2000.0152","article-title":"Finding consensus in speech recognition: word error minimization and other applications of confusion networks","volume":"14","author":"Mangu","year":"2000","journal-title":"Comput. Speech Lang."},{"key":"10.1016\/j.specom.2017.02.009_sbref0028","series-title":"Proc. IEEE Workshop on Spoken Language Technology (SLT)","article-title":"Using syntactic and confusion network structure for out-of-vocabulary word detection","author":"Marin","year":"2012"},{"issue":"1","key":"10.1016\/j.specom.2017.02.009_bib0031","first-page":"22","article-title":"VoiceRex - Spontaneous speech recognition technology for contact-center conversations","volume":"5","author":"Masataki","year":"2007","journal-title":"NTT Tech. Rev."},{"key":"10.1016\/j.specom.2017.02.009_bib0032","series-title":"Proc. ICASSP","first-page":"4894","article-title":"Discriminative training based on an integrated view of MPE and MMI in margin and error space","author":"McDermott","year":"2010"},{"key":"10.1016\/j.specom.2017.02.009_bib0033","series-title":"Proc. Interspeech","first-page":"3771","article-title":"Investigation of recurrent neural network architectures and learning methods for spoken language understanding","author":"Mesnil","year":"2013"},{"key":"10.1016\/j.specom.2017.02.009_bib0034","series-title":"Proc. Interspeech","first-page":"1045","article-title":"Recurrent neural network based language model","author":"Mikolov","year":"2010"},{"key":"10.1016\/j.specom.2017.02.009_bib0035","series-title":"Proc. ICASSP","first-page":"5528","article-title":"Extensions of recurrent neural network language model","author":"Mikolov","year":"2011"},{"issue":"10","key":"10.1016\/j.specom.2017.02.009_bib0036","first-page":"849","article-title":"Statistical methods for comparing pattern recognition algorithms and comments on evaluating speech recognition performance","volume":"50","author":"Nakagawa","year":"1994","journal-title":"J. Acoust. Soc. Japan"},{"key":"10.1016\/j.specom.2017.02.009_bib0037","unstructured":"NIST, 2009. Speech Recognition Scoring Toolkit (SCTK) version 2.4.0. http:\/\/www.itl.nist.gov\/iad\/mig\/tools\/."},{"key":"10.1016\/j.specom.2017.02.009_bib0038","series-title":"Proc. Interspeech","first-page":"1223","article-title":"Unsupervised discriminative language modeling using error rate estimator","author":"Oba","year":"2013"},{"key":"10.1016\/j.specom.2017.02.009_bib0039","series-title":"Proc. ICASSP","first-page":"4370","article-title":"ASR error detection and recognition rate estimation using deep bidirectional recurrent neural networks","author":"Ogawa","year":"2015"},{"key":"10.1016\/j.specom.2017.02.009_bib0040","series-title":"Proc. ICASSP","first-page":"4925","article-title":"Error type classification and word accuracy estimation using alignment features from word confusion network","author":"Ogawa","year":"2012"},{"key":"10.1016\/j.specom.2017.02.009_bib0041","series-title":"Proc. IEEE Workshop on Spoken Language Technology (SLT)","first-page":"113","article-title":"Recognition rate estimation based on word alignment network and discriminative error type classification","author":"Ogawa","year":"2012"},{"key":"10.1016\/j.specom.2017.02.009_bib0042","series-title":"Proc. ICASSP","first-page":"6832","article-title":"Discriminative recognition rate estimation for n-best list and its application to n-best rescoring","author":"Ogawa","year":"2013"},{"issue":"12","key":"10.1016\/j.specom.2017.02.009_bib0043","doi-asserted-by":"publisher","first-page":"2400","DOI":"10.1109\/TASLP.2016.2603599","article-title":"Estimating speech recognition accuracy based on error type classification","volume":"24","author":"Ogawa","year":"2016","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2017.02.009_bib0044","series-title":"Proc. HLT-NAACL","first-page":"216","article-title":"Contextual information improves OOV detection in speech","author":"Parada","year":"2010"},{"key":"10.1016\/j.specom.2017.02.009_bib0045","series-title":"Proc. Interspeech","first-page":"338","article-title":"Long short-term memory recurrent neural network architectures for large scale acoustic modeling","author":"Sak","year":"2014"},{"key":"10.1016\/j.specom.2017.02.009_bib0046","series-title":"Proc. Interspeech","first-page":"2581","article-title":"Detection of OOV words using generalized word models and a semantic class language model","author":"Schaaf","year":"2001"},{"key":"10.1016\/j.specom.2017.02.009_bib0047","series-title":"Proc. ICASSP","first-page":"875","article-title":"Confidence measures for spontaneous speech recognition","author":"Schaaf","year":"1997"},{"issue":"5","key":"10.1016\/j.specom.2017.02.009_bib0048","doi-asserted-by":"publisher","first-page":"1103","DOI":"10.1109\/TASL.2010.2091635","article-title":"On the relationship between Bayes risk and word error rate in ASR","volume":"19","author":"Schl\u00fcter","year":"2011","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"11","key":"10.1016\/j.specom.2017.02.009_bib0049","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","article-title":"Bidirectional recurrent neural networks","volume":"45","author":"Schuster","year":"1997","journal-title":"IEEE Trans. Signal Process."},{"issue":"3","key":"10.1016\/j.specom.2017.02.009_bib0050","doi-asserted-by":"publisher","first-page":"492","DOI":"10.1016\/j.csl.2006.09.003","article-title":"Continuous space language model","volume":"21","author":"Schwenk","year":"2007","journal-title":"Comput. Speech Lang."},{"key":"10.1016\/j.specom.2017.02.009_bib0051","series-title":"Proc. Interspeech","first-page":"905","article-title":"Combining information sources for confidence estimation with CRF models","author":"Seigel","year":"2011"},{"key":"10.1016\/j.specom.2017.02.009_bib0052","series-title":"Proc. ICASSP","first-page":"2321","article-title":"Detecting deletions in ASR output","author":"Seigel","year":"2014"},{"key":"10.1016\/j.specom.2017.02.009_bib0053","series-title":"Proc. Interspeech","article-title":"LSTM neural networks for language modeling","author":"Sundermeyer","year":"2012"},{"key":"10.1016\/j.specom.2017.02.009_bib0054","series-title":"Proc. ICASSP","first-page":"2331","article-title":"ASR error detection using recurrent neural network language model and complementary ASR","author":"Tam","year":"2014"},{"key":"10.1016\/j.specom.2017.02.009_bib0055","series-title":"Proc. Interspeech","first-page":"130","article-title":"Is it time to switch to word embedding and recurrent neural networks for spoken language understanding?","author":"Vukotic","year":"2015"},{"issue":"3","key":"10.1016\/j.specom.2017.02.009_bib0056","doi-asserted-by":"publisher","first-page":"855","DOI":"10.1109\/TSA.2005.857791","article-title":"Automatic determination of acoustic model topology using variational Bayesian estimation and clustering for large vocabulary continuous speech recognition","volume":"14","author":"Watanabe","year":"2006","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2017.02.009_bib0057","series-title":"Proc. ICASSP","first-page":"887","article-title":"Neural-network based measures of confidence for word recognition","author":"Weintraub","year":"1997"},{"key":"10.1016\/j.specom.2017.02.009_bib0058","series-title":"Proc. ICASSP","first-page":"225","article-title":"Using word probabilities as confidence measures","author":"Wessel","year":"1998"},{"issue":"3","key":"10.1016\/j.specom.2017.02.009_bib0059","doi-asserted-by":"publisher","first-page":"288","DOI":"10.1109\/89.906002","article-title":"Confidence measures for large vocabulary continuous speech recognition","volume":"9","author":"Wessel","year":"2001","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2017.02.009_bib0060","series-title":"Proc. ICASSP","first-page":"4105","article-title":"Recurrent conditional random field for language understanding","author":"Yao","year":"2014"},{"key":"10.1016\/j.specom.2017.02.009_bib0061","series-title":"Proc. Interspeech","first-page":"2524","article-title":"Recurrent neural networks for language understanding","author":"Yao","year":"2013"},{"issue":"8","key":"10.1016\/j.specom.2017.02.009_bib0062","doi-asserted-by":"publisher","first-page":"2461","DOI":"10.1109\/TASL.2011.2141988","article-title":"Calibration of confidence measures in speech recognition","volume":"19","author":"Yu","year":"2011","journal-title":"IEEE Trans. Audio Speech Lang. Process."}],"container-title":["Speech Communication"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639316301042?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639316301042?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2018,9,3]],"date-time":"2018-09-03T12:22:13Z","timestamp":1535977333000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167639316301042"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,5]]},"references-count":62,"alternative-id":["S0167639316301042"],"URL":"https:\/\/doi.org\/10.1016\/j.specom.2017.02.009","relation":{},"ISSN":["0167-6393"],"issn-type":[{"value":"0167-6393","type":"print"}],"subject":[],"published":{"date-parts":[[2017,5]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Error detection and accuracy estimation in automatic speech recognition using deep bidirectional recurrent neural networks","name":"articletitle","label":"Article Title"},{"value":"Speech Communication","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.specom.2017.02.009","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2017 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}