{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,12]],"date-time":"2024-09-12T13:12:44Z","timestamp":1726146764104},"reference-count":44,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Speech & Language"],"published-print":{"date-parts":[[2022,1]]},"DOI":"10.1016\/j.csl.2021.101272","type":"journal-article","created":{"date-parts":[[2021,8,8]],"date-time":"2021-08-08T05:31:49Z","timestamp":1628400709000},"page":"101272","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":25,"special_numbering":"C","title":["Arabic speech recognition by end-to-end, modular systems and human"],"prefix":"10.1016","volume":"71","author":[{"ORCID":"http:\/\/orcid.org\/0000-0002-0820-4062","authenticated-orcid":false,"given":"Amir","family":"Hussein","sequence":"first","affiliation":[]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[]},{"given":"Ahmed","family":"Ali","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.csl.2021.101272_b1","series-title":"Computational Linguistics, Speech and Image Processing for Arabic Language, Vol. 4","first-page":"231","article-title":"End-to-end lexicon free arabic speech recognition using recurrent neural networks","author":"Ahmed","year":"2018"},{"key":"10.1016\/j.csl.2021.101272_b2","series-title":"IEEE Spoken Language Technology Workshop (SLT)","first-page":"279","article-title":"The MGB-2 challenge: Arabic multi-dialect broadcast media recognition","author":"Ali","year":"2016"},{"key":"10.1016\/j.csl.2021.101272_b3","series-title":"IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","first-page":"576","article-title":"Multi-reference WER for evaluating ASR for languages with no orthographic rules","author":"Ali","year":"2015"},{"key":"10.1016\/j.csl.2021.101272_b4","series-title":"IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","first-page":"1026","article-title":"The MGB-5 challenge: Recognition and dialect identification of dialectal Arabic speech","author":"Ali","year":"2019"},{"key":"10.1016\/j.csl.2021.101272_b5","series-title":"IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","first-page":"316","article-title":"Speech recognition challenge in the wild: Arabic MGB-3","author":"Ali","year":"2017"},{"key":"10.1016\/j.csl.2021.101272_b6","series-title":"IEEE Spoken Language Technology Workshop (SLT)","first-page":"525","article-title":"A complete KALDI recipe for building arabic speech recognition systems","author":"Ali","year":"2014"},{"key":"10.1016\/j.csl.2021.101272_b7","series-title":"International Conference on Machine Learning","first-page":"173","article-title":"Deep speech 2: End-to-end speech recognition in English and Mandarin","author":"Amodei","year":"2016"},{"key":"10.1016\/j.csl.2021.101272_b8","series-title":"Layer normalization","author":"Ba","year":"2016"},{"key":"10.1016\/j.csl.2021.101272_b9","doi-asserted-by":"crossref","unstructured":"Belinkov12, Y., Ali, A., Glass, J., 2019. Analyzing phonetic and graphemic representations in end-to-end automatic speech recognition. In: Proc. Interspeech.","DOI":"10.21437\/Interspeech.2019-2599"},{"key":"10.1016\/j.csl.2021.101272_b10","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"4960","article-title":"Listen, attend and spell: A neural network for large vocabulary conversational speech recognition","author":"Chan","year":"2016"},{"key":"10.1016\/j.csl.2021.101272_b11","first-page":"577","article-title":"Attention-based models for speech recognition","volume":"28","author":"Chorowski","year":"2015","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"1","key":"10.1016\/j.csl.2021.101272_b12","doi-asserted-by":"crossref","first-page":"30","DOI":"10.1109\/TASL.2011.2134090","article-title":"Context-dependent pre-trained deep neural networks for large-vocabulary speech recognition","volume":"20","author":"Dahl","year":"2011","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.csl.2021.101272_b13","series-title":"Sixteenth Annual Conference of the International Speech Communication Association","article-title":"Cross-lingual transfer learning during supervised training in low resource scenarios","author":"Das","year":"2015"},{"key":"10.1016\/j.csl.2021.101272_b14","series-title":"Music Information Retrieval Evaluation EXchange (MIREX 2018)","article-title":"INA\u2019s mirex 2018 music and speech detection system","author":"Doukhan","year":"2018"},{"key":"10.1016\/j.csl.2021.101272_b15","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"2494","article-title":"A pitch extraction algorithm tuned for automatic speech recognition","author":"Ghahremani","year":"2014"},{"issue":"12","key":"10.1016\/j.csl.2021.101272_b16","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0144610","article-title":"Pyaudioanalysis: An open-source python library for audio signal analysis","volume":"10","author":"Giannakopoulos","year":"2015","journal-title":"PLoS One"},{"key":"10.1016\/j.csl.2021.101272_b17","series-title":"International Conference on Machine Learning","first-page":"1764","article-title":"Towards end-to-end speech recognition with recurrent neural networks","author":"Graves","year":"2014"},{"key":"10.1016\/j.csl.2021.101272_b18","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"6645","article-title":"Speech recognition with deep recurrent neural networks","author":"Graves","year":"2013"},{"key":"10.1016\/j.csl.2021.101272_b19","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J., 2016. Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"issue":"2","key":"10.1016\/j.csl.2021.101272_b20","first-page":"65","article-title":"Unified acoustic modeling using deep conditional random fields","volume":"3","author":"Hifny","year":"2015","journal-title":"Trans. Mach. Learn. Artif. Intell."},{"key":"10.1016\/j.csl.2021.101272_b21","series-title":"IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","first-page":"449","article-title":"A comparative study on transformer vs RNN in speech applications","author":"Karita","year":"2019"},{"key":"10.1016\/j.csl.2021.101272_b22","series-title":"IEEE Spoken Language Technology Workshop (SLT)","first-page":"292","article-title":"QCRI advanced transcription system (QATS) for the arabic multi-dialect broadcast media recognition: MGB-2 challenge","author":"Khurana","year":"2016"},{"key":"10.1016\/j.csl.2021.101272_b23","series-title":"DARTS: Dialectal Arabic transcription system","author":"Khurana","year":"2019"},{"key":"10.1016\/j.csl.2021.101272_b24","series-title":"Sixteenth Annual Conference of the International Speech Communication Association","article-title":"Audio augmentation for speech recognition","author":"Ko","year":"2015"},{"key":"10.1016\/j.csl.2021.101272_b25","series-title":"Subword regularization: Improving neural network translation models with multiple subword candidates","author":"Kudo","year":"2018"},{"key":"10.1016\/j.csl.2021.101272_b26","series-title":"Margins of Writing, Origins of Cultures","first-page":"159","article-title":"The lives of the Sumerian language","author":"Michalowski","year":"2006"},{"key":"10.1016\/j.csl.2021.101272_b27","doi-asserted-by":"crossref","unstructured":"Mubarak, H., Abdelali, A., Sajjad, H., Samih, Y., Darwish, K., 2019. Highly effective arabic diacritization using sequence to sequence modeling. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). pp. 2390\u20132395.","DOI":"10.18653\/v1\/N19-1248"},{"key":"10.1016\/j.csl.2021.101272_b28","series-title":"IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","first-page":"838","article-title":"Hierarchical transformers for long document classification","author":"Pappagari","year":"2019"},{"key":"10.1016\/j.csl.2021.101272_b29","doi-asserted-by":"crossref","unstructured":"Park, D.S., Chan, W., Zhang, Y., Chiu, C.-C., Zoph, B., Cubuk, E.D., Le, Q.V., 2019. SpecAugment: A simple data augmentation method for automatic speech recognition. In: Proc. Interspeech. pp. 2613\u20132617.","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"10.1016\/j.csl.2021.101272_b30","series-title":"Sixteenth Annual Conference of the International Speech Communication Association","article-title":"A time delay neural network architecture for efficient modeling of long temporal contexts","author":"Peddinti","year":"2015"},{"key":"10.1016\/j.csl.2021.101272_b31","series-title":"IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), no. CONF","article-title":"The Kaldi speech recognition toolkit","author":"Povey","year":"2011"},{"key":"10.1016\/j.csl.2021.101272_b32","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"5874","article-title":"A time-restricted self-attention layer for ASR","author":"Povey","year":"2018"},{"key":"10.1016\/j.csl.2021.101272_b33","doi-asserted-by":"crossref","unstructured":"Povey, D., Peddinti, V., Galvez, D., Ghahremani, P., Manohar, V., Na, X., Wang, Y., Khudanpur, S., 2016. Purely sequence-trained neural networks for ASR based on lattice-free MMI. In: Proc. Interspeech. pp. 2751\u20132755.","DOI":"10.21437\/Interspeech.2016-595"},{"key":"10.1016\/j.csl.2021.101272_b34","doi-asserted-by":"crossref","unstructured":"Saon, G., Kurata, G., Sercu, T., Audhkhasi, K., Thomas, S., Dimitriadis, D., Cui, X., Ramabhadran, B., Picheny, M., Lim, L.-L., et al., 2017. English conversational telephone speech recognition by humans and machines. In: Proc. Interspeech. pp. 132\u2013136.","DOI":"10.21437\/Interspeech.2017-405"},{"key":"10.1016\/j.csl.2021.101272_b35","series-title":"IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","first-page":"338","article-title":"Aalto system for the 2017 Arabic multi-genre broadcast challenge","author":"Smit","year":"2017"},{"key":"10.1016\/j.csl.2021.101272_b36","series-title":"Comparing human and machine errors in conversational speech transcription","author":"Stolcke","year":"2017"},{"key":"10.1016\/j.csl.2021.101272_b37","series-title":"End-to-end ASR: from supervised to semi-supervised learning with modern architectures","author":"Synnaeve","year":"2020"},{"key":"10.1016\/j.csl.2021.101272_b38","series-title":"Advances in Neural Information Processing Systems","first-page":"5998","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.csl.2021.101272_b39","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"6874","article-title":"Transformer-based acoustic modeling for hybrid speech recognition","author":"Wang","year":"2020"},{"issue":"8","key":"10.1016\/j.csl.2021.101272_b40","doi-asserted-by":"crossref","first-page":"1018","DOI":"10.3390\/sym11081018","article-title":"An overview of end-to-end automatic speech recognition","volume":"11","author":"Wang","year":"2019","journal-title":"Symmetry"},{"key":"10.1016\/j.csl.2021.101272_b41","doi-asserted-by":"crossref","unstructured":"Watanabe, S., Hori, T., Karita, S., Hayashi, T., Nishitoba, J., Unno, Y., Soplin, N.-E.Y., Heymann, J., Wiesner, M., Chen, N., et al., 2018. ESPnet: End-to-end speech processing toolkit. In: Proc. Interspeech. pp. 2207\u20132211.","DOI":"10.21437\/Interspeech.2018-1456"},{"issue":"8","key":"10.1016\/j.csl.2021.101272_b42","doi-asserted-by":"crossref","first-page":"1240","DOI":"10.1109\/JSTSP.2017.2763455","article-title":"Hybrid CTC\/attention architecture for end-to-end speech recognition","volume":"11","author":"Watanabe","year":"2017","journal-title":"IEEE J. Sel. Top. Sign. Proces."},{"key":"10.1016\/j.csl.2021.101272_b43","series-title":"Achieving human parity in conversational speech recognition","author":"Xiong","year":"2016"},{"key":"10.1016\/j.csl.2021.101272_b44","series-title":"Improved training of end-to-end attention models for speech recognition","author":"Zeyer","year":"2018"}],"container-title":["Computer Speech & Language"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0885230821000760?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0885230821000760?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2023,3,4]],"date-time":"2023-03-04T22:24:59Z","timestamp":1677968699000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0885230821000760"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1]]},"references-count":44,"alternative-id":["S0885230821000760"],"URL":"https:\/\/doi.org\/10.1016\/j.csl.2021.101272","relation":{},"ISSN":["0885-2308"],"issn-type":[{"value":"0885-2308","type":"print"}],"subject":[],"published":{"date-parts":[[2022,1]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Arabic speech recognition by end-to-end, modular systems and human","name":"articletitle","label":"Article Title"},{"value":"Computer Speech & Language","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.csl.2021.101272","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2021 Elsevier Ltd. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"101272"}}