{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,11]],"date-time":"2024-09-11T06:33:15Z","timestamp":1726036395486},"reference-count":97,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2023,7,7]],"date-time":"2023-07-07T00:00:00Z","timestamp":1688688000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100019779","name":"Qatar National Library","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100019779","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Speech & Language"],"published-print":{"date-parts":[[2024,1]]},"DOI":"10.1016\/j.csl.2023.101539","type":"journal-article","created":{"date-parts":[[2023,7,11]],"date-time":"2023-07-11T12:26:28Z","timestamp":1689078388000},"page":"101539","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":2,"special_numbering":"C","title":["What do end-to-end speech models learn about speaker, language and channel information? A layer-wise and neuron-level analysis"],"prefix":"10.1016","volume":"83","author":[{"ORCID":"http:\/\/orcid.org\/0000-0002-1331-2543","authenticated-orcid":false,"given":"Shammur Absar","family":"Chowdhury","sequence":"first","affiliation":[]},{"given":"Nadir","family":"Durrani","sequence":"additional","affiliation":[]},{"given":"Ahmed","family":"Ali","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"4","key":"10.1016\/j.csl.2023.101539_b1","doi-asserted-by":"crossref","first-page":"124","DOI":"10.1145\/3451150","article-title":"Connecting Arabs: Bridging the gap in dialectal speech recognition","volume":"64","author":"Ali","year":"2021","journal-title":"Commun. ACM"},{"key":"10.1016\/j.csl.2023.101539_b2","doi-asserted-by":"crossref","unstructured":"Ali, A., Chowdhury, S.A., Hussein, A., Hifny, Y., 2021b. Arabic Code-Switching Speech Recognition using Monolingual Data. In: Proc. Interspeech 2021.","DOI":"10.21437\/Interspeech.2021-2231"},{"key":"10.1016\/j.csl.2023.101539_b3","series-title":"2019 IEEE Automatic Speech Recognition and Understanding Workshop","first-page":"1026","article-title":"The MGB-5 challenge: Recognition and dialect identification of dialectal Arabic speech","author":"Ali","year":"2019"},{"key":"10.1016\/j.csl.2023.101539_b4","series-title":"2017 IEEE Automatic Speech Recognition and Understanding Workshop","first-page":"316","article-title":"Speech recognition challenge in the wild: Arabic MGB-3","author":"Ali","year":"2017"},{"key":"10.1016\/j.csl.2023.101539_b5","unstructured":"Amodei, D., Ananthanarayanan, S., Anubhai, R., Bai, J., Battenberg, E., Case, C., Casper, J., Catanzaro, B., Cheng, Q., Chen, G., et al., 2016. Deep speech 2: End-to-end speech recognition in English and Mandarin. In: International Conference on Machine Learning. pp. 173\u2013182."},{"key":"10.1016\/j.csl.2023.101539_b6","series-title":"Common voice: A massively-multilingual speech corpus","author":"Ardila","year":"2019"},{"key":"10.1016\/j.csl.2023.101539_b7","series-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020"},{"key":"10.1016\/j.csl.2023.101539_b8","unstructured":"Bau, D.A., Belinkov, Y., Sajjad, H., Durrani, N., Dalvi, F., Glass, J., 2019. Identifying and Controlling Important Neurons in Neural Machine Translation. In: International Conference on Learning Representations. ICLR."},{"key":"10.1016\/j.csl.2023.101539_b9","doi-asserted-by":"crossref","unstructured":"Bau, D., Zhou, B., Khosla, A., Oliva, A., Torralba, A., 2017. Network dissection: Quantifying interpretability of deep visual representations. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 6541\u20136549.","DOI":"10.1109\/CVPR.2017.354"},{"key":"10.1016\/j.csl.2023.101539_b10","series-title":"Gan dissection: Visualizing and understanding generative adversarial networks","author":"Bau","year":"2018"},{"key":"10.1016\/j.csl.2023.101539_b11","series-title":"Interpreting and explaining deep neural networks for classification of audio signals","author":"Becker","year":"2018"},{"key":"10.1016\/j.csl.2023.101539_b12","series-title":"Interpreting intermediate convolutional layers of CNNs trained on raw speech","author":"Begu\u0161","year":"2021"},{"key":"10.1016\/j.csl.2023.101539_b13","doi-asserted-by":"crossref","unstructured":"Belinkov, Y., Durrani, N., Dalvi, F., Sajjad, H., Glass, J.R., 2017. What do Neural Machine Translation Models Learn about Morphology?.","DOI":"10.18653\/v1\/P17-1080"},{"key":"10.1016\/j.csl.2023.101539_b14","doi-asserted-by":"crossref","first-page":"49","DOI":"10.1162\/tacl_a_00254","article-title":"Analysis methods in neural language processing: A survey","volume":"7","author":"Belinkov","year":"2019","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"10.1016\/j.csl.2023.101539_b15","series-title":"Fifth European Conference on Speech Communication and Technology","article-title":"Multilingual speech recognition: The 1996 byblos callhome system","author":"Billa","year":"1997"},{"key":"10.1016\/j.csl.2023.101539_b16","doi-asserted-by":"crossref","unstructured":"Blevins, T., Levy, O., Zettlemoyer, L., 2018. Deep RNNs Encode Soft Hierarchical Syntax. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers). pp. 14\u201319.","DOI":"10.18653\/v1\/P18-2003"},{"key":"10.1016\/j.csl.2023.101539_b17","doi-asserted-by":"crossref","unstructured":"Chaabouni, R., Dunbar, E., Zeghidour, N., Dupoux, E., 2017. Learning Weakly Supervised Multimodal Phoneme Embeddings. In: Proc. Interspeech 2017. pp. 2218\u20132222.","DOI":"10.21437\/Interspeech.2017-1689"},{"key":"10.1016\/j.csl.2023.101539_b18","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"4960","article-title":"Listen, attend and spell: A neural network for large vocabulary conversational speech recognition","author":"Chan","year":"2016"},{"key":"10.1016\/j.csl.2023.101539_b19","series-title":"Audio ALBERT: A lite BERT for self-supervised learning of audio representation","author":"Chi","year":"2020"},{"key":"10.1016\/j.csl.2023.101539_b20","doi-asserted-by":"crossref","unstructured":"Chowdhury, S.A., Ali, A., Shon, S., Glass, J., 2020. What does an End-to-End Dialect Identification Model Learn about Non-dialectal Information?. In: Proc. Interspeech 2020. pp. 462\u2013466.","DOI":"10.21437\/Interspeech.2020-2235"},{"key":"10.1016\/j.csl.2023.101539_b21","doi-asserted-by":"crossref","unstructured":"Chowdhury, S.A., Hussein, A., Abdelali, A., Ali, A., 2021. Towards One Model to Rule All: Multilingual Strategy for Dialectal Code-Switching Arabic ASR. In: Proc. Interspeech 2021.","DOI":"10.21437\/Interspeech.2021-1809"},{"key":"10.1016\/j.csl.2023.101539_b22","unstructured":"Chowdhury, S.A., Zamparelli, R., 2018. RNN simulations of grammaticality judgments on long-distance dependencies. In: Proceedings of the 27th International Conference on Computational Linguistics. pp. 133\u2013144."},{"key":"10.1016\/j.csl.2023.101539_b23","series-title":"Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","first-page":"613","article-title":"Representations of language in a model of visually grounded speech signal","author":"Chrupa\u0142a","year":"2017"},{"key":"10.1016\/j.csl.2023.101539_b24","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"3040","article-title":"Similarity analysis of self-supervised speech representations","author":"Chung","year":"2021"},{"key":"10.1016\/j.csl.2023.101539_b25","series-title":"Voxceleb2: Deep speaker recognition","author":"Chung","year":"2018"},{"key":"10.1016\/j.csl.2023.101539_b26","doi-asserted-by":"crossref","unstructured":"Conneau, A., Kruszewski, G., Lample, G., Barrault, L., Baroni, M., 2018. What you can cram into a single vector: Probing sentence embeddings for linguistic properties. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics. ACL.","DOI":"10.18653\/v1\/P18-1198"},{"key":"10.1016\/j.csl.2023.101539_b27","doi-asserted-by":"crossref","unstructured":"Dalvi, F., Durrani, N., Sajjad, H., Belinkov, Y., Bau, A., Glass, J., 2019a. What is one grain of sand in the desert? analyzing individual neurons in deep nlp models. In: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 33. pp. 6309\u20136317.","DOI":"10.1609\/aaai.v33i01.33016309"},{"key":"10.1016\/j.csl.2023.101539_b28","unstructured":"Dalvi, F., Durrani, N., Sajjad, H., Belinkov, Y., Vogel, S., 2017. Understanding and improving morphological learning in the neural machine translation decoder. In: Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers). pp. 142\u2013151."},{"key":"10.1016\/j.csl.2023.101539_b29","series-title":"International Conference on Learning Representations","article-title":"Discovering latent concepts learned in BERT","author":"Dalvi","year":"2022"},{"key":"10.1016\/j.csl.2023.101539_b30","doi-asserted-by":"crossref","unstructured":"Dalvi, F., Nortonsmith, A., Bau, A., Belinkov, Y., Sajjad, H., Durrani, N., Glass, J., 2019b. NeuroX: A toolkit for analyzing individual neurons in neural networks. In: Proceedings of the AAAI Conference on Artificial Intelligence. 33, pp. 9851\u20139852.","DOI":"10.1609\/aaai.v33i01.33019851"},{"key":"10.1016\/j.csl.2023.101539_b31","doi-asserted-by":"crossref","unstructured":"Dalvi, F., Sajjad, H., Durrani, N., Belinkov, Y., 2020. Analyzing redundancy in pretrained transformer models. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing. EMNLP, pp. 4908\u20134926.","DOI":"10.18653\/v1\/2020.emnlp-main.398"},{"key":"10.1016\/j.csl.2023.101539_b32","series-title":"Deep Learning in Natural Language Processing","author":"Deng","year":"2018"},{"key":"10.1016\/j.csl.2023.101539_b33","series-title":"Towards a rigorous science of interpretable machine learning","author":"Doshi-Velez","year":"2017"},{"key":"10.1016\/j.csl.2023.101539_b34","series-title":"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing","first-page":"1495","article-title":"On the transformation of latent space in fine-tuned NLP models","author":"Durrani","year":"2022"},{"key":"10.1016\/j.csl.2023.101539_b35","doi-asserted-by":"crossref","unstructured":"Durrani, N., Sajjad, H., Dalvi, F., Belinkov, Y., 2020. Analyzing Individual Neurons in Pre-trained Language Models. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing. EMNLP, pp. 4865\u20134880.","DOI":"10.18653\/v1\/2020.emnlp-main.395"},{"key":"10.1016\/j.csl.2023.101539_b36","doi-asserted-by":"crossref","unstructured":"Elloumi, Z., Besacier, L., Galibert, O., Lecouteux, B., 2018. Analyzing Learned Representations of a Deep ASR Performance Prediction Model. In: Blackbox NLP Workshop and EMLP 2018.","DOI":"10.18653\/v1\/W18-5402"},{"key":"10.1016\/j.csl.2023.101539_b37","doi-asserted-by":"crossref","unstructured":"Ettinger, A., Elgohary, A., Resnik, P., 2016. Probing for semantic evidence of composition by means of simple classification tasks. In: Proceedings of the 1st Workshop on Evaluating Vector-Space Representations for NLP. pp. 134\u2013139.","DOI":"10.18653\/v1\/W16-2524"},{"key":"10.1016\/j.csl.2023.101539_b38","unstructured":"Frankle, J., Carbin, M., 2018. The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks. In: International Conference on Learning Representations."},{"key":"10.1016\/j.csl.2023.101539_b39","unstructured":"Ghader, H., Monz, C., 2017. What does Attention in Neural Machine Translation Pay Attention to?. In: Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers). pp. 30\u201339."},{"issue":"1","key":"10.1016\/j.csl.2023.101539_b40","doi-asserted-by":"crossref","first-page":"142","DOI":"10.1109\/TPAMI.2015.2437384","article-title":"Region-based convolutional networks for accurate object detection and segmentation","volume":"38","author":"Girshick","year":"2016","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.csl.2023.101539_b41","doi-asserted-by":"crossref","unstructured":"Gulordava, K., Bojanowski, P., Grave, \u00c9., Linzen, T., Baroni, M., 2018. Colorless Green Recurrent Networks Dream Hierarchically. In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers). pp. 1195\u20131205.","DOI":"10.18653\/v1\/N18-1108"},{"key":"10.1016\/j.csl.2023.101539_b42","first-page":"90","article-title":"Filter-wrapper combination and embedded feature selection for gene expression data","volume":"10","author":"Hameed","year":"2018","journal-title":"Int. J. Adv. Soft Comput. Appl."},{"key":"10.1016\/j.csl.2023.101539_b43","doi-asserted-by":"crossref","unstructured":"Harwath, D., Glass, J., 2017. Learning Word-Like Units from Joint Audio-Visual Analysis. In: Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics. pp. 506\u2013517.","DOI":"10.18653\/v1\/P17-1047"},{"key":"10.1016\/j.csl.2023.101539_b44","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"5115","article-title":"End-to-end text-dependent speaker verification","author":"Heigold","year":"2016"},{"key":"10.1016\/j.csl.2023.101539_b45","doi-asserted-by":"crossref","unstructured":"Hewitt, J., Liang, P., 2019. Designing and Interpreting Probes with Control Tasks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). pp. 2733\u20132743.","DOI":"10.18653\/v1\/D19-1275"},{"key":"10.1016\/j.csl.2023.101539_b46","series-title":"Visualisation and \u2018diagnostic classifiers\u2019 reveal how recurrent and recursive neural networks process hierarchical structure","author":"Hupkes","year":"2018"},{"key":"10.1016\/j.csl.2023.101539_b47","article-title":"End-to-end language identification using high-order utterance representation with bilinear pooling","author":"Jin","year":"2017","journal-title":"Int. Speech Commun. Soc."},{"key":"10.1016\/j.csl.2023.101539_b48","series-title":"2013 International Conference on Biometrics","first-page":"1","article-title":"The 2013 speaker recognition evaluation in mobile environment","author":"Khoury","year":"2013"},{"key":"10.1016\/j.csl.2023.101539_b49","series-title":"Proceedings of International Workshop on Spoken Language Translation","article-title":"Translations of the CALLHOME Egyptian Arabic corpus for conversational speech translation","author":"Kumar","year":"2014"},{"key":"10.1016\/j.csl.2023.101539_b50","series-title":"Proceeding of Speech Prosody","first-page":"84","article-title":"Automatic identification of gender from speech","author":"Levitan","year":"2016"},{"key":"10.1016\/j.csl.2023.101539_b51","doi-asserted-by":"crossref","first-page":"521","DOI":"10.1162\/tacl_a_00115","article-title":"Assessing the ability of LSTMs to learn syntax-sensitive dependencies","volume":"4","author":"Linzen","year":"2016","journal-title":"Trans. Assoc. Comput. Linguist."},{"issue":"3","key":"10.1016\/j.csl.2023.101539_b52","doi-asserted-by":"crossref","first-page":"31","DOI":"10.1145\/3236386.3241340","article-title":"The mythos of model interpretability","volume":"16","author":"Lipton","year":"2018","journal-title":"Queue"},{"key":"10.1016\/j.csl.2023.101539_b53","doi-asserted-by":"crossref","unstructured":"Liu, N.F., Levy, O., Schwartz, R., Tan, C., Smith, N.A., 2018. LSTMs Exploit Linguistic Attributes of Data. In: Proceedings of the Third Workshop on Representation Learning for NLP. pp. 180\u2013186.","DOI":"10.18653\/v1\/W18-3024"},{"key":"10.1016\/j.csl.2023.101539_b54","series-title":"Tera: Self-supervised learning of transformer encoder representation or speech","author":"Liu","year":"2020"},{"key":"10.1016\/j.csl.2023.101539_b55","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6419","article-title":"Mockingjay: Unsupervised speech representation learning with deep bidirectional transformer encoders","author":"Liu","year":"2020"},{"issue":"6","key":"10.1016\/j.csl.2023.101539_b56","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3457607","article-title":"A survey on bias and fairness in machine learning","volume":"54","author":"Mehrabi","year":"2021","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.csl.2023.101539_b57","doi-asserted-by":"crossref","unstructured":"Merlo, P., 2019. Probing word and sentence embeddings for long-distance dependencies effects in French and English. In: Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP. pp. 158\u2013172.","DOI":"10.18653\/v1\/W19-4817"},{"key":"10.1016\/j.csl.2023.101539_b58","series-title":"2015 IEEE Workshop on Automatic Speech Recognition and Understanding","first-page":"167","article-title":"EESEN: End-to-end speech recognition using deep RNN models and WFST-based decoding","author":"Miao","year":"2015"},{"key":"10.1016\/j.csl.2023.101539_b59","doi-asserted-by":"crossref","unstructured":"Nagamine, T., Seltzer, M.L., Mesgarani, N., 2015. Exploring how deep neural networks form phonemic categories. In: Sixteenth Annual Conference of the International Speech Communication Association.","DOI":"10.21437\/Interspeech.2015-422"},{"key":"10.1016\/j.csl.2023.101539_b60","doi-asserted-by":"crossref","unstructured":"Nagamine, T., Seltzer, M.L., Mesgarani, N., 2016. On the Role of Nonlinear Transformations in Deep Neural Network Acoustic Models. In: Interspeech. pp. 803\u2013807.","DOI":"10.21437\/Interspeech.2016-1406"},{"key":"10.1016\/j.csl.2023.101539_b61","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A., 2017. VoxCeleb: A Large-Scale Speaker Identification Dataset. In: Proc. Interspeech 2017. pp. 2616\u20132620.","DOI":"10.21437\/Interspeech.2017-950"},{"key":"10.1016\/j.csl.2023.101539_b62","first-page":"3387","article-title":"Synthesizing the preferred inputs for neurons in neural networks via deep generator networks","volume":"29","author":"Nguyen","year":"2016","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.csl.2023.101539_b63","series-title":"Comparative layer-wise analysis of self-supervised speech models","author":"Pasad","year":"2022"},{"key":"10.1016\/j.csl.2023.101539_b64","series-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","first-page":"1499","article-title":"Dissecting contextual word embeddings: Architecture and representation","author":"Peters","year":"2018"},{"key":"10.1016\/j.csl.2023.101539_b65","series-title":"Proceedings of the 4th Workshop on Representation Learning for NLP","first-page":"7","article-title":"To tune or not to tune? Adapting pretrained representations to diverse tasks","author":"Peters","year":"2019"},{"key":"10.1016\/j.csl.2023.101539_b66","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","first-page":"4609","article-title":"Information-theoretic probing for linguistic structure","author":"Pimentel","year":"2020"},{"key":"10.1016\/j.csl.2023.101539_b67","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6460","article-title":"Wav2letter++: A fast open-source speech recognition system","author":"Pratap","year":"2019"},{"key":"10.1016\/j.csl.2023.101539_b68","doi-asserted-by":"crossref","unstructured":"Qian, P., Qiu, X., Huang, X.-J., 2016. Analyzing linguistic knowledge in sequential model of sentence. In: Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing. pp. 826\u2013835.","DOI":"10.18653\/v1\/D16-1079"},{"key":"10.1016\/j.csl.2023.101539_b69","series-title":"Proceedings of the Thirty-Sixth Conference on Uncertainty in Artificial Intelligence","first-page":"197","article-title":"TX-Ray: Quantifying and explaining model-knowledge transfer in (un-)supervised NLP","author":"Rethmeier","year":"2020"},{"issue":"C","key":"10.1016\/j.csl.2023.101539_b70","article-title":"On the effect of dropping layers of pre-trained transformer models","volume":"77","author":"Sajjad","year":"2023","journal-title":"Comput. Speech Lang.","ISSN":"http:\/\/id.crossref.org\/issn\/0885-2308","issn-type":"print"},{"key":"10.1016\/j.csl.2023.101539_b71","doi-asserted-by":"crossref","first-page":"1285","DOI":"10.1162\/tacl_a_00519","article-title":"Neuron-level interpretation of deep NLP models: A survey","volume":"10","author":"Sajjad","year":"2022","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"10.1016\/j.csl.2023.101539_b72","series-title":"What all do audio transformer models hear? Probing acoustic representations for language delivery and its structure","author":"Shah","year":"2021"},{"key":"10.1016\/j.csl.2023.101539_b73","doi-asserted-by":"crossref","unstructured":"Sheikholeslami, S., Meister, M., Wang, T., Payberah, A.H., Vlassov, V., Dowling, J., 2021. AutoAblation: Automated Parallel Ablation Studies for Deep Learning. In: Proceedings of the 1st Workshop on Machine Learning and Systems. pp. 55\u201361.","DOI":"10.1145\/3437984.3458834"},{"key":"10.1016\/j.csl.2023.101539_b74","doi-asserted-by":"crossref","unstructured":"Shi, X., Knight, K., Yuret, D., 2016a. Why neural translations are the right length. In: Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing. pp. 2278\u20132282.","DOI":"10.18653\/v1\/D16-1248"},{"key":"10.1016\/j.csl.2023.101539_b75","doi-asserted-by":"crossref","unstructured":"Shi, X., Padhi, I., Knight, K., 2016b. Does string-based neural MT learn source syntax?. In: Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing. pp. 1526\u20131534.","DOI":"10.18653\/v1\/D16-1159"},{"key":"10.1016\/j.csl.2023.101539_b76","doi-asserted-by":"crossref","unstructured":"Shon, S., Ali, A., Glass, J., 2018a. Convolutional Neural Network and Language Embeddings for End-to-End Dialect Recognition. In: Proc. Odyssey 2018 the Speaker and Language Recognition Workshop. pp. 98\u2013104.","DOI":"10.21437\/Odyssey.2018-14"},{"key":"10.1016\/j.csl.2023.101539_b77","doi-asserted-by":"crossref","unstructured":"Shon, S., Ali, A., Samih, Y., Mubarak, H., Glass, J., 2020. ADI17: A Fine-Grained Arabic Dialect Identification Dataset. In: IEEE International Conference on Acoustics, Speech and Signal Processing. ICASSP, pp. 8244\u20138248.","DOI":"10.1109\/ICASSP40776.2020.9052982"},{"key":"10.1016\/j.csl.2023.101539_b78","series-title":"2018 IEEE Spoken Language Technology Workshop","first-page":"1007","article-title":"Frame-level speaker embeddings for text-independent speaker recognition and analysis of end-to-end model","author":"Shon","year":"2018"},{"key":"10.1016\/j.csl.2023.101539_b79","series-title":"Do RNN states encode abstract phonological processes?","author":"Silfverberg","year":"2021"},{"key":"10.1016\/j.csl.2023.101539_b80","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Povey, D., Khudanpur, S., 2017. Deep Neural Network Embeddings for Text-Independent Speaker Verification. In: Interspeech. pp. 999\u20131003.","DOI":"10.21437\/Interspeech.2017-620"},{"key":"10.1016\/j.csl.2023.101539_b81","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"5329","article-title":"X-vectors: Robust dnn embeddings for speaker recognition","author":"Snyder","year":"2018"},{"key":"10.1016\/j.csl.2023.101539_b82","series-title":"Are pre-trained convolutions better than pre-trained transformers?","author":"Tay","year":"2021"},{"key":"10.1016\/j.csl.2023.101539_b83","series-title":"Probing speech emotion recognition transformers for linguistic knowledge","author":"Triantafyllopoulos","year":"2022"},{"key":"10.1016\/j.csl.2023.101539_b84","doi-asserted-by":"crossref","unstructured":"Trong, T.N., Hautam\u00e4ki, V., Lee, K.-A., 2016. Deep Language: a comprehensive deep learning approach to end-to-end language recognition. In: Odyssey. pp. 109\u2013116.","DOI":"10.21437\/Odyssey.2016-16"},{"key":"10.1016\/j.csl.2023.101539_b85","doi-asserted-by":"crossref","unstructured":"Voita, E., Serdyukov, P., Sennrich, R., Titov, I., 2018. Context-Aware Neural Machine Translation Learns Anaphora Resolution. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). pp. 1264\u20131274.","DOI":"10.18653\/v1\/P18-1117"},{"key":"10.1016\/j.csl.2023.101539_b86","series-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing","article-title":"Information-theoretic probing with minimum description length","author":"Voita","year":"2020"},{"key":"10.1016\/j.csl.2023.101539_b87","doi-asserted-by":"crossref","DOI":"10.1155\/2018\/7068349","article-title":"Deep learning for computer vision: A brief review","author":"Voulodimos","year":"2018","journal-title":"Comput. Intell. Neurosci."},{"key":"10.1016\/j.csl.2023.101539_b88","series-title":"Gate activation signal analysis for gated recurrent neural networks and its correlation with phoneme boundaries","author":"Wang","year":"2017"},{"key":"10.1016\/j.csl.2023.101539_b89","doi-asserted-by":"crossref","unstructured":"Wang, S., Qian, Y., Yu, K., 2017b. What does the speaker embedding encode?. In: Interspeech. pp. 1497\u20131501.","DOI":"10.21437\/Interspeech.2017-1125"},{"key":"10.1016\/j.csl.2023.101539_b90","series-title":"Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP","first-page":"353","article-title":"GLUE: A multi-task benchmark and analysis platform for natural language understanding","author":"Wang","year":"2018"},{"key":"10.1016\/j.csl.2023.101539_b91","series-title":"Unsupervised pre-training of bidirectional speech encoders via masked reconstruction","author":"Wang","year":"2020"},{"key":"10.1016\/j.csl.2023.101539_b92","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"5140","article-title":"Investigating gated recurrent networks for speech synthesis","author":"Wu","year":"2016"},{"issue":"6","key":"10.1016\/j.csl.2023.101539_b93","doi-asserted-by":"crossref","first-page":"1380","DOI":"10.1109\/JSTSP.2022.3203608","article-title":"Autoregressive predictive coding: A comprehensive study","volume":"16","author":"Yang","year":"2022","journal-title":"IEEE J. Sel. Top. Sign. Proces."},{"key":"10.1016\/j.csl.2023.101539_b94","series-title":"European Conference on Computer Vision","first-page":"818","article-title":"Visualizing and understanding convolutional networks","author":"Zeiler","year":"2014"},{"issue":"1","key":"10.1016\/j.csl.2023.101539_b95","doi-asserted-by":"crossref","first-page":"27","DOI":"10.1631\/FITEE.1700808","article-title":"Visual interpretability for deep learning: a survey","volume":"19","author":"Zhang","year":"2018","journal-title":"Front. Inf. Technol. Electron. Eng."},{"key":"10.1016\/j.csl.2023.101539_b96","series-title":"Object detectors emerge in deep scene cnns","author":"Zhou","year":"2014"},{"issue":"2","key":"10.1016\/j.csl.2023.101539_b97","doi-asserted-by":"crossref","first-page":"301","DOI":"10.1111\/j.1467-9868.2005.00503.x","article-title":"Regularization and variable selection via the elastic net","volume":"67","author":"Zou","year":"2005","journal-title":"J. R. Stat. Soc. Ser. B Stat. Methodol."}],"container-title":["Computer Speech & Language"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S088523082300058X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S088523082300058X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2023,9,15]],"date-time":"2023-09-15T16:49:59Z","timestamp":1694796599000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S088523082300058X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1]]},"references-count":97,"alternative-id":["S088523082300058X"],"URL":"https:\/\/doi.org\/10.1016\/j.csl.2023.101539","relation":{},"ISSN":["0885-2308"],"issn-type":[{"value":"0885-2308","type":"print"}],"subject":[],"published":{"date-parts":[[2024,1]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"What do end-to-end speech models learn about speaker, language and channel information? A layer-wise and neuron-level analysis","name":"articletitle","label":"Article Title"},{"value":"Computer Speech & Language","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.csl.2023.101539","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2023 The Author(s). Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"101539"}}