{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T18:07:42Z","timestamp":1719943662865},"reference-count":53,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2017,11,1]],"date-time":"2017-11-01T00:00:00Z","timestamp":1509494400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Speech & Language"],"published-print":{"date-parts":[[2017,11]]},"DOI":"10.1016\/j.csl.2017.06.001","type":"journal-article","created":{"date-parts":[[2017,6,14]],"date-time":"2017-06-14T05:30:16Z","timestamp":1497418216000},"page":"233-248","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["Scalable algorithms for unsupervised clustering of acoustic data for speech recognition"],"prefix":"10.1016","volume":"46","author":[{"given":"Shakti P.","family":"Rath","sequence":"first","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.csl.2017.06.001_bib0001","series-title":"Proceedings of International Conference on Spoken Language Processing (ICSLP)","article-title":"HMM adaptation using Vector Taylor series for noisy speech recognition","author":"Acero","year":"2000"},{"key":"10.1016\/j.csl.2017.06.001_bib0002","series-title":"Proceedings of the Fourth International Conference on Spoken Language (ICSLP 96) Philadelphia","first-page":"1137","article-title":"A compact model for speaker-adaptive training","author":"Anastasakos","year":"1996"},{"key":"10.1016\/j.csl.2017.06.001_bib0003","series-title":"Proceedings of Rich Transcription Workshop","article-title":"Improving speaker diarization","author":"Barras","year":"2004"},{"key":"10.1016\/j.csl.2017.06.001_bib0004","series-title":"Proceedings of InterSpeech","article-title":"Unsupervised discovery and training of maximally dissimilar cluster models","author":"Beaufays","year":"2010"},{"issue":"4","key":"10.1016\/j.csl.2017.06.001_bib0005","doi-asserted-by":"crossref","first-page":"357","DOI":"10.1109\/TASSP.1980.1163420","article-title":"Comparison of parametric representations for monosyllabic word recognition in continuous spoken sentences","volume":"ASSP-28","author":"Davis","year":"1980","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"issue":"4","key":"10.1016\/j.csl.2017.06.001_bib0006","doi-asserted-by":"crossref","first-page":"788","DOI":"10.1109\/TASL.2010.2064307","article-title":"Front-end factor analysis for speaker verification","volume":"19","author":"Dehak","year":"2011","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.csl.2017.06.001_bib0007","series-title":"Pattern Classification (2nd Edition)","author":"Duda","year":"2000"},{"key":"10.1016\/j.csl.2017.06.001_bib0008","series-title":"Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP-94)","article-title":"A model distance measure for talker clustering and identification","author":"Foote","year":"1994"},{"issue":"2","key":"10.1016\/j.csl.2017.06.001_bib0009","doi-asserted-by":"crossref","first-page":"75","DOI":"10.1006\/csla.1998.0043","article-title":"Maximum likelihood linear transformations for HMM-based speech recognition","volume":"12","author":"Gales","year":"1998","journal-title":"Comput. Speech Lang."},{"issue":"3","key":"10.1016\/j.csl.2017.06.001_bib0010","doi-asserted-by":"crossref","first-page":"272","DOI":"10.1109\/89.759034","article-title":"Semi-tied covariance matrices for hidden Markov models","volume":"7","author":"Gales","year":"1999","journal-title":"IEEE Trans. Speech Audio Proc."},{"key":"10.1016\/j.csl.2017.06.001_bib0011","doi-asserted-by":"crossref","first-page":"417","DOI":"10.1109\/89.848223","article-title":"Cluster adaptive training of hidden Markov models","volume":"8","author":"Gales","year":"2000","journal-title":"IEEE Trans. Speech Audio Process"},{"key":"10.1016\/j.csl.2017.06.001_bib0012","series-title":"Proceedings of Automatic Speech Recognition & and Understanding (ASRU)","article-title":"Acoustic factorisation","author":"Gales","year":"2001"},{"issue":"5","key":"10.1016\/j.csl.2017.06.001_bib0013","doi-asserted-by":"crossref","first-page":"3520359","DOI":"10.1109\/89.536929","article-title":"Robust continuous speech recognition using parallel model combination","volume":"4","author":"Gales","year":"1996","journal-title":"IEEE Trans. Speech Audio Proc."},{"key":"10.1016\/j.csl.2017.06.001_bib0014","series-title":"Proceedings of Interspeech","article-title":"Robust i-vector based adaptation of DNN acoustic model for speech recognition","author":"Garimella","year":"2015"},{"key":"10.1016\/j.csl.2017.06.001_bib0015","series-title":"Proceedings of Interspeech","article-title":"Partitioning and transcription of broadcast news data","author":"Gauvain","year":"1998"},{"key":"10.1016\/j.csl.2017.06.001_bib0016","doi-asserted-by":"crossref","first-page":"291","DOI":"10.1109\/89.279278","article-title":"Maximum-a-posteriori estimation for multivariate Gaussian observations of Markov chains","volume":"2","author":"Gauvain","year":"1994","journal-title":"IEEE Trans. Speech Audio Proc."},{"key":"10.1016\/j.csl.2017.06.001_bib0017","series-title":"Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","article-title":"Simplification and optimization of i-vector extraction","author":"Glembek","year":"2011"},{"key":"10.1016\/j.csl.2017.06.001_bib0018","series-title":"Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) 1998. Sydney","article-title":"Maximum likelihood modeling with Gaussian distribution for classification","author":"Gopinath","year":"1998"},{"key":"10.1016\/j.csl.2017.06.001_bib0019","series-title":"2013 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","article-title":"Speech recognition with deep recurrent neural networks","author":"Graves","year":"2013"},{"key":"10.1016\/j.csl.2017.06.001_bib0020","series-title":"Proceedings of Eurospeech","article-title":"Acoustic clustering and adaptation for robust speech recognition","author":"Heck","year":"1997"},{"key":"10.1016\/j.csl.2017.06.001_bib0021","doi-asserted-by":"crossref","first-page":"738","DOI":"10.1121\/1.399423","article-title":"Perceptual linear predictive (PLP) analysis for speech","volume":"87","author":"Hermansky","year":"1990","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.csl.2017.06.001_bib0022","unstructured":"Hinton, G., 2010. A practical guide to training restricted Boltzmann machines. Available at: https:\/\/www.cs.toronto.edu\/~hinton\/absps\/guideTR.pdf."},{"issue":"6","key":"10.1016\/j.csl.2017.06.001_bib0023","doi-asserted-by":"crossref","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","article-title":"Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups","volume":"29","author":"Hinton","year":"2012","journal-title":"Signal Process. Mag. IEEE"},{"issue":"8","key":"10.1016\/j.csl.2017.06.001_bib0024","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"Hochreiter","year":"1997","journal-title":"Neural Comput."},{"key":"10.1016\/j.csl.2017.06.001_bib0025","series-title":"Proceedings of Interspeech 2012","article-title":"Application of pretrained deep neural networks to large vocabulary speech recognition","author":"Jaitly","year":"2012"},{"key":"10.1016\/j.csl.2017.06.001_bib0026","series-title":"Proceedings of Automatic Speech Recognition & and Understanding (ASRU)","article-title":"i-Vector-based discriminative adaptation for automatic speech recognition","author":"Karafiat","year":"2011"},{"key":"10.1016\/j.csl.2017.06.001_bib0027","series-title":"Proceedings of Interspeech","article-title":"I-vector estimation using informative priors for adaptation of deep neural networks","author":"Karanasou","year":"2015"},{"key":"10.1016\/j.csl.2017.06.001_bib0028","unstructured":"Kenny, P., 2013. Joint factor analysis of speaker and session variability: theory and algorithms. Available at: http:\/\/www.crim.ca\/perso\/patrick.kenny\/FAtheory.pdf."},{"key":"10.1016\/j.csl.2017.06.001_bib0029","doi-asserted-by":"crossref","unstructured":"Kenny, P., Boulianne, G., Ouellet, P., Dumouchel, P., 2007. Speaker and session variability in GMM-based speaker verification. IEEE Trans. Audio Speech Lang. Process. 15 (4), 1448\u20131460.","DOI":"10.1109\/TASL.2007.894527"},{"key":"10.1016\/j.csl.2017.06.001_bib0030","series-title":"Proceedings of Automatic Speech Recognition & and Understanding (ASRU)","article-title":"Large scale deep neural network acoustic modeling with semi-supervised training data for youtube video transcription","author":"Liao","year":"2013"},{"issue":"1","key":"10.1016\/j.csl.2017.06.001_bib0031","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1109\/TCOM.1980.1094577","article-title":"An algorithm for vector quantizer design","volume":"COM-28","author":"Linde","year":"1980","journal-title":"IEEE Trans. Commun."},{"key":"10.1016\/j.csl.2017.06.001_bib0032","series-title":"Proceedings of IEEE Spoken Language Technology Workshop (SLT)","article-title":"Improvements to speaker adaptive training of deep neural networks","author":"Miao","year":"2014"},{"key":"10.1016\/j.csl.2017.06.001_bib0033","series-title":"IEEE Transactions on Speech and Audio Processing","article-title":"Speaker clustering and transformation for speaker adaptation in speech recognition systems","volume":"Vol.\u00a06","author":"Padmanabhan","year":"1998"},{"key":"10.1016\/j.csl.2017.06.001_bib0034","series-title":"Proceedings of Interspeech","article-title":"A time delay neural network architecture for efficient modeling of long temporal contexts","author":"Peddinti","year":"2015"},{"key":"10.1016\/j.csl.2017.06.001_bib0035","doi-asserted-by":"crossref","first-page":"404","DOI":"10.1016\/j.csl.2010.06.003","article-title":"The subspace Gaussian mixture model-a structured model for speech recognition","volume":"25","author":"Povey","year":"2011","journal-title":"Comput. Speech Lang."},{"key":"10.1016\/j.csl.2017.06.001_bib0036","series-title":"Proceedings of IEEE Automatic Speech Recognition & and Understanding (ASRU)","article-title":"The Kaldi speech recognition toolkit","author":"Povey","year":"2011"},{"key":"10.1016\/j.csl.2017.06.001_bib0037","series-title":"Proceedings of Interspeech","article-title":"Improved feature processing for deep neural networks","author":"Rath","year":"2013"},{"key":"10.1016\/j.csl.2017.06.001_bib0038","series-title":"Proceedings of International Conference on Acoustics, Speech and Signal Processing (ICASSP)","article-title":"Deep convolutional neural networks for LVCSR","author":"Sainath","year":"2013"},{"key":"10.1016\/j.csl.2017.06.001_bib0039","series-title":"Proceedings of International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"4580","article-title":"Convolutional, long short-term memory, fully connected deep neural networks","author":"Sainath","year":"2015"},{"key":"10.1016\/j.csl.2017.06.001_bib0040","series-title":"Proceedings of Interspeech","article-title":"Long short-term memory recurrent neural network architectures for large scale acoustic modeling","author":"Sak","year":"2014"},{"key":"10.1016\/j.csl.2017.06.001_bib0041","series-title":"Proceedings of Eurospeech","article-title":"Training data clustering for improved speech recognition","author":"Sankar","year":"1995"},{"key":"10.1016\/j.csl.2017.06.001_bib0042","series-title":"Proceedings of Interspeech","article-title":"Unfolded recurrent neural networks for speech recognition","author":"Saon","year":"2014"},{"key":"10.1016\/j.csl.2017.06.001_bib0043","series-title":"Proceedings of Automatic Speech Recognition & and Understanding ASRU","article-title":"Speaker adaptation of neural network acoustic models using I-vectors","author":"Saon","year":"2013"},{"key":"10.1016\/j.csl.2017.06.001_bib0044","series-title":"Proceedings of IEEE Automatic Speech Recognition & and Understanding ASRU","article-title":"Feature engineering in context-dependent deep neural networks for conversational speech transcription","author":"Seide","year":"2011"},{"key":"10.1016\/j.csl.2017.06.001_bib0045","series-title":"Proceedings of InterSpeech","article-title":"Factored adaptation using a combination of feature-space and model-space transforms","author":"Seltzer","year":"2012"},{"key":"10.1016\/j.csl.2017.06.001_bib0046","series-title":"Proceedings of International Conference on Acoustics, Speech and Signal Processing (ICASSP)","article-title":"Improving DNN speaker independence with I-Vector inputs","author":"Senior","year":"2014"},{"key":"10.1016\/j.csl.2017.06.001_bib0047","series-title":"Proceedings of International Conference on Acoustics, Speech and Signal Processing (ICASSP)","article-title":"Very deep multilingual convolutional neural networks for LVCSR","author":"Sercu","year":"2016"},{"key":"10.1016\/j.csl.2017.06.001_bib0048","series-title":"Proceedings of Interspeech","article-title":"The Cambridge University March 2005 speaker diarisation system","author":"Sinha","year":"2005"},{"key":"10.1016\/j.csl.2017.06.001_bib0049","series-title":"Proceedings of Workshop on Fall 2004 Rich Transcription (RT-04f)","article-title":"The Development of the Cambridge University RT-04 diarisation System","author":"Tranter","year":"2004"},{"key":"10.1016\/j.csl.2017.06.001_bib0050","series-title":"Proceedings of Interspeech 2013","article-title":"Sequence-discriminative training of deep neural networks","author":"Vesel\u00fd","year":"2013"},{"key":"10.1016\/j.csl.2017.06.001_bib0051","series-title":"Proceedings of IEEE Automatic Speech Recognition & and Understanding (ASRU)","article-title":"Convolutive bottleneck network features for LVCSR","author":"Vesely","year":"2011"},{"key":"10.1016\/j.csl.2017.06.001_bib0052","doi-asserted-by":"crossref","first-page":"2149","DOI":"10.1109\/TASL.2012.2198059","article-title":"Speaker and noise factorization for robust speech recognition","volume":"20","author":"Wang","year":"2012","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.csl.2017.06.001_bib0053","series-title":"Proceedings of InterSpeech","article-title":"An i-Vector based approach to training data clustering for improved speech recognition","author":"Zhang","year":"2011"}],"container-title":["Computer Speech & Language"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0885230816303412?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0885230816303412?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2018,8,31]],"date-time":"2018-08-31T22:27:28Z","timestamp":1535754448000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0885230816303412"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,11]]},"references-count":53,"alternative-id":["S0885230816303412"],"URL":"https:\/\/doi.org\/10.1016\/j.csl.2017.06.001","relation":{},"ISSN":["0885-2308"],"issn-type":[{"value":"0885-2308","type":"print"}],"subject":[],"published":{"date-parts":[[2017,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Scalable algorithms for unsupervised clustering of acoustic data for speech recognition","name":"articletitle","label":"Article Title"},{"value":"Computer Speech & Language","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.csl.2017.06.001","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2017 Elsevier Ltd. All rights reserved.","name":"copyright","label":"Copyright"}]}}