{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T13:56:23Z","timestamp":1726494983574},"reference-count":67,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T00:00:00Z","timestamp":1669852800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T00:00:00Z","timestamp":1669852800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T00:00:00Z","timestamp":1669852800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T00:00:00Z","timestamp":1669852800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T00:00:00Z","timestamp":1669852800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T00:00:00Z","timestamp":1669852800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Expert Systems with Applications"],"published-print":{"date-parts":[[2022,12]]},"DOI":"10.1016\/j.eswa.2022.118041","type":"journal-article","created":{"date-parts":[[2022,7,4]],"date-time":"2022-07-04T15:51:48Z","timestamp":1656949908000},"page":"118041","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":9,"special_numbering":"C","title":["Stacked auto-encoders based visual features for speech\/music classification"],"prefix":"10.1016","volume":"208","author":[{"ORCID":"http:\/\/orcid.org\/0000-0003-3947-7757","authenticated-orcid":false,"given":"Arvind","family":"Kumar","sequence":"first","affiliation":[]},{"given":"Sandeep Singh","family":"Solanki","sequence":"additional","affiliation":[]},{"given":"Mahesh","family":"Chandra","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.eswa.2022.118041_b1","doi-asserted-by":"crossref","first-page":"557","DOI":"10.1016\/j.eswa.2018.08.050","article-title":"Classification and diagnosis of cervical cancer with stacked autoencoder and softmax classification","volume":"115","author":"Adem","year":"2019","journal-title":"Expert Systems with Applications"},{"issue":"2","key":"10.1016\/j.eswa.2022.118041_b2","doi-asserted-by":"crossref","first-page":"98","DOI":"10.1049\/sil2.12015","article-title":"An efficient supervised framework for music mood recognition using autoencoder-based optimised support vector regression model","volume":"15","author":"Agarwal","year":"2021","journal-title":"IET Signal Processing"},{"issue":"2","key":"10.1016\/j.eswa.2022.118041_b3","first-page":"285","article-title":"Speech emotion recognition using scalogram based deep structure","volume":"33","author":"Aghajani","year":"2020","journal-title":"International Journal of Engineering"},{"key":"10.1016\/j.eswa.2022.118041_b4","series-title":"A Python implementation of Deep Belief Networks built upon NumPy and TensorFlow with scikit-learn compatibility","author":"Albertbup","year":"2017"},{"issue":"2","key":"10.1016\/j.eswa.2022.118041_b5","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s10916-019-1483-2","article-title":"Brain tumor detection by using stacked autoencoders in deep learning","volume":"44","author":"Amin","year":"2020","journal-title":"Journal of Medical Systems"},{"key":"10.1016\/j.eswa.2022.118041_b6","series-title":"2016 national conference on electrical, electronics and biomedical engineering","first-page":"499","article-title":"Classification and diagnosis of the parkinson disease by stacked autoencoder","author":"Badem","year":"2016"},{"key":"10.1016\/j.eswa.2022.118041_b7","series-title":"2017 25th signal processing and communications applications conference","first-page":"1","article-title":"Deep neural network based diagnosis system for melanoma skin cancer","author":"Ba\u015ft\u00fcrk","year":"2017"},{"issue":"4","key":"10.1016\/j.eswa.2022.118041_b8","doi-asserted-by":"crossref","first-page":"27","DOI":"10.1109\/45.329294","article-title":"Feed-forward neural networks","volume":"13","author":"Bebis","year":"1994","journal-title":"IEEE Potentials"},{"key":"10.1016\/j.eswa.2022.118041_b9","series-title":"Learning deep architectures for AI","author":"Bengio","year":"2009"},{"key":"10.1016\/j.eswa.2022.118041_b10","doi-asserted-by":"crossref","first-page":"1549","DOI":"10.1109\/TASLP.2020.2993152","article-title":"Speech\/music classification using features from spectral peaks","volume":"28","author":"Bhattacharjee","year":"2020","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"issue":"11","key":"10.1016\/j.eswa.2022.118041_b11","doi-asserted-by":"crossref","first-page":"15141","DOI":"10.1007\/s11042-018-6899-z","article-title":"Speech and music classification using spectrogram based statistical descriptors and extreme learning machine","volume":"78","author":"Birajdar","year":"2019","journal-title":"Multimedia Tools and Applications"},{"issue":"1","key":"10.1016\/j.eswa.2022.118041_b12","doi-asserted-by":"crossref","first-page":"329","DOI":"10.1007\/s12652-019-01303-4","article-title":"Speech\/music classification using visual and spectral chromagram features","volume":"11","author":"Birajdar","year":"2020","journal-title":"Journal of Ambient Intelligence and Humanized Computing"},{"issue":"4","key":"10.1016\/j.eswa.2022.118041_b13","doi-asserted-by":"crossref","first-page":"981","DOI":"10.1007\/s00521-018-3813-6","article-title":"Deep learning for music generation: challenges and directions","volume":"32","author":"Briot","year":"2020","journal-title":"Neural Computing and Applications"},{"key":"10.1016\/j.eswa.2022.118041_b14","series-title":"2011 IEEE international conference on acoustics, speech and signal processing","first-page":"5724","article-title":"Automatic language identification in music videos with low level audio and visual features","author":"Chandrasekhar","year":"2011"},{"key":"10.1016\/j.eswa.2022.118041_b15","first-page":"230","article-title":"Model selection for support vector machines","volume":"12","author":"Chapelle","year":"1999","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"1","key":"10.1016\/j.eswa.2022.118041_b16","doi-asserted-by":"crossref","first-page":"110","DOI":"10.3390\/rs10010110","article-title":"Classification of polsar images using multilayer autoencoders and a self-paced learning approach","volume":"10","author":"Chen","year":"2018","journal-title":"Remote Sensing"},{"issue":"4","key":"10.1016\/j.eswa.2022.118041_b17","doi-asserted-by":"crossref","first-page":"325","DOI":"10.1109\/TSMC.1976.5408784","article-title":"The distance-weighted k-nearest-neighbor rule","author":"Dudani","year":"1976","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics"},{"key":"10.1016\/j.eswa.2022.118041_b18","series-title":"2000 IEEE international conference on acoustics, speech, and signal processing. Proceedings (Cat. No. 00CH37100) (Vol. 4)","first-page":"2445","article-title":"Speech\/music discrimination for multimedia applications","author":"El-Maleh","year":"2000"},{"issue":"2","key":"10.1016\/j.eswa.2022.118041_b19","doi-asserted-by":"crossref","first-page":"1622","DOI":"10.2991\/ijcis.d.191216.001","article-title":"Music emotion recognition by using chroma spectrogram and deep visual features","volume":"12","author":"Er","year":"2019","journal-title":"International Journal of Computational Intelligence Systems"},{"issue":"7","key":"10.1016\/j.eswa.2022.118041_b20","doi-asserted-by":"crossref","first-page":"659","DOI":"10.1109\/LSP.2010.2049877","article-title":"Histogram equalization-based features for speech, music, and song discrimination","volume":"17","author":"Gallardo-Antol\u00edn","year":"2010","journal-title":"IEEE Signal Processing Letters"},{"key":"10.1016\/j.eswa.2022.118041_b21","series-title":"2014 IEEE international conference on computational intelligence and computing research","first-page":"1","article-title":"Data mining for meteorological applications: Decision trees for modeling rainfall prediction","author":"Geetha","year":"2014"},{"key":"10.1016\/j.eswa.2022.118041_b22","series-title":"2013 IEEE international conference on acoustics, speech and signal processing","first-page":"3377","article-title":"Extracting deep bottleneck features using stacked auto-encoders","author":"Gehring","year":"2013"},{"key":"10.1016\/j.eswa.2022.118041_b23","series-title":"2009 third international symposium on intelligent information technology application","first-page":"435","article-title":"Speech\/music classification using occurrence pattern of zcr and ste","volume":"3","author":"Ghosal","year":"2009"},{"key":"10.1016\/j.eswa.2022.118041_b24","series-title":"2011 second international conference on emerging applications of information technology","first-page":"49","article-title":"Speech\/music classification using empirical mode decomposition","author":"Ghosal","year":"2011"},{"key":"10.1016\/j.eswa.2022.118041_b25","series-title":"Interspeech","first-page":"3603","article-title":"Representation learning for speech emotion recognition","author":"Ghosh","year":"2016"},{"issue":"21","key":"10.1016\/j.eswa.2022.118041_b26","article-title":"Indian classical musical instrument classification using timbral features","volume":"33","author":"Gulhane","year":"2021","journal-title":"Concurrency Computations: Practice and Experience"},{"issue":"6","key":"10.1016\/j.eswa.2022.118041_b27","first-page":"31","article-title":"Representational learning with extreme learning machine for big data","volume":"28","author":"Huang","year":"2013","journal-title":"IEEE Intelligent Systems"},{"key":"10.1016\/j.eswa.2022.118041_b28","doi-asserted-by":"crossref","DOI":"10.1016\/j.artmed.2020.101809","article-title":"Scalogram based prediction model for respiratory disorders using optimized convolutional neural networks","volume":"103","author":"Jayalakshmy","year":"2020","journal-title":"Artificial Intelligence in Medicine"},{"issue":"1","key":"10.1016\/j.eswa.2022.118041_b29","doi-asserted-by":"crossref","first-page":"55","DOI":"10.1007\/s00530-006-0034-0","article-title":"Machine-learning based classification of speech and music","volume":"12","author":"Khan","year":"2006","journal-title":"Multimedia Systems"},{"key":"10.1016\/j.eswa.2022.118041_b30","series-title":"2015 annual IEEE India conference","first-page":"1","article-title":"Speech\/music classification using vocal tract constriction aspect of speech","author":"Khonglah","year":"2015"},{"key":"10.1016\/j.eswa.2022.118041_b31","series-title":"2016 IEEE region 10 conference","first-page":"2593","article-title":"Low frequency region of vocal tract information for speech\/music classification","author":"Khonglah","year":"2016"},{"key":"10.1016\/j.eswa.2022.118041_b32","doi-asserted-by":"crossref","first-page":"71","DOI":"10.1016\/j.dsp.2015.09.005","article-title":"Speech\/music classification using speech-specific features","volume":"48","author":"Khonglah","year":"2016","journal-title":"Digital Signal Processing"},{"issue":"4","key":"10.1016\/j.eswa.2022.118041_b33","doi-asserted-by":"crossref","first-page":"1023","DOI":"10.1007\/s10772-017-9464-7","article-title":"Clean speech\/speech with background music classification using HNGD spectrum","volume":"20","author":"Khonglah","year":"2017","journal-title":"International Journal of Speech Technology"},{"key":"10.1016\/j.eswa.2022.118041_b34","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1155\/2009\/628570","article-title":"Online speech\/music segmentation based on the variance mean of filter bank energy","volume":"2009","author":"Kos","year":"2009","journal-title":"EURASIP Journal on Advances in Signal Processing"},{"issue":"2","key":"10.1016\/j.eswa.2022.118041_b35","doi-asserted-by":"crossref","first-page":"659","DOI":"10.1016\/j.dsp.2012.10.008","article-title":"Acoustic classification and segmentation using modified spectral roll-off and variance-based features","volume":"23","author":"Kos","year":"2013","journal-title":"Digital Signal Processing"},{"key":"10.1016\/j.eswa.2022.118041_b36","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1155\/2009\/239892","article-title":"A decision-tree-based algorithm for speech\/music classification and segmentation","volume":"2009","author":"Lavner","year":"2009","journal-title":"EURASIP Journal on Audio, Speech, and Music Processing"},{"issue":"6","key":"10.1016\/j.eswa.2022.118041_b37","doi-asserted-by":"crossref","first-page":"1601","DOI":"10.1109\/TFUZZ.2020.2982618","article-title":"Event-triggered fuzzy bipartite tracking control for network systems based on distributed reduced-order observers","volume":"29","author":"Liang","year":"2020","journal-title":"IEEE Transactions on Fuzzy Systems"},{"key":"10.1016\/j.eswa.2022.118041_b38","doi-asserted-by":"crossref","DOI":"10.1109\/TNNLS.2021.3126320","article-title":"Adaptive neural network control for a class of nonlinear systems with function constraints on states","author":"Liu","year":"2021","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"10.1016\/j.eswa.2022.118041_b39","series-title":"Interspeech (Vol. 2013)","first-page":"436","article-title":"Speech enhancement based on deep denoising autoencoder","author":"Lu","year":"2013"},{"key":"10.1016\/j.eswa.2022.118041_b40","doi-asserted-by":"crossref","first-page":"108","DOI":"10.1016\/j.eswa.2015.09.018","article-title":"Combining visual and acoustic features for music genre classification","volume":"45","author":"Nanni","year":"2016","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2022.118041_b41","series-title":"2012 IEEE international conference on acoustics, speech and signal processing","first-page":"501","article-title":"Spectrogram based features selection using multiple kernel learning for speech\/music discrimination","author":"Nilufar","year":"2012"},{"key":"10.1016\/j.eswa.2022.118041_b42","doi-asserted-by":"crossref","first-page":"334","DOI":"10.1016\/j.eswa.2018.05.016","article-title":"Speech-music discrimination using deep visual feature extractors","volume":"114","author":"Papakostas","year":"2018","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2022.118041_b43","doi-asserted-by":"crossref","unstructured":"Pinquier,\u00a0J., Rouas,\u00a0J.-L., & Andr\u00e9-Obrecht,\u00a0R. (2002). Robust speech\/music classification in audio documents. In Seventh international conference on spoken language processing.","DOI":"10.1109\/ICASSP.2002.1004854"},{"issue":"3","key":"10.1016\/j.eswa.2022.118041_b44","doi-asserted-by":"crossref","first-page":"662","DOI":"10.1109\/JAS.2018.7511066","article-title":"Deep scalogram representations for acoustic scene classification","volume":"5","author":"Ren","year":"2018","journal-title":"IEEE\/CAA Journal of Automatica Sinica"},{"issue":"2","key":"10.1016\/j.eswa.2022.118041_b45","doi-asserted-by":"crossref","first-page":"253","DOI":"10.1007\/s11042-008-0228-x","article-title":"New speech\/music discrimination approach based on fundamental frequency estimation","volume":"41","author":"Ruiz-Reyes","year":"2009","journal-title":"Multimedia Tools and Applications"},{"key":"10.1016\/j.eswa.2022.118041_b46","series-title":"1996 IEEE international conference on acoustics, speech, and signal processing conference proceedings (Vol. 2)","first-page":"993","article-title":"Real-time discrimination of broadcast speech\/music","volume":"2","author":"Saunders","year":"1996"},{"key":"10.1016\/j.eswa.2022.118041_b47","series-title":"Neural approaches to dynamics of signal exchanges","first-page":"11","article-title":"Music genre classification using stacked auto-encoders","author":"Scarpiniti","year":"2020"},{"key":"10.1016\/j.eswa.2022.118041_b48","series-title":"1997 IEEE international conference on acoustics, speech, and signal processing (Vol. 2)","first-page":"1331","article-title":"Construction and evaluation of a robust multifeature speech\/music discriminator","author":"Scheirer","year":"1997"},{"key":"10.1016\/j.eswa.2022.118041_b49","series-title":"2014 IEEE international conference on acoustics, speech and signal processing","first-page":"2489","article-title":"Music tonality features for speech\/music discrimination","author":"Sell","year":"2014"},{"key":"10.1016\/j.eswa.2022.118041_b50","series-title":"Practical convolutional neural networks: implement advanced deep learning models using Python","author":"Sewak","year":"2018"},{"key":"10.1016\/j.eswa.2022.118041_b51","series-title":"Proc. of the 10th international conference on digital audio effects","article-title":"Automatic music detection in television productions","author":"Seyerlehner","year":"2007"},{"key":"10.1016\/j.eswa.2022.118041_b52","series-title":"International symposium on visual computing","first-page":"45","article-title":"Stacked autoencoders for medical image search","author":"Sharma","year":"2016"},{"issue":"2","key":"10.1016\/j.eswa.2022.118041_b53","doi-asserted-by":"crossref","first-page":"415","DOI":"10.1007\/s11042-009-0416-3","article-title":"Improvement to speech-music discrimination using sinusoidal model based features","volume":"50","author":"Shirazi","year":"2010","journal-title":"Multimedia Tools and Applications"},{"key":"10.1016\/j.eswa.2022.118041_b54","series-title":"The music-speech corpus","author":"Slaney","year":"1997"},{"key":"10.1016\/j.eswa.2022.118041_b55","series-title":"2014 36th annual international conference of the IEEE engineering in medicine and biology society","first-page":"4184","article-title":"Feature extraction with stacked autoencoders for epileptic seizure detection","author":"Supratak","year":"2014"},{"key":"10.1016\/j.eswa.2022.118041_b56","series-title":"2008 Tenth IEEE international symposium on multimedia","first-page":"53","article-title":"On the discrimination of speech\/music using a time series regularity","author":"Swe","year":"2008"},{"key":"10.1016\/j.eswa.2022.118041_b57","series-title":"2006 IEEE international conference on acoustics speech and signal processing proceedings (Vol. 5)","first-page":"V","article-title":"A speech-music discriminator using HILN model based features","author":"Thoshkahna","year":"2006"},{"key":"10.1016\/j.eswa.2022.118041_b58","doi-asserted-by":"crossref","unstructured":"Tieleman,\u00a0T. (2008). Training restricted Boltzmann machines using approximations to the likelihood gradient. In Proceedings of the 25th international conference on machine learning (pp. 1064\u20131071).","DOI":"10.1145\/1390156.1390290"},{"key":"10.1016\/j.eswa.2022.118041_b59","article-title":"GTZAN Music-speech corpus","author":"Tzanetakis","year":"2002","journal-title":"IEEE Transactions on Audio and Speech Processing"},{"key":"10.1016\/j.eswa.2022.118041_b60","doi-asserted-by":"crossref","first-page":"302","DOI":"10.3389\/fnins.2017.00302","article-title":"Stacked autoencoders for the P300 component detection","volume":"11","author":"Va\u0159eka","year":"2017","journal-title":"Frontiers in Neuroscience"},{"key":"10.1016\/j.eswa.2022.118041_b61","series-title":"Fourth international conference on information, communications and signal processing, 2003 and the fourth Pacific rim conference on multimedia. Proceedings of the 2003 joint (Vol. 3)","first-page":"1325","article-title":"A fast and robust speech\/music discrimination approach","author":"Wang","year":"2003"},{"key":"10.1016\/j.eswa.2022.118041_b62","series-title":"On the origin of deep learning","author":"Wang","year":"2017"},{"key":"10.1016\/j.eswa.2022.118041_b63","series-title":"2008 IEEE international conference on acoustics, speech and signal processing","first-page":"2033","article-title":"Real-time speech\/music classification with a hierarchical oblique decision tree","author":"Wang","year":"2008"},{"issue":"1","key":"10.1016\/j.eswa.2022.118041_b64","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/2801127","article-title":"Combining acoustic and multilevel visual features for music genre classification","volume":"12","author":"Wu","year":"2015","journal-title":"ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM)"},{"key":"10.1016\/j.eswa.2022.118041_b65","series-title":"2018 international conference on machine learning and cybernetics (Vol. 1)","first-page":"294","article-title":"Stacked autoencoder networks based speaker recognition","author":"Zeng","year":"2018"},{"key":"10.1016\/j.eswa.2022.118041_b66","series-title":"Advanced signal processing algorithms, architectures, and implementations VIII (Vol. 3461)","first-page":"432","article-title":"Content-based classification and retrieval of audio","author":"Zhang","year":"1998"},{"key":"10.1016\/j.eswa.2022.118041_b67","series-title":"2016 intl IEEE conferences on ubiquitous intelligence & computing, advanced and trusted computing, scalable computing and communications, cloud and big data computing, internet of people, and smart world congress (UIC\/ATC\/ScalCom\/CBDCom\/IoP\/SmartWorld)","first-page":"841","article-title":"Deep learning based affective model for speech emotion recognition","author":"Zhou","year":"2016"}],"container-title":["Expert Systems with Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417422012532?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417422012532?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2023,3,12]],"date-time":"2023-03-12T10:20:03Z","timestamp":1678616403000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0957417422012532"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,12]]},"references-count":67,"alternative-id":["S0957417422012532"],"URL":"https:\/\/doi.org\/10.1016\/j.eswa.2022.118041","relation":{},"ISSN":["0957-4174"],"issn-type":[{"value":"0957-4174","type":"print"}],"subject":[],"published":{"date-parts":[[2022,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Stacked auto-encoders based visual features for speech\/music classification","name":"articletitle","label":"Article Title"},{"value":"Expert Systems with Applications","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.eswa.2022.118041","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2022 Elsevier Ltd. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"118041"}}