{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T03:00:37Z","timestamp":1740106837539,"version":"3.37.3"},"reference-count":40,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2019,11,1]],"date-time":"2019-11-01T00:00:00Z","timestamp":1572566400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2019,5,31]],"date-time":"2019-05-31T00:00:00Z","timestamp":1559260800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100004298","name":"SECOM","doi-asserted-by":"publisher","award":["18J22090"],"id":[{"id":"10.13039\/501100004298","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Speech & Language"],"published-print":{"date-parts":[[2019,11]]},"DOI":"10.1016\/j.csl.2019.05.008","type":"journal-article","created":{"date-parts":[[2019,6,1]],"date-time":"2019-06-01T10:30:17Z","timestamp":1559385017000},"page":"347-363","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":8,"special_numbering":"C","title":["Vocoder-free text-to-speech synthesis incorporating generative adversarial networks using low-\/multi-frequency STFT amplitude spectra"],"prefix":"10.1016","volume":"58","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7967-2613","authenticated-orcid":false,"given":"Yuki","family":"Saito","sequence":"first","affiliation":[]},{"given":"Shinnosuke","family":"Takamichi","sequence":"additional","affiliation":[]},{"given":"Hiroshi","family":"Saruwatari","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.csl.2019.05.008_bib0001","series-title":"Proceedings of the INTERSPEECH, Dresden, Germany","first-page":"2097","article-title":"Robust deep feature for spoofing detection the SJTU system for ASVspoof 2015 challenge","author":"Chen","year":"2015"},{"key":"10.1016\/j.csl.2019.05.008_bib0002","series-title":"Proceedings of the ICML, Sydney, Australia","first-page":"933","article-title":"Language modeling with gated convolutional networks","author":"Dauphin","year":"2017"},{"key":"10.1016\/j.csl.2019.05.008_bib0003","first-page":"2121","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"Duchi","year":"2011","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.csl.2019.05.008_bib0004","series-title":"Proceedings of the AISTATS, Lauderdale, USA","first-page":"315","article-title":"Deep sparse rectifier neural networks","author":"Glorot","year":"2011"},{"key":"10.1016\/j.csl.2019.05.008_bib0005","series-title":"Proceedings of the NIPS","first-page":"2672","article-title":"Generative adversarial nets","author":"Goodfellow","year":"2014"},{"issue":"2","key":"10.1016\/j.csl.2019.05.008_bib0006","doi-asserted-by":"crossref","first-page":"236","DOI":"10.1109\/TASSP.1984.1164317","article-title":"Signal estimation from modified short-time fourier transform","volume":"32","author":"Griffin","year":"1984","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.csl.2019.05.008_bib0007","series-title":"Proceedings of the CVPR, Las Vegas, U.S.A.","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"issue":"8","key":"10.1016\/j.csl.2019.05.008_bib0008","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"Hochreiter","year":"1997","journal-title":"Neural Comput."},{"key":"10.1016\/j.csl.2019.05.008_bib0009","series-title":"Proceedings of the ICASSP, Calgary, Canada","first-page":"5679","article-title":"Speech waveform synthesis from MFCC sequences with generative adversarial networks","author":"Juvela","year":"2018"},{"key":"10.1016\/j.csl.2019.05.008_bib0010","series-title":"Proceedings of the ICASSP, Brighton, U.K.","first-page":"96","article-title":"Joint separation and dereverberation of reverberant mixtures with multichannel variational autoencoder","author":"Inoue","year":"2018"},{"key":"10.1016\/j.csl.2019.05.008_bib0011","series-title":"Proceedings of the EUSIPCO, Rome, Italy","first-page":"2114","article-title":"CycleGAN-VC: non-parallel voice conversion using cycle-consistent adversarial networks","author":"Kaneko","year":"2018"},{"key":"10.1016\/j.csl.2019.05.008_bib0012","series-title":"Proceedings of the ICASSP, New Orleans, USA","first-page":"4910","article-title":"Generative adversarial network-based postfilter for statistical parametric speech synthesis","author":"Kaneko","year":"2017"},{"key":"10.1016\/j.csl.2019.05.008_bib0013","series-title":"Proceedings of the INTERSPEECH, Stockholm, Sweden","first-page":"3389","article-title":"Generative adversarial network-based postfilter for STFT spectrograms","author":"Kaneko","year":"2017"},{"issue":"3\u20134","key":"10.1016\/j.csl.2019.05.008_bib0014","doi-asserted-by":"crossref","first-page":"187","DOI":"10.1016\/S0167-6393(98)00085-5","article-title":"Restructuring speech representations using a pitch-adaptive time-frequency smoothing and an instantaneous-frequency-based F0 extraction: Possible role of a repetitive structure in sounds","volume":"27","author":"Kawahara","year":"1999","journal-title":"Speech Commun."},{"key":"10.1016\/j.csl.2019.05.008_bib0015","series-title":"Proc. ICASSP, Shanghai, China","first-page":"5595","article-title":"Modulation spectrum compensation for HMM-based speech synthesis using line spectral pairs","author":"Ling","year":"2016"},{"key":"10.1016\/j.csl.2019.05.008_bib0016","series-title":"Proceedings of the ICLR Workshop, Vancouver, Canada","article-title":"Synthesizing audio with GANs","author":"McAuley","year":"2018"},{"key":"10.1016\/j.csl.2019.05.008_bib0017","series-title":"Proceedings of the ICLR, Toulon, France","article-title":"SampleRNN: an unconditional end-to-end neural audio generation model","author":"Mehri","year":"2017"},{"key":"10.1016\/j.csl.2019.05.008_bib0018","unstructured":"Mirza, M., Osindero, S., 2014. Conditional generative adversarial networks. arXiv:1411.1784."},{"issue":"7","key":"10.1016\/j.csl.2019.05.008_bib0019","doi-asserted-by":"crossref","first-page":"1877","DOI":"10.1587\/transinf.2015EDP7457","article-title":"WORLD: a vocoder-based high-quality speech synthesis system for real-time applications","volume":"E99-D","author":"Morise","year":"2016","journal-title":"IEICE Transactions on Information and Systems"},{"key":"10.1016\/j.csl.2019.05.008_bib0020","unstructured":"Oord, A., Dieleman, S., Zen, H., Simonyan, K., Vinyals, O., Graves, A., Kalchbrenner, N., Senior, A., Kavukcuoglu, K., 2016. WaveNet: a generative model for raw audio. arXiv:1609.03499."},{"key":"10.1016\/j.csl.2019.05.008_bib0021","unstructured":"Oord, A., Li, Y., Babuschkin, I., Simonyan, K., Vinyals, O., Kavukcuoglu, K., Driessche, G., Lockhart, E., Cobo, L., Stimberg, F., Casagrande, N., Grewe, D., Noury, S., Dieleman, S., Elsen, E., Kalchbrenner, N., Zen, H., Graves, A., King, H., Walters, T., Belov, D., Hassabis, D., 2017. Parallel WaveNet: fast high-fidelity speech synthesis. arXiv:1711.10433."},{"key":"10.1016\/j.csl.2019.05.008_bib0022","series-title":"Proceedings of the ICASSP, New York, USA","first-page":"679","article-title":"Speech synthesis by rule using an optimal selection of non-uniform synthesis units","author":"Sagisaka","year":"1988"},{"key":"10.1016\/j.csl.2019.05.008_bib0023","series-title":"Proceedings of the INTERSPEECH, Dresden, Germany","first-page":"2087","article-title":"A comparison of features for synthetic speech detection","author":"Sahidullah","year":"2015"},{"key":"10.1016\/j.csl.2019.05.008_bib0024","series-title":"Proceedings of the ICASSP, New Orleans, USA","first-page":"4900","article-title":"Training algorithm to deceive anti-spoofing verification for DNN-based speech synthesis","author":"Saito","year":"2017"},{"issue":"1","key":"10.1016\/j.csl.2019.05.008_bib0025","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1109\/TASLP.2017.2761547","article-title":"Statistical parametric speech synthesis incorporating generative adversarial networks","volume":"26","author":"Saito","year":"2018","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.csl.2019.05.008_bib0026","series-title":"Proceedings of the ICASSP, Calgary, Canada","first-page":"5299","article-title":"Text-to-speech synthesis using STFT spectra based on low-\/multi-resolution generative adversarial networks","author":"Saito","year":"2018"},{"key":"10.1016\/j.csl.2019.05.008_bib0027","series-title":"Proceedings of the ICLR, Toulon, France","article-title":"Char2Wav: end-to-end speech synthesis","author":"Sotelo","year":"2017"},{"key":"10.1016\/j.csl.2019.05.008_bib0028","series-title":"Proceedings of the INTERSPEECH, Stockholm, Sweden","first-page":"1128","article-title":"Direct modeling of frequency spectra and waveform generation based on phase recovery for DNN-based speech synthesis","author":"Takaki","year":"2017"},{"issue":"4","key":"10.1016\/j.csl.2019.05.008_bib0029","doi-asserted-by":"crossref","first-page":"755","DOI":"10.1109\/TASLP.2016.2522655","article-title":"Postfilters to modify the modulation spectrum for statistical parametric speech synthesis","volume":"24","author":"Takamichi","year":"2016","journal-title":"IEEE\/ACM Trans. Audio, Speech, and Lang. Process."},{"issue":"8","key":"10.1016\/j.csl.2019.05.008_bib0030","doi-asserted-by":"crossref","first-page":"2222","DOI":"10.1109\/TASL.2007.907344","article-title":"Voice conversion based on maximum likelihood estimation of spectral parameter trajectory","volume":"15","author":"Toda","year":"2007","journal-title":"IEEE Trans. Audio, Speech, and Lang. Process."},{"key":"10.1016\/j.csl.2019.05.008_bib0031","series-title":"Proceedings of the INTERSPEECH, California, USA","first-page":"1632","article-title":"The voice conversion challenge 2016","author":"Toda","year":"2016"},{"issue":"5","key":"10.1016\/j.csl.2019.05.008_bib0032","doi-asserted-by":"crossref","first-page":"1234","DOI":"10.1109\/JPROC.2013.2251852","article-title":"Speech synthesis based on hidden Markov models","volume":"101","author":"Tokuda","year":"2013","journal-title":"Proc. IEEE"},{"issue":"10","key":"10.1016\/j.csl.2019.05.008_bib0033","doi-asserted-by":"crossref","first-page":"1702","DOI":"10.1109\/TASLP.2018.2842159","article-title":"Supervised speech separation based on deep learning: an overview","volume":"26","author":"Wang","year":"2018","journal-title":"IEEE\/ACM Trans. Audio, Speech, and Lang. Process."},{"issue":"7","key":"10.1016\/j.csl.2019.05.008_bib0034","doi-asserted-by":"crossref","first-page":"1255","DOI":"10.1109\/TASLP.2016.2551865","article-title":"Improving trajectory modeling for DNN-based speech synthesis by using stacked bottleneck features and minimum trajectory error training","volume":"24","author":"Wu","year":"2016","journal-title":"IEEE\/ACM Trans. Audio, Speech, and Lang. Process."},{"issue":"4","key":"10.1016\/j.csl.2019.05.008_bib0035","doi-asserted-by":"crossref","first-page":"768","DOI":"10.1109\/TASLP.2016.2526653","article-title":"Anti-spoofing for text-independent speaker verification: An initial database, comparison of countermeasures, and human performance","volume":"24","author":"Wu","year":"2016","journal-title":"IEEE\/ACM Trans. Audio, Speech, and Lang. Process."},{"issue":"1","key":"10.1016\/j.csl.2019.05.008_bib0036","doi-asserted-by":"crossref","first-page":"7","DOI":"10.1109\/TASLP.2014.2364452","article-title":"A regression approach to speech enhancement based on deep neural networks","volume":"23","author":"Xu","year":"2015","journal-title":"IEEE\/ACM Trans. Audio, Speech, and Lang. Process."},{"key":"10.1016\/j.csl.2019.05.008_bib0037","series-title":"Proceedings of the ICASSP, Brisbane, Australia","first-page":"4470","article-title":"Unidirectional long short-term memory recurrent neural network with recurrent output layer for low-latency speech synthesis","author":"Zen","year":"2015"},{"key":"10.1016\/j.csl.2019.05.008_bib0038","series-title":"Proceedings of the ICASSP, Vancouver, Canada","first-page":"7962","article-title":"Statistical parametric speech synthesis using deep neural networks","author":"Zen","year":"2013"},{"key":"10.1016\/j.csl.2019.05.008_bib0039","series-title":"Proceedings of the INTERSPEECH, Lisbon, Portugal","first-page":"93","article-title":"An overview of nitech HMM-based speech synthesis system for blizzard challenge 2005","author":"Zen","year":"2005"},{"issue":"11","key":"10.1016\/j.csl.2019.05.008_bib0040","doi-asserted-by":"crossref","first-page":"1039","DOI":"10.1016\/j.specom.2009.04.004","article-title":"Statistical parametric speech synthesis","volume":"51","author":"Zen","year":"2009","journal-title":"Speech Commun."}],"container-title":["Computer Speech & Language"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0885230818303413?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0885230818303413?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2019,9,24]],"date-time":"2019-09-24T15:25:39Z","timestamp":1569338739000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0885230818303413"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,11]]},"references-count":40,"alternative-id":["S0885230818303413"],"URL":"https:\/\/doi.org\/10.1016\/j.csl.2019.05.008","relation":{},"ISSN":["0885-2308"],"issn-type":[{"type":"print","value":"0885-2308"}],"subject":[],"published":{"date-parts":[[2019,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Vocoder-free text-to-speech synthesis incorporating generative adversarial networks using low-\/multi-frequency STFT amplitude spectra","name":"articletitle","label":"Article Title"},{"value":"Computer Speech & Language","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.csl.2019.05.008","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2019 The Authors. Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}]}}