{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T18:04:43Z","timestamp":1719943483434},"reference-count":77,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Speech & Language"],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1016\/j.csl.2023.101520","type":"journal-article","created":{"date-parts":[[2023,4,3]],"date-time":"2023-04-03T21:11:46Z","timestamp":1680556306000},"page":"101520","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["Deep learning-based speaker-adaptive postfiltering with limited adaptation data for embedded text-to-speech synthesis systems"],"prefix":"10.1016","volume":"81","author":[{"ORCID":"http:\/\/orcid.org\/0000-0001-7553-7003","authenticated-orcid":false,"given":"Eray","family":"Eren","sequence":"first","affiliation":[]},{"given":"Cenk","family":"Demiroglu","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.csl.2023.101520_b1","series-title":"SPECOM","first-page":"282","article-title":"Deep recurrent neural networks in speech synthesis using a continuous vocoder","volume":"vol.10458","author":"Al-Radhi","year":"2017"},{"key":"10.1016\/j.csl.2023.101520_b2","series-title":"Interspeech","first-page":"434","article-title":"Time-domain envelope modulating the noise component of excitation in a continuous residual-based vocoder for statistical parametric speech synthesis","author":"Al-Radhi","year":"2017"},{"key":"10.1016\/j.csl.2023.101520_b3","series-title":"Proc. Int. Conf. Mach. Learn.","first-page":"195","article-title":"Deep voice: Real-time neural text-to-speech","volume":"vol.70","author":"Arik","year":"2017"},{"key":"10.1016\/j.csl.2023.101520_b4","series-title":"Proc. Int. Conf. Learn. Repr.","article-title":"High fidelity speech synthesis with adversarial networks","author":"Binkowski","year":"2020"},{"issue":"11","key":"10.1016\/j.csl.2023.101520_b5","doi-asserted-by":"crossref","first-page":"2003","DOI":"10.1109\/TASLP.2015.2461448","article-title":"A deep generative architecture for postfiltering in statistical parametric speech synthesis","volume":"23","author":"Chen","year":"2015","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.csl.2023.101520_b6","series-title":"ICLR","article-title":"AdaSpeech: Adaptive text to speech for custom voice","author":"Chen","year":"2021"},{"key":"10.1016\/j.csl.2023.101520_b7","series-title":"ICASSP","first-page":"8588","article-title":"Investigating on incorporating pretrained and learnable speaker representations for multi-speaker multi-style text-to-speech","author":"Chien","year":"2021"},{"key":"10.1016\/j.csl.2023.101520_b8","series-title":"Interspeech","first-page":"2007","article-title":"Attentron: Few-shot text-to-speech utilizing attention-based variable-length embedding","author":"Choi","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b9","series-title":"ICASSP","first-page":"6184","article-title":"Zero-shot multi-speaker text-to-speech with state-of-the-art neural speaker embeddings","author":"Cooper","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b10","series-title":"Interspeech","first-page":"3979","article-title":"Can speaker augmentation improve multi-speaker end-to-end TTS?","author":"Cooper","year":"2020"},{"issue":"1","key":"10.1016\/j.csl.2023.101520_b11","doi-asserted-by":"crossref","first-page":"12","DOI":"10.3390\/biomimetics6010012","article-title":"Discriminative multi-stream postfilters based on deep learning for enhancing statistical parametric speech synthesis","volume":"6","author":"Coto-Jim\u00e9nez","year":"2021","journal-title":"Biomimetics"},{"issue":"4","key":"10.1016\/j.csl.2023.101520_b12","doi-asserted-by":"crossref","first-page":"788","DOI":"10.1109\/TASL.2010.2064307","article-title":"Front-end factor analysis for speaker verification","volume":"19","author":"Dehak","year":"2011","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.csl.2023.101520_b13","series-title":"INTERSPEECH","first-page":"1964","article-title":"TTS synthesis with bidirectional LSTM based recurrent neural networks","author":"Fan","year":"2014"},{"key":"10.1016\/j.csl.2023.101520_b14","series-title":"NeurIPS","first-page":"2962","article-title":"Deep voice 2: Multi-speaker neural text-to-speech","author":"Gibiansky","year":"2017"},{"key":"10.1016\/j.csl.2023.101520_b15","series-title":"NeurIPS","first-page":"2672","article-title":"Generative adversarial nets","author":"Goodfellow","year":"2014"},{"key":"10.1016\/j.csl.2023.101520_b16","series-title":"Long short-term memory","author":"Hochreiter","year":"1995"},{"key":"10.1016\/j.csl.2023.101520_b17","series-title":"Interspeech","first-page":"210","article-title":"WG-WaveNet: Real-time high-fidelity speech synthesis without GPU","author":"Hsu","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b18","series-title":"ICLR","article-title":"Hierarchical generative modeling for controllable speech synthesis","author":"Hsu","year":"2019"},{"key":"10.1016\/j.csl.2023.101520_b19","series-title":"NeurIPS","first-page":"4485","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","author":"Jia","year":"2018"},{"key":"10.1016\/j.csl.2023.101520_b20","series-title":"Proc. Int. Conf. Mach. Learn.","first-page":"2415","article-title":"Efficient neural audio synthesis","volume":"vol.80","author":"Kalchbrenner","year":"2018"},{"key":"10.1016\/j.csl.2023.101520_b21","series-title":"ICASSP","first-page":"4910","article-title":"Generative adversarial network-based postfilter for statistical parametric speech synthesis","author":"Kaneko","year":"2017"},{"key":"10.1016\/j.csl.2023.101520_b22","series-title":"Interspeech","first-page":"3389","article-title":"Generative adversarial network-based postfilter for STFT spectrograms","author":"Kaneko","year":"2017"},{"issue":"6","key":"10.1016\/j.csl.2023.101520_b23","doi-asserted-by":"crossref","first-page":"349","DOI":"10.1250\/ast.27.349","article-title":"STRAIGHT, exploitation of the other aspect of VOCODER: Perceptually isomorphic decomposition of speech sounds","volume":"27","author":"Kawahara","year":"2006","journal-title":"Acoust. Sci. Technol."},{"key":"10.1016\/j.csl.2023.101520_b24","series-title":"NeurIPS","article-title":"Glow-TTS: A generative flow for text-to-speech via monotonic alignment search","author":"Kim","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b25","series-title":"ICLR","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2015"},{"key":"10.1016\/j.csl.2023.101520_b26","series-title":"NeurIPS","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","author":"Kong","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b27","series-title":"NeurIPS","first-page":"14881","article-title":"MelGAN: Generative adversarial networks for conditional waveform synthesis","author":"Kumar","year":"2019"},{"issue":"4","key":"10.1016\/j.csl.2023.101520_b28","doi-asserted-by":"crossref","first-page":"541","DOI":"10.1162\/neco.1989.1.4.541","article-title":"Backpropagation applied to handwritten zip code recognition","volume":"1","author":"LeCun","year":"1989","journal-title":"Neural Comput."},{"key":"10.1016\/j.csl.2023.101520_b29","series-title":"Proc. AAAI","first-page":"6706","article-title":"Neural speech synthesis with transformer network","author":"Li","year":"2019"},{"key":"10.1016\/j.csl.2023.101520_b30","series-title":"IEEE International Conference on Intelligent Systems","first-page":"437","article-title":"An evaluation of postfiltering for deep learning based speech synthesis with limited data","author":"Lorincz","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b31","series-title":"ICASSP","first-page":"7209","article-title":"Flow-TTS: A non-autoregressive network for text to speech based on flow","author":"Miao","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b32","series-title":"ICML","first-page":"7748","article-title":"Meta-StyleSpeech : Multi-speaker adaptive text-to-speech generation","volume":"vol.139","author":"Min","year":"2021"},{"key":"10.1016\/j.csl.2023.101520_b33","series-title":"Conditional generative adversarial nets","author":"Mirza","year":"2014"},{"issue":"7","key":"10.1016\/j.csl.2023.101520_b34","doi-asserted-by":"crossref","first-page":"1877","DOI":"10.1587\/transinf.2015EDP7457","article-title":"WORLD: A vocoder-based high-quality speech synthesis system for real-time applications","volume":"99-D","author":"Morise","year":"2016","journal-title":"IEICE Trans. Inf. Syst."},{"key":"10.1016\/j.csl.2023.101520_b35","series-title":"ICASSP","first-page":"7639","article-title":"BOFFIN TTS: Few-shot speaker adaptation by Bayesian optimization","author":"Moss","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b36","series-title":"Interspeech","first-page":"235","article-title":"Speaker conditional WaveRNN: Towards universal neural vocoder for unseen speaker and recording conditions","author":"Paul","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b37","series-title":"ICLR","article-title":"Deep voice 3: Scaling text-to-speech with convolutional sequence learning","author":"Ping","year":"2018"},{"key":"10.1016\/j.csl.2023.101520_b38","series-title":"ICASSP","first-page":"3829","article-title":"On the training aspects of deep neural network (DNN) for parametric TTS synthesis","author":"Qian","year":"2014"},{"key":"10.1016\/j.csl.2023.101520_b39","series-title":"ICLR","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","author":"Ren","year":"2021"},{"key":"10.1016\/j.csl.2023.101520_b40","series-title":"NeurIPS","first-page":"3165","article-title":"FastSpeech: Fast, robust and controllable text to speech","author":"Ren","year":"2019"},{"issue":"1","key":"10.1016\/j.csl.2023.101520_b41","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1109\/TASLP.2017.2761547","article-title":"Statistical parametric speech synthesis incorporating generative adversarial networks","volume":"26","author":"Saito","year":"2018","journal-title":"IEEE ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.csl.2023.101520_b42","series-title":"ICASSP","first-page":"4779","article-title":"Natural TTS synthesis by conditioning wavenet on MEL spectrogram predictions","author":"Shen","year":"2018"},{"key":"10.1016\/j.csl.2023.101520_b43","series-title":"INTERSPEECH","first-page":"1436","article-title":"Ways to implement global variance in statistical speech synthesis","author":"Sil\u00e9n","year":"2012"},{"key":"10.1016\/j.csl.2023.101520_b44","series-title":"ICLR","article-title":"Char2Wav: End-to-end speech synthesis","author":"Sotelo","year":"2017"},{"key":"10.1016\/j.csl.2023.101520_b45","series-title":"ICASSP","first-page":"4784","article-title":"Efficiently trainable text-to-speech system based on deep convolutional networks with guided attention","author":"Tachibana","year":"2018"},{"key":"10.1016\/j.csl.2023.101520_b46","series-title":"The 9th ISCA Speech Synthesis Workshop","first-page":"153","article-title":"Speaker adaptation of various components in deep neural network based speech synthesis","author":"Takaki","year":"2016"},{"key":"10.1016\/j.csl.2023.101520_b47","series-title":"ICASSP","first-page":"290","article-title":"A postfilter to modify the modulation spectrum in HMM-based speech synthesis","author":"Takamichi","year":"2014"},{"issue":"188","key":"10.1016\/j.csl.2023.101520_b48","first-page":"1","article-title":"WaveCycleGAN2: Neural waveform post-filter for high-quality speech generation","volume":"119","author":"Tanaka","year":"2019","journal-title":"IEICE Tech. Rep. IEICE Tech. Rep."},{"key":"10.1016\/j.csl.2023.101520_b49","series-title":"Spoken Language Technology Workshop","first-page":"632","article-title":"Synthetic-to-natural speech waveform conversion using cycle-consistent adversarial networks","author":"Tanaka","year":"2018"},{"issue":"5","key":"10.1016\/j.csl.2023.101520_b50","doi-asserted-by":"crossref","first-page":"816","DOI":"10.1093\/ietisy\/e90-d.5.816","article-title":"A speech parameter generation algorithm considering global variance for HMM-based speech synthesis","volume":"90-D","author":"Toda","year":"2007","journal-title":"IEICE Trans. Inf. Syst."},{"key":"10.1016\/j.csl.2023.101520_b51","series-title":"ICASSP","first-page":"4025","article-title":"Trajectory training considering global variance for HMM-based speech synthesis","author":"Toda","year":"2009"},{"key":"10.1016\/j.csl.2023.101520_b52","series-title":"ICASSP","first-page":"1315","article-title":"Speech parameter generation algorithms for HMM-based speech synthesis","author":"Tokuda","year":"2000"},{"issue":"5","key":"10.1016\/j.csl.2023.101520_b53","doi-asserted-by":"crossref","first-page":"1234","DOI":"10.1109\/JPROC.2013.2251852","article-title":"Speech synthesis based on hidden Markov models","volume":"101","author":"Tokuda","year":"2013","journal-title":"Proc. IEEE"},{"key":"10.1016\/j.csl.2023.101520_b54","series-title":"ICASSP, Vol. 3","first-page":"1315","article-title":"Speech parameter generation algorithms for HMM-based speech synthesis","author":"Tokuda","year":"2000"},{"key":"10.1016\/j.csl.2023.101520_b55","series-title":"ICASSP","first-page":"5891","article-title":"LPCNET: Improving neural speech synthesis through linear prediction","author":"Valin","year":"2019"},{"key":"10.1016\/j.csl.2023.101520_b56","series-title":"ICLR","article-title":"Flowtron: An autoregressive flow-based generative network for text-to-speech synthesis","author":"Valle","year":"2021"},{"key":"10.1016\/j.csl.2023.101520_b57","unstructured":"van den Oord, A., Dieleman, S., Zen, H., Simonyan, K., Vinyals, O., Graves, A., Kalchbrenner, N., Senior, A.W., Kavukcuoglu, K., 2016. WaveNet: A Generative Model for Raw Audio. In: The 9th ISCA Speech Synthesis Workshop. p. 125."},{"key":"10.1016\/j.csl.2023.101520_b58","series-title":"NeurIPS","first-page":"5998","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.csl.2023.101520_b59","doi-asserted-by":"crossref","first-page":"4006","DOI":"10.21437\/Interspeech.2017-1452","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"Wang","year":"2017","journal-title":"Interspeech"},{"key":"10.1016\/j.csl.2023.101520_b60","series-title":"Interspeech","first-page":"3989","article-title":"Bi-level speaker supervision for one-shot speech synthesis","author":"Wang","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b61","series-title":"Interspeech","first-page":"3540","article-title":"A cyclical post-filtering approach to mismatch refinement of neural vocoder for text-to-speech systems","author":"Wu","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b62","series-title":"ISCA Speech Synthesis Workshop","first-page":"202","article-title":"Merlin: An open source neural network speech synthesis system","author":"Wu","year":"2016"},{"key":"10.1016\/j.csl.2023.101520_b63","series-title":"ICASSP","first-page":"6199","article-title":"Parallel wavegan: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram","author":"Yamamoto","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b64","series-title":"ICASSP","first-page":"6613","article-title":"Adaspeech 2: Adaptive text to speech with untranscribed data","author":"Yan","year":"2021"},{"key":"10.1016\/j.csl.2023.101520_b65","series-title":"Interspeech","first-page":"3171","article-title":"Towards universal text-to-speech","author":"Yang","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b66","series-title":"Simultaneous modeling of phonetic and prosodic parameters, and characteristic conversion for HMM-based text-to-speech systems","author":"Yoshimura","year":"2002"},{"key":"10.1016\/j.csl.2023.101520_b67","doi-asserted-by":"crossref","unstructured":"Yoshimura, T., Tokuda, K., Masuko, T., Kobayashi, T., Kitamura, T., 1999. Simultaneous modeling of spectrum, pitch and duration in HMM-based speech synthesis. In: Sixth European Conference on Speech Communication and Technology.","DOI":"10.21437\/Eurospeech.1999-513"},{"key":"10.1016\/j.csl.2023.101520_b68","series-title":"Interspeech","article-title":"Phase-space representation of speech","author":"Yu","year":"2004"},{"key":"10.1016\/j.csl.2023.101520_b69","unstructured":"Zen, H., 2015. Acoustic modeling in statistical parametric speech synthesis-from HMM to LSTM-RNN. In: Proc. MLSLP."},{"key":"10.1016\/j.csl.2023.101520_b70","series-title":"Sixth ISCA Workshop on Speech Synthesis","first-page":"294","article-title":"The HMM-based speech synthesis system (HTS) version 2.0","author":"Zen","year":"2007"},{"key":"10.1016\/j.csl.2023.101520_b71","series-title":"ICASSP","first-page":"4470","article-title":"Unidirectional long short-term memory recurrent neural network with recurrent output layer for low-latency speech synthesis","author":"Zen","year":"2015"},{"key":"10.1016\/j.csl.2023.101520_b72","series-title":"ICASSP","first-page":"7962","article-title":"Statistical parametric speech synthesis using deep neural networks","author":"Zen","year":"2013"},{"issue":"11","key":"10.1016\/j.csl.2023.101520_b73","doi-asserted-by":"crossref","first-page":"1039","DOI":"10.1016\/j.specom.2009.04.004","article-title":"Statistical parametric speech synthesis","volume":"51","author":"Zen","year":"2009","journal-title":"Speech Commun."},{"key":"10.1016\/j.csl.2023.101520_b74","series-title":"AdaDurIAN: Few-shot adaptation for neural text-to-speech with DurIAN","author":"Zhang","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b75","series-title":"ICASSP","first-page":"7614","article-title":"End-to-end code-switching TTS with cross-lingual language model","author":"Zhou","year":"2020"},{"key":"10.1016\/j.csl.2023.101520_b76","series-title":"ICCV","first-page":"2242","article-title":"Unpaired image-to-image translation using cycle-consistent adversarial networks","author":"Zhu","year":"2017"},{"issue":"2","key":"10.1016\/j.csl.2023.101520_b77","doi-asserted-by":"crossref","first-page":"248","DOI":"10.1121\/1.1908630","article-title":"Subdivision of the audible frequency range into critical bands (frequenzgruppen)","volume":"33","author":"Zwicker","year":"1961","journal-title":"J. Acoust. Soc. Am."}],"container-title":["Computer Speech & Language"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0885230823000396?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0885230823000396?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2023,5,18]],"date-time":"2023-05-18T15:12:13Z","timestamp":1684422733000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0885230823000396"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6]]},"references-count":77,"alternative-id":["S0885230823000396"],"URL":"https:\/\/doi.org\/10.1016\/j.csl.2023.101520","relation":{},"ISSN":["0885-2308"],"issn-type":[{"value":"0885-2308","type":"print"}],"subject":[],"published":{"date-parts":[[2023,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Deep learning-based speaker-adaptive postfiltering with limited adaptation data for embedded text-to-speech synthesis systems","name":"articletitle","label":"Article Title"},{"value":"Computer Speech & Language","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.csl.2023.101520","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2023 Elsevier Ltd. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"101520"}}