{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:06:52Z","timestamp":1730297212009,"version":"3.28.0"},"reference-count":24,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,1,9]]},"DOI":"10.1109\/slt54892.2023.10023432","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T13:54:03Z","timestamp":1674827643000},"page":"1022-1028","source":"Crossref","is-referenced-by-count":1,"title":["Distilling Sequence-to-Sequence Voice Conversion Models for Streaming Conversion Applications"],"prefix":"10.1109","author":[{"given":"Kou","family":"Tanaka","sequence":"first","affiliation":[{"name":"NTT Corporation,NTT Communication Science Laboratories,Japan"}]},{"given":"Hirokazu","family":"Kameoka","sequence":"additional","affiliation":[{"name":"NTT Corporation,NTT Communication Science Laboratories,Japan"}]},{"given":"Takuhiro","family":"Kaneko","sequence":"additional","affiliation":[{"name":"NTT Corporation,NTT Communication Science Laboratories,Japan"}]},{"given":"Shogo","family":"Seki","sequence":"additional","affiliation":[{"name":"NTT Corporation,NTT Communication Science Laboratories,Japan"}]}],"member":"263","reference":[{"key":"ref13","first-page":"5904","article-title":"Developing real-time streaming trans-former transducer for speech recognition on large-scale dataset","author":"chen","year":"2021","journal-title":"ICASSP"},{"key":"ref24","first-page":"5210","article-title":"AutoVC: Zero-shot voice style transfer with only autoencoder loss","author":"qian","year":"2019","journal-title":"ICML"},{"key":"ref12","article-title":"FastS2S- VC: Streaming non-autoregressive sequence-to-sequence voice conversion","author":"tanaka","year":"2021","journal-title":"ArXiv Preprint"},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.21437\/VCC_BC.2020-15"},{"key":"ref15","first-page":"4879","article-title":"Generalized end-to-end loss for speaker veri-fication","author":"wan","year":"2018","journal-title":"ICASSP"},{"doi-asserted-by":"publisher","key":"ref14","DOI":"10.21437\/Interspeech.2020-2442"},{"key":"ref20","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2015","journal-title":"ICLRE"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.1037\/h0044155"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.21437\/Interspeech.2018-1456"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1109\/TASLP.2020.3047262"},{"key":"ref21","article-title":"Corpus of spontaneous japanese: Its design and evaluation","author":"maekawa","year":"2003","journal-title":"Workshop on Spontaneous Speech Processing and Recognition"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.1109\/TASLP.2019.2892235"},{"key":"ref1","article-title":"Neural dis-crete representation learning","volume":"30","author":"van den oord","year":"2017","journal-title":"Advances in Neural Infor-Mation Processing Systems"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref16","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"kong","year":"2020","journal-title":"Advances in neural information processing systems"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1016\/0167-6393(90)90011-W"},{"key":"ref18","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"ICML"},{"key":"ref8","first-page":"6805","article-title":"AttS2S- VC: Sequence-to-sequence voice conversion with attention and context preservation mechanisms","author":"kaneko","year":"2019","journal-title":"ICASSP"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1016\/j.specom.2008.11.004"},{"key":"ref9","doi-asserted-by":"crossref","first-page":"1849","DOI":"10.1109\/TASLP.2020.3001456","article-title":"ConvS2S- VC: Fully convolutional sequence-to-sequence voice conversion","volume":"28","author":"kwasny","year":"2020","journal-title":"IEEE\/ACM Transactions on Audio Speech and Language Processing"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.1109\/ICASSP40776.2020.9054556"},{"key":"ref3","first-page":"2514","article-title":"Statistical singing voice conversion with direct waveform modifi-cation based on the spectrum differential","author":"kobayashi","year":"2014","journal-title":"INTER-SPEECH"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.21437\/Interspeech.2019-1789"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.1587\/transinf.E97.D.1429"}],"event":{"name":"2022 IEEE Spoken Language Technology Workshop (SLT)","start":{"date-parts":[[2023,1,9]]},"location":"Doha, Qatar","end":{"date-parts":[[2023,1,12]]}},"container-title":["2022 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10022052\/10022330\/10023432.pdf?arnumber=10023432","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,20]],"date-time":"2023-02-20T17:08:24Z","timestamp":1676912904000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10023432\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,9]]},"references-count":24,"URL":"https:\/\/doi.org\/10.1109\/slt54892.2023.10023432","relation":{},"subject":[],"published":{"date-parts":[[2023,1,9]]}}}