{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:06:28Z","timestamp":1730297188933,"version":"3.28.0"},"reference-count":46,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,1,9]]},"DOI":"10.1109\/slt54892.2023.10023072","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T18:54:03Z","timestamp":1674845643000},"page":"928-935","source":"Crossref","is-referenced-by-count":1,"title":["Learning Accent Representation with Multi-Level VAE Towards Controllable Speech Synthesis"],"prefix":"10.1109","author":[{"given":"Jan","family":"Melechovsky","sequence":"first","affiliation":[{"name":"Singapore University of Technology and Design"}]},{"given":"Ambuj","family":"Mehrish","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design"}]},{"given":"Dorien","family":"Herremans","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design"}]},{"given":"Berrak","family":"Sisman","sequence":"additional","affiliation":[{"name":"University of Texas at Dallas"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"crossref","DOI":"10.1017\/CBO9780511611759","volume-title":"Accents of En-glish:","volume":"1","author":"Wells","year":"1982"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.4324\/9780203348802"},{"volume-title":"Social Class Differences in Britain: A Source-book","year":"1977","author":"Reid","key":"ref3"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1177\/0033688220977406"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3060813"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2021.101302"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362120"},{"volume-title":"Official rosetta stoneR","year":"2022","key":"ref8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413386"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1186"},{"key":"ref11","article-title":"Layer-wise fast adaptation for end-to-end multi-accent speech recognition","author":"Gong","year":"2022","journal-title":"arXiv"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1495"},{"key":"ref13","article-title":"A survey on neural speech synthesis","volume-title":"arXiv preprint","author":"Tan","year":"2021"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1609.03499"},{"key":"ref15","first-page":"195","article-title":"Deep voice: Real-time neural text-to-speech","volume-title":"International Conference on Machine Learning","author":"Arik"},{"key":"ref16","article-title":"Neural machine translation by jointly learning to align and translate","author":"Bahdanau","year":"2014","journal-title":"arXiv"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref19","article-title":"Fastspeech: Fast, robust and controllable text to speech","volume":"32","author":"Ren","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref20","article-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","author":"Ren","year":"2020","journal-title":"ar Xiv preprint"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383562"},{"key":"ref22","first-page":"5180","article-title":"Style tokens: Unsuper-vised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"International Conference on Ma-chine Learning. PMLR","author":"Wang"},{"key":"ref23","article-title":"Hierarchical generative modeling for controllable speech synthesis","volume-title":"arXiv preprint","author":"Hsu","year":"2018"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11867"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413907"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1236"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683623"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.312"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2013-671"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2009.2038818"},{"journal-title":"Spoken language conversion with accent morphing","year":"2007","author":"Huckvale","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2016.7552917"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.907344"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6855134"},{"key":"ref35","first-page":"4693","article-title":"Towards end-to-end prosody trans-fer for expressive speech synthesis with tacotron","volume-title":"international conference on machine learning","author":"Skerry-Ryan"},{"journal-title":"Cstr vctk corpus: English multi-speaker cor-pus for cstr voice cloning toolkit (version 0.92)","year":"2019","author":"Yamagishi","key":"ref36"},{"key":"ref37","article-title":"Generating sentences from a continuous space","author":"Bowman","year":"2015","journal-title":"arXiv preprint"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/PACRIM.1993.407206"},{"key":"ref39","first-page":"63","article-title":"Synthesizer voice quality of new languages calibrated with mean mel cepstral distortion","author":"Kominek","year":"2008","journal-title":"SLTU"},{"volume-title":"Silero models:pre-trained enterprise-grade stt\/tts models and benchmarks","year":"2022","key":"ref40"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"ref42","first-page":"173","article-title":"Deep speech 2: End-to-end speech recognition in english and mandarin","volume-title":"Inter-national conference on machine learning","author":"Amodei"},{"key":"ref43","article-title":"P. 800: Methods for subjective determination of transmission quality","volume":"22","author":"Rec","year":"1996","journal-title":"International Telecommu-nication Union"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781107337855"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2018-1110"},{"key":"ref46","article-title":"The cmu arctic speech databases","volume-title":"Fifth ISCA workshop on speech synthesis","author":"Kominek","year":"2004"}],"event":{"name":"2022 IEEE Spoken Language Technology Workshop (SLT)","start":{"date-parts":[[2023,1,9]]},"location":"Doha, Qatar","end":{"date-parts":[[2023,1,12]]}},"container-title":["2022 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10022052\/10022330\/10023072.pdf?arnumber=10023072","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,13]],"date-time":"2024-02-13T06:36:42Z","timestamp":1707806202000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10023072\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,9]]},"references-count":46,"URL":"https:\/\/doi.org\/10.1109\/slt54892.2023.10023072","relation":{},"subject":[],"published":{"date-parts":[[2023,1,9]]}}}