{"id":"https://openalex.org/W4391640577","doi":"https://doi.org/10.1109/taslp.2024.3364085","title":"Transfer Learning for Low-Resource, Multi-Lingual, and Zero-Shot Multi-Speaker Text-to-Speech","display_name":"Transfer Learning for Low-Resource, Multi-Lingual, and Zero-Shot Multi-Speaker Text-to-Speech","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4391640577","doi":"https://doi.org/10.1109/taslp.2024.3364085"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3364085","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"journal-article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103069095","display_name":"Myeonghun Jeong","orcid":"https://orcid.org/0000-0002-4689-3110"},"institutions":[{"id":"https://openalex.org/I4210164379","display_name":"Seoul Media Institute of Technology","ror":"https://ror.org/04ywg4h07","country_code":"KR","type":"education","lineage":["https://openalex.org/I4210164379"]},{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"funder","lineage":["https://openalex.org/I139264467"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Myeonghun Jeong","raw_affiliation_strings":["Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I4210164379","https://openalex.org/I139264467"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027616639","display_name":"Minchan Kim","orcid":"https://orcid.org/0000-0002-8150-765X"},"institutions":[{"id":"https://openalex.org/I4210164379","display_name":"Seoul Media Institute of Technology","ror":"https://ror.org/04ywg4h07","country_code":"KR","type":"education","lineage":["https://openalex.org/I4210164379"]},{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"funder","lineage":["https://openalex.org/I139264467"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Minchan Kim","raw_affiliation_strings":["Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I4210164379","https://openalex.org/I139264467"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101424423","display_name":"Byoung Jin Choi","orcid":null},"institutions":[{"id":"https://openalex.org/I4210164379","display_name":"Seoul Media Institute of Technology","ror":"https://ror.org/04ywg4h07","country_code":"KR","type":"education","lineage":["https://openalex.org/I4210164379"]},{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"funder","lineage":["https://openalex.org/I139264467"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Byoung Jin Choi","raw_affiliation_strings":["Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I4210164379","https://openalex.org/I139264467"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050154985","display_name":"Jaesam Yoon","orcid":"https://orcid.org/0000-0002-9978-0582"},"institutions":[],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jaesam Yoon","raw_affiliation_strings":["Kakao Enterprise, Seongnam, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Kakao Enterprise, Seongnam, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103192158","display_name":"Won Jang","orcid":"https://orcid.org/0000-0002-4711-780X"},"institutions":[],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Won Jang","raw_affiliation_strings":["Kakao Enterprise, Seongnam, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Kakao Enterprise, Seongnam, Republic of Korea","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5051356511","display_name":"Nam Soo Kim","orcid":"https://orcid.org/0000-0002-0568-4902"},"institutions":[{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"funder","lineage":["https://openalex.org/I139264467"]},{"id":"https://openalex.org/I4210164379","display_name":"Seoul Media Institute of Technology","ror":"https://ror.org/04ywg4h07","country_code":"KR","type":"education","lineage":["https://openalex.org/I4210164379"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Nam Soo Kim","raw_affiliation_strings":["Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I139264467","https://openalex.org/I4210164379"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.019,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.785868,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":77,"max":88},"biblio":{"volume":"32","issue":null,"first_page":"1519","last_page":"1530"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9937,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9937,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.54244906},{"id":"https://openalex.org/keywords/intelligibility","display_name":"Intelligibility (philosophy)","score":0.4986241},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.41866094}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7852179},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.755983},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.56556773},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.54244906},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5407221},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.4986241},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.46942478},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.45111728},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.41866094},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.08946374},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3364085","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.61,"display_name":"Quality education","id":"https://metadata.un.org/sdg/4"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":37,"referenced_works":["https://openalex.org/W2060277733","https://openalex.org/W2187089797","https://openalex.org/W2884225676","https://openalex.org/W2936832667","https://openalex.org/W2962770929","https://openalex.org/W2964002616","https://openalex.org/W2964243274","https://openalex.org/W2972473628","https://openalex.org/W2973049979","https://openalex.org/W2973084242","https://openalex.org/W2990124956","https://openalex.org/W2998572311","https://openalex.org/W3015826515","https://openalex.org/W3024869864","https://openalex.org/W3081416955","https://openalex.org/W3090254849","https://openalex.org/W3095012670","https://openalex.org/W3095410713","https://openalex.org/W3097290232","https://openalex.org/W3140429000","https://openalex.org/W3167533889","https://openalex.org/W3181257032","https://openalex.org/W3196584150","https://openalex.org/W3198213150","https://openalex.org/W3198429080","https://openalex.org/W3198575134","https://openalex.org/W3209059054","https://openalex.org/W4206596421","https://openalex.org/W4221166168","https://openalex.org/W4225746985","https://openalex.org/W4283640572","https://openalex.org/W4296070453","https://openalex.org/W4297808394","https://openalex.org/W4297841354","https://openalex.org/W4297841714","https://openalex.org/W4312095868","https://openalex.org/W4313148337"],"related_works":["https://openalex.org/W4403759994","https://openalex.org/W4400309480","https://openalex.org/W4252942110","https://openalex.org/W2549308614","https://openalex.org/W2081919107","https://openalex.org/W2079655441","https://openalex.org/W2075706796","https://openalex.org/W2032941915","https://openalex.org/W2023694213","https://openalex.org/W1604114751"],"abstract_inverted_index":{"Though":[0],"neural":[1],"text-to-speech":[2],"(TTS)":[3],"models":[4,35],"show":[5],"remarkable":[6],"performance,":[7],"they":[8],"still":[9],"require":[10],"a":[11,38,51,72,95,104,132],"large":[12],"amount":[13,106,134],"of":[14,41,107,135,142,167],"<":[15],"speech,":[16],"text>":[17],"paired":[18,30,108],"dataset,":[19],"which":[20,124],"is":[21],"expensive":[22],"to":[23,99],"collect.":[24],"The":[25,110,154],"heavy":[26],"demand":[27],"for":[28,55,128],"collecting":[29],"datasets":[31],"makes":[32],"the":[33,79,100,116,140,163],"TTS":[34,101,152],"support":[36],"only":[37],"small":[39,105],"number":[40],"speakers":[42],"and":[43,59,149,170],"languages.":[44],"To":[45],"address":[46],"this":[47,88],"problem,":[48],"we":[49,63,92],"introduce":[50],"transfer":[52],"learning":[53,97,130],"framework":[54],"multi-lingual,":[56,148],"zero-shot":[57,150],"multi-speaker,":[58],"low-resource":[60],"TTS.":[61],"Firstly,":[62],"pretrain":[64],"our":[65,143,159],"model":[66,102,145],"in":[67,146,165],"an":[68],"unsupervised":[69],"manner":[70],"with":[71,103,131],"multi-lingual":[73],"multi-speaker":[74,151],"speech-only":[75,118],"dataset":[76,119],"by":[77],"leveraging":[78],"self-supervised":[80],"speech":[81],"representations":[82,113],"as":[83],"intermediate":[84],"linguistic":[85,90,112],"representations.":[86],"Given":[87],"pretrained":[89,111],"information,":[91],"then":[93],"apply":[94],"supervised":[96,129],"technique":[98],"dataset.":[109],"extracted":[114],"from":[115],"large-scale":[117],"facilitate":[120],"phoneme-to-linguistic":[121],"feature":[122],"matching,":[123],"provides":[125],"good":[126],"guidance":[127],"limited":[133],"labeled":[136],"data.":[137],"We":[138],"evaluate":[139],"performance":[141],"proposed":[144,160],"low-resource,":[147],"tasks.":[153],"experimental":[155],"results":[156],"demonstrate":[157],"that":[158],"method":[161],"outperforms":[162],"baseline":[164],"terms":[166],"naturalness,":[168],"intelligibility,":[169],"speaker":[171],"similarity.":[172]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4391640577","counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-04-22T20:35:53.967196","created_date":"2024-02-09"}