{"id":"https://openalex.org/W4405033054","doi":"https://doi.org/10.48550/arxiv.2411.18918","title":"CoDiff-VC: A Codec-Assisted Diffusion Model for Zero-shot Voice\n Conversion","display_name":"CoDiff-VC: A Codec-Assisted Diffusion Model for Zero-shot Voice\n Conversion","publication_year":2024,"publication_date":"2024-11-28","ids":{"openalex":"https://openalex.org/W4405033054","doi":"https://doi.org/10.48550/arxiv.2411.18918"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.18918","pdf_url":"http://arxiv.org/pdf/2411.18918","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2411.18918","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100697204","display_name":"Yuke Li","orcid":"https://orcid.org/0009-0000-7282-8964"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yuke","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101883211","display_name":"Xinfa Zhu","orcid":"https://orcid.org/0000-0001-9275-523X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Xinfa","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016567570","display_name":"Hanzhao Li","orcid":"https://orcid.org/0009-0005-3215-7517"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Hanzhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015560758","display_name":"Jixun Yao","orcid":"https://orcid.org/0000-0002-5324-7360"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, JiXun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057172831","display_name":"Wen\u2010de Tian","orcid":"https://orcid.org/0000-0002-9192-9901"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, WenJie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039657406","display_name":"Xiao Yang","orcid":"https://orcid.org/0000-0003-2401-5197"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, XiPeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074963249","display_name":"Y.C. Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, YunLin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100782898","display_name":"Zhifei Li","orcid":"https://orcid.org/0000-0003-1246-741X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Zhifei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100668966","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-8234-0823"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Lei","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":78},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9986,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9986,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9636,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9232,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.65322256},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.6340872}],"concepts":[{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.65322256},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.6340872},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.54035354},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.5211739},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.51185125},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4656421},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.19356343},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.14589366},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.09366125},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.042979896},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C191897082","wikidata":"https://www.wikidata.org/wiki/Q11467","display_name":"Metallurgy","level":1,"score":0.0},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.18918","pdf_url":"http://arxiv.org/pdf/2411.18918","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.18918","pdf_url":"http://arxiv.org/pdf/2411.18918","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4214877189","https://openalex.org/W3161919736","https://openalex.org/W2980279061","https://openalex.org/W2964213236","https://openalex.org/W2773965352","https://openalex.org/W2387018512","https://openalex.org/W2381179799","https://openalex.org/W2334685461","https://openalex.org/W2163719598","https://openalex.org/W2074502265"],"abstract_inverted_index":{"Zero-shot":[0],"voice":[1,23,66,126],"conversion":[2,24,67],"(VC)":[3],"aims":[4],"to":[5,11,31,77,88,106,120],"convert":[6],"the":[7,17,45,93,108,148],"original":[8,109],"speaker's":[9],"timbre":[10,42,117,122,145],"any":[12],"target":[13],"speaker":[14,36,52,116,134,160],"while":[15],"keeping":[16],"linguistic":[18,33,47,90],"content.":[19],"Current":[20],"mainstream":[21],"zero-shot":[22,65],"approaches":[25],"depend":[26],"on":[27],"pre-trained":[28],"recognition":[29],"models":[30],"disentangle":[32],"content":[34,48,91,98,143],"and":[35,49,73,124,133,144,152,164],"representation.":[37],"This":[38],"results":[39],"in":[40,51],"a":[41,70,74,85,114],"residue":[43],"within":[44],"decoupled":[46],"inadequacies":[50],"representation":[53],"modeling.":[54],"In":[55],"this":[56],"study,":[57],"we":[58,100,112,136],"propose":[59],"CoDiff-VC,":[60],"an":[61],"end-to-end":[62],"framework":[63],"for":[64],"that":[68,156],"integrates":[69],"speech":[71,131],"codec":[72,87],"diffusion":[75],"model":[76],"produce":[78],"high-fidelity":[79],"waveforms.":[80],"Our":[81],"approach":[82,119],"involves":[83],"employing":[84],"single-codebook":[86],"separate":[89],"from":[92],"source":[94],"speech.":[95,166],"To":[96,129],"enhance":[97],"disentanglement,":[99],"introduce":[101,137],"Mix-Style":[102],"layer":[103],"normalization":[104],"(MSLN)":[105],"perturb":[107],"timbre.":[110],"Additionally,":[111],"incorporate":[113],"multi-scale":[115],"modeling":[118],"ensure":[121],"consistency":[123],"improve":[125,130],"detail":[127],"similarity.":[128],"quality":[132],"similarity,":[135,161],"dual":[138],"classifier-free":[139],"guidance,":[140],"providing":[141],"both":[142],"guidance":[146],"during":[147],"generation":[149],"process.":[150],"Objective":[151],"subjective":[153],"experiments":[154],"affirm":[155],"CoDiff-VC":[157],"significantly":[158],"improves":[159],"generating":[162],"natural":[163],"higher-quality":[165]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4405033054","counts_by_year":[],"updated_date":"2025-04-05T05:31:56.345701","created_date":"2024-12-05"}