{"id":"https://openalex.org/W4403663629","doi":"https://doi.org/10.48550/arxiv.2409.08039","title":"Zero-Shot Sing Voice Conversion: built upon clustering-based phoneme\n representations","display_name":"Zero-Shot Sing Voice Conversion: built upon clustering-based phoneme\n representations","publication_year":2024,"publication_date":"2024-09-12","ids":{"openalex":"https://openalex.org/W4403663629","doi":"https://doi.org/10.48550/arxiv.2409.08039"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.08039","pdf_url":"http://arxiv.org/pdf/2409.08039","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2409.08039","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5032740716","display_name":"Wangjin Zhou","orcid":"https://orcid.org/0009-0007-0693-5316"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Wangjin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027555657","display_name":"Fengrun Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Fengrun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104026346","display_name":"Yiming Liu","orcid":"https://orcid.org/0009-0001-5272-6132"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yiming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111310824","display_name":"Wenhao Guan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guan, Wenhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106669650","display_name":"Yi Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5038044080","display_name":"Tatsuya Kawahara","orcid":"https://orcid.org/0000-0002-2686-2296"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kawahara, Tatsuya","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":77},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9972,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9972,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9715,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9389,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.8175149},{"id":"https://openalex.org/keywords/ground-zero","display_name":"Ground zero","score":0.49005258}],"concepts":[{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.8175149},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.61125934},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.61020195},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.55462927},{"id":"https://openalex.org/C2780518707","wikidata":"https://www.wikidata.org/wiki/Q685332","display_name":"Ground zero","level":2,"score":0.49005258},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4477831},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.4159181},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.38274646},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.28156358},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.27792692},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.1914958},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.060168564},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185544564","wikidata":"https://www.wikidata.org/wiki/Q81197","display_name":"Nuclear physics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.08039","pdf_url":"http://arxiv.org/pdf/2409.08039","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.08039","pdf_url":"http://arxiv.org/pdf/2409.08039","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4402779703","https://openalex.org/W4313594480","https://openalex.org/W4300821880","https://openalex.org/W4293365998","https://openalex.org/W4256031153","https://openalex.org/W3279617","https://openalex.org/W2768862283","https://openalex.org/W2327903337","https://openalex.org/W2285651113","https://openalex.org/W2168002573"],"abstract_inverted_index":{"This":[0,26],"study":[1],"presents":[2],"an":[3],"innovative":[4],"Zero-Shot":[5],"any-to-any":[6],"Singing":[7],"Voice":[8],"Conversion":[9],"(SVC)":[10],"method,":[11],"leveraging":[12],"a":[13],"novel":[14],"clustering-based":[15],"phoneme":[16],"representation":[17],"to":[18,45],"effectively":[19],"separate":[20],"content,":[21],"timbre,":[22],"and":[23,56,66,73,84],"singing":[24,55],"style.":[25],"approach":[27],"enables":[28],"precise":[29],"voice":[30,75],"characteristic":[31],"manipulation.":[32],"We":[33],"discovered":[34],"that":[35],"datasets":[36],"with":[37,70],"fewer":[38],"recordings":[39],"per":[40],"artist":[41],"are":[42],"more":[43],"susceptible":[44],"timbre":[46,67],"leakage.":[47],"Extensive":[48],"testing":[49],"on":[50,91],"over":[51],"10,000":[52],"hours":[53],"of":[54,98],"user":[57],"feedback":[58],"revealed":[59],"our":[60,71],"model":[61],"significantly":[62],"improves":[63],"sound":[64],"quality":[65],"accuracy,":[68],"aligning":[69],"objectives":[72],"advancing":[74],"conversion":[76],"technology.":[77],"Furthermore,":[78],"this":[79],"research":[80],"advances":[81],"zero-shot":[82],"SVC":[83],"sets":[85],"the":[86,96],"stage":[87],"for":[88],"future":[89],"work":[90],"discrete":[92],"speech":[93],"representation,":[94],"emphasizing":[95],"preservation":[97],"rhyme.":[99]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4403663629","counts_by_year":[],"updated_date":"2025-04-22T21:07:10.663247","created_date":"2024-10-23"}