{"id":"https://openalex.org/W4390075359","doi":"https://doi.org/10.1162/tacl_a_00618","title":"Speak, Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision","display_name":"Speak, Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4390075359","doi":"https://doi.org/10.1162/tacl_a_00618"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00618","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00618/2200655/tacl_a_00618.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true},"type":"article","type_crossref":"journal-article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00618/2200655/tacl_a_00618.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102924622","display_name":"Eugene Kharitonov","orcid":"https://orcid.org/0009-0000-8653-721X"},"institutions":[],"countries":["FR"],"is_corresponding":true,"raw_author_name":"Eugene Kharitonov","raw_affiliation_strings":["Google, France kharitonov@google.com"],"affiliations":[{"raw_affiliation_string":"Google, France kharitonov@google.com","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058616625","display_name":"Damien Vincent","orcid":null},"institutions":[{"id":"https://openalex.org/I4210100430","display_name":"Google (Switzerland)","ror":"https://ror.org/014f9c269","country_code":"CH","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210100430","https://openalex.org/I4210128969"]}],"countries":["CH"],"is_corresponding":true,"raw_author_name":"Damien Vincent","raw_affiliation_strings":["Google, Switzerland. damienv@google.com"],"affiliations":[{"raw_affiliation_string":"Google, Switzerland. damienv@google.com","institution_ids":["https://openalex.org/I4210100430"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026110001","display_name":"Zal\u00e1n Borsos","orcid":"https://orcid.org/0000-0003-0007-829X"},"institutions":[{"id":"https://openalex.org/I4210100430","display_name":"Google (Switzerland)","ror":"https://ror.org/014f9c269","country_code":"CH","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210100430","https://openalex.org/I4210128969"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Zal\u00e1n Borsos","raw_affiliation_strings":["Google, Switzerland"],"affiliations":[{"raw_affiliation_string":"Google, Switzerland","institution_ids":["https://openalex.org/I4210100430"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040382220","display_name":"Rapha\u00ebl Marinier","orcid":null},"institutions":[],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Rapha\u00ebl Marinier","raw_affiliation_strings":["Google, France"],"affiliations":[{"raw_affiliation_string":"Google, France","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103445258","display_name":"Sertan Girgin","orcid":null},"institutions":[],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Sertan Girgin","raw_affiliation_strings":["Google, France"],"affiliations":[{"raw_affiliation_string":"Google, France","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065100569","display_name":"Olivier Pietquin","orcid":"https://orcid.org/0000-0002-5386-465X"},"institutions":[],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Olivier Pietquin","raw_affiliation_strings":["Google, France"],"affiliations":[{"raw_affiliation_string":"Google, France","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081709307","display_name":"Matt Sharifi","orcid":null},"institutions":[{"id":"https://openalex.org/I4210100430","display_name":"Google (Switzerland)","ror":"https://ror.org/014f9c269","country_code":"CH","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210100430","https://openalex.org/I4210128969"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Matt Sharifi","raw_affiliation_strings":["Google, Switzerland"],"affiliations":[{"raw_affiliation_string":"Google, Switzerland","institution_ids":["https://openalex.org/I4210100430"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033341878","display_name":"Marco Tagliasacchi","orcid":"https://orcid.org/0000-0002-7682-6795"},"institutions":[{"id":"https://openalex.org/I4210100430","display_name":"Google (Switzerland)","ror":"https://ror.org/014f9c269","country_code":"CH","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210100430","https://openalex.org/I4210128969"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Marco Tagliasacchi","raw_affiliation_strings":["Google, Switzerland"],"affiliations":[{"raw_affiliation_string":"Google, Switzerland","institution_ids":["https://openalex.org/I4210100430"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5047639590","display_name":"Neil Zeghidour","orcid":"https://orcid.org/0000-0001-6896-3987"},"institutions":[],"countries":["FR"],"is_corresponding":true,"raw_author_name":"Neil Zeghidour","raw_affiliation_strings":["Google, France neilz@google.com"],"affiliations":[{"raw_affiliation_string":"Google, France neilz@google.com","institution_ids":[]}]}],"institution_assertions":[],"countries_distinct_count":2,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5102924622","https://openalex.org/A5058616625","https://openalex.org/A5047639590"],"corresponding_institution_ids":["https://openalex.org/I4210100430"],"apc_list":{"value":0,"currency":"USD","value_usd":0,"provenance":"doaj"},"apc_paid":null,"fwci":10.922,"has_fulltext":true,"fulltext_origin":"pdf","cited_by_count":31,"citation_normalized_percentile":{"value":0.999976,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":"11","issue":null,"first_page":"1703","last_page":"1718"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8662131},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.7668024},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.59737164},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5112505},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.43221575},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.42630178},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37962475},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00618","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00618/2200655/tacl_a_00618.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true},{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2302.03540","pdf_url":"https://arxiv.org/pdf/2302.03540","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00618","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00618/2200655/tacl_a_00618.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true},"sustainable_development_goals":[{"display_name":"Quality education","id":"https://metadata.un.org/sdg/4","score":0.84}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":34,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2889326796","https://openalex.org/W2963216553","https://openalex.org/W2964243274","https://openalex.org/W2972359262","https://openalex.org/W2995181338","https://openalex.org/W3015419784","https://openalex.org/W3016181583","https://openalex.org/W3034999214","https://openalex.org/W3092028330","https://openalex.org/W3140429000","https://openalex.org/W3161480375","https://openalex.org/W3198123200","https://openalex.org/W3198217962","https://openalex.org/W3205644108","https://openalex.org/W3209984917","https://openalex.org/W3215615641","https://openalex.org/W3215895588","https://openalex.org/W4226033575","https://openalex.org/W4285182272","https://openalex.org/W4288089799","https://openalex.org/W4292779060","https://openalex.org/W4296068981","https://openalex.org/W4296070453","https://openalex.org/W4307680525","https://openalex.org/W4313679638","https://openalex.org/W4318351475","https://openalex.org/W4320459320","https://openalex.org/W4380874786","https://openalex.org/W4381786045","https://openalex.org/W4381827575","https://openalex.org/W4385245566","https://openalex.org/W4385574033","https://openalex.org/W4394671563"],"related_works":["https://openalex.org/W2946856121","https://openalex.org/W290673751","https://openalex.org/W2535215250","https://openalex.org/W2433276473","https://openalex.org/W2108985546","https://openalex.org/W2077992636","https://openalex.org/W2024201202","https://openalex.org/W2017702615","https://openalex.org/W1914543332","https://openalex.org/W1537411440"],"abstract_inverted_index":{"Abstract":[0],"We":[1],"introduce":[2],"SPEAR-TTS,":[3],"a":[4,28,107,127],"multi-speaker":[5],"text-to-speech":[6],"(TTS)":[7],"system":[8],"that":[9,124,131],"can":[10],"be":[11],"trained":[12],"with":[13,134],"minimal":[14],"supervision.":[15],"By":[16],"combining":[17],"two":[18,31,54],"types":[19],"of":[20,30,58,72,110,141],"discrete":[21],"speech":[22,147],"representations,":[23],"we":[24,93],"cast":[25],"TTS":[26],"as":[27],"composition":[29],"sequence-to-sequence":[32],"tasks:":[33],"from":[34,44],"text":[35],"to":[36,41,47,76,100,102],"high-level":[37],"semantic":[38,45],"tokens":[39,46,50],"(akin":[40],"\u201creading\u201d)":[42],"and":[43,66,74,150],"low-level":[48],"acoustic":[49,151],"(\u201cspeaking\u201d).":[51],"Decoupling":[52],"these":[53],"tasks":[55],"enables":[56],"training":[57,84],"the":[59,68,78,85,90],"\u201cspeaking\u201d":[60],"module":[61],"using":[62,105,137],"abundant":[63],"audio-only":[64],"data,":[65,143],"unlocks":[67],"highly":[69],"efficient":[70],"combination":[71],"pretraining":[73],"backtranslation":[75],"reduce":[77],"need":[79],"for":[80],"parallel":[81,142],"data":[82],"when":[83],"\u201creading\u201d":[86],"component.":[87],"To":[88],"control":[89],"speaker":[91,116,119],"identity,":[92],"adopt":[94],"example":[95],"prompting,":[96],"which":[97],"allows":[98],"SPEAR-TTS":[99,125],"generalize":[101],"unseen":[103],"speakers":[104],"only":[106,138],"short":[108],"sample":[109],"3":[111],"seconds,":[112],"without":[113],"any":[114],"explicit":[115],"representation":[117],"or":[118],"labels.":[120],"Our":[121],"experiments":[122],"demonstrate":[123],"achieves":[126],"character":[128],"error":[129],"rate":[130],"is":[132],"competitive":[133],"state-of-the-art":[135],"methods":[136],"15":[139],"minutes":[140],"while":[144],"matching":[145],"ground-truth":[146],"in":[148],"naturalness":[149],"quality.":[152]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4390075359","counts_by_year":[{"year":2024,"cited_by_count":24},{"year":2023,"cited_by_count":7}],"updated_date":"2025-01-03T04:37:26.829194","created_date":"2023-12-22"}