{"id":"https://openalex.org/W4319862675","doi":"https://doi.org/10.1109/slt54892.2023.10023153","title":"GAN You Hear Me? Reclaiming Unconditional Speech Synthesis from Diffusion Models","display_name":"GAN You Hear Me? Reclaiming Unconditional Speech Synthesis from Diffusion Models","publication_year":2023,"publication_date":"2023-01-09","ids":{"openalex":"https://openalex.org/W4319862675","doi":"https://doi.org/10.1109/slt54892.2023.10023153"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt54892.2023.10023153","pdf_url":null,"source":{"id":"https://openalex.org/S4363605953","display_name":"2022 IEEE Spoken Language Technology Workshop (SLT)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2210.05271","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5085677838","display_name":"Matthew Baas","orcid":"https://orcid.org/0000-0003-3001-6292"},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Matthew Baas","raw_affiliation_strings":["Stellenbosch University,MediaLab, Electrical & Electronic Engineering,South Africa"],"affiliations":[{"raw_affiliation_string":"Stellenbosch University,MediaLab, Electrical & Electronic Engineering,South Africa","institution_ids":["https://openalex.org/I26092322"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040305929","display_name":"Herman Kamper","orcid":"https://orcid.org/0000-0003-2980-3475"},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Herman Kamper","raw_affiliation_strings":["Stellenbosch University,MediaLab, Electrical & Electronic Engineering,South Africa"],"affiliations":[{"raw_affiliation_string":"Stellenbosch University,MediaLab, Electrical & Electronic Engineering,South Africa","institution_ids":["https://openalex.org/I26092322"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.061,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.999754,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":92},"biblio":{"volume":null,"issue":null,"first_page":"906","last_page":"911"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9996,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9996,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9963,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9936,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminator","display_name":"Discriminator","score":0.93128496},{"id":"https://openalex.org/keywords/aliasing","display_name":"Aliasing","score":0.56144565},{"id":"https://openalex.org/keywords/signal","display_name":"SIGNAL (programming language)","score":0.463352},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.46301696},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.42232782}],"concepts":[{"id":"https://openalex.org/C2779803651","wikidata":"https://www.wikidata.org/wiki/Q5282088","display_name":"Discriminator","level":3,"score":0.93128496},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7839912},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.657564},{"id":"https://openalex.org/C4069607","wikidata":"https://www.wikidata.org/wiki/Q868732","display_name":"Aliasing","level":3,"score":0.56144565},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.5350818},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.48485434},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.4752143},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.463352},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.46301696},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.42232782},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33946615},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.24930105},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.21359524},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.12676123},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.09064537},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.08940643},{"id":"https://openalex.org/C94915269","wikidata":"https://www.wikidata.org/wiki/Q1834857","display_name":"Detector","level":2,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt54892.2023.10023153","pdf_url":null,"source":{"id":"https://openalex.org/S4363605953","display_name":"2022 IEEE Spoken Language Technology Workshop (SLT)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2210.05271","pdf_url":"https://arxiv.org/pdf/2210.05271","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2210.05271","pdf_url":"https://arxiv.org/pdf/2210.05271","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"score":0.71,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":35,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1522301498","https://openalex.org/W2129069237","https://openalex.org/W2519091744","https://openalex.org/W2549139847","https://openalex.org/W2623550831","https://openalex.org/W2749617271","https://openalex.org/W2792263949","https://openalex.org/W2797583228","https://openalex.org/W2962770929","https://openalex.org/W2963373786","https://openalex.org/W3034720584","https://openalex.org/W3035187956","https://openalex.org/W3035574324","https://openalex.org/W3036843665","https://openalex.org/W3092028330","https://openalex.org/W3106255532","https://openalex.org/W3129576130","https://openalex.org/W3129651364","https://openalex.org/W3140429000","https://openalex.org/W3162850270","https://openalex.org/W3174807077","https://openalex.org/W3198217962","https://openalex.org/W3209059054","https://openalex.org/W348213426","https://openalex.org/W4221152471","https://openalex.org/W4224035735","https://openalex.org/W4281485151","https://openalex.org/W4283686991","https://openalex.org/W4294643831","https://openalex.org/W4296068974","https://openalex.org/W4297817572","https://openalex.org/W4301206121","https://openalex.org/W4320013936","https://openalex.org/W4394671563"],"related_works":["https://openalex.org/W4380714744","https://openalex.org/W4319453655","https://openalex.org/W4293202849","https://openalex.org/W2387995142","https://openalex.org/W2089959425","https://openalex.org/W2074460168","https://openalex.org/W2053531689","https://openalex.org/W1980965563","https://openalex.org/W1842536210","https://openalex.org/W1489300767"],"abstract_inverted_index":{"We":[0],"propose":[1],"AudioStyleGAN":[2],"(ASGAN),":[3],"a":[4,28,37,57,63,99],"new":[5,60],"generative":[6],"adversarial":[7],"network":[8],"(GAN)":[9],"for":[10],"unconditional":[11,79],"speech":[12,80,112],"synthesis.":[13],"As":[14],"in":[15,78],"the":[16,83,94],"StyleGAN":[17],"family":[18],"of":[19,39,59],"image":[20],"synthesis":[21,81],"models,":[22,133],"ASGAN":[23,74,104,121],"maps":[24],"sampled":[25],"noise":[26],"to":[27,36,65,69,107,118],"disentangled":[29],"latent":[30],"vector":[31],"which":[32],"is":[33,46,89,105],"then":[34],"mapped":[35],"sequence":[38],"audio":[40],"features":[41],"so":[42],"that":[43,101,123],"signal":[44],"aliasing":[45],"suppressed":[47],"at":[48],"every":[49],"layer.":[50],"To":[51],"successfully":[52],"train":[53],"ASGAN,":[54],"we":[55],"introduce":[56],"number":[58],"techniques,":[61],"including":[62],"modification":[64],"adaptive":[66],"discriminator":[67,72],"augmentation":[68],"probabilistically":[70],"skip":[71],"updates.":[73],"achieves":[75],"state-of-the-art":[76],"results":[77],"on":[82],"Google":[84],"Speech":[85],"Commands":[86],"dataset.":[87],"It":[88],"also":[90],"substantially":[91],"faster":[92],"than":[93],"top-performing":[95],"diffusion":[96,130],"models.":[97,131],"Through":[98],"design":[100],"encourages":[102],"disentanglement,":[103],"able":[106],"perform":[108],"voice":[109],"conversion":[110],"and":[111],"editing":[113],"without":[114],"being":[115],"explicitly":[116],"trained":[117],"do":[119],"so.":[120],"demonstrates":[122],"GANs":[124],"are":[125],"still":[126],"highly":[127],"competitive":[128],"with":[129],"Code,":[132],"samples:":[134],"https://github.com/RF5/simple-asgan/.":[135]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4319862675","counts_by_year":[{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":1}],"updated_date":"2024-12-22T17:19:02.735177","created_date":"2023-02-11"}