{"id":"https://openalex.org/W4225272718","doi":"https://doi.org/10.1109/icassp43922.2022.9746475","title":"Tts4pretrain 2.0: Advancing the use of Text and Speech in ASR Pretraining with Consistency and Contrastive Losses","display_name":"Tts4pretrain 2.0: Advancing the use of Text and Speech in ASR Pretraining with Consistency and Contrastive Losses","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W4225272718","doi":"https://doi.org/10.1109/icassp43922.2022.9746475"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9746475","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002433660","display_name":"Zhehuai Chen","orcid":"https://orcid.org/0000-0003-4400-5340"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"funder","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhehuai Chen","raw_affiliation_strings":["Google, Inc."],"affiliations":[{"raw_affiliation_string":"Google, Inc.","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100433723","display_name":"Yu Zhang","orcid":"https://orcid.org/0000-0003-3090-7431"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"funder","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yu Zhang","raw_affiliation_strings":["Google, Inc."],"affiliations":[{"raw_affiliation_string":"Google, Inc.","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102902866","display_name":"Andrew Rosenberg","orcid":"https://orcid.org/0000-0003-1780-4390"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"funder","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Andrew Rosenberg","raw_affiliation_strings":["Google, Inc."],"affiliations":[{"raw_affiliation_string":"Google, Inc.","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071715737","display_name":"Bhuvana Ramabhadran","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"funder","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bhuvana Ramabhadran","raw_affiliation_strings":["Google, Inc."],"affiliations":[{"raw_affiliation_string":"Google, Inc.","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103874391","display_name":"Pedro J. Moreno","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"funder","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Pedro Moreno","raw_affiliation_strings":["Google, Inc."],"affiliations":[{"raw_affiliation_string":"Google, Inc.","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5025935520","display_name":"Gary Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"funder","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gary Wang","raw_affiliation_strings":["Google, Inc."],"affiliations":[{"raw_affiliation_string":"Google, Inc.","institution_ids":["https://openalex.org/I1291425158"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.17,"has_fulltext":false,"cited_by_count":15,"citation_normalized_percentile":{"value":0.812687,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":93,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"7677","last_page":"7681"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization","score":0.55886126},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.5393716},{"id":"https://openalex.org/keywords/speech-error","display_name":"Speech error","score":0.5391998}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.759763},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6911846},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.64884865},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5718644},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.55886126},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.5393716},{"id":"https://openalex.org/C541956065","wikidata":"https://www.wikidata.org/wiki/Q2250680","display_name":"Speech error","level":3,"score":0.5391998},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.45121458},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43888783},{"id":"https://openalex.org/C43617652","wikidata":"https://www.wikidata.org/wiki/Q7575399","display_name":"Speech production","level":2,"score":0.14660537},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9746475","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality education","score":0.45}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":36,"referenced_works":["https://openalex.org/W11314411","https://openalex.org/W1494198834","https://openalex.org/W1524333225","https://openalex.org/W1810943226","https://openalex.org/W2125336414","https://openalex.org/W2187089797","https://openalex.org/W2250357346","https://openalex.org/W2962369866","https://openalex.org/W2963250244","https://openalex.org/W2972359262","https://openalex.org/W2973049979","https://openalex.org/W2995181338","https://openalex.org/W3005680577","https://openalex.org/W3015280134","https://openalex.org/W3016011332","https://openalex.org/W3016234571","https://openalex.org/W3020336359","https://openalex.org/W3026041220","https://openalex.org/W3030437843","https://openalex.org/W3036601975","https://openalex.org/W3093579165","https://openalex.org/W3095727342","https://openalex.org/W3097777922","https://openalex.org/W3139918052","https://openalex.org/W3148001440","https://openalex.org/W3160235762","https://openalex.org/W3170072081","https://openalex.org/W3171007011","https://openalex.org/W3198608154","https://openalex.org/W3198771897","https://openalex.org/W3198840231","https://openalex.org/W3202037040","https://openalex.org/W3212799896","https://openalex.org/W4210690962","https://openalex.org/W4226033575","https://openalex.org/W4289383906"],"related_works":["https://openalex.org/W4287266619","https://openalex.org/W4224916606","https://openalex.org/W3201953150","https://openalex.org/W3163300396","https://openalex.org/W3136989387","https://openalex.org/W3107474891","https://openalex.org/W3008370744","https://openalex.org/W2982095018","https://openalex.org/W2892009249","https://openalex.org/W1556750318"],"abstract_inverted_index":{"An":[0],"effective":[1],"way":[2],"to":[3,35,115,175],"learn":[4],"representations":[5,14,26,118],"from":[6,16,28,42,52,100,120],"untranscribed":[7],"speech":[8,18,32,54,76,98,121],"and":[9,30,66,77,88,91,96,122,159,202],"unspoken":[10,44],"text":[11,78,123],"with":[12,61,86,157],"linguistic/lexical":[13],"derived":[15,99,119],"synthesized":[17,29,67,97],"was":[19],"introduced":[20],"in":[21,57],"tts4pretrain":[22],"[1].":[23],"However,":[24],"the":[25,40,58,101,105,111,162,170,181],"learned":[27],"real":[31,65,95],"are":[33],"likely":[34],"be":[36],"different,":[37],"potentially":[38],"limiting":[39],"improvements":[41],"incorporating":[43],"text.":[45],"In":[46],"this":[47],"paper,":[48],"we":[49,81,178],"introduce":[50,82],"learning":[51,73],"supervised":[53,171],"earlier":[55],"on":[56,141,188,194],"training":[59],"process":[60],"consistency-based":[62],"regularization":[63,93],"between":[64,94],"speech.":[68],"This":[69],"allows":[70],"for":[71],"better":[72],"of":[74,138],"shared":[75],"representations.":[79],"Thus,":[80],"a":[83,153,195],"new":[84,112],"objective,":[85],"encoder":[87],"decoder":[89],"consistency":[90],"contrastive":[92],"labeled":[102],"corpora":[103],"during":[104],"pretraining":[106,130],"stage.":[107],"We":[108],"show":[109,179],"that":[110,124,180],"objective":[113],"leads":[114],"more":[116],"similar":[117],"help":[125],"downstream":[126],"ASR.":[127,204],"The":[128,166],"proposed":[129,164,167,182],"method":[131,168,183],"yields":[132,185],"Word":[133],"Error":[134],"Rate":[135],"(WER)":[136],"reductions":[137,187],"7-21%":[139],"relative":[140],"six":[142],"public":[143],"corpora,":[144],"Librispeech,":[145],"AMI,":[146],"TEDLIUM,":[147],"Common":[148],"Voice,":[149],"Switchboard,":[150],"CHiME-6,":[151],"over":[152,161],"state-of-the-art":[154],"baseline":[155],"pretrained":[156],"wav2vec2.0":[158],"2-17%":[160],"previously":[163],"tts4pretrain.":[165],"outperforms":[169],"SpeechStew":[172],"by":[173,192],"up":[174],"17%.":[176],"Moreover,":[177],"also":[184],"WER":[186],"larger":[189],"data":[190],"sets":[191],"evaluating":[193],"large":[196],"resource,":[197],"in-house":[198],"Voice":[199],"Search":[200],"task":[201],"streaming":[203]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4225272718","counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":10},{"year":2022,"cited_by_count":3}],"updated_date":"2025-04-17T10:31:07.397842","created_date":"2022-05-04"}