{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:06:12Z","timestamp":1730297172176,"version":"3.28.0"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,1,9]]},"DOI":"10.1109\/slt54892.2023.10022791","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T18:54:03Z","timestamp":1674845643000},"page":"68-75","source":"Crossref","is-referenced-by-count":5,"title":["Maestro-U: Leveraging Joint Speech-Text Representation Learning for Zero Supervised Speech ASR"],"prefix":"10.1109","author":[{"given":"Zhehuai","family":"Chen","sequence":"first","affiliation":[{"name":"Google, Inc."}]},{"given":"Ankur","family":"Bapna","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Andrew","family":"Rosenberg","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Yu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Bhuvana","family":"Ramabhadran","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Pedro","family":"Moreno","sequence":"additional","affiliation":[{"name":"Google, Inc."}]},{"given":"Nanxin","family":"Chen","sequence":"additional","affiliation":[{"name":"Google, Inc."}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-677"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639348"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639655"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746594"},{"key":"ref6","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv preprint"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2341"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref9","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"arXiv preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472618"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462682"},{"key":"ref12","article-title":"SLAM: A unified encoder for speech and language modeling via speech-text joint pre-training","author":"Bapna","year":"2021","journal-title":"arXiv preprint"},{"key":"ref13","article-title":"mSLAM: Massively multilingual joint pre-training for speech and text","author":"Bapna","year":"2022","journal-title":"arXiv preprint"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10937"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1800"},{"key":"ref16","article-title":"Unsupervised speech recognition via segmental empirical output distribution matching","author":"Yeh","year":"2018","journal-title":"arXiv preprint"},{"key":"ref17","article-title":"Completely unsupervised speech recognition by a generative adversarial network harmonized with iteratively refined hidden markov models","author":"Chen","year":"2019","journal-title":"arXiv preprint"},{"key":"ref18","first-page":"27826","article-title":"Unsupervised speech recognition","volume":"34","author":"Baevski","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref19","article-title":"Towards end-to-end unsupervised speech recognition","author":"Liu","year":"2022","journal-title":"arXiv preprint"},{"key":"ref20","article-title":"The zero resource speech benchmark 2021: Metrics and baselines for unsupervised spoken language modeling","volume-title":"Self-Supervised Learning for Speech and Audio Processing Workshop @ NeurIPS","author":"Anh Nguyen","year":"2020"},{"key":"ref21","article-title":"Fleurs: Few-shot learning evaluation of universal representations of speech","author":"Conneau","year":"2022","journal-title":"arXiv preprint"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"ref23","article-title":"Common Voice: A massively-multilingual speech corpus","author":"Ardila","year":"2019","journal-title":"arXiv preprint"},{"key":"ref24","first-page":"16","article-title":"Speech recognition and keyword spotting for low-resource languages: Babel project research at cued","volume-title":"Fourth International workshop on spoken language technologies for under-resourced languages (SLTU-2014)","author":"Gales","year":"2014"},{"key":"ref25","article-title":"mT5: A massively multilingual pre-trained text-to-text transformer","author":"Xue","year":"2020","journal-title":"arXiv preprint"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00447"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.579"},{"key":"ref28","article-title":"Cross-lingual transfer learning for question answering","author":"Lee","year":"2019","journal-title":"arXiv preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414718"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053443"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2858"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639699"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688018"},{"key":"ref36","article-title":"Pay less attention with lightweight and dynamic convolutions","author":"Wu","year":"2019","journal-title":"arXiv preprint"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682674"},{"key":"ref38","article-title":"Pushing the limits of semi-supervised learning for automatic speech recognition","author":"Zhang","year":"2020","journal-title":"arXiv preprint"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746475"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/icassp43922.2022.9746719"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/icassp43922.2022.9746038"}],"event":{"name":"2022 IEEE Spoken Language Technology Workshop (SLT)","start":{"date-parts":[[2023,1,9]]},"location":"Doha, Qatar","end":{"date-parts":[[2023,1,12]]}},"container-title":["2022 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10022052\/10022330\/10022791.pdf?arnumber=10022791","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,13]],"date-time":"2024-02-13T08:07:36Z","timestamp":1707811656000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10022791\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,9]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/slt54892.2023.10022791","relation":{},"subject":[],"published":{"date-parts":[[2023,1,9]]}}}