{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,1,31]],"date-time":"2025-01-31T19:10:30Z","timestamp":1738350630583,"version":"3.35.0"},"reference-count":60,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,1,9]]},"DOI":"10.1109\/slt54892.2023.10022770","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T18:54:03Z","timestamp":1674845643000},"page":"1096-1103","source":"Crossref","is-referenced-by-count":12,"title":["Superb @ SLT 2022: Challenge on Generalization and Efficiency of Self-Supervised Speech Representation Learning"],"prefix":"10.1109","author":[{"given":"Tzu-hsun","family":"Feng","sequence":"first","affiliation":[{"name":"National Taiwan University,Taiwan"}]},{"given":"Annie","family":"Dong","sequence":"additional","affiliation":[{"name":"Meta,USA"}]},{"given":"Ching-Feng","family":"Yeh","sequence":"additional","affiliation":[{"name":"Meta,USA"}]},{"given":"Shu-wen","family":"Yang","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}]},{"given":"Tzu-Quan","family":"Lin","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}]},{"given":"Jiatong","family":"Shi","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Kai-Wei","family":"Chang","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}]},{"given":"Zili","family":"Huang","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,USA"}]},{"given":"Haibin","family":"Wu","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}]},{"given":"Xuankai","family":"Chang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Abdelrahman","family":"Mohamed","sequence":"additional","affiliation":[{"name":"Meta,USA"}]},{"given":"Shang-Wen","family":"Li","sequence":"additional","affiliation":[{"name":"Meta,USA"}]},{"given":"Hung-yi","family":"Lee","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3207050"},{"key":"ref2","article-title":"BERT: Pre-training of deep bidirectional trans-formers for language understanding","author":"Devlin","year":"2019","journal-title":"NAACL-HLT"},{"key":"ref3","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"NeurIPS"},{"key":"ref4","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","author":"Chen","year":"2020","journal-title":"ICML. PMLR"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3497510"},{"article-title":"Pushing the limits of semi-supervised learning for automatic speech recognition","volume-title":"Workshop on Self-Supervised Learning for Speech and Audio Processing, NeurIPS","author":"Zhang","key":"ref6"},{"key":"ref7","article-title":"A fine-tuned wav2vec 2.0\/HuBERT benchmark for speech emotion recognition, speaker verification and spoken language understanding","author":"Wang","year":"2021","journal-title":"arXiv preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747077"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref10","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"NeurIPS"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1775"},{"key":"ref12","article-title":"SUPERB-SG: Enhanced Speech processing Universal PER-formance Benchmark for Semantic and Generative Capabili-ties","author":"Tsai","year":"2022","journal-title":"ACL"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-tutorials.2"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-1473"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054438"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.213"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1228"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053176"},{"key":"ref19","article-title":"DeCoAR 2.0: Deep contextu-alized acoustic representations with vector quantization","author":"Ling","year":"2020","journal-title":"ar Xiv preprint"},{"key":"ref20","article-title":"Mockingjay: Unsupervised speech representation learning with deep bidirectional transformer encoders","author":"Andy","year":"2020","journal-title":"ICASSP"},{"key":"ref21","article-title":"Au-dio ALBERT: A lite BERT for self-supervised learning of au-dio representation","author":"Chi","year":"2020","journal-title":"SLT"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3095662"},{"key":"ref23","article-title":"Improving transformer-based speech recognition using unsupervised pre-training","author":"Jiang","year":"2019","journal-title":"ar Xiv preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/icassp39728.2021.9414539"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-905"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1511"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-349"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-2605"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053569"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.29007\/1mjd"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref32","article-title":"vq-wav2vec: Self-supervised learning of discrete speech represen-tations","author":"Baevski","year":"2020","journal-title":"ICLR"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref34","article-title":"GLUE: A multi-task benchmark and analysis platform for natural language under-standing","author":"Wang","year":"2018","journal-title":"ICLR"},{"key":"ref35","article-title":"A large-scale study of representation learning with the visual task adaptation benchmark","author":"Zhai","year":"2019","journal-title":"arXiv preprint"},{"key":"ref36","article-title":"Ch-marl: A multimodal benchmark for cooperative, heterogeneous multi-agent rein-forcement learning","author":"Sharma","year":"2022","journal-title":"arXiv preprint"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-556"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2743"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3033"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1693"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"journal-title":"arXiv preprint","article-title":"data2vec: A general framework for self-supervised learning in speech, vision and language","author":"Baevski","key":"ref42"},{"article-title":"Self-supervised representation learning for speech using visual grounding and masked language modeling","volume-title":"AAAI 2022 workshop on Self-supervised Learning for Audio and Speech Processing","author":"Peng","key":"ref43"},{"key":"ref44","article-title":"DistilHu-BERT: Speech Representation Learning by Layer-wise Distil-lation of Hidden-unit BERT","author":"Chang","year":"2022","journal-title":"ICASSP"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10269"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/slt54892.2023.10022954"},{"key":"ref47","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"ICML"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-519"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/slt54892.2023.10022474"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-236"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/slt54892.2023.10022552"},{"key":"ref52","article-title":"Silence is sweeter than speech: Self-supervised model using silence to store speaker information","author":"Feng","year":"2022","journal-title":"arXiv preprint"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/slt54892.2023.10022991"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054250"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747357"},{"key":"ref56","article-title":"Towards end-to-end unsupervised speech recognition","author":"Liu","year":"2022","journal-title":"arXiv preprint"},{"key":"ref57","first-page":"27826","article-title":"Unsupervised speech recognition","volume":"34","author":"Baevski","year":"2021","journal-title":"Advances in Neu-ral Information Processing Systems"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/slt54892.2023.10023274"},{"key":"ref59","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","author":"Houlsby","year":"2019","journal-title":"ICML. PMLR"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/slt54892.2023.10023345"}],"event":{"name":"2022 IEEE Spoken Language Technology Workshop (SLT)","start":{"date-parts":[[2023,1,9]]},"location":"Doha, Qatar","end":{"date-parts":[[2023,1,12]]}},"container-title":["2022 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10022052\/10022330\/10022770.pdf?arnumber=10022770","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,31]],"date-time":"2025-01-31T18:28:41Z","timestamp":1738348121000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10022770\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,9]]},"references-count":60,"URL":"https:\/\/doi.org\/10.1109\/slt54892.2023.10022770","relation":{},"subject":[],"published":{"date-parts":[[2023,1,9]]}}}