{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,5]],"date-time":"2024-11-05T05:04:14Z","timestamp":1730783054991,"version":"3.28.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681665","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"915-924","update-policy":"http:\/\/dx.doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CIEASR:Contextual Image-Enhanced Automatic Speech Recognition for Improved Homophone Discrimination"],"prefix":"10.1145","author":[{"ORCID":"http:\/\/orcid.org\/0009-0006-5863-303X","authenticated-orcid":false,"given":"Ziyi","family":"Wang","sequence":"first","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences & School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0003-4661-5601","authenticated-orcid":false,"given":"Yiming","family":"Rong","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences & School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"http:\/\/orcid.org\/0009-0000-4471-6965","authenticated-orcid":false,"given":"Deyang","family":"Jiang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences & School of Future Technology, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0003-0406-8881","authenticated-orcid":false,"given":"Haoran","family":"Wu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-6889-0316","authenticated-orcid":false,"given":"Shiyu","family":"Zhou","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-1111-1529","authenticated-orcid":false,"given":"Bo","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Andrew Senior, Oriol Vinyals, and Andrew Zisserman.","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2018. Deep audio-visual speech recognition. IEEE transactions on pattern analysis and machine intelligence, Vol. 44, 12 (2018), 8717--8727."},{"key":"e_1_3_2_1_3_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716--23736."},{"key":"e_1_3_2_1_4_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, Vol. 33 (2020), 12449--12460."},{"key":"e_1_3_2_1_5_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"X-llm: Bootstrapping advanced large language models by treating multi-modalities as foreign languages. arXiv preprint arXiv:2305.04160","author":"Chen Feilong","year":"2023","unstructured":"Feilong Chen, Minglun Han, Haozhi Zhao, Qingyang Zhang, Jing Shi, Shuang Xu, and Bo Xu. 2023. X-llm: Bootstrapping advanced large language models by treating multi-modalities as foreign languages. arXiv preprint arXiv:2305.04160 (2023)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1369-5"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/5.664274"},{"key":"e_1_3_2_1_9_1","volume-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv:2311.07919","author":"Chu Yunfei","year":"2023","unstructured":"Yunfei Chu, Jin Xu, Xiaohuan Zhou, Qian Yang, Shiliang Zhang, Zhijie Yan, Chang Zhou, and Jingren Zhou. 2023. Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv:2311.07919 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Computer Vision-ACCV 2016: 13th Asian Conference on Computer Vision","author":"Chung Joon Son","year":"2016","unstructured":"Joon Son Chung and Andrew Zisserman. 2017. Lip reading in the wild. In Computer Vision-ACCV 2016: 13th Asian Conference on Computer Vision, Taipei, Taiwan, November 20-24, 2016, Revised Selected Papers, Part II 13. Springer, 87--103."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2018.07.002"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383466"},{"key":"e_1_3_2_1_14_1","volume-title":"Improving Domain Generalization in Speech Emotion Recognition with Whisper. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 11631--11635","author":"Goron Erik","year":"2024","unstructured":"Erik Goron, Lena Asai, Elias Rut, and Martin Dinov. 2024. Improving Domain Generalization in Speech Emotion Recognition with Whisper. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 11631--11635."},{"key":"e_1_3_2_1_15_1","volume-title":"Knowledge transfer from pre-trained language models to cif-based speech recognizers via hierarchical distillation. arXiv preprint arXiv:2301.13003","author":"Han Minglun","year":"2023","unstructured":"Minglun Han, Feilong Chen, Jing Shi, Shuang Xu, and Bo Xu. 2023. Knowledge transfer from pre-trained language models to cif-based speech recognizers via hierarchical distillation. arXiv preprint arXiv:2301.13003 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9415054"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_2_1_19_1","volume-title":"The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691","author":"Lester Brian","year":"2021","unstructured":"Brian Lester, Rami Al-Rfou, and Noah Constant. 2021. The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)."},{"key":"e_1_3_2_1_20_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730--19742."},{"key":"e_1_3_2_1_21_1","volume-title":"Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190","author":"Li Xiang Lisa","year":"2021","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)."},{"key":"e_1_3_2_1_22_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Bin Zhu, Yang Ye, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_24_1","first-page":"1","article-title":"Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing","volume":"55","author":"Liu Pengfei","year":"2023","unstructured":"Pengfei Liu, Weizhe Yuan, Jinlan Fu, Zhengbao Jiang, Hiroaki Hayashi, and Graham Neubig. 2023. Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing. Comput. Surveys, Vol. 55, 9 (2023), 1--35.","journal-title":"Comput. Surveys"},{"key":"e_1_3_2_1_25_1","volume-title":"GPT understands, too. AI Open","author":"Liu Xiao","year":"2023","unstructured":"Xiao Liu, Yanan Zheng, Zhengxiao Du, Ming Ding, Yujie Qian, Zhilin Yang, and Jie Tang. 2023. GPT understands, too. AI Open (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414567"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-022-00550-z"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053841"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.3682038"},{"key":"e_1_3_2_1_30_1","volume-title":"Harlan M Krumholz, Jure Leskovec, Eric J Topol, and Pranav Rajpurkar.","author":"Moor Michael","year":"2023","unstructured":"Michael Moor, Oishi Banerjee, Zahra Shakeri Hossein Abad, Harlan M Krumholz, Jure Leskovec, Eric J Topol, and Pranav Rajpurkar. 2023. Foundation models for generalist medical artificial intelligence. Nature, Vol. 616, 7956 (2023), 259--265."},{"key":"e_1_3_2_1_31_1","volume-title":"VILAS: Exploring the Effects of Vision and Language Context in Automatic Speech Recognition. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Ni Ziyi","year":"2024","unstructured":"Ziyi Ni, Minglun Han, Feilong Chen, Linghui Meng, Jing Shi, Pin Lv, and Bo Xu. 2024. VILAS: Exploring the Effects of Vision and Language Context in Automatic Speech Recognition. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 11366--11370."},{"key":"e_1_3_2_1_32_1","volume-title":"Audio-visual speech recognition using deep learning. Applied intelligence","author":"Noda Kuniaki","year":"2015","unstructured":"Kuniaki Noda, Yuki Yamaguchi, Kazuhiro Nakadai, Hiroshi G Okuno, and Tetsuya Ogata. 2015. Audio-visual speech recognition using deep learning. Applied intelligence, Vol. 42 (2015), 722--737."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 4579--4588","author":"Onea'ua Dan","year":"2022","unstructured":"Dan Onea'ua and Horia Cucu. 2022. Improving multimodal speech recognition by data augmentation and speech representations. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 4579--4588."},{"key":"e_1_3_2_1_34_1","volume-title":"Leveraging unimodal self-supervised learning for multimodal audio-visual speech recognition. arXiv preprint arXiv:2203.07996","author":"Pan Xichen","year":"2022","unstructured":"Xichen Pan, Peiyu Chen, Yichen Gong, Helong Zhou, Xinbing Wang, and Zhouhan Lin. 2022. Leveraging unimodal self-supervised learning for multimodal audio-visual speech recognition. arXiv preprint arXiv:2203.07996 (2022)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_1_36_1","volume-title":"Can Visual Context Improve Automatic Speech Recognition for an Embodied Agent? arXiv preprint arXiv:2210.13189","author":"Pramanick Pradip","year":"2022","unstructured":"Pradip Pramanick and Chayan Sarkar. 2022. Can Visual Context Improve Automatic Speech Recognition for an Embodied Agent? arXiv preprint arXiv:2210.13189 (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"International Conference on Machine Learning. PMLR, 28492--28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International Conference on Machine Learning. PMLR, 28492--28518."},{"key":"e_1_3_2_1_38_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, Vol. 28 (2015)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"e_1_3_2_1_40_1","volume-title":"Learning audio-visual speech representation by masked multimodal cluster prediction. arXiv preprint arXiv:2201.02184","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Wei-Ning Hsu, Kushal Lakhotia, and Abdelrahman Mohamed. 2022. Learning audio-visual speech representation by masked multimodal cluster prediction. arXiv preprint arXiv:2201.02184 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"Robust self-supervised audio-visual speech recognition. arXiv preprint arXiv:2201.01763","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Wei-Ning Hsu, and Abdelrahman Mohamed. 2022. Robust self-supervised audio-visual speech recognition. arXiv preprint arXiv:2201.01763 (2022)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053397"},{"key":"e_1_3_2_1_43_1","volume-title":"Fine-grained grounding for multimodal speech recognition. arXiv preprint arXiv:2010.02384","author":"Srinivasan Tejas","year":"2020","unstructured":"Tejas Srinivasan, Ramon Sanabria, Florian Metze, and Desmond Elliott. 2020. Fine-grained grounding for multimodal speech recognition. arXiv preprint arXiv:2010.02384 (2020)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2016.7846320"},{"key":"e_1_3_2_1_45_1","unstructured":"Quan Sun Yuxin Fang Ledell Wu Xinlong Wang and Yue Cao. 2023. EVA-CLIP: Improved Training Techniques for CLIP at Scale. arxiv: 2303.15389 [cs.CV]"},{"key":"e_1_3_2_1_46_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25081"},{"key":"e_1_3_2_1_48_1","volume-title":"A prompt pattern catalog to enhance prompt engineering with chatgpt. arXiv preprint arXiv:2302.11382","author":"White Jules","year":"2023","unstructured":"Jules White, Quchen Fu, Sam Hays, Michael Sandborn, Carlos Olea, Henry Gilbert, Ashraf Elnashar, Jesse Spencer-Smith, and Douglas C Schmidt. 2023. A prompt pattern catalog to enhance prompt engineering with chatgpt. arXiv preprint arXiv:2302.11382 (2023)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01444"},{"volume-title":"Automatic speech recognition","author":"Yu Dong","key":"e_1_3_2_1_50_1","unstructured":"Dong Yu and Lin Deng. 2016. Automatic speech recognition. Vol. 1. Springer."},{"key":"e_1_3_2_1_51_1","volume-title":"Speechgpt: Empowering large language models with intrinsic cross-modal conversational abilities. arXiv preprint arXiv:2305.11000","author":"Zhang Dong","year":"2023","unstructured":"Dong Zhang, Shimin Li, Xin Zhang, Jun Zhan, Pengyu Wang, Yaqian Zhou, and Xipeng Qiu. 2023. Speechgpt: Empowering large language models with intrinsic cross-modal conversational abilities. arXiv preprint arXiv:2305.11000 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601","author":"Zhang Duzhen","year":"2024","unstructured":"Duzhen Zhang, Yahan Yu, Chenxing Li, Jiahua Dong, Dan Su, Chenhui Chu, and Dong Yu. 2024. Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601 (2024)."},{"volume-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Zhang Weitai","key":"e_1_3_2_1_53_1","unstructured":"Weitai Zhang, Hanyi Zhang, Chenxuan Liu, Zhongyi Ye, Xinyuan Zhou, Chao Lin, and Lirong Dai. 2024. Pre-Trained Acoustic-and-Textual Modeling for End-To-End Speech-To-Text Translation. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 11451--11455."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2969791"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681665","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T05:21:23Z","timestamp":1730697683000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681665"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":54,"alternative-id":["10.1145\/3664647.3681665","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681665","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}