{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T01:35:45Z","timestamp":1740101745313,"version":"3.37.3"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072462"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key R&D Program of China","award":["2020AAA0108600"]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,4,30]]},"DOI":"10.1145\/3543507.3583232","type":"proceedings-article","created":{"date-parts":[[2023,4,26]],"date-time":"2023-04-26T23:30:51Z","timestamp":1682551851000},"page":"2392-2401","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["CapEnrich: Enriching Caption Semantics for Web Images via Cross-modal Pre-trained Knowledge"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9809-8864","authenticated-orcid":false,"given":"Linli","family":"Yao","sequence":"first","affiliation":[{"name":"School of Information, Renmin University of China, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9371-5256","authenticated-orcid":false,"given":"Weijing","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6486-6020","authenticated-orcid":false,"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, China"}]}],"member":"320","published-online":{"date-parts":[[2023,4,30]]},"reference":[{"key":"e_1_3_2_4_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"e_1_3_2_4_2_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems 33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020), 1877\u20131901."},{"key":"e_1_3_2_4_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"e_1_3_2_4_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00146"},{"key":"e_1_3_2_4_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_4_6_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Cho Jaemin","year":"2021","unstructured":"Jaemin Cho, Jie Lei, Hao Tan, and Mohit Bansal. 2021. Unifying vision-and-language tasks via text generation. In International Conference on Machine Learning. PMLR, 1931\u20131942."},{"key":"e_1_3_2_4_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/3294771.3294857"},{"key":"e_1_3_2_4_8_1","unstructured":"Fartash Faghri David\u00a0J. Fleet Jamie\u00a0Ryan Kiros and Sanja Fidler. 2018. VSE++: Improving Visual-Semantic Embeddings with Hard Negatives. In BMVC."},{"key":"e_1_3_2_4_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401430"},{"key":"e_1_3_2_4_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12266"},{"key":"e_1_3_2_4_11_1","volume-title":"CLIPScore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan\u00a0Le Bras, and Yejin Choi. 2021. CLIPScore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_4_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295748"},{"key":"e_1_3_2_4_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"e_1_3_2_4_14_1","volume-title":"A Good Prompt Is Worth Millions of Parameters\u00bf Low-resource Prompt-based Learning for Vision-Language Models. arXiv preprint arXiv:2110.08484","author":"Jin Woojeong","year":"2021","unstructured":"Woojeong Jin, Yu Cheng, Yelong Shen, Weizhu Chen, and Xiang Ren. 2021. A Good Prompt Is Worth Millions of Parameters\u00bf Low-resource Prompt-based Learning for Vision-Language Models. arXiv preprint arXiv:2110.08484 (2021)."},{"key":"e_1_3_2_4_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_4_16_1","volume-title":"International Conference on Machine Learning.","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision. In International Conference on Machine Learning."},{"key":"e_1_3_2_4_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"e_1_3_2_4_18_1","volume-title":"AAAI Conference on Artificial Intelligence.","author":"Li Gen","year":"2019","unstructured":"Gen Li, Nan Duan, Yuejian Fang, Daxin Jiang, and Ming Zhou. 2019. Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training. In AAAI Conference on Artificial Intelligence."},{"key":"e_1_3_2_4_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"e_1_3_2_4_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"e_1_3_2_4_21_1","unstructured":"Tsung-Yi Lin Michael Maire Serge\u00a0J. Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C.\u00a0Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In ECCV."},{"key":"e_1_3_2_4_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00434"},{"key":"e_1_3_2_4_23_1","volume-title":"prompt, and predict: A systematic survey of prompting methods in natural language processing. arXiv preprint arXiv:2107.13586","author":"Liu Pengfei","year":"2021","unstructured":"Pengfei Liu, Weizhe Yuan, Jinlan Fu, Zhengbao Jiang, Hiroaki Hayashi, and Graham Neubig. 2021. Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing. arXiv preprint arXiv:2107.13586 (2021)."},{"key":"e_1_3_2_4_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_21"},{"key":"e_1_3_2_4_25_1","unstructured":"Jiasen Lu Dhruv Batra Devi Parikh and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Neural Information Processing Systems."},{"key":"e_1_3_2_4_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00728"},{"key":"e_1_3_2_4_27_1","unstructured":"Vicente Ordonez Girish Kulkarni and Tamara\u00a0L. Berg. 2011. Im2Text: Describing Images Using 1 Million Captioned Photographs. In NIPS."},{"key":"e_1_3_2_4_28_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_2_4_29_1","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML."},{"key":"e_1_3_2_4_30_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog 1, 8","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, 2019. Language models are unsupervised multitask learners. OpenAI blog 1, 8 (2019), 9."},{"key":"e_1_3_2_4_31_1","first-page":"1","article-title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter\u00a0J Liu. 2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. Journal of Machine Learning Research 21 (2020), 1\u201367.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_4_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"e_1_3_2_4_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"e_1_3_2_4_34_1","volume-title":"Exploiting cloze questions for few shot text classification and natural language inference. arXiv preprint arXiv:2001.07676","author":"Schick Timo","year":"2020","unstructured":"Timo Schick and Hinrich Sch\u00fctze. 2020. Exploiting cloze questions for few shot text classification and natural language inference. arXiv preprint arXiv:2001.07676 (2020)."},{"key":"e_1_3_2_4_35_1","volume-title":"Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. In ACL.","author":"Sharma Piyush","year":"2018","unstructured":"Piyush Sharma, Nan Ding, Sebastian Goodman, and Radu Soricut. 2018. Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. In ACL."},{"key":"e_1_3_2_4_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.445"},{"key":"e_1_3_2_4_37_1","doi-asserted-by":"crossref","unstructured":"Zhan Shi Hui Liu and Xiao-Dan Zhu. 2021. Enhancing Descriptive Image Captioning with Natural Language Inference. In ACL.","DOI":"10.18653\/v1\/2021.acl-short.36"},{"key":"e_1_3_2_4_38_1","volume-title":"LXMERT: Learning Cross-Modality Encoder Representations from Transformers. ArXiv abs\/1908.07490","author":"Tan Hao\u00a0Hao","year":"2019","unstructured":"Hao\u00a0Hao Tan and Mohit Bansal. 2019. LXMERT: Learning Cross-Modality Encoder Representations from Transformers. ArXiv abs\/1908.07490 (2019)."},{"key":"e_1_3_2_4_39_1","volume-title":"Multimodal few-shot learning with frozen language models. Advances in Neural Information Processing Systems 34","author":"Tsimpoukelli Maria","year":"2021","unstructured":"Maria Tsimpoukelli, Jacob Menick, Serkan Cabi, SM Eslami, Oriol Vinyals, and Felix Hill. 2021. Multimodal few-shot learning with frozen language models. Advances in Neural Information Processing Systems 34 (2021)."},{"key":"e_1_3_2_4_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_4_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00899"},{"key":"e_1_3_2_4_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_4_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_22"},{"key":"e_1_3_2_4_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475215"},{"key":"e_1_3_2_4_45_1","volume-title":"Describing Like Humans: On Diversity in Image Captioning. In 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4190\u20134198","author":"Wang Qingzhong","year":"2019","unstructured":"Qingzhong Wang and Antoni\u00a0B. Chan. 2019. Describing Like Humans: On Diversity in Image Captioning. In 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4190\u20134198."},{"volume-title":"VD-BERT: A Unified Vision and Dialog Transformer with BERT. In Conference on Empirical Methods in Natural Language Processing.","author":"Wang Yue","key":"e_1_3_2_4_46_1","unstructured":"Yue Wang, Shafiq\u00a0R. Joty, Michael\u00a0R. Lyu, Irwin King, Caiming Xiong, and Steven C.\u00a0H. Hoi. 2020. VD-BERT: A Unified Vision and Dialog Transformer with BERT. In Conference on Empirical Methods in Natural Language Processing."},{"key":"e_1_3_2_4_47_1","volume-title":"SimVLM: Simple Visual Language Model Pretraining with Weak Supervision. ArXiv abs\/2108.10904","author":"Wang Zirui","year":"2021","unstructured":"Zirui Wang, Jiahui Yu, Adams\u00a0Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2021. SimVLM: Simple Visual Language Model Pretraining with Weak Supervision. ArXiv abs\/2108.10904 (2021)."},{"key":"e_1_3_2_4_48_1","volume-title":"Proceedings of the 32nd International Conference on International Conference on Machine Learning (ICML). 2048\u20132057","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy\u00a0Lei Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhutdinov, Richard\u00a0S. Zemel, and Yoshua Bengio. 2015. Show, Attend and Tell: Neural Image Caption Generation with Visual Attention. In Proceedings of the 32nd International Conference on International Conference on Machine Learning (ICML). 2048\u20132057."},{"key":"e_1_3_2_4_49_1","volume-title":"Cpt: Colorful prompt tuning for pre-trained vision-language models. arXiv preprint arXiv:2109.11797","author":"Yao Yuan","year":"2021","unstructured":"Yuan Yao, Ao Zhang, Zhengyan Zhang, Zhiyuan Liu, Tat-Seng Chua, and Maosong Sun. 2021. Cpt: Colorful prompt tuning for pre-trained vision-language models. arXiv preprint arXiv:2109.11797 (2021)."},{"key":"e_1_3_2_4_50_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_4_51_1","volume-title":"Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. arXiv preprint arXiv:2111.08276","author":"Zeng Yan","year":"2021","unstructured":"Yan Zeng, Xinsong Zhang, and Hang Li. 2021. Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. arXiv preprint arXiv:2111.08276 (2021)."},{"key":"e_1_3_2_4_52_1","volume-title":"VinVL: Revisiting Visual Representations in Vision-Language Models. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Zhang Pengchuan","year":"2021","unstructured":"Pengchuan Zhang, Xiujun Li, Xiaowei Hu, Jianwei Yang, Lei Zhang, Lijuan Wang, Yejin Choi, and Jianfeng Gao. 2021. VinVL: Revisiting Visual Representations in Vision-Language Models. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 5575\u20135584."},{"key":"e_1_3_2_4_53_1","volume-title":"Learning to prompt for vision-language models. arXiv preprint arXiv:2109.01134","author":"Zhou Kaiyang","year":"2021","unstructured":"Kaiyang Zhou, Jingkang Yang, Chen\u00a0Change Loy, and Ziwei Liu. 2021. Learning to prompt for vision-language models. arXiv preprint arXiv:2109.01134 (2021)."},{"key":"e_1_3_2_4_54_1","volume-title":"Unified Vision-Language Pre-Training for Image Captioning and VQA. ArXiv abs\/1909.11059","author":"Zhou Luowei","year":"2019","unstructured":"Luowei Zhou, Hamid Palangi, Lei Zhang, Houdong Hu, Jason\u00a0J. Corso, and Jianfeng Gao. 2019. Unified Vision-Language Pre-Training for Image Captioning and VQA. ArXiv abs\/1909.11059 (2019)."}],"event":{"name":"WWW '23: The ACM Web Conference 2023","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Austin TX USA","acronym":"WWW '23"},"container-title":["Proceedings of the ACM Web Conference 2023"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3543507.3583232","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,29]],"date-time":"2024-02-29T19:41:18Z","timestamp":1709235678000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3543507.3583232"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,4,30]]},"references-count":54,"alternative-id":["10.1145\/3543507.3583232","10.1145\/3543507"],"URL":"https:\/\/doi.org\/10.1145\/3543507.3583232","relation":{},"subject":[],"published":{"date-parts":[[2023,4,30]]},"assertion":[{"value":"2023-04-30","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}