{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,5]],"date-time":"2024-11-05T05:04:23Z","timestamp":1730783063558,"version":"3.28.0"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372014, 61925201, 62132001, U22B2048"],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681660","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"1187-1196","update-policy":"http:\/\/dx.doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ResVG: Enhancing Relation and Semantic Understanding in Multiple Instances for Visual Grounding"],"prefix":"10.1145","author":[{"ORCID":"http:\/\/orcid.org\/0000-0003-1612-975X","authenticated-orcid":false,"given":"Minghang","family":"Zheng","sequence":"first","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}]},{"ORCID":"http:\/\/orcid.org\/0009-0009-0312-8484","authenticated-orcid":false,"given":"Jiahua","family":"Zhang","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-1216-5609","authenticated-orcid":false,"given":"Qingchao","family":"Chen","sequence":"additional","affiliation":[{"name":"National Institute of Health Data Science, Peking University, Beijing, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-7658-3845","authenticated-orcid":false,"given":"Yuxin","family":"Peng","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-4259-3882","authenticated-orcid":false,"given":"Yang","family":"Liu","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University & State Key Laboratory of General Artificial Intelligence, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"volume-title":"Segdiff: Image segmentation with diffusion probabilistic models. arXiv preprint arXiv:2112.00390","year":"2021","author":"Amit Tomer","key":"e_1_3_2_2_1_1","unstructured":"Tomer Amit, Tal Shaharbany, Eliya Nachmani, and Lior Wolf. 2021. Segdiff: Image segmentation with diffusion probabilistic models. arXiv preprint arXiv:2112.00390 (2021)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"volume-title":"Robust classification via a single diffusion model. arXiv preprint arXiv:2305.15241","year":"2023","author":"Chen Huanran","key":"e_1_3_2_2_5_1","unstructured":"Huanran Chen, Yinpeng Dong, Zhengyi Wang, Xiao Yang, Chengqi Duan, Hang Su, and Jun Zhu. 2023. Robust classification via a single diffusion model. arXiv preprint arXiv:2305.15241 (2023)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16188"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01816"},{"volume-title":"Real-time referring expression comprehension by single-stage grounding network. arXiv preprint arXiv:1812.03426","year":"2018","author":"Chen Xinpeng","key":"e_1_3_2_2_8_1","unstructured":"Xinpeng Chen, Lin Ma, Jingyuan Chen, Zequn Jie, Wei Liu, and Jiebo Luo. 2018. Real-time referring expression comprehension by single-stage grounding network. arXiv preprint arXiv:1812.03426 (2018)."},{"key":"e_1_3_2_2_9_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Clark Kevin","year":"2024","unstructured":"Kevin Clark and Priyank Jaini. 2024. Text-to-Image Diffusion Models are Zero Shot Classifiers. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","year":"2018","author":"Devlin Jacob","key":"e_1_3_2_2_11_1","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"volume-title":"The segmented and annotated IAPR TC-12 benchmark. Computer vision and image understanding","year":"2010","author":"Escalante Hugo Jair","key":"e_1_3_2_2_12_1","unstructured":"Hugo Jair Escalante, Carlos A Hern\u00e1ndez, Jesus A Gonzalez, Aurelio L\u00f3pez-L\u00f3pez, Manuel Montes, Eduardo F Morales, L Enrique Sucar, Luis Villasenor, and Michael Grubinger. 2010. The segmented and annotated IAPR TC-12 benchmark. Computer vision and image understanding, Vol. 114, 4 (2010), 419--428."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447191"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.23919\/cje.2022.00.227"},{"volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","year":"2020","author":"Ho Jonathan","key":"e_1_3_2_2_17_1","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840--6851."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2911066"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.470"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01661"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00525"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01507"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","year":"2013","author":"Kingma Diederik P","key":"e_1_3_2_2_24_1","unstructured":"Diederik P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00477"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00265"},{"volume-title":"LGR-NET: Language Guided Reasoning Network for Referring Expression Comprehension","year":"2024","author":"Lu Mingcong","key":"e_1_3_2_2_28_1","unstructured":"Mingcong Lu, Ruifan Li, Fangxiang Feng, Zhanyu Ma, and Xiaojie Wang. 2024. LGR-NET: Language Guided Reasoning Network for Referring Expression Comprehension. IEEE Transactions on Circuits and Systems for Video Technology (2024)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28222"},{"volume-title":"Diffusion models beat gans on image classification. arXiv preprint arXiv:2307.08702","year":"2023","author":"Mukhopadhyay Soumik","key":"e_1_3_2_2_31_1","unstructured":"Soumik Mukhopadhyay, Matthew Gwilliam, Vatsal Agarwal, Namitha Padmanabhan, Archana Swaminathan, Srinidhi Hegde, Tianyi Zhou, and Abhinav Shrivastava. 2023. Diffusion models beat gans on image classification. arXiv preprint arXiv:2307.08702 (2023)."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01258-8_16"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0965-7"},{"volume-title":"SiRi: A Simple Selective Retraining Mechanism for Transformer-based Visual Grounding","year":"2022","author":"Qu Mengxue","key":"e_1_3_2_2_34_1","unstructured":"Mengxue Qu, Yu Wu, Wu Liu, Qiqi Gong, Xiaodan Liang, Olga Russakovsky, Yao Zhao, and Yunchao Wei. 2022. SiRi: A Simple Selective Retraining Mechanism for Transformer-based Visual Grounding. Springer, Cham (2022)."},{"volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","year":"2021","author":"Radford Alec","key":"e_1_3_2_2_35_1","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"volume-title":"Michael Ying Yang, and Bodo Rosenhahn","year":"2019","author":"Reinders Christoph","key":"e_1_3_2_2_36_1","unstructured":"Christoph Reinders, Hanno Ackermann, Michael Ying Yang, and Bodo Rosenhahn. 2019. Learning convolutional neural networks for object detection with very little training data. In Multimodal scene understanding. Elsevier, 65--100."},{"volume-title":"Data augmentation in defect detection of sanitary ceramics in small and non-iid datasets","year":"2022","author":"Ren Xinyang","key":"e_1_3_2_2_37_1","unstructured":"Xinyang Ren, Weiyang Lin, Xianqiang Yang, Xinghu Yu, and Huijun Gao. 2022. Data augmentation in defect detection of sanitary ceramics in small and non-iid datasets. IEEE Transactions on Neural Networks and Learning Systems (2022)."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00479"},{"volume-title":"Cross-Graph Transformer Network for Temporal Sentence Grounding. In International Conference on Artificial Neural Networks. Springer, 345--356","year":"2023","author":"Shang Jiahui","key":"e_1_3_2_2_41_1","unstructured":"Jiahui Shang, Ping Wei, and Nanning Zheng. 2023. Cross-Graph Transformer Network for Temporal Sentence Grounding. In International Conference on Artificial Neural Networks. Springer, 345--356."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-019-0197-0"},{"volume-title":"Context disentangling and prototype inheriting for robust visual grounding","year":"2023","author":"Tang Wei","key":"e_1_3_2_2_43_1","unstructured":"Wei Tang, Liang Li, Xuejing Liu, Lu Jin, Jinhui Tang, and Zechao Li. 2023. Context disentangling and prototype inheriting for robust visual grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"volume-title":"Detection of power line insulator defects using aerial images analyzed with convolutional neural networks","year":"2018","author":"Tao Xian","key":"e_1_3_2_2_44_1","unstructured":"Xian Tao, Dapeng Zhang, Zihao Wang, Xilong Liu, Hongyan Zhang, and De Xu. 2018. Detection of power line insulator defects using aerial images analyzed with convolutional neural networks. IEEE transactions on systems, man, and cybernetics: systems, Vol. 50, 4 (2018), 1486--1498."},{"volume-title":"attend, and segment: Unsupervised zero-shot segmentation using stable diffusion. arXiv preprint arXiv:2308.12469","year":"2023","author":"Tian Junjiao","key":"e_1_3_2_2_45_1","unstructured":"Junjiao Tian, Lavisha Aggarwal, Andrea Colaco, Zsolt Kira, and Mar Gonzalez-Franco. 2023. Diffuse, attend, and segment: Unsupervised zero-shot segmentation using stable diffusion. arXiv preprint arXiv:2308.12469 (2023)."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3604557","article-title":"Language-guided Residual Graph Attention Network and Data Augmentation for Visual Grounding","volume":"20","author":"Wang Jia","year":"2023","unstructured":"Jia Wang, Hong-Han Shuai, Yung-Hui Li, and Wen-Huang Cheng. 2023. Language-guided Residual Graph Attention Network and Data Augmentation for Visual Grounding. ACM Transactions on Multimedia Computing, Communications and Applications, Vol. 20, 1 (2023), 1--23.","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2797921"},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1960--1968","author":"Wang Peng","key":"e_1_3_2_2_48_1","unstructured":"Peng Wang, Qi Wu, Jiewei Cao, Chunhua Shen, Lianli Gao, and Anton van den Hengel. 2019. Neighbourhood watch: Referring expression comprehension via language-guided graph attention networks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1960--1968."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00117"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00289"},{"volume-title":"Semantic-Aware Human Object Interaction Image Generation. In Forty-first International Conference on Machine Learning.","year":"2024","author":"Xu Zhu","key":"e_1_3_2_2_51_1","unstructured":"Zhu Xu, Qingchao Chen, Yuxin Peng, and Yang Liu. 2024. Semantic-Aware Human Object Interaction Image Generation. In Forty-first International Conference on Machine Learning."},{"volume-title":"3D Vision and Language Pretraining with Large-Scale Synthetic Data. IJCAI","year":"2024","author":"Yang Dejie","key":"e_1_3_2_2_52_1","unstructured":"Dejie Yang, Zhu Xu, Wentao Mo, Qingchao Chen, Siyuan Huang, and Yang Liu. 2024. 3D Vision and Language Pretraining with Large-Scale Synthetic Data. IJCAI (2024)."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00928"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00474"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_23"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00478"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.23919\/cje.2021.00.084"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"volume-title":"Rethinking diversified and discriminative proposal generation for visual grounding. arXiv preprint arXiv:1805.03508","year":"2018","author":"Yu Zhou","key":"e_1_3_2_2_60_1","unstructured":"Zhou Yu, Jun Yu, Chenchao Xiang, Zhou Zhao, Qi Tian, and Dacheng Tao. 2018. Rethinking diversified and discriminative proposal generation for visual grounding. arXiv preprint arXiv:1805.03508 (2018)."},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00437"},{"volume-title":"DiffusionEngine: Diffusion model is scalable data engine for object detection. arXiv preprint arXiv:2309.03893","year":"2023","author":"Zhang Manlin","key":"e_1_3_2_2_62_1","unstructured":"Manlin Zhang, Jie Wu, Yuxi Ren, Ming Li, Jie Qin, Xuefeng Xiao, Wei Liu, Rui Wang, Min Zheng, and Andy J Ma. 2023. DiffusionEngine: Diffusion model is scalable data engine for object detection. arXiv preprint arXiv:2309.03893 (2023)."},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_2_64_1","first-page":"18123","article-title":"Counterfactual contrastive learning for weakly-supervised vision-language grounding","volume":"33","author":"Zhang Zhu","year":"2020","unstructured":"Zhu Zhang, Zhou Zhao, Zhijie Lin, Xiuqiang He, et al. 2020. Counterfactual contrastive learning for weakly-supervised vision-language grounding. Advances in Neural Information Processing Systems, Vol. 33 (2020), 18123--18134.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"Proceedings of the European Conference on Computer Vision (ECCV).","year":"2024","author":"Zheng Minghang","key":"e_1_3_2_2_65_1","unstructured":"Minghang Zheng, Xinhao Cai, Qingchao Chen, Yuxin Peng, and Yang Liu. 2024. Training Free Video Temporal Grounding using Large-scale Pre-trained Models. In Proceedings of the European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.794"},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20263"},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01511"},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25478"},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7000"},{"volume-title":"SeqTR: A Simple yet Universal Network for Visual Grounding. arXiv e-prints","year":"2022","author":"Zhu Chaoyang","key":"e_1_3_2_2_71_1","unstructured":"Chaoyang Zhu, Yiyi Zhou, Yunhang Shen, Gen Luo, Xingjia Pan, Mingbao Lin, Chao Chen, Liujuan Cao, Xiaoshuai Sun, and Rongrong Ji. 2022. SeqTR: A Simple yet Universal Network for Visual Grounding. arXiv e-prints (2022)."},{"key":"e_1_3_2_2_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00447"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681660","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T05:24:12Z","timestamp":1730697852000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681660"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":72,"alternative-id":["10.1145\/3664647.3681660","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681660","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}