{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T07:27:09Z","timestamp":1743751629083,"version":"3.37.3"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,15]],"date-time":"2019-10-15T00:00:00Z","timestamp":1571097600000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100012659","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61602089, 61572108, 61632007, 61772116"],"id":[{"id":"10.13039\/501100012659","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Sichuan Province Science and Technology Program","award":["2018GZDZX0032"]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,15]]},"DOI":"10.1145\/3343031.3350875","type":"proceedings-article","created":{"date-parts":[[2019,10,21]],"date-time":"2019-10-21T16:32:26Z","timestamp":1571675546000},"page":"12-20","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":116,"title":["Matching Images and Text with Multi-modal Tensor Fusion and Re-ranking"],"prefix":"10.1145","author":[{"given":"Tan","family":"Wang","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Xing","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Yang","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Alan","family":"Hanjalic","sequence":"additional","affiliation":[{"name":"Delft University of Technology, Delft, Netherlands"}]},{"given":"Heng Tao","family":"Shen","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Jingkuan","family":"Song","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]}],"member":"320","published-online":{"date-parts":[[2019,10,15]]},"reference":[{"volume-title":"Bottom-Up and Top-Down Attention for Image Captioning and VQA. CoRR","year":"2017","author":"Anderson Peter","key":"e_1_3_2_1_1_1"},{"volume-title":"VQA: Visual Question Answering. In 2015 IEEE International Conference on Computer Vision. 2425--2433","year":"2015","author":"Antol Stanislaw","key":"e_1_3_2_1_2_1"},{"volume-title":"MUTAN: Multimodal Tucker Fusion for Visual Question Answering. In IEEE International Conference on Computer Vision. 2631--2639","year":"2017","author":"Hedi","key":"e_1_3_2_1_3_1"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"e_1_3_2_1_5_1","unstructured":"Fartash Faghri David J. Fleet Jamie Kiros and Sanja Fidler. 2018. VSE Fartash Faghri David J. Fleet Jamie Kiros and Sanja Fidler. 2018. VSE"},{"volume-title":"British Machine Vision Conference","year":"2018","key":"e_1_3_2_1_6_1"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1044"},{"volume-title":"2015 IEEE International Conference on Computer Vision. 1305--1313","year":"2015","author":"Jorge Garc'i","key":"e_1_3_2_1_8_1"},{"volume-title":"Imagine and Match: Improving Textual-Visual Cross-Modal Retrieval with Generative Models. CoRR","year":"2017","author":"Gu Jiuxiang","key":"e_1_3_2_1_9_1"},{"volume-title":"Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition. 770--778","year":"2016","author":"He Kaiming","key":"e_1_3_2_1_10_1"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1180639.1180654"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2890144"},{"volume-title":"Instance-Aware Image and Sentence Matching with Selective Multimodal LSTM. In 2017 IEEE Conference on Computer Vision and Pattern Recognition . 7254--7262","year":"2017","author":"Huang Yan","key":"e_1_3_2_1_13_1"},{"volume-title":"Learning Semantic Concepts and Order for Image and Sentence Matching. CoRR","year":"2036","author":"Huang Yan","key":"e_1_3_2_1_14_1"},{"volume-title":"Computer Vision - ECCV 2016 - 14th European Conference. 727--739.","author":"Jabri Allan","key":"e_1_3_2_1_15_1"},{"volume-title":"Proc. IEEE Conf. Computer Vision and Pattern Recognition. 3128--3137","author":"Karpathy A.","key":"e_1_3_2_1_16_1"},{"volume-title":"Woosang Lim, Jeonghee Kim, JungWoo Ha, and Byoung-Tak Zhang.","year":"2016","author":"Kim Jin-Hwa","key":"e_1_3_2_1_17_1"},{"volume-title":"Kingma and Jimmy Ba","year":"2014","author":"Diederik","key":"e_1_3_2_1_18_1"},{"volume-title":"Skip-Thought Vectors. In Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015 . 3294--3302","year":"2015","author":"Kiros Ryan","key":"e_1_3_2_1_19_1"},{"volume-title":"Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. CoRR","year":"2016","author":"Krishna Ranjay","key":"e_1_3_2_1_20_1"},{"volume-title":"Stacked Cross Attention for Image-Text Matching. CoRR","year":"2018","author":"Lee Kuang-Huei","key":"e_1_3_2_1_21_1"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-014-1949-7"},{"volume-title":"Identity-Aware Textual-Visual Matching with Latent Co-attention. In IEEE International Conference on Computer Vision. 1908--1917","year":"2017","author":"Li Shuang","key":"e_1_3_2_1_23_1"},{"volume-title":"Piotr Doll\u00e1 r, and C. Lawrence Zitnick","year":"2014","author":"Lin Tsung-Yi","key":"e_1_3_2_1_24_1"},{"volume-title":"IEEE International Conference on Computer Vision. 4127--4136","author":"Liu Yu","key":"e_1_3_2_1_25_1"},{"volume-title":"Dual Attention Networks for Multimodal Reasoning and Matching. In 2017 IEEE Conference on Computer Vision and Pattern Recognition. 2156--2164","year":"2017","author":"Nam Hyeonseob","key":"e_1_3_2_1_26_1"},{"volume-title":"IEEE International Conference on Computer Vision. 1899--1907","year":"2017","author":"Niu Zhenxing","key":"e_1_3_2_1_27_1"},{"volume-title":"The 24th IEEE Conference on Computer Vision and Pattern Recognition . 777--784","author":"Qin Danfeng","key":"e_1_3_2_1_28_1"},{"volume-title":"Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems","year":"2015","author":"Ren Shaoqing","key":"e_1_3_2_1_29_1"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248031"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/2463676.2465274"},{"volume-title":"Learning Deep Structure-Preserving Image-Text Embeddings. In 2016 IEEE Conference on Computer Vision and Pattern Recognition. 5005--5013","year":"2016","author":"Wang Liwei","key":"e_1_3_2_1_32_1"},{"volume-title":"Learning Two-Branch Neural Networks for Image-Text Matching Tasks. CoRR","year":"2017","author":"Wang Liwei","key":"e_1_3_2_1_33_1"},{"volume-title":"Joint Global and Co-Attentive Representation Learning for Image-Sentence Retrieval. In 2018 ACM Multimedia Conference on Multimedia Conference. ACM, 1398--1406","year":"2018","author":"Wang Shuhui","key":"e_1_3_2_1_34_1"},{"volume-title":"Proceedings of the 32nd International Conference on Machine Learning . 2048--2057","year":"2015","author":"Xu Kelvin","key":"e_1_3_2_1_35_1"},{"volume-title":"Deep Adversarial Metric Learning for Cross-Modal Retrieval. World Wide Web","year":"2018","author":"Xu Xing","key":"e_1_3_2_1_36_1"},{"volume-title":"Heng Tao Shen, and Xuelong Li","year":"2019","author":"Xu Xing","key":"e_1_3_2_1_37_1"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873977"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2012.2187778"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-14445-0_10"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2605058"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"volume-title":"Deep Cross-Modal Projection Learning for Image-Text Matching. In The European Conference on Computer Vision (ECCV) .","year":"2018","author":"Zhang Ying","key":"e_1_3_2_1_43_1"},{"volume-title":"Dual-path convolutional image-text embedding. arXiv preprint arXiv:1711.05535","year":"2017","author":"Zheng Zhedong","key":"e_1_3_2_1_44_1"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.389"},{"volume-title":"Simple Baseline for Visual Question Answering. CoRR","year":"2015","author":"Zhou Bolei","key":"e_1_3_2_1_46_1"}],"event":{"name":"MM '19: The 27th ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Nice France","acronym":"MM '19"},"container-title":["Proceedings of the 27th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3343031.3350875","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,10]],"date-time":"2023-01-10T16:57:15Z","timestamp":1673369835000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3350875"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,15]]},"references-count":46,"alternative-id":["10.1145\/3343031.3350875","10.1145\/3343031"],"URL":"https:\/\/doi.org\/10.1145\/3343031.3350875","relation":{},"subject":[],"published":{"date-parts":[[2019,10,15]]},"assertion":[{"value":"2019-10-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}