{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,19]],"date-time":"2024-11-19T18:46:39Z","timestamp":1732041999755},"reference-count":51,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/100012542","name":"Sichuan Province Science and Technology Support Program","doi-asserted-by":"publisher","award":["2019ZDZX0008"],"id":[{"id":"10.13039\/100012542","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61976049","62172079","62222203"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004806","name":"Fok Ying Tong Education Foundation","doi-asserted-by":"publisher","award":["171106"],"id":[{"id":"10.13039\/501100004806","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Fusion"],"published-print":{"date-parts":[[2023,3]]},"DOI":"10.1016\/j.inffus.2022.10.013","type":"journal-article","created":{"date-parts":[[2022,10,20]],"date-time":"2022-10-20T05:20:06Z","timestamp":1666243206000},"page":"327-337","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":11,"special_numbering":"C","title":["TFUN: Trilinear Fusion Network for Ternary Image-Text Retrieval"],"prefix":"10.1016","volume":"91","author":[{"ORCID":"http:\/\/orcid.org\/0000-0001-5685-3123","authenticated-orcid":false,"given":"Xing","family":"Xu","sequence":"first","affiliation":[]},{"given":"Jialiang","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Zuo","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Yin","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xiaofeng","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Heng Tao","family":"Shen","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.inffus.2022.10.013_b1","series-title":"International Conference on Computer Vision and Image Processing","first-page":"75","article-title":"Visual question answering using deep learning: A survey and performance analysis","author":"Srivastava","year":"2020"},{"key":"10.1016\/j.inffus.2022.10.013_b2","doi-asserted-by":"crossref","first-page":"268","DOI":"10.1016\/j.inffus.2019.03.005","article-title":"Information fusion in visual question answering: A survey","volume":"52","author":"Zhang","year":"2019","journal-title":"Inf. Fusion"},{"issue":"6","key":"10.1016\/j.inffus.2022.10.013_b3","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3295748","article-title":"A comprehensive survey of deep learning for image captioning","volume":"51","author":"Hossain","year":"2019","journal-title":"ACM Comput. Surv. (CsUR)"},{"key":"10.1016\/j.inffus.2022.10.013_b4","doi-asserted-by":"crossref","first-page":"233","DOI":"10.1016\/j.inffus.2021.07.008","article-title":"Explain and improve: LRP-inference fine-tuning for image captioning models","volume":"77","author":"Sun","year":"2022","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.inffus.2022.10.013_b5","doi-asserted-by":"crossref","first-page":"149","DOI":"10.1016\/j.inffus.2021.07.009","article-title":"Multimodal research in vision and language: A review of current and emerging trends","volume":"77","author":"Uppal","year":"2022","journal-title":"Inf. Fusion"},{"issue":"1","key":"10.1016\/j.inffus.2022.10.013_b6","doi-asserted-by":"crossref","first-page":"23","DOI":"10.1007\/s11263-020-01359-2","article-title":"Image matching from handcrafted to deep features: A survey","volume":"129","author":"Ma","year":"2021","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.inffus.2022.10.013_b7","first-page":"1","article-title":"Joint feature synthesis and embedding: Adversarial cross-modal retrieval revisited","author":"Xu","year":"2020","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.inffus.2022.10.013_b8","series-title":"2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR","first-page":"3068","article-title":"Learning cross-modal embeddings for cooking recipes and food images","author":"Salvador","year":"2017"},{"key":"10.1016\/j.inffus.2022.10.013_b9","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1520","article-title":"Memory-augmented attribute manipulation networks for interactive fashion search","author":"Zhao","year":"2017"},{"year":"2020","series-title":"Cross-modal food retrieval: Learning a joint embedding of food images and recipes with semantic consistency and attention mechanism","author":"Wang","key":"10.1016\/j.inffus.2022.10.013_b10"},{"key":"10.1016\/j.inffus.2022.10.013_b11","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"6439","article-title":"Composing text and image for image retrieval-an empirical odyssey","author":"Vo","year":"2019"},{"key":"10.1016\/j.inffus.2022.10.013_b12","series-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","first-page":"1140","article-title":"Compositional learning of image-text query for image retrieval","author":"Anwaar","year":"2021"},{"key":"10.1016\/j.inffus.2022.10.013_b13","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3001","article-title":"Image search with text feedback by visiolinguistic attention learning","author":"Chen","year":"2020"},{"year":"2015","series-title":"Adaptive Control Processes: A Guided Tour","author":"Bellman","key":"10.1016\/j.inffus.2022.10.013_b14"},{"key":"10.1016\/j.inffus.2022.10.013_b15","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"392","article-title":"Compact trilinear interaction for visual question answering","author":"Do","year":"2019"},{"key":"10.1016\/j.inffus.2022.10.013_b16","series-title":"European Conference on Computer Vision","first-page":"126","article-title":"Hard negative examples are hard, but useful","author":"Xuan","year":"2020"},{"key":"10.1016\/j.inffus.2022.10.013_b17","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2020.107795","article-title":"Fast hard negative mining for deep metric learning","volume":"112","author":"Gaji\u0107","year":"2021","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.inffus.2022.10.013_b18","series-title":"IEEE International Conference on Computer Vision, ICCV 2017","first-page":"1472","article-title":"Automatic spatially-aware fashion concept discovery","author":"Han","year":"2017"},{"year":"2019","series-title":"The fashion IQ dataset: Retrieving images by combining side information and relative natural language feedback","author":"Guo","key":"10.1016\/j.inffus.2022.10.013_b19"},{"key":"10.1016\/j.inffus.2022.10.013_b20","series-title":"Proceedings of the European Conference on Computer Vision","first-page":"169","article-title":"Attributes as operators: factorizing unseen attribute-object compositions","author":"Nagarajan","year":"2018"},{"issue":"12","key":"10.1016\/j.inffus.2022.10.013_b21","doi-asserted-by":"crossref","first-page":"5412","DOI":"10.1109\/TNNLS.2020.2967597","article-title":"Cross-modal attention with semantic consistence for image\u2013text matching","volume":"31","author":"Xu","year":"2020","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.inffus.2022.10.013_b22","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10921","article-title":"Graph structured network for image-text matching","author":"Liu","year":"2020"},{"key":"10.1016\/j.inffus.2022.10.013_b23","series-title":"European Conference on Computer Vision","first-page":"104","article-title":"Uniter: Universal image-text representation learning","author":"Chen","year":"2020"},{"key":"10.1016\/j.inffus.2022.10.013_b24","series-title":"2018 ACM Multimedia Conference on Multimedia Conference, MM","first-page":"1020","article-title":"Deep understanding of cooking procedure for cross-modal recipe retrieval","author":"Chen","year":"2018"},{"key":"10.1016\/j.inffus.2022.10.013_b25","series-title":"2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR","first-page":"14558","article-title":"MCEN: Bridging cross-modal gap between cooking recipes and dish images with latent variable model","author":"Fu","year":"2020"},{"year":"2015","series-title":"Improved semantic representations from tree-structured long short-term memory networks","author":"Tai","key":"10.1016\/j.inffus.2022.10.013_b26"},{"year":"2021","series-title":"CHEF: Cross-modal hierarchical embeddings for food domain retrieval","author":"Pham","key":"10.1016\/j.inffus.2022.10.013_b27"},{"key":"10.1016\/j.inffus.2022.10.013_b28","series-title":"IEEE Conference on Computer Vision and Pattern Recognition, CVPR","first-page":"11477","article-title":"R2GAN: Cross-modal recipe retrieval with generative adversarial network","author":"Zhu","year":"2019"},{"key":"10.1016\/j.inffus.2022.10.013_b29","series-title":"IEEE Conference on Computer Vision and Pattern Recognition, CVPR","first-page":"11572","article-title":"Learning cross-modal embeddings with adversarial networks for cooking recipes and food images","author":"Wang","year":"2019"},{"key":"10.1016\/j.inffus.2022.10.013_b30","series-title":"Proceedings of the 30th ACM International Conference on Information & Knowledge Management","first-page":"2221","article-title":"Learning joint embedding with modality alignments for cross-modal retrieval of recipes and food images","author":"Xie","year":"2021"},{"year":"2021","series-title":"Revamping cross-modal recipe retrieval with hierarchical transformers and self-supervised learning","author":"Salvador","key":"10.1016\/j.inffus.2022.10.013_b31"},{"year":"2020","series-title":"CurlingNet: Compositional learning between images and text for fashion IQ data","author":"Yu","key":"10.1016\/j.inffus.2022.10.013_b32"},{"key":"10.1016\/j.inffus.2022.10.013_b33","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXII 16","first-page":"136","article-title":"Learning joint visual semantic matching embeddings for language-guided retrieval","author":"Chen","year":"2020"},{"key":"10.1016\/j.inffus.2022.10.013_b34","series-title":"2021 IEEE 8th International Conference on Data Science and Advanced Analytics","first-page":"1","article-title":"Learning fashion similarity based on hierarchical attribute embedding","author":"Yan","year":"2021"},{"issue":"3","key":"10.1016\/j.inffus.2022.10.013_b35","doi-asserted-by":"crossref","first-page":"279","DOI":"10.1007\/BF02289464","article-title":"Some mathematical notes on three-mode factor analysis","volume":"31","author":"Tucker","year":"1966","journal-title":"Psychometrika"},{"year":"1970","series-title":"Foundations of the PARAFAC procedure: Models and conditions for an\u201d explanatory\u201d multimodal factor analysis","author":"Harshman","key":"10.1016\/j.inffus.2022.10.013_b36"},{"issue":"7\u20138","key":"10.1016\/j.inffus.2022.10.013_b37","doi-asserted-by":"crossref","first-page":"324","DOI":"10.1002\/cem.1206","article-title":"Modeling multi-way data with linearly dependent loadings","volume":"23","author":"Bro","year":"2009","journal-title":"J. Chemom.: J. Chemom. Soc."},{"key":"10.1016\/j.inffus.2022.10.013_b38","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"21","article-title":"Stacked attention networks for image question answering","author":"Yang","year":"2016"},{"issue":"3","key":"10.1016\/j.inffus.2022.10.013_b39","doi-asserted-by":"crossref","first-page":"455","DOI":"10.1137\/07070111X","article-title":"Tensor decompositions and applications","volume":"51","author":"Kolda","year":"2009","journal-title":"SIAM Rev."},{"year":"2019","series-title":"Dividing and conquering cross-modal recipe retrieval: from nearest neighbours baselines to sota","author":"Fain","key":"10.1016\/j.inffus.2022.10.013_b40"},{"key":"10.1016\/j.inffus.2022.10.013_b41","series-title":"Proceedings of the 27th ACM International Conference on Multimedia","first-page":"12","article-title":"Matching images and text with multi-modal tensor fusion and re-ranking","author":"Wang","year":"2019"},{"key":"10.1016\/j.inffus.2022.10.013_b42","series-title":"European Conference on Computer Vision","first-page":"549","article-title":"Adaptive offline quintuplet loss for image-text matching","author":"Chen","year":"2020"},{"key":"10.1016\/j.inffus.2022.10.013_b43","series-title":"Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining","first-page":"2553","article-title":"Embedding-based retrieval in facebook search","author":"Huang","year":"2020"},{"key":"10.1016\/j.inffus.2022.10.013_b44","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"815","article-title":"Facenet: A unified embedding for face recognition and clustering","author":"Schroff","year":"2015"},{"key":"10.1016\/j.inffus.2022.10.013_b45","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"2840","article-title":"Sampling matters in deep embedding learning","author":"Wu","year":"2017"},{"year":"2017","series-title":"In defense of the triplet loss for person re-identification","author":"Hermans","key":"10.1016\/j.inffus.2022.10.013_b46"},{"key":"10.1016\/j.inffus.2022.10.013_b47","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"issue":"8","key":"10.1016\/j.inffus.2022.10.013_b48","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"Hochreiter","year":"1997","journal-title":"Neural Comput."},{"year":"2014","series-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling","author":"Chung","key":"10.1016\/j.inffus.2022.10.013_b49"},{"key":"10.1016\/j.inffus.2022.10.013_b50","series-title":"International ACM SIGIR Conference on Research & Development in Information Retrieval, SIGIR","first-page":"35","article-title":"Cross-modal retrieval in the cooking context: Learning semantic text-image embeddings","author":"Carvalho","year":"2018"},{"key":"10.1016\/j.inffus.2022.10.013_b51","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"3156","article-title":"Show and tell: A neural image caption generator","author":"Vinyals","year":"2015"}],"container-title":["Information Fusion"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253522001828?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253522001828?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,5,24]],"date-time":"2024-05-24T16:32:21Z","timestamp":1716568341000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1566253522001828"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,3]]},"references-count":51,"alternative-id":["S1566253522001828"],"URL":"https:\/\/doi.org\/10.1016\/j.inffus.2022.10.013","relation":{},"ISSN":["1566-2535"],"issn-type":[{"type":"print","value":"1566-2535"}],"subject":[],"published":{"date-parts":[[2023,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"TFUN: Trilinear Fusion Network for Ternary Image-Text Retrieval","name":"articletitle","label":"Article Title"},{"value":"Information Fusion","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.inffus.2022.10.013","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2022 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}