{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,14]],"date-time":"2024-09-14T07:11:48Z","timestamp":1726297908177},"reference-count":39,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2021,9,1]],"date-time":"2021-09-01T00:00:00Z","timestamp":1630454400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2021,4,15]],"date-time":"2021-04-15T00:00:00Z","timestamp":1618444800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100004543","name":"China Scholarship Council","doi-asserted-by":"publisher","award":["201703170183"],"id":[{"id":"10.13039\/501100004543","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001717","name":"Universiteit Leiden","doi-asserted-by":"publisher","award":["201703170183"],"id":[{"id":"10.13039\/501100001717","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004543","name":"China Scholarship Council","doi-asserted-by":"publisher","award":["201703170183"],"id":[{"id":"10.13039\/501100004543","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2021,9]]},"DOI":"10.1016\/j.patcog.2021.107983","type":"journal-article","created":{"date-parts":[[2021,4,8]],"date-time":"2021-04-08T01:26:42Z","timestamp":1617845202000},"page":"107983","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":19,"special_numbering":"C","title":["Integrating information theory and adversarial learning for cross-modal retrieval"],"prefix":"10.1016","volume":"117","author":[{"given":"Wei","family":"Chen","sequence":"first","affiliation":[]},{"given":"Yu","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Erwin M.","family":"Bakker","sequence":"additional","affiliation":[]},{"given":"Michael S.","family":"Lew","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2021.107983_bib0001","article-title":"A comprehensive survey on cross-modal retrieval","author":"Wang","year":"2016","journal-title":"arXiv preprint arXiv:1607.06215"},{"issue":"1","key":"10.1016\/j.patcog.2021.107983_bib0002","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/2906152","article-title":"Socializing the semantic gap: a comparative survey on image tag assignment, refinement, and retrieval","volume":"49","author":"Li","year":"2016","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.patcog.2021.107983_bib0003","doi-asserted-by":"crossref","first-page":"296","DOI":"10.1016\/j.patcog.2019.06.013","article-title":"Graph-based multimodal fusion with metric learning for multimodal classification","volume":"95","author":"Angelou","year":"2019","journal-title":"Pattern Recognit."},{"issue":"2","key":"10.1016\/j.patcog.2021.107983_bib0004","doi-asserted-by":"crossref","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","article-title":"Multimodal machine learning: a survey and taxonomy","volume":"41","author":"Baltru\u0161aitis","year":"2018","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2021.107983_bib0005","series-title":"Proc. BMVC","first-page":"1","article-title":"Vse++: Improving visual-semantic embeddings with hard negatives","author":"Faghri","year":"2018"},{"key":"10.1016\/j.patcog.2021.107983_bib0006","series-title":"Proc. ACM MM","first-page":"154","article-title":"Adversarial cross-modal retrieval","author":"Wang","year":"2017"},{"key":"10.1016\/j.patcog.2021.107983_bib0007","doi-asserted-by":"crossref","first-page":"365","DOI":"10.1016\/j.patcog.2019.05.008","article-title":"Cyclematch: a cycle-consistent embedding network for image-text matching","volume":"93","author":"Liu","year":"2019","journal-title":"Pattern Recognit."},{"issue":"3","key":"10.1016\/j.patcog.2021.107983_bib0008","doi-asserted-by":"crossref","first-page":"379","DOI":"10.1002\/j.1538-7305.1948.tb01338.x","article-title":"A mathematical theory of communication","volume":"27","author":"Shannon","year":"1948","journal-title":"Bell system technical journal"},{"key":"10.1016\/j.patcog.2021.107983_bib0009","doi-asserted-by":"crossref","first-page":"312","DOI":"10.1016\/j.patcog.2018.11.017","article-title":"Word spotting and recognition via a joint deep embedding of image and text","volume":"88","author":"Mhiri","year":"2019","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2021.107983_bib0010","doi-asserted-by":"crossref","first-page":"107479","DOI":"10.1016\/j.patcog.2020.107479","article-title":"Joint and individual matrix factorization hashing for large-scale cross-modal retrieval","author":"Wang","year":"2020","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2021.107983_bib0011","doi-asserted-by":"crossref","first-page":"107335","DOI":"10.1016\/j.patcog.2020.107335","article-title":"Modality-specific and shared generative adversarial network for cross-modal retrieval","author":"Wu","year":"2020","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2021.107983_bib0012","series-title":"Proc. ECCV","first-page":"686","article-title":"Deep cross-modal projection learning for image-text matching","author":"Zhang","year":"2018"},{"key":"10.1016\/j.patcog.2021.107983_bib0013","series-title":"Proc. IEEE CVPR","first-page":"2517","article-title":"Advent: Adversarial entropy minimization for domain adaptation in semantic segmentation","author":"Vu","year":"2019"},{"key":"10.1016\/j.patcog.2021.107983_bib0014","series-title":"Proc. IEEE ICME","first-page":"43","article-title":"Domain uncertainty based on information theory for cross-modal hash retrieval","author":"Chen","year":"2019"},{"key":"10.1016\/j.patcog.2021.107983_bib0015","series-title":"Proc. IEEE CVPR","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.patcog.2021.107983_bib0016","article-title":"Mobilenets: efficient convolutional neural networks for mobile vision applications","author":"Howard","year":"2017","journal-title":"arXiv preprint arXiv:1704.04861"},{"key":"10.1016\/j.patcog.2021.107983_bib0017","series-title":"Proc. ICANN","first-page":"799","article-title":"Bidirectional lstm networks for improved phoneme classification and recognition","author":"Graves","year":"2005"},{"key":"10.1016\/j.patcog.2021.107983_bib0018","doi-asserted-by":"crossref","first-page":"107523","DOI":"10.1016\/j.patcog.2020.107523","article-title":"A novel strategy to balance the results of cross-modal hashing","volume":"107","author":"Zhong","year":"2020","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2021.107983_bib0019","series-title":"Proc. ICML","first-page":"1321","article-title":"On calibration of modern neural networks","author":"Guo","year":"2017"},{"key":"10.1016\/j.patcog.2021.107983_bib0020","doi-asserted-by":"crossref","first-page":"853","DOI":"10.1613\/jair.3994","article-title":"Framing image description as a ranking task: data, models and evaluation metrics","volume":"47","author":"Hodosh","year":"2013","journal-title":"Journal of Artificial Intelligence Research"},{"key":"10.1016\/j.patcog.2021.107983_bib0021","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1162\/tacl_a_00166","article-title":"From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions","volume":"2","author":"Young","year":"2014","journal-title":"Trans. Association for Computational Linguistics"},{"key":"10.1016\/j.patcog.2021.107983_bib0022","series-title":"Proc. ECCV","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.patcog.2021.107983_bib0023","series-title":"Proc. IEEE CVPR","first-page":"1970","article-title":"Person search with natural language description","author":"Li","year":"2017"},{"key":"10.1016\/j.patcog.2021.107983_bib0024","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014","journal-title":"arXiv preprint arXiv:1409.1556"},{"key":"10.1016\/j.patcog.2021.107983_bib0025","series-title":"Proc. IEEE CVPR","first-page":"5005","article-title":"Learning deep structure-preserving image-text embeddings","author":"Wang","year":"2016"},{"key":"10.1016\/j.patcog.2021.107983_bib0026","series-title":"Proc. ICLR","article-title":"Deep captioning with multimodal recurrent neural networks (m-rnn)","author":"Mao","year":"2015"},{"key":"10.1016\/j.patcog.2021.107983_bib0027","series-title":"Proc. ECCV","first-page":"833","article-title":"Rnn fisher vectors for action recognition and image annotation","author":"Lev","year":"2016"},{"issue":"12","key":"10.1016\/j.patcog.2021.107983_bib0028","doi-asserted-by":"crossref","first-page":"3377","DOI":"10.1109\/TMM.2018.2832602","article-title":"Predicting visual features from text for image and video caption retrieval","volume":"20","author":"Dong","year":"2018","journal-title":"IEEE Trans. Multimedia"},{"key":"10.1016\/j.patcog.2021.107983_bib0029","series-title":"Proc. IEEE CVPR","first-page":"2310","article-title":"Instance-aware image and sentence matching with selective multimodal lstm","author":"Huang","year":"2017"},{"key":"10.1016\/j.patcog.2021.107983_bib0030","series-title":"Proc. IEEE ICCV","first-page":"4107","article-title":"Learning a recurrent residual fusion network for multimodal matching","author":"Liu","year":"2017"},{"issue":"2s","key":"10.1016\/j.patcog.2021.107983_bib0031","doi-asserted-by":"crossref","first-page":"56","DOI":"10.1145\/3314577","article-title":"Cross-modality retrieval by joint correlation learning","volume":"15","author":"Wang","year":"2019","journal-title":"ACM Trans. Multimedia Comput. Commun. Appl."},{"key":"10.1016\/j.patcog.2021.107983_bib0032","series-title":"Proc. IEEE ICCV","first-page":"5814","article-title":"Adversarial representation learning for text-to-image matching","author":"Sarafianos","year":"2019"},{"key":"10.1016\/j.patcog.2021.107983_bib0033","series-title":"Proc. IEEE CVPR","first-page":"299","article-title":"Dual attention networks for multimodal reasoning and matching","author":"Nam","year":"2017"},{"issue":"2","key":"10.1016\/j.patcog.2021.107983_bib0034","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3383184","article-title":"Dual-path convolutional image-text embeddings with instance loss","volume":"16","author":"Zheng","year":"2020","journal-title":"ACM Trans. Multimedia Comput. Commun. Appl."},{"key":"10.1016\/j.patcog.2021.107983_bib0035","series-title":"Proc. IEEE ICCV","first-page":"1890","article-title":"Identity-aware textual-visual matching with latent co-attention","author":"Li","year":"2017"},{"key":"10.1016\/j.patcog.2021.107983_bib0036","series-title":"Proc. ECCV","first-page":"54","article-title":"Improving deep visual representation for person re-identification by global and local image-language association","author":"Chen","year":"2018"},{"key":"10.1016\/j.patcog.2021.107983_bib0037","doi-asserted-by":"crossref","first-page":"5542","DOI":"10.1109\/TIP.2020.2984883","article-title":"Improving description-based person re-identification by multi-granularity image-text alignments","volume":"29","author":"Niu","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.patcog.2021.107983_bib0038","series-title":"Proc. IEEE CVPR","first-page":"4437","article-title":"Associating neural word embeddings with deep image representations using fisher vectors","author":"Klein","year":"2015"},{"key":"10.1016\/j.patcog.2021.107983_bib0039","series-title":"Proc. ICML","first-page":"1180","article-title":"Unsupervised domain adaptation by backpropagation","author":"Ganin","year":"2015"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320321001709?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320321001709?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2021,6,5]],"date-time":"2021-06-05T12:51:49Z","timestamp":1622897509000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320321001709"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,9]]},"references-count":39,"alternative-id":["S0031320321001709"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2021.107983","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2021,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Integrating information theory and adversarial learning for cross-modal retrieval","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2021.107983","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2021 The Authors. Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"107983"}}