{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T06:03:37Z","timestamp":1740117817810,"version":"3.37.3"},"reference-count":25,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2020,2,1]],"date-time":"2020-02-01T00:00:00Z","timestamp":1580515200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61472087"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003399","name":"Science and Technology Commission of Shanghai Municipality","doi-asserted-by":"publisher","award":["17511104203"],"id":[{"id":"10.13039\/501100003399","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2020,2]]},"DOI":"10.1016\/j.neucom.2019.10.103","type":"journal-article","created":{"date-parts":[[2019,11,5]],"date-time":"2019-11-05T12:11:58Z","timestamp":1572955918000},"page":"262-272","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":6,"special_numbering":"C","title":["Dynamic interaction networks for image-text multimodal learning"],"prefix":"10.1016","volume":"379","author":[{"given":"Wenshan","family":"Wang","sequence":"first","affiliation":[]},{"given":"Pengfei","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7550-3057","authenticated-orcid":false,"given":"Su","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Weishan","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"2","key":"10.1016\/j.neucom.2019.10.103_bib0001","doi-asserted-by":"crossref","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","article-title":"Multimodal machine learning: a survey and taxonomy","volume":"41","author":"Baltruaitis","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2019.10.103_sbref0002","series-title":"Proceedings of the Thirty-Second International Conference on Machine Learning","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","volume":"37","author":"Xu","year":"2015"},{"key":"10.1016\/j.neucom.2019.10.103_bib0003","series-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","first-page":"2425","article-title":"Vqa: visual question answering","author":"Antol","year":"2015"},{"key":"10.1016\/j.neucom.2019.10.103_bib0004","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"49","article-title":"Learning deep representations of fine-grained visual descriptions","author":"Reed","year":"2016"},{"key":"10.1016\/j.neucom.2019.10.103_bib0005","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"21","article-title":"Stacked attention networks for image question answering","author":"Yang","year":"2016"},{"key":"10.1016\/j.neucom.2019.10.103_sbref0006","series-title":"Proceedings of the Thirtieth International Conference on Neural Information Processing Systems","first-page":"289","article-title":"Hierarchical question-image co-attention for visual question answering","author":"Lu","year":"2016"},{"key":"10.1016\/j.neucom.2019.10.103_bib0007","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"2156","article-title":"Dual attention networks for multimodal reasoning and matching","author":"Nam","year":"2017"},{"key":"10.1016\/j.neucom.2019.10.103_bib0008","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"30","article-title":"Image question answering using convolutional neural network with dynamic parameter prediction","author":"Noh","year":"2016"},{"key":"10.1016\/j.neucom.2019.10.103_bib0009","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"151","article-title":"Question type guided attention in visual question answering","author":"Shi","year":"2018"},{"key":"10.1016\/j.neucom.2019.10.103_sbref0010","series-title":"Proceedings of the Advances in Neural Information Processing Systems","first-page":"5998","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.neucom.2019.10.103_bib0011","unstructured":"D. Ha, A. Dai, Q.V. Le, Hypernetworks, 2017. https:\/\/openreview.net\/pdf?id=rkpACe1lx."},{"key":"10.1016\/j.neucom.2019.10.103_sbref0011","series-title":"Proceedings of the Advances in Neural Information Processing Systems","first-page":"523","article-title":"Learning feed-forward one-shot learners","volume":"29","author":"Bertinetto","year":"2016"},{"key":"10.1016\/j.neucom.2019.10.103_sbref0012","doi-asserted-by":"crossref","DOI":"10.1049\/iet-cvi.2019.0361","article-title":"Neural aesthetic image reviewer","author":"Wang","year":"2019","journal-title":"IET Computer Vision"},{"key":"10.1016\/j.neucom.2019.10.103_bib0014","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.neucom.2019.10.103_bib0015","series-title":"Proceedings of the EMNLP","first-page":"1532","article-title":"Glove: Global vectors for word representation.","volume":"14","author":"Pennington","year":"2014"},{"key":"10.1016\/j.neucom.2019.10.103_bib0016","first-page":"1","article-title":"Stat: spatial-temporal attention mechanism for video captioning","author":"Yan","year":"2019","journal-title":"IEEE Trans. Multimed."},{"issue":"10","key":"10.1016\/j.neucom.2019.10.103_bib0017","doi-asserted-by":"crossref","first-page":"2675","DOI":"10.1109\/TMM.2019.2903448","article-title":"Cross-modality bridging and knowledge transferring for image understanding","volume":"21","author":"Yan","year":"2019","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.neucom.2019.10.103_sbref0017","series-title":"Proceedings of the Advances in Neural Information Processing Systems","first-page":"2204","article-title":"Recurrent models of visual attention","volume":"27","author":"Mnih","year":"2014"},{"key":"10.1016\/j.neucom.2019.10.103_sbref0018","series-title":"Proceedings of the Thirtieth International Conference on Neural Information Processing Systems","first-page":"667","article-title":"Dynamic filter networks","author":"De Brabandere","year":"2016"},{"key":"10.1016\/j.neucom.2019.10.103_bib0020","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1575","article-title":"Meta-SR: a magnification-arbitrary network for super-resolution","author":"Hu","year":"2019"},{"key":"10.1016\/j.neucom.2019.10.103_bib0021","series-title":"The IEEE International Conference on Computer Vision (ICCV)","article-title":"Metapruning: meta learning for automatic neural network channel pruning","author":"Liu","year":"2019"},{"key":"10.1016\/j.neucom.2019.10.103_bib0022","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","article-title":"Dynamic conditional networks for few-shot learning","author":"Zhao","year":"2018"},{"key":"10.1016\/j.neucom.2019.10.103_bib0023","series-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"30","article-title":"Image question answering using convolutional neural network with dynamic parameter prediction","author":"Noh","year":"2016"},{"key":"10.1016\/j.neucom.2019.10.103_bib0024","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"4840","article-title":"A dynamic convolutional layer for short rangeweather prediction","author":"Klein","year":"2015"},{"key":"10.1016\/j.neucom.2019.10.103_bib0025","series-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, (Long and Short Papers)","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","volume":"Vol. 1","author":"Devlin","year":"2019"}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231219315462?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231219315462?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2020,1,13]],"date-time":"2020-01-13T18:31:56Z","timestamp":1578940316000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231219315462"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,2]]},"references-count":25,"alternative-id":["S0925231219315462"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2019.10.103","relation":{},"ISSN":["0925-2312"],"issn-type":[{"type":"print","value":"0925-2312"}],"subject":[],"published":{"date-parts":[[2020,2]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Dynamic interaction networks for image-text multimodal learning","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2019.10.103","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2019 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}