{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,14]],"date-time":"2024-09-14T02:40:34Z","timestamp":1726281634595},"reference-count":58,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2021,9,1]],"date-time":"2021-09-01T00:00:00Z","timestamp":1630454400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2021,9,1]],"date-time":"2021-09-01T00:00:00Z","timestamp":1630454400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2021,9,1]],"date-time":"2021-09-01T00:00:00Z","timestamp":1630454400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2021,9,1]],"date-time":"2021-09-01T00:00:00Z","timestamp":1630454400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2021,9,1]],"date-time":"2021-09-01T00:00:00Z","timestamp":1630454400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,9,1]],"date-time":"2021-09-01T00:00:00Z","timestamp":1630454400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100004826","name":"Beijing Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004826","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2021,9]]},"DOI":"10.1016\/j.patcog.2021.107956","type":"journal-article","created":{"date-parts":[[2021,4,8]],"date-time":"2021-04-08T20:41:46Z","timestamp":1617914506000},"page":"107956","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":41,"special_numbering":"C","title":["Dual self-attention with co-attention networks for visual question answering"],"prefix":"10.1016","volume":"117","author":[{"given":"Yun","family":"Liu","sequence":"first","affiliation":[]},{"given":"Xiaoming","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0002-2147-4059","authenticated-orcid":false,"given":"Qianyun","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Chaozhuo","family":"Li","sequence":"additional","affiliation":[]},{"given":"Feiran","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Xianghong","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Zhoujun","family":"Li","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2021.107956_bib0001","series-title":"Proceedings of IEEE International Conference on Computer Vision","first-page":"2425","article-title":"VQA: visual question answering","author":"Antol","year":"2015"},{"key":"10.1016\/j.patcog.2021.107956_bib0002","series-title":"Proceedings of the Neural Information Processing Systems Conference","first-page":"1682","article-title":"A multi-world approach to question answering about real-world scenes based on uncertain input","author":"Malinowski","year":"2014"},{"key":"10.1016\/j.patcog.2021.107956_bib0003","series-title":"Proceedings of the Neural Information Processing Systems Conference","first-page":"2953","article-title":"Exploring models and data for image question answering","author":"Ren","year":"2015"},{"key":"10.1016\/j.patcog.2021.107956_bib0004","series-title":"The Thirty-Third AAAI Conference on Artificial Intelligence","first-page":"9324","article-title":"Dynamic capsule attention for visual question answering","author":"Zhou","year":"2019"},{"key":"10.1016\/j.patcog.2021.107956_bib0005","series-title":"Proceedings of the 2017 ACM on Conference on Information and Knowledge Management","first-page":"2439","article-title":"Knowledge-based question answering by jointly generating, copying and paraphrasing","author":"Zhu","year":"2015"},{"key":"10.1016\/j.patcog.2021.107956_bib0006","doi-asserted-by":"crossref","unstructured":"A. Bordes, J. Weston, N. Usunier, Open question answering with weakly supervised embedding models, in: Proceedings of the Joint European Conference on Machine Learning and Knowledge Discovery in Databases, pp. 165\u2013180.","DOI":"10.1007\/978-3-662-44848-9_11"},{"key":"10.1016\/j.patcog.2021.107956_bib0007","series-title":"Proceedings of the Thirtieth AAAI Conference on Artificial Intelligence","first-page":"294","article-title":"Online cross-modal hashing for web image retrieval","author":"Xie","year":"2016"},{"key":"10.1016\/j.patcog.2021.107956_bib0008","series-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","first-page":"5187","article-title":"Person search with natural language description","author":"Li","year":"2017"},{"issue":"2","key":"10.1016\/j.patcog.2021.107956_bib0009","doi-asserted-by":"crossref","first-page":"739","DOI":"10.1109\/TIP.2018.2860898","article-title":"Semi-supervised metric learning-based anchor graph hashing for large-scale image retrieval","volume":"28","author":"Hu","year":"2019","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.patcog.2021.107956_bib0010","series-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","first-page":"3128","article-title":"Deep visual-semantic alignments for generating image descriptions","author":"Karpathy","year":"2015"},{"key":"10.1016\/j.patcog.2021.107956_bib0011","series-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","first-page":"3156","article-title":"Show and tell: a neural image caption generator","author":"Vinyals","year":"2015"},{"issue":"1","key":"10.1016\/j.patcog.2021.107956_bib0012","doi-asserted-by":"crossref","DOI":"10.1609\/aaai.v32i1.12283","article-title":"Learning to guide decoding for image captioning","volume":"32","author":"Jiang","year":"2018","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"10.1016\/j.patcog.2021.107956_bib0013","series-title":"2018 4th International Conference on Electrical Engineering and Information & Communication Technology (iCEEiCT)","first-page":"118","article-title":"Study and observation of the variations of accuracies for handwritten digits recognition with various hidden layers and epochs using neural network algorithm","author":"Siddique","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0014","series-title":"2018 4th International Conference on Electrical Engineering and Information & Communication Technology (iCEEiCT)","first-page":"112","article-title":"Study and observation of the variations of accuracies for handwritten digits recognition with various hidden layers and epochs using convolutional neural network","author":"Arif","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0015","series-title":"Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining","first-page":"1880","article-title":"R-VQA: learning visual relation facts with semantic attention for visual question answering","author":"Lu","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0016","series-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","first-page":"21","article-title":"Stacked attention networks for image question answering","author":"Yang","year":"2016"},{"key":"10.1016\/j.patcog.2021.107956_bib0017","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"7680","article-title":"Differential attention for visual question answering","author":"Patro","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0018","series-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","first-page":"2156","article-title":"Dual attention networks for multimodal reasoning and matching","author":"Nam","year":"2017"},{"key":"10.1016\/j.patcog.2021.107956_bib0019","first-page":"1","article-title":"ALSA: adversarial learning of supervised attentions for visual question answering.","author":"Liu","year":"2020","journal-title":"IEEE Trans. Cybern."},{"key":"10.1016\/j.patcog.2021.107956_bib0020","series-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4622","article-title":"Ask me anything: free-form visual question answering based on knowledge from external sources","author":"Wu","year":"2016"},{"key":"10.1016\/j.patcog.2021.107956_bib0021","unstructured":"Q. Wu, C. Shen, A. van den Hengel, P. Wang, A.R. Dick, Image captioning and visual question answering based on attributes and their related external knowledge, CoRR abs\/1603.02814(2016b)."},{"key":"10.1016\/j.patcog.2021.107956_bib0022","series-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6146","article-title":"Knowledge acquisition for visual question answering via iterative querying","author":"Zhu","year":"2017"},{"key":"10.1016\/j.patcog.2021.107956_bib0023","series-title":"The Thirty-Third AAAI Conference on Artificial Intelligence","first-page":"8876","article-title":"KVQA: knowledge-aware visual question answering","author":"Shah","year":"2019"},{"key":"10.1016\/j.patcog.2021.107956_bib0024","series-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing","first-page":"457","article-title":"Multimodal compact bilinear pooling for visual question answering and visual grounding","author":"Fukui","year":"2016"},{"key":"10.1016\/j.patcog.2021.107956_bib0025","series-title":"Proceedings of 5th International Conference on Learning Representations","article-title":"Hadamard product for low-rank bilinear pooling","author":"Kim","year":"2017"},{"key":"10.1016\/j.patcog.2021.107956_bib0026","series-title":"Proceedings of IEEE International Conference on Computer Vision","first-page":"2631","article-title":"Mutan: multimodal tucker fusion for visual question answering","author":"Ben-younes","year":"2017"},{"issue":"02","key":"10.1016\/j.patcog.2021.107956_bib0027","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1142\/S0218488598000094","article-title":"The vanishing gradient problem during learning recurrent neural nets and problem solutions","volume":"6","author":"Hochreiter","year":"1998","journal-title":"Int. J. Uncertain. Fuzziness Knowledge-Based Syst."},{"issue":"8","key":"10.1016\/j.patcog.2021.107956_bib0028","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"Hochreiter","year":"1997","journal-title":"Neural Comput."},{"key":"10.1016\/j.patcog.2021.107956_bib0029","unstructured":"J. Chung, C. Gulcehre, K. Cho, Y. Bengio, Empirical evaluation of gated recurrent neural networks on sequence modeling, arXiv preprint arXiv:1412.3555(2014)."},{"key":"10.1016\/j.patcog.2021.107956_bib0030","series-title":"Proceedings of Advances in Neural Information Processing Systems","first-page":"1097","article-title":"Imagenet classification with deep convolutional neural networks","author":"Krizhevsky","year":"2012"},{"key":"10.1016\/j.patcog.2021.107956_bib0031","series-title":"Proceedings of Advances in Neural Information Processing Systems","first-page":"5998","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.patcog.2021.107956_bib0032","series-title":"Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies,(Short Papers)","first-page":"464","article-title":"Self-attention with relative position representations","author":"Shaw","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0033","series-title":"Proceedings of 6th International Conference on Learning Representations","article-title":"Bi-directional block self-attention for fast and memory-efficient sequence modeling","author":"Shen","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0034","series-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","first-page":"30","article-title":"Image question answering using convolutional neural network with dynamic parameter prediction","author":"Noh","year":"2016"},{"key":"10.1016\/j.patcog.2021.107956_bib0035","series-title":"Proceedings of the 27th ACM International Conference on Information and Knowledge Management","first-page":"1013","article-title":"Adversarial learning of answer-related representation for visual question answering","author":"Liu","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0036","series-title":"Proceedings of the Thirtieth AAAI Conference on Artificial Intelligence","first-page":"3567","article-title":"Learning to answer questions from image using convolutional neural network","author":"Ma","year":"2016"},{"key":"10.1016\/j.patcog.2021.107956_bib0037","first-page":"1","article-title":"Adversarial learning with multi-modal attention for visual question answering","author":"Liu","year":"2020","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.patcog.2021.107956_bib0038","series-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","first-page":"21","article-title":"Multi-level attention networks for visual question answering","author":"Yu","year":"2017"},{"key":"10.1016\/j.patcog.2021.107956_bib0039","series-title":"Proceedings of the International Joint Conference on Artificial Intelligence","first-page":"906","article-title":"From pixels to objects: Cubic visual attention for visual question answering","author":"Song","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0040","series-title":"Proceedings of the ACM Multimedia Conference","first-page":"519","article-title":"Object-difference attention: a simple relational attention for visual question answering","author":"Wu","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0041","unstructured":"P. Wang, Q. Wu, C. Shen, A.v. d. Hengel, A. Dick, Explicit knowledge-based reasoning for visual question answering, arXiv preprint arXiv:1511.02570(2015)."},{"key":"10.1016\/j.patcog.2021.107956_bib0042","series-title":"Proceedings of International Joint Conference on Artificial Intelligence","first-page":"4216","article-title":"Feature enhancement in attention for visual question answering","author":"Lin","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0043","series-title":"Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence","article-title":"Disan: directional self-attention network for RNN\/CNN-free language understanding","author":"Shen","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0044","series-title":"Prodeedings of the ACM Multimedia Conference","first-page":"447","article-title":"CSAN: contextual self-attention network for user sequential recommendation","author":"Huang","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0045","doi-asserted-by":"crossref","unstructured":"J. Fu, J. Liu, H. Tian, Z. Fang, H. Lu, Dual attention network for scene segmentation, arXiv preprint arXiv:1809.02983(2018).","DOI":"10.1109\/CVPR.2019.00326"},{"key":"10.1016\/j.patcog.2021.107956_bib0046","series-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing","first-page":"1532","article-title":"Glove: global vectors for word representation","author":"Pennington","year":"2014"},{"key":"10.1016\/j.patcog.2021.107956_bib0047","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.patcog.2021.107956_bib0048","series-title":"Proceedings of Advances in Neural Information Processing Systems","first-page":"289","article-title":"Hierarchical question-image co-attention for visual question answering","volume":"29","author":"Lu","year":"2016"},{"key":"10.1016\/j.patcog.2021.107956_bib0049","series-title":"Medical Imaging 2020: Image-Guided Procedures, Robotic Interventions, and Modeling","first-page":"113151R","article-title":"Preoperative angular insertion depth prediction in case of lateral wall cochlear implant electrode arrays","volume":"11315","author":"Khan","year":"2020"},{"key":"10.1016\/j.patcog.2021.107956_bib0050","series-title":"Medical Imaging 2020: Image-Guided Procedures, Robotic Interventions, and Modeling","first-page":"113152U","article-title":"Preoperative prediction of insertion depth of lateral wall cochlear implant electrode arrays","volume":"11315","author":"Khan","year":"2020"},{"key":"10.1016\/j.patcog.2021.107956_bib0051","doi-asserted-by":"crossref","unstructured":"Nguyen D K, Okatani T. Improved fusion of visual and language representations by dense symmetric co-attention for visual question answering. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2018: 6087-6096.","DOI":"10.1109\/CVPR.2018.00637"},{"key":"10.1016\/j.patcog.2021.107956_bib0052","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"1821","article-title":"Multi-modal factorized bilinear pooling with co-attention learning for visual question answering","author":"Yu","year":"2017"},{"key":"10.1016\/j.patcog.2021.107956_bib0053","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6325","article-title":"Making the V in VQA matter: elevating the role of image understanding in visual question answering","author":"Goyal","year":"2017"},{"key":"10.1016\/j.patcog.2021.107956_bib0054","series-title":"Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence","article-title":"Co-attending free-form regions and detections with multi-modal multiplicative feature embedding for visual question answering","author":"Lu","year":"2018"},{"key":"10.1016\/j.patcog.2021.107956_bib0055","series-title":"Proceedings of the 32nd Annual Meeting on Association for Computational Linguistics","first-page":"133","article-title":"Verb semantics and lexical selection","author":"Wu","year":"1994"},{"key":"10.1016\/j.patcog.2021.107956_bib0056","series-title":"Proceedings of the 33nd International Conference on Machine Learning","first-page":"2397","article-title":"Dynamic memory networks for visual and textual question answering","author":"Xiong","year":"2016"},{"key":"10.1016\/j.patcog.2021.107956_bib0057","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"7736","article-title":"Learning visual knowledge memory networks for visual question answering","author":"Su","year":"2018"},{"issue":"1","key":"10.1016\/j.patcog.2021.107956_bib0058","doi-asserted-by":"crossref","first-page":"86","DOI":"10.1214\/aoms\/1177731944","article-title":"A comparison of alternative tests of significance for the problem of m rankings,","volume":"11","author":"Friedman","year":"1940","journal-title":"Ann. Math. Stat."}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320321001436?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320321001436?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2022,12,30]],"date-time":"2022-12-30T05:22:02Z","timestamp":1672377722000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320321001436"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,9]]},"references-count":58,"alternative-id":["S0031320321001436"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2021.107956","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2021,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Dual self-attention with co-attention networks for visual question answering","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2021.107956","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2021 Elsevier Ltd. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"107956"}}