{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T05:52:27Z","timestamp":1740117147148,"version":"3.37.3"},"reference-count":75,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2019YFE0105400","MC-201920-X01"],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["3072022JC0402","3072022JC0403"],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013804","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100013804","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62173103","62303129"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100005046","name":"Natural Science Foundation of Heilongjiang Province","doi-asserted-by":"publisher","award":["LH2023F022"],"id":[{"id":"10.13039\/501100005046","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Image and Vision Computing"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1016\/j.imavis.2023.104840","type":"journal-article","created":{"date-parts":[[2023,10,16]],"date-time":"2023-10-16T16:12:30Z","timestamp":1697472750000},"page":"104840","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":7,"special_numbering":"C","title":["Multi-modal spatial relational attention networks for visual question answering"],"prefix":"10.1016","volume":"140","author":[{"given":"Haibo","family":"Yao","sequence":"first","affiliation":[]},{"given":"Lipeng","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Chengtao","family":"Cai","sequence":"additional","affiliation":[]},{"given":"Yuxin","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Zhi","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Yongkang","family":"Luo","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.imavis.2023.104840_bb0005","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"17948","article-title":"Beyond a pre-trained object detector: Cross-modal textual and visual context for image captioning","author":"Kuo","year":"2022"},{"key":"10.1016\/j.imavis.2023.104840_bb0010","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"2286","article-title":"Dual-level collaborative transformer for image captioning","author":"Luo","year":"2021"},{"key":"10.1016\/j.imavis.2023.104840_bb0015","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"15640","article-title":"Negative-aware attention framework for image-text matching","author":"Zhang","year":"2022"},{"key":"10.1016\/j.imavis.2023.104840_bb0020","series-title":"2021 IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"1793","article-title":"Wasserstein coupled graph learning for cross-modal retrieval","author":"Wang","year":"2021"},{"key":"10.1016\/j.imavis.2023.104840_bb0025","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"9489","article-title":"Improving visual grounding with visual-linguistic verification and iterative reasoning","author":"Yang","year":"2022"},{"key":"10.1016\/j.imavis.2023.104840_bb0030","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"4682","article-title":"A fast and accurate one-stage approach to visual grounding","author":"Yang","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0035","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2021.104316","article-title":"Aligning vision-language for graph inference in visual dialog","volume":"116","author":"Jiang","year":"2021","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2023.104840_bb0040","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"2425","article-title":"Vqa: Visual question answering","author":"Antol","year":"2015"},{"key":"10.1016\/j.imavis.2023.104840_bb0045","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2021.104328","article-title":"Vqa as a factoid question answering problem: a novel approach for knowledge-aware and explainable visual question answering","volume":"116","author":"Narayanan","year":"2021","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2023.104840_bb0050","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2020.103985","article-title":"From known to the unknown: transferring knowledge to answer questions about novel visual and semantic concepts","volume":"103","author":"Farazi","year":"2020","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2023.104840_bb0055","series-title":"Proceedings of the IEEE Winter Conference on Applications of Computer Vision","first-page":"1114","article-title":"Dense but efficient videoqa for intricate compositional reasoning","author":"Lee","year":"2023"},{"key":"10.1016\/j.imavis.2023.104840_bb0060","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4223","article-title":"Tips and tricks for visual question answering: Learnings from the 2017 challenge","author":"Teney","year":"2018"},{"key":"10.1016\/j.imavis.2023.104840_bb0065","series-title":"Advances in Neural Information Processing Systems","first-page":"275","article-title":"Chain of reasoning for visual question answering","author":"Wu","year":"2018"},{"key":"10.1016\/j.imavis.2023.104840_bb0070","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"5089","article-title":"Maintaining reasoning consistency in compositional visual question answering","author":"Jing","year":"2022"},{"key":"10.1016\/j.imavis.2023.104840_bb0075","series-title":"Advances in Neural Information Processing Systems","first-page":"1564","article-title":"Bilinear attention networks","author":"Kim","year":"2018"},{"key":"10.1016\/j.imavis.2023.104840_bb0080","series-title":"Advances in Neural Information Processing Systems","article-title":"Self-critical reasoning for robust visual question answering","author":"Wu","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0085","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"2054","article-title":"Trar: Routing the attention spans in transformer for visual question answering","author":"Zhou","year":"2021"},{"key":"10.1016\/j.imavis.2023.104840_bb0090","series-title":"Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)","first-page":"1532","article-title":"Glove: Global vectors for word representation","author":"Pennington","year":"2014"},{"key":"10.1016\/j.imavis.2023.104840_bb0095","article-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling","author":"Chung","year":"2014","journal-title":"arXiv"},{"issue":"8","key":"10.1016\/j.imavis.2023.104840_bb0100","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"Hochreiter","year":"1997","journal-title":"Neural Comput."},{"key":"10.1016\/j.imavis.2023.104840_bb0105","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6077","article-title":"Bottom-up and top-down attention for image captioning and visual question answering","author":"Anderson","year":"2018"},{"key":"10.1016\/j.imavis.2023.104840_bb0110","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"91","article-title":"Re-attention for visual question answering","author":"Guo","year":"2020"},{"key":"10.1016\/j.imavis.2023.104840_bb0115","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2021.104291","article-title":"Multi-tier attention network using term-weighted question features for visual question answering","volume":"115","author":"Manmadhan","year":"2021","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2023.104840_bb0120","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2020.103968","article-title":"Explaining vqa predictions using visual grounding and a knowledge base","volume":"101","author":"Riquelme","year":"2020","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2023.104840_bb0125","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"21","article-title":"Stacked attention networks for image question answering","author":"Yang","year":"2016"},{"key":"10.1016\/j.imavis.2023.104840_bb0130","series-title":"Advances in Neural Information Processing Systems","first-page":"289","article-title":"Hierarchical question-image co-attention for visual question answering","author":"Lu","year":"2016"},{"key":"10.1016\/j.imavis.2023.104840_bb0135","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"1839","article-title":"Multi-modal factorized bilinear pooling with co-attention learning for visual question answering","author":"Yu","year":"2017"},{"key":"10.1016\/j.imavis.2023.104840_bb0140","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6274","article-title":"Deep modular co-attention networks for visual question answering","author":"Yu","year":"2019"},{"issue":"1","key":"10.1016\/j.imavis.2023.104840_bb0145","doi-asserted-by":"crossref","first-page":"586","DOI":"10.1007\/s10489-022-03559-4","article-title":"Sparse co-attention visual question answering networks based on thresholds","volume":"53","author":"Guo","year":"2023","journal-title":"Appl. Intell."},{"issue":"6","key":"10.1016\/j.imavis.2023.104840_bb0150","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0287557","article-title":"Multi-modal adaptive gated mechanism for visual question answering","volume":"18","author":"Xu","year":"2023","journal-title":"PLoS One"},{"key":"10.1016\/j.imavis.2023.104840_bb0155","series-title":"Advances in Neural Information Processing Systems","first-page":"5999","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.imavis.2023.104840_bb0160","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6087","article-title":"Improved fusion of visual and language representations by dense symmetric co-attention for visual question answering","author":"Nguyen","year":"2018"},{"key":"10.1016\/j.imavis.2023.104840_bb0165","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6632","article-title":"Dynamic fusion with intra-and inter-modality attention flow for visual question answering","author":"Gao","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0170","series-title":"European Conference on Computer Vision","first-page":"852","article-title":"Visual relationship detection with language priors","author":"Lu","year":"2016"},{"issue":"8\u20139","key":"10.1016\/j.imavis.2023.104840_bb0175","doi-asserted-by":"crossref","first-page":"2146","DOI":"10.1007\/s11263-020-01353-8","article-title":"Multi-task compositional network for visual relationship detection","volume":"128","author":"Zhan","year":"2020","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.imavis.2023.104840_bb0180","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"3588","article-title":"Relation networks for object detection","author":"Hu","year":"2018"},{"key":"10.1016\/j.imavis.2023.104840_bb0185","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"3668","article-title":"Image retrieval using scene graphs","author":"Johnson","year":"2015"},{"key":"10.1016\/j.imavis.2023.104840_bb0190","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1307","article-title":"Mattnet: modular attention network for referring expression comprehension","author":"Yu","year":"2018"},{"key":"10.1016\/j.imavis.2023.104840_bb0195","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"8368","article-title":"Explainable and explicit visual reasoning over scene graphs","author":"Shi","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0200","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2021.104281","article-title":"Multi-type decision fusion network for visual q&a","volume":"115","author":"Liu","year":"2021","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2023.104840_bb0205","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2021.104165","article-title":"Visual question answering model based on graph neural network and contextual attention","volume":"110","author":"Sharma","year":"2021","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2023.104840_bb0210","series-title":"Advances in Neural Information Processing Systems","first-page":"91","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","author":"Ren","year":"2015"},{"key":"10.1016\/j.imavis.2023.104840_bb0215","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.imavis.2023.104840_bb0220","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2015","journal-title":"3rd International Conference on Learning Representations"},{"key":"10.1016\/j.imavis.2023.104840_bb0225","series-title":"18th China National Conference on Computational Linguistics","first-page":"194","article-title":"How to fine-tune bert for text classification?","author":"Sun","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0230","series-title":"Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)","first-page":"4171","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0235","series-title":"Advances in Neural Information Processing Systems","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"Lu","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0240","series-title":"Advances in Neural Information Processing Systems","article-title":"Vlmo: Unified vision-language pre-training with mixture-of-modality-experts","author":"Bao","year":"2022"},{"issue":"1","key":"10.1016\/j.imavis.2023.104840_bb0245","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","article-title":"Visual genome: connecting language and vision using crowdsourced dense image annotations","volume":"123","author":"Krishna","year":"2017","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.imavis.2023.104840_bb0250","series-title":"Proceedings of the 14th International Conference on Artificial Intelligence and Statistics","first-page":"315","article-title":"Deep sparse rectifier neural networks","author":"Glorot","year":"2011"},{"key":"10.1016\/j.imavis.2023.104840_bb0255","first-page":"1929","article-title":"Dropout: a simple way to prevent neural networks from overfitting","volume":"15","author":"Srivastava","year":"2014","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.imavis.2023.104840_bb0260","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6325","article-title":"Making the v in vqa matter: Elevating the role of image understanding in visual question answering","author":"Goyal","year":"2017"},{"key":"10.1016\/j.imavis.2023.104840_bb0265","series-title":"European Conference on Computer Vision","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.imavis.2023.104840_bb0270","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6693","article-title":"Gqa: A new dataset for real-world visual reasoning and compositional question answering","author":"Hudson","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0275","series-title":"3rd International Conference on Learning Representations","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2015"},{"key":"10.1016\/j.imavis.2023.104840_bb0280","article-title":"Accurate, large minibatch sgd: Training imagenet in 1 hour","author":"Goyal","year":"2017","journal-title":"arXiv"},{"key":"10.1016\/j.imavis.2023.104840_bb0285","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1989","article-title":"Murel: Multimodal relational reasoning for visual question answering","author":"Cadene","year":"2019"},{"issue":"5","key":"10.1016\/j.imavis.2023.104840_bb0290","doi-asserted-by":"crossref","first-page":"2527","DOI":"10.1007\/s00530-023-01125-7","article-title":"Co-attention graph convolutional network for visual question answering","volume":"29","author":"Liu","year":"2023","journal-title":"Multimedia Systems."},{"key":"10.1016\/j.imavis.2023.104840_bb0295","series-title":"International Conference on Learning Representations","article-title":"Learning to count objects in natural images for visual question answering","author":"Zhang","year":"2018"},{"issue":"4","key":"10.1016\/j.imavis.2023.104840_bb0300","first-page":"1644","article-title":"Answer again: improving vqa with cascaded-answering model","volume":"34","author":"Peng","year":"2022","journal-title":"IEEE Trans. Knowledge Data Eng."},{"issue":"1","key":"10.1016\/j.imavis.2023.104840_bb0305","doi-asserted-by":"crossref","first-page":"318","DOI":"10.1109\/TPAMI.2020.3004830","article-title":"Mra-net: improving vqa via multi-modal relation attention network","volume":"44","author":"Peng","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2023.104840_bb0310","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"10312","article-title":"Relation-aware graph attention network for visual question answering","author":"Li","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0315","doi-asserted-by":"crossref","DOI":"10.1016\/j.displa.2022.102329","article-title":"Lrb-net: improving vqa via division of labor strategy and multimodal classifiers","volume":"75","author":"Feng","year":"2022","journal-title":"Displays"},{"key":"10.1016\/j.imavis.2023.104840_bb0320","doi-asserted-by":"crossref","DOI":"10.1109\/TNNLS.2021.3135655","article-title":"Bilateral cross-modality graph matching attention for feature fusion in visual question answering","author":"Cao","year":"2022","journal-title":"IEEE Trans. Neural Netw. Syst."},{"key":"10.1016\/j.imavis.2023.104840_bb0325","doi-asserted-by":"crossref","DOI":"10.1007\/s11042-023-15418-6","article-title":"Oeca-net: a co-attention network for visual question answering based on ocr scene text feature enhancement","author":"Yan","year":"2023","journal-title":"Multimed. Tools Appl."},{"key":"10.1016\/j.imavis.2023.104840_bb0330","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"5824","article-title":"Multi-modality latent interaction network for visual question answering","author":"Gao","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0335","series-title":"International Conference on Learning Representations","article-title":"Compositional attention networks for machine reasoning","author":"Hudson","year":"2018"},{"key":"10.1016\/j.imavis.2023.104840_bb0340","series-title":"International Conference on Image Processing","first-page":"1411","article-title":"Prior visual relationship reasoning for visual question answering","author":"Yang","year":"2020"},{"key":"10.1016\/j.imavis.2023.104840_bb0345","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"10293","article-title":"Language-conditioned graph networks for relational reasoning","author":"Hu","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0350","series-title":"International Conference on Learning Representations","article-title":"Electra: Pre-training text encoders as discriminators rather than generators","author":"Clark","year":"2020"},{"key":"10.1016\/j.imavis.2023.104840_bb0355","series-title":"Advances in Neural Information Processing Systems","article-title":"Cross-lingual language model pretraining","author":"Conneau","year":"2019"},{"key":"10.1016\/j.imavis.2023.104840_bb0360","series-title":"Advances in Neural Information Processing Systems","article-title":"Mpnet: Masked and permuted pre-training for language understanding","author":"Song","year":"2020"},{"key":"10.1016\/j.imavis.2023.104840_bb0365","article-title":"Squeezebert: What can computer vision teach nlp about efficient neural networks?","author":"Iandola","year":"2020","journal-title":"arXiv"},{"key":"10.1016\/j.imavis.2023.104840_bb0370","series-title":"International Conference on Learning Representations","article-title":"Albert: A lite bert for self-supervised learning of language representations","author":"Lan","year":"2020"},{"author":"Radford","key":"10.1016\/j.imavis.2023.104840_bb0375"}],"container-title":["Image and Vision Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885623002147?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885623002147?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2023,12,5]],"date-time":"2023-12-05T21:20:56Z","timestamp":1701811256000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0262885623002147"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12]]},"references-count":75,"alternative-id":["S0262885623002147"],"URL":"https:\/\/doi.org\/10.1016\/j.imavis.2023.104840","relation":{},"ISSN":["0262-8856"],"issn-type":[{"type":"print","value":"0262-8856"}],"subject":[],"published":{"date-parts":[[2023,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Multi-modal spatial relational attention networks for visual question answering","name":"articletitle","label":"Article Title"},{"value":"Image and Vision Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.imavis.2023.104840","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2023 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"104840"}}