{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T10:01:39Z","timestamp":1740132099681,"version":"3.37.3"},"reference-count":52,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001381","name":"National Research Foundation Singapore","doi-asserted-by":"publisher","award":["AISG-RP-2018-003"],"id":[{"id":"10.13039\/501100001381","id-type":"DOI","asserted-by":"publisher"}]},{"name":"MOE Tier-1","award":["RG28\/18 (S)","RG22\/19 (S)","RG95\/20"]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102182","61976116","61905114"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004608","name":"Natural Science Foundation of Jiangsu Province","doi-asserted-by":"publisher","award":["BK20210327"],"id":[{"id":"10.13039\/501100004608","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["30920021135"],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key R&D Program of China","award":["2021YFF0602101"]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2023,6,1]]},"DOI":"10.1109\/tpami.2021.3139957","type":"journal-article","created":{"date-parts":[[2022,1,4]],"date-time":"2022-01-04T20:33:23Z","timestamp":1641328403000},"page":"6807-6819","source":"Crossref","is-referenced-by-count":9,"title":["Depth and Video Segmentation Based Visual Attention for Embodied Question Answering"],"prefix":"10.1109","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9121-2687","authenticated-orcid":false,"given":"Haonan","family":"Luo","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0329-7458","authenticated-orcid":false,"given":"Guosheng","family":"Lin","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Nanyang Technological University, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0337-9410","authenticated-orcid":false,"given":"Yazhou","family":"Yao","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6649-7660","authenticated-orcid":false,"given":"Fayao","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research A*STAR, Singapore"}]},{"given":"Zichuan","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Nanyang Technological University, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6708-2205","authenticated-orcid":false,"given":"Zhenmin","family":"Tang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, Jiangsu, China"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2017.00081"},{"article-title":"Building generalizable agents with a realistic and rich 3D environment","year":"2018","author":"wu","key":"ref12"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00976"},{"key":"ref52","article-title":"Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer","author":"ranftl","year":"2020","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref11","first-page":"1","article-title":"Embodied question answering","author":"das","year":"2018","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10584-0_23"},{"key":"ref17","first-page":"53","article-title":"Neural modular control for embodied question answering","author":"das","year":"2018","journal-title":"Conference on Robot Learning"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00430"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.9"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref50","first-page":"289","article-title":"Hierarchical question-image co-attention for visual question answering","author":"lu","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.119"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00295"},{"key":"ref47","first-page":"746","article-title":"Indoor segmentation and support inference from RGBD images","author":"silberman","year":"2012","journal-title":"Proc Eur Conf Comput Vis"},{"article-title":"Adaptive computation time for recurrent neural networks","year":"2016","author":"graves","key":"ref42"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.12"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992696"},{"key":"ref43","first-page":"278","article-title":"Policy invariance under reward transformations: Theory and application to reward shaping","author":"ng","year":"1999","journal-title":"Proc 16th Int Conf Mach Learn"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.28"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2754246"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00679"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.549"},{"key":"ref3","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","author":"ren","year":"2015","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.330"},{"key":"ref5","first-page":"1112","article-title":"Hierarchical LSTMS with adaptive attention for visual captioning","volume":"42","author":"gao","year":"2020","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"ref35","first-page":"213","article-title":"FuseNet: Incorporating depth into semantic segmentation via fusion-based CNN architecture","author":"hazirbas","year":"2016","journal-title":"Proc Asian Conf Comput Vis"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_28"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.540"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.179"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.441"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.79"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2950923"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"article-title":"Compositional memory for visual question answering","year":"2015","author":"jiang","key":"ref39"},{"article-title":"ABC-CNN: An attention based convolutional neural network for visual question answering","year":"2015","author":"chen","key":"ref38"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/1015330.1015430"},{"article-title":"Imitation learning with recurrent neural networks","year":"2016","author":"nguyen","key":"ref23"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00063"},{"article-title":"Simultaneous mapping and target driven navigation","year":"2019","author":"georgakis","key":"ref25"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00680"},{"key":"ref22","first-page":"8334","article-title":"Learning conditioned graph structures for interpretable visual question answering","author":"norcliffe-brown","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.344"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00925"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00682"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00686"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10120646\/09669060.pdf?arnumber=9669060","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,5,29]],"date-time":"2023-05-29T17:34:29Z","timestamp":1685381669000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9669060\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,1]]},"references-count":52,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2021.3139957","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"type":"print","value":"0162-8828"},{"type":"electronic","value":"2160-9292"},{"type":"electronic","value":"1939-3539"}],"subject":[],"published":{"date-parts":[[2023,6,1]]}}}