{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T14:38:41Z","timestamp":1730212721313,"version":"3.28.0"},"reference-count":50,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1109\/cvpr52729.2023.00646","type":"proceedings-article","created":{"date-parts":[[2023,8,22]],"date-time":"2023-08-22T17:30:52Z","timestamp":1692725452000},"page":"6683-6693","source":"Crossref","is-referenced-by-count":5,"title":["Meta-Explore: Exploratory Hierarchical Vision-and-Language Navigation Using Scene Object Spectrum Grounding"],"prefix":"10.1109","author":[{"given":"Minyoung","family":"Hwang","sequence":"first","affiliation":[{"name":"Seoul National University,Electrical and Computer Engineering and ASRI"}]},{"given":"Jaeyeon","family":"Jeong","sequence":"additional","affiliation":[{"name":"Seoul National University,Electrical and Computer Engineering and ASRI"}]},{"given":"Minsoo","family":"Kim","sequence":"additional","affiliation":[{"name":"Seoul National University,Interdisciplinary Major in Artificial Intelligence"}]},{"given":"Yoonseon","family":"Oh","sequence":"additional","affiliation":[{"name":"Hanyang University,Department of Electronic Engineering"}]},{"given":"Songhwai","family":"Oh","sequence":"additional","affiliation":[{"name":"Seoul National University,Electrical and Computer Engineering and ASRI"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"crossref","first-page":"2459","DOI":"10.1609\/aaai.v34i03.5627","article-title":"Just ask: An interactive learning frame-work for vision and language navigation","volume":"34","author":"chi","year":"2020","journal-title":"inProceedings of the AAAI Conference on Artificial Intelligence"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1287"},{"key":"ref15","first-page":"1","article-title":"Embodied question answering","author":"das","year":"2018","journal-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition"},{"journal-title":"Building generalizable agents with a realistic and rich 3d environment","year":"2018","author":"wu","key":"ref14"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01374-3"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6849"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01000"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01250"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01281"},{"key":"ref18","article-title":"Help, anna! visual nav-igation with natural multimodal assistance via retrospective curiosity-encouraging imitation learning","author":"nguyen","year":"2019","journal-title":"Proceedings of the Conference on Empirical Methods for Natural Language Processing"},{"key":"ref50","article-title":"Habitat 2.0: Training home assistants to rearrange their habitat","author":"szot","year":"2021","journal-title":"Proc of International Conference on Neural Information Processing"},{"journal-title":"No-regret reductions for imitation learning and structured prediction","year":"2010","author":"ross","key":"ref46"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"journal-title":"On evaluation of embodied navigation agents","year":"2018","author":"anderson","key":"ref48"},{"key":"ref47","first-page":"1928","article-title":"Asynchronous methods for deep rein-forcement learning","author":"mnih","year":"2016","journal-title":"Proceedings of the International Con-ference on Machine Learning"},{"key":"ref42","article-title":"An image is worth 16×16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"Proceedings of the International Conference on Learning Representations"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref44","first-page":"1","author":"serra","year":"2020","journal-title":"Mathematical Morphology"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1016\/j.robot.2005.12.001"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00679"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_7"},{"key":"ref7","first-page":"652","article-title":"Landmark-rxr: Solving vision-and-language navigation with fine-grained alignment supervision","volume":"34","author":"he","year":"2021","journal-title":"InProceedings of the International Conference on Neu-ral Information Processing Systems"},{"journal-title":"The streetlearn environment and dataset","year":"2019","author":"mirowski","key":"ref9"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.356"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01282"},{"journal-title":"Cross-lingual vision-language navigation","year":"2019","author":"yan","key":"ref6"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1181"},{"key":"ref40","first-page":"26661","article-title":"No rl, no simulation: Learning to navigate without navigating","volume":"34","author":"hahn","year":"2021","journal-title":"Proc of International Conference on Neural Information Processing"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00690"},{"key":"ref34","first-page":"5834","article-title":"History aware multimodal transformer for vision-and-language navigation","author":"chen","year":"2021","journal-title":"Proc of International Conference on Neural Information Processing"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-020-03157-9"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1287\/orsc.2.1.71"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00166"},{"key":"ref30","article-title":"Vil-bert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"lu","year":"2019","journal-title":"Proceedings of the Inter-national Conference on Neural Information Processing Systems"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01564"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00169"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"journal-title":"Deep learning for embodied vision navigation A survey","year":"2021","author":"zhu","key":"ref1"},{"key":"ref39","article-title":"Learning to explore using active neural slam","author":"chaplot","year":"2020","journal-title":"Proceedings of the International Conference on Learning Representations"},{"key":"ref38","article-title":"When should agents explore","author":"pislar","year":"2022","journal-title":"Proceedings of the International Conference on Learning Representations"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01604"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01003"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00835"},{"key":"ref25","first-page":"20660","article-title":"Evolving graphical planner: Contextual global planning for vision-and-language navigation","volume":"33","author":"deng","year":"2020","journal-title":"Proceedings of the Inter-national Conference on Neural Information Processing Systems"},{"key":"ref20","first-page":"2119","article-title":"Ex-ecuting instructions in situated collaborative interactions","author":"suhr","year":"2019","journal-title":"Proceedings of the Conference on Empirical Methods for Natural Language Processing"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00689"},{"key":"ref21","article-title":"Self-monitoring navigation agent via auxiliary progress estimation","author":"ma","year":"2019","journal-title":"Proceedings of the International Conference on Learning Representations"},{"key":"ref28","article-title":"Speaker-follower models for vision-and-language navigation","author":"fried","year":"2018","journal-title":"Proceedings of the International Conference on Neural In-formation Processing Systems"},{"key":"ref27","first-page":"394","article-title":"Vision-and-dialog navigation","author":"thomason","year":"2020","journal-title":"Proceedings of the Conference on Robot Learning"},{"key":"ref29","first-page":"1494","article-title":"Robust navigation with language pretraining and stochastic sam-pling","author":"li","year":"2019","journal-title":"Proceedings of the Conference on Empirical Methods for Natural Language Processing"}],"event":{"name":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","start":{"date-parts":[[2023,6,17]]},"location":"Vancouver, BC, Canada","end":{"date-parts":[[2023,6,24]]}},"container-title":["2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10203037\/10203050\/10205004.pdf?arnumber=10205004","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,11]],"date-time":"2023-09-11T18:06:56Z","timestamp":1694455616000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10205004\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6]]},"references-count":50,"URL":"https:\/\/doi.org\/10.1109\/cvpr52729.2023.00646","relation":{},"subject":[],"published":{"date-parts":[[2023,6]]}}}