{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T19:14:09Z","timestamp":1730229249986,"version":"3.28.0"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,2,22]],"date-time":"2024-02-22T00:00:00Z","timestamp":1708560000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,2,22]],"date-time":"2024-02-22T00:00:00Z","timestamp":1708560000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,2,22]]},"DOI":"10.1109\/icara60736.2024.10553173","type":"proceedings-article","created":{"date-parts":[[2024,6,18]],"date-time":"2024-06-18T17:29:21Z","timestamp":1718731761000},"page":"501-505","source":"Crossref","is-referenced-by-count":0,"title":["Zero-shot Object Navigation with Vision-Language Foundation Models Reasoning"],"prefix":"10.1109","author":[{"given":"Shuaihang","family":"Yuan","sequence":"first","affiliation":[{"name":"NYUAD Center for Artificial Intelligence and Robotics (CAIR),Abu Dhabi,UAE"}]},{"given":"Muhammad","family":"Shafique","sequence":"additional","affiliation":[{"name":"NYUAD Center for Artificial Intelligence and Robotics (CAIR),Abu Dhabi,UAE"}]},{"given":"Mohamed Riyadh","family":"Baghdadi","sequence":"additional","affiliation":[{"name":"NYUAD Center for Artificial Intelligence and Robotics (CAIR),Abu Dhabi,UAE"}]},{"given":"Farshad","family":"Khorrami","sequence":"additional","affiliation":[{"name":"NYUAD Center for Artificial Intelligence and Robotics (CAIR),Abu Dhabi,UAE"}]},{"given":"Anthony","family":"Tzes","sequence":"additional","affiliation":[{"name":"NYUAD Center for Artificial Intelligence and Robotics (CAIR),Abu Dhabi,UAE"}]},{"given":"Yi","family":"Fang","sequence":"additional","affiliation":[{"name":"NYUAD Center for Artificial Intelligence and Robotics (CAIR),Abu Dhabi,UAE"}]}],"member":"263","reference":[{"year":"2018","author":"Anderson","journal-title":"arXiv preprint","article-title":"On evaluation of embodied navigation agents","key":"ref1"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.7763\/IJCTE.2013.V5.735"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1007\/978-3-030-66823-5_24"},{"year":"2020","author":"Chaplot","journal-title":"arXiv preprint","article-title":"Learning to explore using active neural slam","key":"ref4"},{"key":"ref5","first-page":"4247","article-title":"Object goal navigation using goal-oriented semantic exploration","volume":"33","author":"Chaplot","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.15607\/RSS.2023.XIX.075"},{"key":"ref7","first-page":"38149","article-title":"Weakly-supervised multi-granularity map learning for vision-and-language navigation","volume":"35","author":"Chen","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"See","year":"2023","author":"Chiang","article-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","key":"ref8"},{"doi-asserted-by":"publisher","key":"ref9","DOI":"10.1109\/CVPR42600.2020.00323"},{"year":"2018","author":"Devlin","journal-title":"arXiv preprint","article-title":"Bert: Pretraining of deep bidirectional transformers for language understanding","key":"ref10"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.1109\/lra.2023.3346800"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.1007\/978-3-030-58571-6_2"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1109\/CVPR52729.2023.02219"},{"year":"2021","author":"He","journal-title":"arXiv preprint","article-title":"Debertav3: Improving deberta using electra-style pretraining with gradient-disentangled embedding sharing","key":"ref14"},{"year":"2017","author":"Kolve","journal-title":"arXiv preprint","article-title":"Ai2-thor: An interactive 3d environment for visual ai","key":"ref15"},{"year":"2023","author":"Li","journal-title":"arXiv preprint","article-title":"Blip-2: Bootstrapping language-image pretraining with frozen image encoders and large language models","key":"ref16"},{"volume-title":"International Conference on Machine Learning","author":"Li","first-page":"12888","article-title":"Blip: Bootstrapping language-image pretraining for unified vision-language understanding and generation","key":"ref17"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1109\/cvpr52688.2022.01069"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1109\/ICRA48506.2021.9560925"},{"year":"2023","author":"Liu","journal-title":"arXiv preprint","article-title":"Visual instruction tuning","key":"ref20"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1109\/LRA.2023.3295255"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.1007\/978-3-030-58539-6_16"},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.1109\/ICCV48922.2021.01509"},{"volume-title":"International conference on machine learning","author":"Radford","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","key":"ref24"},{"doi-asserted-by":"publisher","key":"ref25","DOI":"10.1109\/CVPR52688.2022.01832"},{"doi-asserted-by":"publisher","key":"ref26","DOI":"10.1109\/ICCV.2019.00943"},{"volume-title":"Conference on Robot Learning","author":"Shah","first-page":"492","article-title":"Lm-nav: Robotic navigation with large pretrained models of language, vision, and action","key":"ref27"},{"year":"2023","author":"Touvron","journal-title":"arXiv preprint","article-title":"Llama: Open and efficient foundation language models","key":"ref28"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.1109\/CVPR.2018.00945"},{"year":"2023","author":"Xie","journal-title":"arXiv preprint","article-title":"Reasoning about the unseen for efficient outdoor object navigation","key":"ref30"},{"doi-asserted-by":"publisher","key":"ref31","DOI":"10.1109\/CIRA.1997.613851"},{"year":"2022","author":"Yao","journal-title":"arXiv preprint","article-title":"React: Synergizing reasoning and acting in language models","key":"ref32"},{"doi-asserted-by":"publisher","key":"ref33","DOI":"10.1109\/ICCV48922.2021.01581"},{"doi-asserted-by":"publisher","key":"ref34","DOI":"10.7763\/IJCTE.2012.V4.428"},{"year":"2022","author":"Zheng","journal-title":"arXiv preprint","article-title":"Jarvis: a neuro-symbolic commonsense reasoning framework for conversational embodied agents","key":"ref35"},{"doi-asserted-by":"publisher","key":"ref36","DOI":"10.1609\/aaai.v38i7.28597"},{"year":"2023","author":"Zhou","journal-title":"arXiv preprint","article-title":"Esc: Exploration with soft commonsense constraints for zero-shot object navigation","key":"ref37"}],"event":{"name":"2024 10th International Conference on Automation, Robotics and Applications (ICARA)","start":{"date-parts":[[2024,2,22]]},"location":"Athens, Greece","end":{"date-parts":[[2024,2,24]]}},"container-title":["2024 10th International Conference on Automation, Robotics and Applications (ICARA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10552867\/10552915\/10553173.pdf?arnumber=10553173","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,6,25]],"date-time":"2024-06-25T19:12:38Z","timestamp":1719342758000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10553173\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,22]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/icara60736.2024.10553173","relation":{},"subject":[],"published":{"date-parts":[[2024,2,22]]}}}