{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T01:17:59Z","timestamp":1728177479439},"reference-count":50,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,6,16]]},"DOI":"10.1109\/cvpr52733.2024.01236","type":"proceedings-article","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T17:34:53Z","timestamp":1726508093000},"page":"13009-13018","source":"Crossref","is-referenced-by-count":6,"title":["GLaMM: Pixel Grounding Large Multimodal Model"],"prefix":"10.1109","author":[{"given":"Hanoona","family":"Rasheed","sequence":"first","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Muhammad","family":"Maaz","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Sahal","family":"Shaji","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Abdelrahman","family":"Shaker","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Salman","family":"Khan","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Hisham","family":"Cholakkal","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Rao M.","family":"Anwer","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Eric","family":"Xing","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Ming-Hsuan","family":"Yang","sequence":"additional","affiliation":[{"name":"University of California - Merced"}]},{"given":"Fahad S.","family":"Khan","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"key":"ref2","article-title":"Foundational models defining a new era in vision: A survey and outlook","author":"Awais","year":"2023","journal-title":"arXiv:2307.13721"},{"key":"ref3","article-title":"Bigdetection: A large-scale benchmark for improved object detector pretraining","author":"Cai","year":"2022","journal-title":"CVPR"},{"key":"ref4","article-title":"Shikra: Unleashing multimodal llms referential dialogue magic","author":"Chen","year":"2023","journal-title":"arXiv:2306.15195"},{"key":"ref5","author":"Dai","year":"2023","journal-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning"},{"key":"ref6","author":"Gao","year":"2023","journal-title":"Llama-adapter v2: Parameter-efficient visual instruction model"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01745"},{"key":"ref10","volume-title":"Language is not all you need: Aligning perception with language models","author":"Huang","year":"2023"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/d14-1086"},{"key":"ref12","volume-title":"Segment anything in high quality","author":"Ke","year":"2023"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"},{"key":"ref16","author":"Lai","year":"2023","journal-title":"Lisa: Reasoning segmentation via large language model"},{"key":"ref17","author":"Li","year":"2023","journal-title":"Otter: A multimodal model with in-context instruction tuning"},{"key":"ref18","article-title":"Blip: Bootstrapping language-image pretraining for unified vision-language understanding and generation","author":"Li","year":"2022","journal-title":"ICML"},{"key":"ref19","article-title":"BLIP-2: bootstrapping language-image pretraining with frozen image encoders and large language models","author":"Li","year":"2023","journal-title":"ICML"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref21","article-title":"Gres: Gen-eralized referring expression segmentation","author":"Liu","journal-title":"CVPR, 2023"},{"key":"ref22","article-title":"Visual instruction tuning","author":"Liu","year":"2023","journal-title":"NeurIPS"},{"key":"ref23","author":"Liu","year":"2023","journal-title":"Grounding dino: Marrying dino with grounded pretraining for open-set object detection"},{"key":"ref24","author":"Liu","year":"2023","journal-title":"Interngpt: Solving vision-centric tasks by interacting with chatgpt beyond language"},{"key":"ref25","author":"Maaz","year":"2023","journal-title":"Video-chatgpt: Towards detailed video understanding via large vision and language models"},{"key":"ref26","article-title":"Gpt-4 technical report","year":"2023","journal-title":"OpenAI"},{"key":"ref27","author":"Peng","year":"2023","journal-title":"Kosmos-2: Grounding multimodal large language models to the world"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.876"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"ref30","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"ICML"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00852"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01817"},{"key":"ref33","author":"Wang","year":"2023","journal-title":"Visionllm: Large language model is also an open-ended decoder for vision-centric tasks"},{"key":"ref34","author":"Wang","year":"2023","journal-title":"The all-seeing project: Towards panoptic visual recognition and understanding of the open world"},{"key":"ref35","author":"Wang","year":"2021","journal-title":"Simvlm: Simple visual language model pretraining with weak supervision"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"ref37","author":"Wu","year":"2022","journal-title":"Grit: A generative region-to-text transformer for object understanding"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_11"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"ref40","article-title":"Mm-react: Prompting chat-gpt for multimodal reasoning and action","author":"Yang","year":"2023","journal-title":"arXiv: 2303.11381"},{"key":"ref41","article-title":"mplug-owl: Modularization empowers large language models with multimodality","author":"Ye","year":"2023","journal-title":"arXiv:2305.03726"},{"key":"ref42","author":"You","year":"2023","journal-title":"Ferret: Refer and ground anything anywhere at any granularity"},{"key":"ref43","author":"Yu","year":"2022","journal-title":"Coca: Contrastive captioners are image-text foundation models"},{"key":"ref44","author":"Zhang","year":"2021","journal-title":"Vinvl: Making visual representations matter in vision-language models"},{"key":"ref45","author":"Zhang","year":"2023","journal-title":"Llama-adapter: Efficient fine-tuning of language models with zero-init attention"},{"key":"ref46","author":"Zhang","year":"2023","journal-title":"Gpt4roi: In-struction tuning large language model on region-of-interest"},{"key":"ref47","author":"Zhao","year":"2023","journal-title":"Bubogpt: Enabling visual grounding in multimodal llms"},{"key":"ref48","author":"Zhu","year":"2023","journal-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01451"},{"key":"ref50","article-title":"Segment everything every-where all at once","author":"Zou","year":"2023","journal-title":"NeurIPS"}],"event":{"name":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Seattle, WA, USA","start":{"date-parts":[[2024,6,16]]},"end":{"date-parts":[[2024,6,22]]}},"container-title":["2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10654794\/10654797\/10655326.pdf?arnumber=10655326","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,20]],"date-time":"2024-09-20T06:25:09Z","timestamp":1726813509000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10655326\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,16]]},"references-count":50,"URL":"https:\/\/doi.org\/10.1109\/cvpr52733.2024.01236","relation":{},"subject":[],"published":{"date-parts":[[2024,6,16]]}}}