{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T14:40:10Z","timestamp":1732977610516,"version":"3.30.0"},"publisher-location":"Cham","reference-count":88,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031730382"},{"type":"electronic","value":"9783031730399"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73039-9_4","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:57:07Z","timestamp":1730300227000},"page":"52-70","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MEERKAT: Audio-Visual Large Language Model for\u00a0Grounding in\u00a0Space and\u00a0Time"],"prefix":"10.1007","author":[{"given":"Sanjoy","family":"Chowdhury","sequence":"first","affiliation":[]},{"given":"Sayan","family":"Nag","sequence":"additional","affiliation":[]},{"given":"Subhrajyoti","family":"Dasgupta","sequence":"additional","affiliation":[]},{"given":"Jun","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Mohamed","family":"Elhoseiny","sequence":"additional","affiliation":[]},{"given":"Ruohan","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Dinesh","family":"Manocha","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"4_CR1","unstructured":"Achiam, J., et\u00a0al.: Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"4_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"4_CR3","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)"},{"key":"4_CR4","unstructured":"Chen, F., et al.: X-llm: Bootstrapping advanced large language models by treating multi-modalities as foreign languages. arXiv preprint arXiv:2305.04160 (2023)"},{"key":"4_CR5","unstructured":"Chen, G., et\u00a0al: Plot: prompt learning with optimal transport for vision-language models. ICLR (2023)"},{"key":"4_CR6","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., Zisserman, A.: Localizing visual sounds the hard way. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16867\u201316876 (2021)","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Vedaldi, A., Zisserman, A.: Vggsound: a large-scale audio-visual dataset. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 721\u2013725. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"4_CR8","unstructured":"Chen, J., et al.: Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"4_CR9","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: Unleashing multimodal llm\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"4_CR10","unstructured":"Chen, L., Gan, Z., Cheng, Y., Li, L., Carin, L., Liu, J.: Graph optimal transport for cross-domain alignment. In: International Conference on Machine Learning, pp. 1542\u20131553. PMLR (2020)"},{"key":"4_CR11","unstructured":"Chen, S., et al.: Valor: Vision-audio-language omni-perception pretraining model and dataset. arXiv preprint arXiv:2304.08345 (2023)"},{"key":"4_CR12","doi-asserted-by":"crossref","unstructured":"Chen, S., et al.: Mm21 pre-training for video understanding challenge: video captioning with pretraining techniques. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 4853\u20134857 (2021)","DOI":"10.1145\/3474085.3479216"},{"key":"4_CR13","doi-asserted-by":"publisher","unstructured":"Chen, Y.C., et al.: Uniter: universal image-text representation learning. In: European conference on computer vision, pp. 104\u2013120. Springer (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"4_CR14","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* chatgpt quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"4_CR15","unstructured":"Chowdhery, A., et al.: Palm: scaling language modeling with pathways. J. Mach. Learn. Res. 24(240), 1\u2013113 (2023)"},{"key":"4_CR16","doi-asserted-by":"crossref","unstructured":"Chowdhury, S., Nag, S., Manocha, D.: Apollo: unified adapter and prompt learning for vision language models. In: The 2023 Conference on Empirical Methods in Natural Language Processing (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.629"},{"key":"4_CR17","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"4_CR18","unstructured":"Dou, Z.Y., et al.: Coarse-to-fine vision-language pre-training with fusion in the backbone. Adv. Neural. Inf. Process. Syst. 35, 32942\u201332956 (2022)"},{"key":"4_CR19","doi-asserted-by":"crossref","unstructured":"Elizalde, B., Deshmukh, S., Al\u00a0Ismail, M., Wang, H.: Clap learning audio concepts from natural language supervision. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"4_CR20","doi-asserted-by":"crossref","unstructured":"Everingham, M., Eslami, S.A., Van Gool, L., Williams, C.K., Winn, J., Zisserman, A.: The pascal visual object classes challenge: a retrospective. Int. J. Comput. Vision 111, 98\u2013136 (2015)","DOI":"10.1007\/s11263-014-0733-5"},{"key":"4_CR21","doi-asserted-by":"crossref","unstructured":"Fedorishin, D., Mohan, D.D., Jawade, B., Setlur, S., Govindaraju, V.: Hear the flow: optical flow-based self-supervised visual sound source localization. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2278\u20132287 (2023)","DOI":"10.1109\/WACV56688.2023.00231"},{"key":"4_CR22","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 776\u2013780. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Georgescu, M.I., Fonseca, E., Ionescu, R.T., Lucic, M., Schmid, C., Arnab, A.: Audiovisual masked autoencoders. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16144\u201316154 (2023)","DOI":"10.1109\/ICCV51070.2023.01479"},{"key":"4_CR24","unstructured":"Gong, Y., Luo, H., Liu, A.H., Karlinsky, L., Glass, J.: Listen, think, and understand. arXiv preprint arXiv:2305.10790 (2023)"},{"key":"4_CR25","unstructured":"Gutmann, M.U., Hyv\u00e4rinen, A.: Noise-contrastive estimation of unnormalized statistical models, with applications to natural image statistics. J. Mach. Learn. Res. 13(2) (2012)"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Honovich, O., Scialom, T., Levy, O., Schick, T.: Unnatural instructions: Tuning language models with (almost) no human labor. arXiv preprint arXiv:2212.09689 (2022)","DOI":"10.18653\/v1\/2023.acl-long.806"},{"key":"4_CR27","unstructured":"Hu, E.J., et al.: Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"4_CR28","doi-asserted-by":"crossref","unstructured":"Huang, S., Qin, L., Wang, B., Tu, G., Xu, R.: Sdif-da: A shallow-to-deep interaction framework with data augmentation for multi-modal intent detection. arXiv preprint arXiv:2401.00424 (2023)","DOI":"10.1109\/ICASSP48485.2024.10446922"},{"key":"4_CR29","doi-asserted-by":"crossref","unstructured":"Kuznetsova, A., et al.: The open images dataset v4: unified image classification, object detection, and visual relationship detection at scale. Int. J. Comput. Vision 128(7), 1956\u20131981 (2020)","DOI":"10.1007\/s11263-020-01316-z"},{"key":"4_CR30","doi-asserted-by":"crossref","unstructured":"Lai, X., et al.: Lisa: Reasoning segmentation via large language model. arXiv preprint arXiv:2308.00692 (2023)","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"4_CR31","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: A multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Li, G., Wei, Y., Tian, Y., Xu, C., Wen, J.R., Hu, D.: Learning to answer questions in dynamic audio-visual scenarios. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19108\u201319118 (2022)","DOI":"10.1109\/CVPR52688.2022.01852"},{"key":"4_CR33","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)"},{"key":"4_CR34","unstructured":"Li, K., et al.: Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)"},{"key":"4_CR35","doi-asserted-by":"crossref","unstructured":"Li, L.H., et\u00a0al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"4_CR36","doi-asserted-by":"crossref","unstructured":"Lin, Y.B., Li, Y.J., Wang, Y.C.F.: Dual-modality seq2seq network for audio-visual event localization. In: ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2002\u20132006. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8683226"},{"key":"4_CR37","doi-asserted-by":"crossref","unstructured":"Lin, Y.B., Sung, Y.L., Lei, J., Bansal, M., Bertasius, G.: Vision transformers are parameter-efficient audio-visual learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2299\u20132309 (2023)","DOI":"10.1109\/CVPR52729.2023.00228"},{"key":"4_CR38","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Adv. Neural Inf. Proce. Syst. 36 (2024)"},{"key":"4_CR39","doi-asserted-by":"crossref","unstructured":"Liu, J., Ju, C., Xie, W., Zhang, Y.: Exploiting transformation invariance and equivariance for self-supervised sound localisation. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 3742\u20133753 (2022)","DOI":"10.1145\/3503161.3548317"},{"key":"4_CR40","doi-asserted-by":"crossref","unstructured":"Liu, X., Dong, Z., Zhang, P.: Tackling data bias in music-avqa: crafting a balanced dataset for unbiased question-answering. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 4478\u20134487 (2024)","DOI":"10.1109\/WACV57701.2024.00442"},{"key":"4_CR41","unstructured":"Lu, P., et al.: Learn to explain: Multimodal reasoning via thought chains for science question answering. Adv. Neural. Inf. Process. Syst. 35, 2507\u20132521 (2022)"},{"key":"4_CR42","unstructured":"Luo, R., et al.: Valley: Video assistant with large language model enhanced ability. arXiv preprint arXiv:2306.07207 (2023)"},{"key":"4_CR43","unstructured":"Lyu, C., et al.: Macaw-llm: Multi-modal language modeling with image, audio, video, and text integration. arXiv preprint arXiv:2306.09093 (2023)"},{"key":"4_CR44","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., Khan, F.S.: Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424 (2023)","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"4_CR45","first-page":"37524","volume":"35","author":"S Mo","year":"2022","unstructured":"Mo, S., Morgado, P.: A closer look at weakly-supervised audio-visual source localization. Adv. Neural. Inf. Process. Syst. 35, 37524\u201337536 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"4_CR46","doi-asserted-by":"publisher","unstructured":"Mo, S., Morgado, P.: Localizing visual sounds the easy way. In: European Conference on Computer Vision, pp. 218\u2013234. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-19836-6_13","DOI":"10.1007\/978-3-031-19836-6_13"},{"key":"4_CR47","doi-asserted-by":"crossref","unstructured":"Nadeem, A., Hilton, A., Dawes, R., Thomas, G., Mustafa, A.: Cad-contextual multi-modal alignment for dynamic avqa. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 7251\u20137263 (2024)","DOI":"10.1109\/WACV57701.2024.00709"},{"key":"4_CR48","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"4_CR49","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. Adv. Neural. Inf. Process. Syst. 35, 27730\u201327744 (2022)"},{"key":"4_CR50","doi-asserted-by":"crossref","unstructured":"Panagopoulou, A., et al.: X-instructblip: A framework for aligning x-modal instruction-aware representations to llms and emergent cross-modal reasoning. arXiv preprint arXiv:2311.18799 (2023)","DOI":"10.1007\/978-3-031-72995-9_11"},{"key":"4_CR51","unstructured":"Park, J., Lee, J., Sohn, K.: Bridging vision and language spaces with assignment prediction. arXiv preprint arXiv:2404.09632 (2024)"},{"key":"4_CR52","doi-asserted-by":"crossref","unstructured":"Park, S., Senocak, A., Chung, J.S.: Marginnce: robust sound localization with a negative margin. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10097234"},{"key":"4_CR53","unstructured":"Peng, B., Li, C., He, P., Galley, M., Gao, J.: Instruction tuning with gpt-4. arXiv preprint arXiv:2304.03277 (2023)"},{"key":"4_CR54","unstructured":"Peng, Z., et al.: Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"4_CR55","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"4_CR56","doi-asserted-by":"crossref","unstructured":"Pramanick, S., et al.: Jack of all tasks, master of many: Designing general-purpose coarse-to-fine vision-language model. arXiv preprint arXiv:2312.12423 (2023)","DOI":"10.1109\/CVPR52733.2024.01335"},{"key":"4_CR57","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International conference on machine learning. pp. 8748\u20138763. PMLR (2021)"},{"key":"4_CR58","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)"},{"key":"4_CR59","doi-asserted-by":"crossref","unstructured":"Ren, S., Yao, L., Li, S., Sun, X., Hou, L.: Timechat: A time-sensitive multimodal large language model for long video understanding. arXiv preprint arXiv:2312.02051 (2023)","DOI":"10.1109\/CVPR52733.2024.01357"},{"key":"4_CR60","doi-asserted-by":"crossref","unstructured":"Schwartz, I., Schwing, A.G., Hazan, T.: A simple baseline for audio-visual scene-aware dialog. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12548\u201312558 (2019)","DOI":"10.1109\/CVPR.2019.01283"},{"key":"4_CR61","doi-asserted-by":"crossref","unstructured":"Senocak, A., Oh, T.H., Kim, J., Yang, M.H., Kweon, I.S.: Learning to localize sound source in visual scenes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4358\u20134366 (2018)","DOI":"10.1109\/CVPR.2018.00458"},{"key":"4_CR62","doi-asserted-by":"crossref","unstructured":"Senocak, A., Ryu, H., Kim, J., Oh, T.H., Pfister, H., Chung, J.S.: Sound source localization is all about cross-modal alignment. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7777\u20137787 (2023)","DOI":"10.1109\/ICCV51070.2023.00715"},{"key":"4_CR63","unstructured":"Shu, F., Zhang, L., Jiang, H., Xie, C.: Audio-visual llm for video understanding. arXiv preprint arXiv:2312.06720 (2023)"},{"key":"4_CR64","unstructured":"Song, Z., Wang, Y., Fan, J., Tan, T., Zhang, Z.: Self-supervised predictive learning: a negative-free method for sound source localization in visual scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3222\u20133231 (2022)"},{"key":"4_CR65","unstructured":"Su, Y., Lan, T., Li, H., Xu, J., Wang, Y., Cai, D.: Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)"},{"key":"4_CR66","doi-asserted-by":"crossref","unstructured":"Sun, W., et al.: Learning audio-visual source localization via false negative aware contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6420\u20136429 (2023)","DOI":"10.1109\/CVPR52729.2023.00621"},{"key":"4_CR67","unstructured":"Taori, R., et al.: Stanford alpaca: an instruction-following llama model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca (2023)"},{"key":"4_CR68","unstructured":"Taylor, R., et al.: Galactica: A large language model for science. arXiv preprint arXiv:2211.09085 (2022)"},{"key":"4_CR69","doi-asserted-by":"publisher","unstructured":"Tian, Y., Li, D., Xu, C.: Unified multisensory perception: Weakly-supervised audio-visual video parsing. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part III 16, pp. 436\u2013454. Springer (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_26","DOI":"10.1007\/978-3-030-58580-8_26"},{"key":"4_CR70","doi-asserted-by":"crossref","unstructured":"Tian, Y., Shi, J., Li, B., Duan, Z., Xu, C.: Audio-visual event localization in unconstrained videos. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 247\u2013263 (2018)","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"4_CR71","unstructured":"Touvron, H., et\u00a0al.: Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"4_CR72","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models, 2023. URL https:\/\/arxivorg\/abs\/2307.09288 (2023)"},{"key":"4_CR73","unstructured":"Wang, W., et\u00a0al.: Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)"},{"key":"4_CR74","unstructured":"Wang, W., et\u00a0al.: Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. Adv. Neural Inf. Proce. Syst. 36 (2024)"},{"key":"4_CR75","unstructured":"Wei, J., et al.: Finetuned language models are zero-shot learners. arXiv preprint arXiv:2109.01652 (2021)"},{"key":"4_CR76","unstructured":"Workshop, B., et\u00a0al.: Bloom: A 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100 (2022)"},{"key":"4_CR77","doi-asserted-by":"crossref","unstructured":"Yang, P., et al.: Avqa: a dataset for audio-visual question answering on videos. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 3480\u20133491 (2022)","DOI":"10.1145\/3503161.3548291"},{"key":"4_CR78","unstructured":"Ye, Q., et\u00a0al.: mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)"},{"key":"4_CR79","unstructured":"You, H., et al.: Ferret: Refer and ground anything anywhere at any granularity. arXiv preprint arXiv:2310.07704 (2023)"},{"key":"4_CR80","doi-asserted-by":"crossref","unstructured":"Yun, H., Yu, Y., Yang, W., Lee, K., Kim, G.: Pano-avqa: grounded audio-visual question answering on 360deg videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2031\u20132041 (2021)","DOI":"10.1109\/ICCV48922.2021.00204"},{"key":"4_CR81","doi-asserted-by":"crossref","unstructured":"Zhang, C., Cai, Y., Lin, G., Shen, C.: Deepemd: few-shot image classification with differentiable earth mover\u2019s distance and structured classifiers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12203\u201312213 (2020)","DOI":"10.1109\/CVPR42600.2020.01222"},{"key":"4_CR82","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"4_CR83","unstructured":"Zhang, R., et al.: Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)"},{"key":"4_CR84","unstructured":"Zhang, S., et al.: Gpt4roi: Instruction tuning large language model on region-of-interest. arXiv preprint arXiv:2307.03601 (2023)"},{"key":"4_CR85","unstructured":"Zhang, S., et\u00a0al.: Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"4_CR86","unstructured":"Zhao, Y., Lin, Z., Zhou, D., Huang, Z., Feng, J., Kang, B.: Bubogpt: Enabling visual grounding in multi-modal llms. arXiv preprint arXiv:2307.08581 (2023)"},{"key":"4_CR87","doi-asserted-by":"publisher","unstructured":"Zhou, J., et al.: Audio\u2013visual segmentation. In: European Conference on Computer Vision, pp. 386\u2013403. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-19836-6_22","DOI":"10.1007\/978-3-031-19836-6_22"},{"key":"4_CR88","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73039-9_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T14:13:53Z","timestamp":1732976033000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73039-9_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031730382","9783031730399"],"references-count":88,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73039-9_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}