@inproceedings{pantazopoulos-etal-2024-lost,
title = "Lost in Space: Probing Fine-grained Spatial Understanding in Vision and Language Resamplers",
author = "Pantazopoulos, Georgios and
Suglia, Alessandro and
Lemon, Oliver and
Eshghi, Arash",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-short.45",
doi = "10.18653/v1/2024.naacl-short.45",
pages = "540--549",
abstract = "An effective method for combining frozen large language models (LLM) and visual encoders involves a resampler module that creates a {`}visual prompt{'} which is provided to the LLM, along with the textual prompt. While this approach has enabled impressive performance across many coarse-grained tasks like image captioning and visual question answering, more fine-grained tasks that require spatial understanding have not been thoroughly examined. In this paper, we use diagnostic classifiers to measure the extent to which the visual prompt produced by the resampler encodes spatial information. Our results show that this information is largely absent from the resampler output when kept frozen during training of the classifiers. However, when the resampler and classifier are trained jointly, we observe a significant performance boost. This shows that the compression achieved by the resamplers can in principle encode the requisite spatial information, but that more object-aware objectives are needed at the pretraining stage to facilitate this capability.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pantazopoulos-etal-2024-lost">
<titleInfo>
<title>Lost in Space: Probing Fine-grained Spatial Understanding in Vision and Language Resamplers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georgios</namePart>
<namePart type="family">Pantazopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Suglia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oliver</namePart>
<namePart type="family">Lemon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arash</namePart>
<namePart type="family">Eshghi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>An effective method for combining frozen large language models (LLM) and visual encoders involves a resampler module that creates a ‘visual prompt’ which is provided to the LLM, along with the textual prompt. While this approach has enabled impressive performance across many coarse-grained tasks like image captioning and visual question answering, more fine-grained tasks that require spatial understanding have not been thoroughly examined. In this paper, we use diagnostic classifiers to measure the extent to which the visual prompt produced by the resampler encodes spatial information. Our results show that this information is largely absent from the resampler output when kept frozen during training of the classifiers. However, when the resampler and classifier are trained jointly, we observe a significant performance boost. This shows that the compression achieved by the resamplers can in principle encode the requisite spatial information, but that more object-aware objectives are needed at the pretraining stage to facilitate this capability.</abstract>
<identifier type="citekey">pantazopoulos-etal-2024-lost</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-short.45</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-short.45</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>540</start>
<end>549</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Lost in Space: Probing Fine-grained Spatial Understanding in Vision and Language Resamplers
%A Pantazopoulos, Georgios
%A Suglia, Alessandro
%A Lemon, Oliver
%A Eshghi, Arash
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F pantazopoulos-etal-2024-lost
%X An effective method for combining frozen large language models (LLM) and visual encoders involves a resampler module that creates a ‘visual prompt’ which is provided to the LLM, along with the textual prompt. While this approach has enabled impressive performance across many coarse-grained tasks like image captioning and visual question answering, more fine-grained tasks that require spatial understanding have not been thoroughly examined. In this paper, we use diagnostic classifiers to measure the extent to which the visual prompt produced by the resampler encodes spatial information. Our results show that this information is largely absent from the resampler output when kept frozen during training of the classifiers. However, when the resampler and classifier are trained jointly, we observe a significant performance boost. This shows that the compression achieved by the resamplers can in principle encode the requisite spatial information, but that more object-aware objectives are needed at the pretraining stage to facilitate this capability.
%R 10.18653/v1/2024.naacl-short.45
%U https://aclanthology.org/2024.naacl-short.45
%U https://doi.org/10.18653/v1/2024.naacl-short.45
%P 540-549
Markdown (Informal)
[Lost in Space: Probing Fine-grained Spatial Understanding in Vision and Language Resamplers](https://aclanthology.org/2024.naacl-short.45) (Pantazopoulos et al., NAACL 2024)
ACL