@inproceedings{sravanthi-etal-2024-pub,
title = "{PUB}: A Pragmatics Understanding Benchmark for Assessing {LLM}s' Pragmatics Capabilities",
author = "Sravanthi, Settaluri and
Doshi, Meet and
Tankala, Pavan and
Murthy, Rudra and
Dabre, Raj and
Bhattacharyya, Pushpak",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.719/",
doi = "10.18653/v1/2024.findings-acl.719",
pages = "12075--12097",
abstract = "LLMs have demonstrated remarkable capability for understanding semantics, but their understanding of pragmatics is not well studied. To this end, we release a Pragmatics Understanding Benchmark (PUB) dataset consisting of fourteen tasks in four pragmatics phenomena, namely; Implicature, Presupposition, Reference, and Deixis. We curate high-quality test sets for each task, consisting of Multiple Choice Question Answers (MCQA). PUB includes a total of 28k data points, 6.1k are newly annotated. We evaluate nine models varying in the number of parameters and type of training. Our study reveals several key observations about the pragmatic capabilities of LLMs: 1. chat-fine-tuning strongly benefits smaller models, 2. large base models are competitive with their chat-fine-tuned counterparts, 3. there is a huge variance in performance across different pragmatics phenomena, and 4. a noticeable performance gap between human capabilities and model capabilities. We hope that PUB will enable comprehensive evaluation of LLM`s pragmatic reasoning capabilities."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sravanthi-etal-2024-pub">
<titleInfo>
<title>PUB: A Pragmatics Understanding Benchmark for Assessing LLMs’ Pragmatics Capabilities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Settaluri</namePart>
<namePart type="family">Sravanthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meet</namePart>
<namePart type="family">Doshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavan</namePart>
<namePart type="family">Tankala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rudra</namePart>
<namePart type="family">Murthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raj</namePart>
<namePart type="family">Dabre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>LLMs have demonstrated remarkable capability for understanding semantics, but their understanding of pragmatics is not well studied. To this end, we release a Pragmatics Understanding Benchmark (PUB) dataset consisting of fourteen tasks in four pragmatics phenomena, namely; Implicature, Presupposition, Reference, and Deixis. We curate high-quality test sets for each task, consisting of Multiple Choice Question Answers (MCQA). PUB includes a total of 28k data points, 6.1k are newly annotated. We evaluate nine models varying in the number of parameters and type of training. Our study reveals several key observations about the pragmatic capabilities of LLMs: 1. chat-fine-tuning strongly benefits smaller models, 2. large base models are competitive with their chat-fine-tuned counterparts, 3. there is a huge variance in performance across different pragmatics phenomena, and 4. a noticeable performance gap between human capabilities and model capabilities. We hope that PUB will enable comprehensive evaluation of LLM‘s pragmatic reasoning capabilities.</abstract>
<identifier type="citekey">sravanthi-etal-2024-pub</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.719</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.719/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>12075</start>
<end>12097</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PUB: A Pragmatics Understanding Benchmark for Assessing LLMs’ Pragmatics Capabilities
%A Sravanthi, Settaluri
%A Doshi, Meet
%A Tankala, Pavan
%A Murthy, Rudra
%A Dabre, Raj
%A Bhattacharyya, Pushpak
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F sravanthi-etal-2024-pub
%X LLMs have demonstrated remarkable capability for understanding semantics, but their understanding of pragmatics is not well studied. To this end, we release a Pragmatics Understanding Benchmark (PUB) dataset consisting of fourteen tasks in four pragmatics phenomena, namely; Implicature, Presupposition, Reference, and Deixis. We curate high-quality test sets for each task, consisting of Multiple Choice Question Answers (MCQA). PUB includes a total of 28k data points, 6.1k are newly annotated. We evaluate nine models varying in the number of parameters and type of training. Our study reveals several key observations about the pragmatic capabilities of LLMs: 1. chat-fine-tuning strongly benefits smaller models, 2. large base models are competitive with their chat-fine-tuned counterparts, 3. there is a huge variance in performance across different pragmatics phenomena, and 4. a noticeable performance gap between human capabilities and model capabilities. We hope that PUB will enable comprehensive evaluation of LLM‘s pragmatic reasoning capabilities.
%R 10.18653/v1/2024.findings-acl.719
%U https://aclanthology.org/2024.findings-acl.719/
%U https://doi.org/10.18653/v1/2024.findings-acl.719
%P 12075-12097
Markdown (Informal)
[PUB: A Pragmatics Understanding Benchmark for Assessing LLMs’ Pragmatics Capabilities](https://aclanthology.org/2024.findings-acl.719/) (Sravanthi et al., Findings 2024)
ACL