@inproceedings{morabito-etal-2023-debiasing,
title = "Debiasing should be Good and Bad: Measuring the Consistency of Debiasing Techniques in Language Models",
author = "Morabito, Robert and
Kabbara, Jad and
Emami, Ali",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.280",
doi = "10.18653/v1/2023.findings-acl.280",
pages = "4581--4597",
abstract = "Debiasing methods that seek to mitigate the tendency of Language Models (LMs) to occasionally output toxic or inappropriate text have recently gained traction. In this paper, we propose a standardized protocol which distinguishes methods that yield not only desirable results, but are also consistent with their mechanisms and specifications. For example, we ask, given a debiasing method that is developed to reduce toxicity in LMs, if the definition of toxicity used by the debiasing method is reversed, would the debiasing results also be reversed? We used such considerations to devise three criteria for our new protocol: Specification Polarity, Specification Importance, and Domain Transferability. As a case study, we apply our protocol to a popular debiasing method, Self-Debiasing, and compare it to one we propose, called Instructive Debiasing, and demonstrate that consistency is as important an aspect to debiasing viability as is simply a desirable result. We show that our protocol provides essential insights into the generalizability and interpretability of debiasing methods that may otherwise go overlooked.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="morabito-etal-2023-debiasing">
<titleInfo>
<title>Debiasing should be Good and Bad: Measuring the Consistency of Debiasing Techniques in Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Morabito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jad</namePart>
<namePart type="family">Kabbara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Emami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Debiasing methods that seek to mitigate the tendency of Language Models (LMs) to occasionally output toxic or inappropriate text have recently gained traction. In this paper, we propose a standardized protocol which distinguishes methods that yield not only desirable results, but are also consistent with their mechanisms and specifications. For example, we ask, given a debiasing method that is developed to reduce toxicity in LMs, if the definition of toxicity used by the debiasing method is reversed, would the debiasing results also be reversed? We used such considerations to devise three criteria for our new protocol: Specification Polarity, Specification Importance, and Domain Transferability. As a case study, we apply our protocol to a popular debiasing method, Self-Debiasing, and compare it to one we propose, called Instructive Debiasing, and demonstrate that consistency is as important an aspect to debiasing viability as is simply a desirable result. We show that our protocol provides essential insights into the generalizability and interpretability of debiasing methods that may otherwise go overlooked.</abstract>
<identifier type="citekey">morabito-etal-2023-debiasing</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.280</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.280</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>4581</start>
<end>4597</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Debiasing should be Good and Bad: Measuring the Consistency of Debiasing Techniques in Language Models
%A Morabito, Robert
%A Kabbara, Jad
%A Emami, Ali
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F morabito-etal-2023-debiasing
%X Debiasing methods that seek to mitigate the tendency of Language Models (LMs) to occasionally output toxic or inappropriate text have recently gained traction. In this paper, we propose a standardized protocol which distinguishes methods that yield not only desirable results, but are also consistent with their mechanisms and specifications. For example, we ask, given a debiasing method that is developed to reduce toxicity in LMs, if the definition of toxicity used by the debiasing method is reversed, would the debiasing results also be reversed? We used such considerations to devise three criteria for our new protocol: Specification Polarity, Specification Importance, and Domain Transferability. As a case study, we apply our protocol to a popular debiasing method, Self-Debiasing, and compare it to one we propose, called Instructive Debiasing, and demonstrate that consistency is as important an aspect to debiasing viability as is simply a desirable result. We show that our protocol provides essential insights into the generalizability and interpretability of debiasing methods that may otherwise go overlooked.
%R 10.18653/v1/2023.findings-acl.280
%U https://aclanthology.org/2023.findings-acl.280
%U https://doi.org/10.18653/v1/2023.findings-acl.280
%P 4581-4597
Markdown (Informal)
[Debiasing should be Good and Bad: Measuring the Consistency of Debiasing Techniques in Language Models](https://aclanthology.org/2023.findings-acl.280) (Morabito et al., Findings 2023)
ACL