@inproceedings{halevy-etal-2024-flex,
title = "{\textquotedblleft}Flex Tape Can`t Fix That{\textquotedblright}: Bias and Misinformation in Edited Language Models",
author = "Halevy, Karina H and
Sotnikova, Anna and
AlKhamissi, Badr and
Montariol, Syrielle and
Bosselut, Antoine",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.494/",
doi = "10.18653/v1/2024.emnlp-main.494",
pages = "8690--8707",
abstract = "Weight-based model editing methods update the parametric knowledge of language models post-training. However, these methods can unintentionally alter unrelated parametric knowledge representations, potentially increasing the risk of harm. In this work, we investigate how weight editing methods unexpectedly amplify model biases after edits. We introduce a novel benchmark dataset, Seesaw-CF, for measuring bias amplification of model editing methods for demographic traits such as race, geographic origin, and gender. We use Seesaw-CF to examine the impact of model editing on bias in five large language models. Our results demonstrate that edited models exhibit, to various degrees, more biased behavior for certain demographic groups than before they were edited, specifically becoming less confident in properties for Asian and African subjects. Additionally, editing facts about place of birth, country of citizenship, or gender has particularly negative effects on the model`s knowledge about unrelated properties, such as field of work, a pattern observed across multiple models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="halevy-etal-2024-flex">
<titleInfo>
<title>“Flex Tape Can‘t Fix That”: Bias and Misinformation in Edited Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Karina</namePart>
<namePart type="given">H</namePart>
<namePart type="family">Halevy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Sotnikova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badr</namePart>
<namePart type="family">AlKhamissi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Syrielle</namePart>
<namePart type="family">Montariol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antoine</namePart>
<namePart type="family">Bosselut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Weight-based model editing methods update the parametric knowledge of language models post-training. However, these methods can unintentionally alter unrelated parametric knowledge representations, potentially increasing the risk of harm. In this work, we investigate how weight editing methods unexpectedly amplify model biases after edits. We introduce a novel benchmark dataset, Seesaw-CF, for measuring bias amplification of model editing methods for demographic traits such as race, geographic origin, and gender. We use Seesaw-CF to examine the impact of model editing on bias in five large language models. Our results demonstrate that edited models exhibit, to various degrees, more biased behavior for certain demographic groups than before they were edited, specifically becoming less confident in properties for Asian and African subjects. Additionally, editing facts about place of birth, country of citizenship, or gender has particularly negative effects on the model‘s knowledge about unrelated properties, such as field of work, a pattern observed across multiple models.</abstract>
<identifier type="citekey">halevy-etal-2024-flex</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.494</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.494/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>8690</start>
<end>8707</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T “Flex Tape Can‘t Fix That”: Bias and Misinformation in Edited Language Models
%A Halevy, Karina H.
%A Sotnikova, Anna
%A AlKhamissi, Badr
%A Montariol, Syrielle
%A Bosselut, Antoine
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F halevy-etal-2024-flex
%X Weight-based model editing methods update the parametric knowledge of language models post-training. However, these methods can unintentionally alter unrelated parametric knowledge representations, potentially increasing the risk of harm. In this work, we investigate how weight editing methods unexpectedly amplify model biases after edits. We introduce a novel benchmark dataset, Seesaw-CF, for measuring bias amplification of model editing methods for demographic traits such as race, geographic origin, and gender. We use Seesaw-CF to examine the impact of model editing on bias in five large language models. Our results demonstrate that edited models exhibit, to various degrees, more biased behavior for certain demographic groups than before they were edited, specifically becoming less confident in properties for Asian and African subjects. Additionally, editing facts about place of birth, country of citizenship, or gender has particularly negative effects on the model‘s knowledge about unrelated properties, such as field of work, a pattern observed across multiple models.
%R 10.18653/v1/2024.emnlp-main.494
%U https://aclanthology.org/2024.emnlp-main.494/
%U https://doi.org/10.18653/v1/2024.emnlp-main.494
%P 8690-8707
Markdown (Informal)
[“Flex Tape Can’t Fix That”: Bias and Misinformation in Edited Language Models](https://aclanthology.org/2024.emnlp-main.494/) (Halevy et al., EMNLP 2024)
ACL