@inproceedings{enomoto-etal-2024-investigating,
title = "Investigating Web Corpus Filtering Methods for Language Model Development in {J}apanese",
author = "Enomoto, Rintaro and
Tolmachev, Arseny and
Niitsuma, Takuro and
Kurita, Shuhei and
Kawahara, Daisuke",
editor = "Cao, Yang (Trista) and
Papadimitriou, Isabel and
Ovalle, Anaelia and
Zampieri, Marcos and
Ferraro, Francis and
Swayamdipta, Swabha",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-srw.18",
doi = "10.18653/v1/2024.naacl-srw.18",
pages = "154--160",
abstract = "The development of large language models (LLMs) is becoming increasingly significant, and there is a demand for high-quality, large-scale corpora for their pretraining.The quality of a web corpus is especially essential to improve the performance of LLMs because it accounts for a large proportion of the whole corpus. However, filtering methods for Web corpora have yet to be established.In this paper, we present empirical studies to reveal which filtering methods are indeed effective and analyze why they are.We build classifiers and language models in Japanese that can process large amounts of corpora rapidly enough for pretraining LLMs in limited computational resources. By evaluating these filtering methods based on a Web corpus quality evaluation benchmark, we reveal that the most accurate method is the N-gram language model. Indeed, we empirically present that strong filtering methods can rather lead to lesser performance in downstream tasks.We also report that the proportion of some specific topics in the processed documents decreases significantly during the filtering process.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="enomoto-etal-2024-investigating">
<titleInfo>
<title>Investigating Web Corpus Filtering Methods for Language Model Development in Japanese</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rintaro</namePart>
<namePart type="family">Enomoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arseny</namePart>
<namePart type="family">Tolmachev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takuro</namePart>
<namePart type="family">Niitsuma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuhei</namePart>
<namePart type="family">Kurita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daisuke</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">(Trista)</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabel</namePart>
<namePart type="family">Papadimitriou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anaelia</namePart>
<namePart type="family">Ovalle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Ferraro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Swabha</namePart>
<namePart type="family">Swayamdipta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The development of large language models (LLMs) is becoming increasingly significant, and there is a demand for high-quality, large-scale corpora for their pretraining.The quality of a web corpus is especially essential to improve the performance of LLMs because it accounts for a large proportion of the whole corpus. However, filtering methods for Web corpora have yet to be established.In this paper, we present empirical studies to reveal which filtering methods are indeed effective and analyze why they are.We build classifiers and language models in Japanese that can process large amounts of corpora rapidly enough for pretraining LLMs in limited computational resources. By evaluating these filtering methods based on a Web corpus quality evaluation benchmark, we reveal that the most accurate method is the N-gram language model. Indeed, we empirically present that strong filtering methods can rather lead to lesser performance in downstream tasks.We also report that the proportion of some specific topics in the processed documents decreases significantly during the filtering process.</abstract>
<identifier type="citekey">enomoto-etal-2024-investigating</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-srw.18</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-srw.18</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>154</start>
<end>160</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Investigating Web Corpus Filtering Methods for Language Model Development in Japanese
%A Enomoto, Rintaro
%A Tolmachev, Arseny
%A Niitsuma, Takuro
%A Kurita, Shuhei
%A Kawahara, Daisuke
%Y Cao, Yang (Trista)
%Y Papadimitriou, Isabel
%Y Ovalle, Anaelia
%Y Zampieri, Marcos
%Y Ferraro, Francis
%Y Swayamdipta, Swabha
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F enomoto-etal-2024-investigating
%X The development of large language models (LLMs) is becoming increasingly significant, and there is a demand for high-quality, large-scale corpora for their pretraining.The quality of a web corpus is especially essential to improve the performance of LLMs because it accounts for a large proportion of the whole corpus. However, filtering methods for Web corpora have yet to be established.In this paper, we present empirical studies to reveal which filtering methods are indeed effective and analyze why they are.We build classifiers and language models in Japanese that can process large amounts of corpora rapidly enough for pretraining LLMs in limited computational resources. By evaluating these filtering methods based on a Web corpus quality evaluation benchmark, we reveal that the most accurate method is the N-gram language model. Indeed, we empirically present that strong filtering methods can rather lead to lesser performance in downstream tasks.We also report that the proportion of some specific topics in the processed documents decreases significantly during the filtering process.
%R 10.18653/v1/2024.naacl-srw.18
%U https://aclanthology.org/2024.naacl-srw.18
%U https://doi.org/10.18653/v1/2024.naacl-srw.18
%P 154-160
Markdown (Informal)
[Investigating Web Corpus Filtering Methods for Language Model Development in Japanese](https://aclanthology.org/2024.naacl-srw.18) (Enomoto et al., NAACL 2024)
ACL
- Rintaro Enomoto, Arseny Tolmachev, Takuro Niitsuma, Shuhei Kurita, and Daisuke Kawahara. 2024. Investigating Web Corpus Filtering Methods for Language Model Development in Japanese. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop), pages 154–160, Mexico City, Mexico. Association for Computational Linguistics.