@inproceedings{kim-etal-2023-chemical,
title = "Chemical Language Understanding Benchmark",
author = "Kim, Yunsoo and
Ko, Hyuk and
Lee, Jane and
Heo, Hyun Young and
Yang, Jinyoung and
Lee, Sungsoo and
Lee, Kyu-hwang",
editor = "Sitaram, Sunayana and
Beigman Klebanov, Beata and
Williams, Jason D",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-industry.39/",
doi = "10.18653/v1/2023.acl-industry.39",
pages = "404--411",
abstract = "In this paper, we introduce the benchmark datasets named CLUB (Chemical Language Understanding Benchmark) to facilitate NLP research in the chemical industry. We have 4 datasets consisted of text and token classification tasks. As far as we have recognized, it is one of the first examples of chemical language understanding benchmark datasets consisted of tasks for both patent and literature articles provided by industrial organization. All the datasets are internally made by chemists from scratch. Finally, we evaluate the datasets on the various language models based on BERT and RoBERTa, and demonstrate the model performs better when the domain of the pretrained models are closer to chemistry domain. We provide baselines for our benchmark as 0.8054 in average, and we hope this benchmark is used by many researchers in both industry and academia."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-etal-2023-chemical">
<titleInfo>
<title>Chemical Language Understanding Benchmark</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunsoo</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyuk</namePart>
<namePart type="family">Ko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jane</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyun</namePart>
<namePart type="given">Young</namePart>
<namePart type="family">Heo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinyoung</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sungsoo</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyu-hwang</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sunayana</namePart>
<namePart type="family">Sitaram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Beata</namePart>
<namePart type="family">Beigman Klebanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we introduce the benchmark datasets named CLUB (Chemical Language Understanding Benchmark) to facilitate NLP research in the chemical industry. We have 4 datasets consisted of text and token classification tasks. As far as we have recognized, it is one of the first examples of chemical language understanding benchmark datasets consisted of tasks for both patent and literature articles provided by industrial organization. All the datasets are internally made by chemists from scratch. Finally, we evaluate the datasets on the various language models based on BERT and RoBERTa, and demonstrate the model performs better when the domain of the pretrained models are closer to chemistry domain. We provide baselines for our benchmark as 0.8054 in average, and we hope this benchmark is used by many researchers in both industry and academia.</abstract>
<identifier type="citekey">kim-etal-2023-chemical</identifier>
<identifier type="doi">10.18653/v1/2023.acl-industry.39</identifier>
<location>
<url>https://aclanthology.org/2023.acl-industry.39/</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>404</start>
<end>411</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Chemical Language Understanding Benchmark
%A Kim, Yunsoo
%A Ko, Hyuk
%A Lee, Jane
%A Heo, Hyun Young
%A Yang, Jinyoung
%A Lee, Sungsoo
%A Lee, Kyu-hwang
%Y Sitaram, Sunayana
%Y Beigman Klebanov, Beata
%Y Williams, Jason D.
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F kim-etal-2023-chemical
%X In this paper, we introduce the benchmark datasets named CLUB (Chemical Language Understanding Benchmark) to facilitate NLP research in the chemical industry. We have 4 datasets consisted of text and token classification tasks. As far as we have recognized, it is one of the first examples of chemical language understanding benchmark datasets consisted of tasks for both patent and literature articles provided by industrial organization. All the datasets are internally made by chemists from scratch. Finally, we evaluate the datasets on the various language models based on BERT and RoBERTa, and demonstrate the model performs better when the domain of the pretrained models are closer to chemistry domain. We provide baselines for our benchmark as 0.8054 in average, and we hope this benchmark is used by many researchers in both industry and academia.
%R 10.18653/v1/2023.acl-industry.39
%U https://aclanthology.org/2023.acl-industry.39/
%U https://doi.org/10.18653/v1/2023.acl-industry.39
%P 404-411
Markdown (Informal)
[Chemical Language Understanding Benchmark](https://aclanthology.org/2023.acl-industry.39/) (Kim et al., ACL 2023)
ACL
- Yunsoo Kim, Hyuk Ko, Jane Lee, Hyun Young Heo, Jinyoung Yang, Sungsoo Lee, and Kyu-hwang Lee. 2023. Chemical Language Understanding Benchmark. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track), pages 404–411, Toronto, Canada. Association for Computational Linguistics.