@inproceedings{shmidman-etal-2024-mrl,
title = "{MRL} Parsing Without Tears: The Case of {H}ebrew",
author = "Shmidman, Shaltiel and
Shmidman, Avi and
Koppel, Moshe and
Tsarfaty, Reut",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.269",
doi = "10.18653/v1/2024.findings-acl.269",
pages = "4537--4550",
abstract = "Syntactic parsing remains a critical tool for relation extraction and information extraction, especially in resource-scarce languages where LLMs are lacking. Yet in morphologically rich languages (MRLs), where parsers need to identify multiple lexical units in each token, existing systems suffer in latency and setup complexity. Some use a pipeline to peel away the layers: first segmentation, then morphology tagging, and then syntax parsing; however, errors in earlier layers are then propagated forward. Others use a joint architecture to evaluate all permutations at once; while this improves accuracy, it is notoriously slow. In contrast, and taking Hebrew as a test case, we present a new {``}flipped pipeline{''}: decisions are made directly on the whole-token units by expert classifiers, each one dedicated to one specific task. The classifier predictions are independent of one another, and only at the end do we synthesize their predictions. This blazingly fast approach requires only a single huggingface call, without the need for recourse to lexicons or linguistic resources. When trained on the same training set used in previous studies, our model achieves near-SOTA performance on a wide array of Hebrew NLP tasks. Furthermore, when trained on a newly enlarged training corpus, our model achieves a new SOTA for Hebrew POS tagging and dependency parsing. We release this new SOTA model to the community. Because our architecture does not rely on any language-specific resources, it can serve as a model to develop similar parsers for other MRLs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shmidman-etal-2024-mrl">
<titleInfo>
<title>MRL Parsing Without Tears: The Case of Hebrew</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shaltiel</namePart>
<namePart type="family">Shmidman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avi</namePart>
<namePart type="family">Shmidman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moshe</namePart>
<namePart type="family">Koppel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Syntactic parsing remains a critical tool for relation extraction and information extraction, especially in resource-scarce languages where LLMs are lacking. Yet in morphologically rich languages (MRLs), where parsers need to identify multiple lexical units in each token, existing systems suffer in latency and setup complexity. Some use a pipeline to peel away the layers: first segmentation, then morphology tagging, and then syntax parsing; however, errors in earlier layers are then propagated forward. Others use a joint architecture to evaluate all permutations at once; while this improves accuracy, it is notoriously slow. In contrast, and taking Hebrew as a test case, we present a new “flipped pipeline”: decisions are made directly on the whole-token units by expert classifiers, each one dedicated to one specific task. The classifier predictions are independent of one another, and only at the end do we synthesize their predictions. This blazingly fast approach requires only a single huggingface call, without the need for recourse to lexicons or linguistic resources. When trained on the same training set used in previous studies, our model achieves near-SOTA performance on a wide array of Hebrew NLP tasks. Furthermore, when trained on a newly enlarged training corpus, our model achieves a new SOTA for Hebrew POS tagging and dependency parsing. We release this new SOTA model to the community. Because our architecture does not rely on any language-specific resources, it can serve as a model to develop similar parsers for other MRLs.</abstract>
<identifier type="citekey">shmidman-etal-2024-mrl</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.269</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.269</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>4537</start>
<end>4550</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MRL Parsing Without Tears: The Case of Hebrew
%A Shmidman, Shaltiel
%A Shmidman, Avi
%A Koppel, Moshe
%A Tsarfaty, Reut
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F shmidman-etal-2024-mrl
%X Syntactic parsing remains a critical tool for relation extraction and information extraction, especially in resource-scarce languages where LLMs are lacking. Yet in morphologically rich languages (MRLs), where parsers need to identify multiple lexical units in each token, existing systems suffer in latency and setup complexity. Some use a pipeline to peel away the layers: first segmentation, then morphology tagging, and then syntax parsing; however, errors in earlier layers are then propagated forward. Others use a joint architecture to evaluate all permutations at once; while this improves accuracy, it is notoriously slow. In contrast, and taking Hebrew as a test case, we present a new “flipped pipeline”: decisions are made directly on the whole-token units by expert classifiers, each one dedicated to one specific task. The classifier predictions are independent of one another, and only at the end do we synthesize their predictions. This blazingly fast approach requires only a single huggingface call, without the need for recourse to lexicons or linguistic resources. When trained on the same training set used in previous studies, our model achieves near-SOTA performance on a wide array of Hebrew NLP tasks. Furthermore, when trained on a newly enlarged training corpus, our model achieves a new SOTA for Hebrew POS tagging and dependency parsing. We release this new SOTA model to the community. Because our architecture does not rely on any language-specific resources, it can serve as a model to develop similar parsers for other MRLs.
%R 10.18653/v1/2024.findings-acl.269
%U https://aclanthology.org/2024.findings-acl.269
%U https://doi.org/10.18653/v1/2024.findings-acl.269
%P 4537-4550
Markdown (Informal)
[MRL Parsing Without Tears: The Case of Hebrew](https://aclanthology.org/2024.findings-acl.269) (Shmidman et al., Findings 2024)
ACL
- Shaltiel Shmidman, Avi Shmidman, Moshe Koppel, and Reut Tsarfaty. 2024. MRL Parsing Without Tears: The Case of Hebrew. In Findings of the Association for Computational Linguistics: ACL 2024, pages 4537–4550, Bangkok, Thailand. Association for Computational Linguistics.