@inproceedings{bhatia-etal-2024-qalam,
title = "Qalam: A Multimodal {LLM} for {A}rabic Optical Character and Handwriting Recognition",
author = "Bhatia, Gagan and
Nagoudi, El Moatez Billah and
Alwajih, Fakhraddin and
Abdul-Mageed, Muhammad",
editor = "Habash, Nizar and
Bouamor, Houda and
Eskander, Ramy and
Tomeh, Nadi and
Abu Farha, Ibrahim and
Abdelali, Ahmed and
Touileb, Samia and
Hamed, Injy and
Onaizan, Yaser and
Alhafni, Bashar and
Antoun, Wissam and
Khalifa, Salam and
Haddad, Hatem and
Zitouni, Imed and
AlKhamissi, Badr and
Almatham, Rawan and
Mrini, Khalil",
booktitle = "Proceedings of The Second Arabic Natural Language Processing Conference",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.arabicnlp-1.19",
doi = "10.18653/v1/2024.arabicnlp-1.19",
pages = "210--224",
abstract = "Arabic Optical Character Recognition (OCR) and Handwriting Recognition (HWR) pose unique challenges due to the cursive and context-sensitive nature of the Arabic script. This study introduces ***Qalam***, a novel foundation model designed for Arabic OCR and HWR, built on a SwinV2 encoder and RoBERTa decoder architecture. Our model significantly outperforms existing methods, achieving a Word Error Rate (WER) of just 0.80{\%} in HWR tasks and 1.18{\%} in OCR tasks. We train ***Qalam*** on a diverse dataset, including over 4.5 million images from Arabic manuscripts and a synthetic dataset comprising 60k image-text pairs. Notably, ***Qalam*** demonstrates exceptional handling of Arabic diacritics, a critical feature in Arabic scripts. Furthermore, it shows a remarkable ability to process high-resolution inputs, addressing a common limitation in current OCR systems. These advancements underscore ***Qalam***{'}s potential as a leading solution for Arabic script recognition, offering a significant leap in accuracy and efficiency.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bhatia-etal-2024-qalam">
<titleInfo>
<title>Qalam: A Multimodal LLM for Arabic Optical Character and Handwriting Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gagan</namePart>
<namePart type="family">Bhatia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">El</namePart>
<namePart type="given">Moatez</namePart>
<namePart type="given">Billah</namePart>
<namePart type="family">Nagoudi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fakhraddin</namePart>
<namePart type="family">Alwajih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="family">Abdul-Mageed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The Second Arabic Natural Language Processing Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramy</namePart>
<namePart type="family">Eskander</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadi</namePart>
<namePart type="family">Tomeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Abu Farha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Abdelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samia</namePart>
<namePart type="family">Touileb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Injy</namePart>
<namePart type="family">Hamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bashar</namePart>
<namePart type="family">Alhafni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wissam</namePart>
<namePart type="family">Antoun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salam</namePart>
<namePart type="family">Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hatem</namePart>
<namePart type="family">Haddad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badr</namePart>
<namePart type="family">AlKhamissi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rawan</namePart>
<namePart type="family">Almatham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalil</namePart>
<namePart type="family">Mrini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Arabic Optical Character Recognition (OCR) and Handwriting Recognition (HWR) pose unique challenges due to the cursive and context-sensitive nature of the Arabic script. This study introduces ***Qalam***, a novel foundation model designed for Arabic OCR and HWR, built on a SwinV2 encoder and RoBERTa decoder architecture. Our model significantly outperforms existing methods, achieving a Word Error Rate (WER) of just 0.80% in HWR tasks and 1.18% in OCR tasks. We train ***Qalam*** on a diverse dataset, including over 4.5 million images from Arabic manuscripts and a synthetic dataset comprising 60k image-text pairs. Notably, ***Qalam*** demonstrates exceptional handling of Arabic diacritics, a critical feature in Arabic scripts. Furthermore, it shows a remarkable ability to process high-resolution inputs, addressing a common limitation in current OCR systems. These advancements underscore ***Qalam***’s potential as a leading solution for Arabic script recognition, offering a significant leap in accuracy and efficiency.</abstract>
<identifier type="citekey">bhatia-etal-2024-qalam</identifier>
<identifier type="doi">10.18653/v1/2024.arabicnlp-1.19</identifier>
<location>
<url>https://aclanthology.org/2024.arabicnlp-1.19</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>210</start>
<end>224</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Qalam: A Multimodal LLM for Arabic Optical Character and Handwriting Recognition
%A Bhatia, Gagan
%A Nagoudi, El Moatez Billah
%A Alwajih, Fakhraddin
%A Abdul-Mageed, Muhammad
%Y Habash, Nizar
%Y Bouamor, Houda
%Y Eskander, Ramy
%Y Tomeh, Nadi
%Y Abu Farha, Ibrahim
%Y Abdelali, Ahmed
%Y Touileb, Samia
%Y Hamed, Injy
%Y Onaizan, Yaser
%Y Alhafni, Bashar
%Y Antoun, Wissam
%Y Khalifa, Salam
%Y Haddad, Hatem
%Y Zitouni, Imed
%Y AlKhamissi, Badr
%Y Almatham, Rawan
%Y Mrini, Khalil
%S Proceedings of The Second Arabic Natural Language Processing Conference
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F bhatia-etal-2024-qalam
%X Arabic Optical Character Recognition (OCR) and Handwriting Recognition (HWR) pose unique challenges due to the cursive and context-sensitive nature of the Arabic script. This study introduces ***Qalam***, a novel foundation model designed for Arabic OCR and HWR, built on a SwinV2 encoder and RoBERTa decoder architecture. Our model significantly outperforms existing methods, achieving a Word Error Rate (WER) of just 0.80% in HWR tasks and 1.18% in OCR tasks. We train ***Qalam*** on a diverse dataset, including over 4.5 million images from Arabic manuscripts and a synthetic dataset comprising 60k image-text pairs. Notably, ***Qalam*** demonstrates exceptional handling of Arabic diacritics, a critical feature in Arabic scripts. Furthermore, it shows a remarkable ability to process high-resolution inputs, addressing a common limitation in current OCR systems. These advancements underscore ***Qalam***’s potential as a leading solution for Arabic script recognition, offering a significant leap in accuracy and efficiency.
%R 10.18653/v1/2024.arabicnlp-1.19
%U https://aclanthology.org/2024.arabicnlp-1.19
%U https://doi.org/10.18653/v1/2024.arabicnlp-1.19
%P 210-224
Markdown (Informal)
[Qalam: A Multimodal LLM for Arabic Optical Character and Handwriting Recognition](https://aclanthology.org/2024.arabicnlp-1.19) (Bhatia et al., ArabicNLP-WS 2024)
ACL