@inproceedings{bijoy-etal-2023-advancing,
title = "Advancing {B}angla Punctuation Restoration by a Monolingual Transformer-Based Method and a Large-Scale Corpus",
author = "Bijoy, Mehedi Hasan and
Faria, Mir Fatema Afroz and
E Sobhani, Mahbub and
Ferdoush, Tanzid and
Shatabda, Swakkhar",
editor = "Alam, Firoj and
Kar, Sudipta and
Chowdhury, Shammur Absar and
Sadeque, Farig and
Amin, Ruhul",
booktitle = "Proceedings of the First Workshop on Bangla Language Processing (BLP-2023)",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.banglalp-1.3",
doi = "10.18653/v1/2023.banglalp-1.3",
pages = "18--25",
abstract = "Punctuation restoration is the endeavor of reinstating and rectifying missing or improper punctuation marks within a text, thereby eradicating ambiguity in written discourse. The Bangla punctuation restoration task has received little attention and exploration, despitethe rising popularity of textual communication in the language. The primary hindrances in the advancement of the task revolve aroundthe utilization of transformer-based methods and an openly accessible extensive corpus, challenges that we discovered remainedunresolved in earlier efforts. In this study, we propose a baseline by introducing a mono-lingual transformer-based method named Jatikarok, where the effectiveness of transfer learning has been meticulously scrutinized, and a large-scale corpus containing 1.48M source-target pairs to resolve the previous issues. The Jatikarok attains accuracy rates of 95.2{\%}, 85.13{\%}, and 91.36{\%} on the BanglaPRCorpus, Prothom-Alo Balanced, and BanglaOPUS corpora, thereby establishing itself as the state-of-the-art method through its superior performance compared to BanglaT5 and T5-Small. Jatikarok and BanglaPRCorpus are publicly available at: https://github.com/mehedihasanbijoy/Jatikarok-and-BanglaPRCorpus",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bijoy-etal-2023-advancing">
<titleInfo>
<title>Advancing Bangla Punctuation Restoration by a Monolingual Transformer-Based Method and a Large-Scale Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mehedi</namePart>
<namePart type="given">Hasan</namePart>
<namePart type="family">Bijoy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mir</namePart>
<namePart type="given">Fatema</namePart>
<namePart type="given">Afroz</namePart>
<namePart type="family">Faria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahbub</namePart>
<namePart type="family">E Sobhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanzid</namePart>
<namePart type="family">Ferdoush</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Swakkhar</namePart>
<namePart type="family">Shatabda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Bangla Language Processing (BLP-2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudipta</namePart>
<namePart type="family">Kar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shammur</namePart>
<namePart type="given">Absar</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Farig</namePart>
<namePart type="family">Sadeque</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruhul</namePart>
<namePart type="family">Amin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Punctuation restoration is the endeavor of reinstating and rectifying missing or improper punctuation marks within a text, thereby eradicating ambiguity in written discourse. The Bangla punctuation restoration task has received little attention and exploration, despitethe rising popularity of textual communication in the language. The primary hindrances in the advancement of the task revolve aroundthe utilization of transformer-based methods and an openly accessible extensive corpus, challenges that we discovered remainedunresolved in earlier efforts. In this study, we propose a baseline by introducing a mono-lingual transformer-based method named Jatikarok, where the effectiveness of transfer learning has been meticulously scrutinized, and a large-scale corpus containing 1.48M source-target pairs to resolve the previous issues. The Jatikarok attains accuracy rates of 95.2%, 85.13%, and 91.36% on the BanglaPRCorpus, Prothom-Alo Balanced, and BanglaOPUS corpora, thereby establishing itself as the state-of-the-art method through its superior performance compared to BanglaT5 and T5-Small. Jatikarok and BanglaPRCorpus are publicly available at: https://github.com/mehedihasanbijoy/Jatikarok-and-BanglaPRCorpus</abstract>
<identifier type="citekey">bijoy-etal-2023-advancing</identifier>
<identifier type="doi">10.18653/v1/2023.banglalp-1.3</identifier>
<location>
<url>https://aclanthology.org/2023.banglalp-1.3</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>18</start>
<end>25</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Advancing Bangla Punctuation Restoration by a Monolingual Transformer-Based Method and a Large-Scale Corpus
%A Bijoy, Mehedi Hasan
%A Faria, Mir Fatema Afroz
%A E Sobhani, Mahbub
%A Ferdoush, Tanzid
%A Shatabda, Swakkhar
%Y Alam, Firoj
%Y Kar, Sudipta
%Y Chowdhury, Shammur Absar
%Y Sadeque, Farig
%Y Amin, Ruhul
%S Proceedings of the First Workshop on Bangla Language Processing (BLP-2023)
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F bijoy-etal-2023-advancing
%X Punctuation restoration is the endeavor of reinstating and rectifying missing or improper punctuation marks within a text, thereby eradicating ambiguity in written discourse. The Bangla punctuation restoration task has received little attention and exploration, despitethe rising popularity of textual communication in the language. The primary hindrances in the advancement of the task revolve aroundthe utilization of transformer-based methods and an openly accessible extensive corpus, challenges that we discovered remainedunresolved in earlier efforts. In this study, we propose a baseline by introducing a mono-lingual transformer-based method named Jatikarok, where the effectiveness of transfer learning has been meticulously scrutinized, and a large-scale corpus containing 1.48M source-target pairs to resolve the previous issues. The Jatikarok attains accuracy rates of 95.2%, 85.13%, and 91.36% on the BanglaPRCorpus, Prothom-Alo Balanced, and BanglaOPUS corpora, thereby establishing itself as the state-of-the-art method through its superior performance compared to BanglaT5 and T5-Small. Jatikarok and BanglaPRCorpus are publicly available at: https://github.com/mehedihasanbijoy/Jatikarok-and-BanglaPRCorpus
%R 10.18653/v1/2023.banglalp-1.3
%U https://aclanthology.org/2023.banglalp-1.3
%U https://doi.org/10.18653/v1/2023.banglalp-1.3
%P 18-25
Markdown (Informal)
[Advancing Bangla Punctuation Restoration by a Monolingual Transformer-Based Method and a Large-Scale Corpus](https://aclanthology.org/2023.banglalp-1.3) (Bijoy et al., BanglaLP 2023)
ACL