@inproceedings{segarra-soriano-etal-2022-dacsa,
title = "{DACSA}: A large-scale Dataset for Automatic summarization of {C}atalan and {S}panish newspaper Articles",
author = "Segarra Soriano, Encarnaci{\'o}n and
Ahuir, Vicent and
Hurtado, Llu{\'\i}s-F. and
Gonz{\'a}lez, Jos{\'e}",
editor = "Carpuat, Marine and
de Marneffe, Marie-Catherine and
Meza Ruiz, Ivan Vladimir",
booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.naacl-main.434",
doi = "10.18653/v1/2022.naacl-main.434",
pages = "5931--5943",
abstract = "The application of supervised methods to automatic summarization requires the availability of adequate corpora consisting of a set of document-summary pairs. As in most Natural Language Processing tasks, the great majority of available datasets for summarization are in English, making it difficult to develop automatic summarization models for other languages. Although Spanish is gradually forming part of some recent summarization corpora, it is not the same for minority languages such as Catalan. In this work, we describe the construction of a corpus of Catalan and Spanish newspapers, the Dataset for Automatic summarization of Catalan and Spanish newspaper Articles (DACSA) corpus. It is a high-quality large-scale corpus that can be used to train summarization models for Catalan and Spanish.We have carried out an analysis of the corpus, both in terms of the style of the summaries and the difficulty of the summarization task. In particular, we have used a set of well-known metrics in the summarization field in order to characterize the corpus. Additionally, for benchmarking purposes, we have evaluated the performances of some extractive and abstractive summarization systems on the DACSA corpus.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="segarra-soriano-etal-2022-dacsa">
<titleInfo>
<title>DACSA: A large-scale Dataset for Automatic summarization of Catalan and Spanish newspaper Articles</title>
</titleInfo>
<name type="personal">
<namePart type="given">Encarnación</namePart>
<namePart type="family">Segarra Soriano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vicent</namePart>
<namePart type="family">Ahuir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís-F.</namePart>
<namePart type="family">Hurtado</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">José</namePart>
<namePart type="family">González</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Catherine</namePart>
<namePart type="family">de Marneffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="given">Vladimir</namePart>
<namePart type="family">Meza Ruiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The application of supervised methods to automatic summarization requires the availability of adequate corpora consisting of a set of document-summary pairs. As in most Natural Language Processing tasks, the great majority of available datasets for summarization are in English, making it difficult to develop automatic summarization models for other languages. Although Spanish is gradually forming part of some recent summarization corpora, it is not the same for minority languages such as Catalan. In this work, we describe the construction of a corpus of Catalan and Spanish newspapers, the Dataset for Automatic summarization of Catalan and Spanish newspaper Articles (DACSA) corpus. It is a high-quality large-scale corpus that can be used to train summarization models for Catalan and Spanish.We have carried out an analysis of the corpus, both in terms of the style of the summaries and the difficulty of the summarization task. In particular, we have used a set of well-known metrics in the summarization field in order to characterize the corpus. Additionally, for benchmarking purposes, we have evaluated the performances of some extractive and abstractive summarization systems on the DACSA corpus.</abstract>
<identifier type="citekey">segarra-soriano-etal-2022-dacsa</identifier>
<identifier type="doi">10.18653/v1/2022.naacl-main.434</identifier>
<location>
<url>https://aclanthology.org/2022.naacl-main.434</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>5931</start>
<end>5943</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DACSA: A large-scale Dataset for Automatic summarization of Catalan and Spanish newspaper Articles
%A Segarra Soriano, Encarnación
%A Ahuir, Vicent
%A Hurtado, Lluís-F.
%A González, José
%Y Carpuat, Marine
%Y de Marneffe, Marie-Catherine
%Y Meza Ruiz, Ivan Vladimir
%S Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, United States
%F segarra-soriano-etal-2022-dacsa
%X The application of supervised methods to automatic summarization requires the availability of adequate corpora consisting of a set of document-summary pairs. As in most Natural Language Processing tasks, the great majority of available datasets for summarization are in English, making it difficult to develop automatic summarization models for other languages. Although Spanish is gradually forming part of some recent summarization corpora, it is not the same for minority languages such as Catalan. In this work, we describe the construction of a corpus of Catalan and Spanish newspapers, the Dataset for Automatic summarization of Catalan and Spanish newspaper Articles (DACSA) corpus. It is a high-quality large-scale corpus that can be used to train summarization models for Catalan and Spanish.We have carried out an analysis of the corpus, both in terms of the style of the summaries and the difficulty of the summarization task. In particular, we have used a set of well-known metrics in the summarization field in order to characterize the corpus. Additionally, for benchmarking purposes, we have evaluated the performances of some extractive and abstractive summarization systems on the DACSA corpus.
%R 10.18653/v1/2022.naacl-main.434
%U https://aclanthology.org/2022.naacl-main.434
%U https://doi.org/10.18653/v1/2022.naacl-main.434
%P 5931-5943
Markdown (Informal)
[DACSA: A large-scale Dataset for Automatic summarization of Catalan and Spanish newspaper Articles](https://aclanthology.org/2022.naacl-main.434) (Segarra Soriano et al., NAACL 2022)
ACL