@inproceedings{zagatti-etal-2022-mwetoolkit,
title = "mwetoolkit-lib: Adaptation of the mwetoolkit as a Python Library and an Application to {MWE}-based Document Clustering",
author = "Zagatti, Fernando and
Medeiros, Paulo Augusto de Lima and
Soares, Esther da Cunha and
Silva, Lucas Nildaimon dos Santos and
Ramisch, Carlos and
Real, Livy",
editor = "Bhatia, Archna and
Cook, Paul and
Taslimipoor, Shiva and
Garcia, Marcos and
Ramisch, Carlos",
booktitle = "Proceedings of the 18th Workshop on Multiword Expressions @LREC2022",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.mwe-1.16",
pages = "112--117",
abstract = "This paper introduces the mwetoolkit-lib, an adaptation of the mwetoolkit as a python library. The original toolkit performs the extraction and identification of multiword expressions (MWEs) in large text bases through the command line. One of the contributions of our work is the adaptation of the MWE extraction pipeline from the mwetoolkit, allowing its usage in python development environments and integration in larger pipelines. The other contribution is the execution of a pilot experiment aiming to show the impact of MWE discovery in data professionals{'} work. This experiment found that the addition of MWE knowledge to the Term Frequency-Inverse Document Frequency (TF-IDF) vectorization altered the word relevance order, improving the linguistic quality of the clusters returned by k-means method.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zagatti-etal-2022-mwetoolkit">
<titleInfo>
<title>mwetoolkit-lib: Adaptation of the mwetoolkit as a Python Library and an Application to MWE-based Document Clustering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fernando</namePart>
<namePart type="family">Zagatti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paulo</namePart>
<namePart type="given">Augusto</namePart>
<namePart type="given">de</namePart>
<namePart type="given">Lima</namePart>
<namePart type="family">Medeiros</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Esther</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cunha</namePart>
<namePart type="family">Soares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucas</namePart>
<namePart type="given">Nildaimon</namePart>
<namePart type="given">dos</namePart>
<namePart type="given">Santos</namePart>
<namePart type="family">Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carlos</namePart>
<namePart type="family">Ramisch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Livy</namePart>
<namePart type="family">Real</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th Workshop on Multiword Expressions @LREC2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Archna</namePart>
<namePart type="family">Bhatia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Cook</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shiva</namePart>
<namePart type="family">Taslimipoor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Garcia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carlos</namePart>
<namePart type="family">Ramisch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces the mwetoolkit-lib, an adaptation of the mwetoolkit as a python library. The original toolkit performs the extraction and identification of multiword expressions (MWEs) in large text bases through the command line. One of the contributions of our work is the adaptation of the MWE extraction pipeline from the mwetoolkit, allowing its usage in python development environments and integration in larger pipelines. The other contribution is the execution of a pilot experiment aiming to show the impact of MWE discovery in data professionals’ work. This experiment found that the addition of MWE knowledge to the Term Frequency-Inverse Document Frequency (TF-IDF) vectorization altered the word relevance order, improving the linguistic quality of the clusters returned by k-means method.</abstract>
<identifier type="citekey">zagatti-etal-2022-mwetoolkit</identifier>
<location>
<url>https://aclanthology.org/2022.mwe-1.16</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>112</start>
<end>117</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T mwetoolkit-lib: Adaptation of the mwetoolkit as a Python Library and an Application to MWE-based Document Clustering
%A Zagatti, Fernando
%A Medeiros, Paulo Augusto de Lima
%A Soares, Esther da Cunha
%A Silva, Lucas Nildaimon dos Santos
%A Ramisch, Carlos
%A Real, Livy
%Y Bhatia, Archna
%Y Cook, Paul
%Y Taslimipoor, Shiva
%Y Garcia, Marcos
%Y Ramisch, Carlos
%S Proceedings of the 18th Workshop on Multiword Expressions @LREC2022
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F zagatti-etal-2022-mwetoolkit
%X This paper introduces the mwetoolkit-lib, an adaptation of the mwetoolkit as a python library. The original toolkit performs the extraction and identification of multiword expressions (MWEs) in large text bases through the command line. One of the contributions of our work is the adaptation of the MWE extraction pipeline from the mwetoolkit, allowing its usage in python development environments and integration in larger pipelines. The other contribution is the execution of a pilot experiment aiming to show the impact of MWE discovery in data professionals’ work. This experiment found that the addition of MWE knowledge to the Term Frequency-Inverse Document Frequency (TF-IDF) vectorization altered the word relevance order, improving the linguistic quality of the clusters returned by k-means method.
%U https://aclanthology.org/2022.mwe-1.16
%P 112-117
Markdown (Informal)
[mwetoolkit-lib: Adaptation of the mwetoolkit as a Python Library and an Application to MWE-based Document Clustering](https://aclanthology.org/2022.mwe-1.16) (Zagatti et al., MWE 2022)
ACL