362 lines
12 KiB
Python
362 lines
12 KiB
Python
|
#
|
||
|
# Natural Language Toolkit: ARLSTem Stemmer
|
||
|
#
|
||
|
# Copyright (C) 2001-2023 NLTK Project
|
||
|
#
|
||
|
# Author: Kheireddine Abainia (x-programer) <k.abainia@gmail.com>
|
||
|
# Algorithms: Kheireddine Abainia <k.abainia@gmail.com>
|
||
|
# Siham Ouamour
|
||
|
# Halim Sayoud
|
||
|
# URL: <https://www.nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
|
||
|
"""
|
||
|
ARLSTem Arabic Stemmer
|
||
|
The details about the implementation of this algorithm are described in:
|
||
|
K. Abainia, S. Ouamour and H. Sayoud, A Novel Robust Arabic Light Stemmer ,
|
||
|
Journal of Experimental & Theoretical Artificial Intelligence (JETAI'17),
|
||
|
Vol. 29, No. 3, 2017, pp. 557-573.
|
||
|
The ARLSTem is a light Arabic stemmer that is based on removing the affixes
|
||
|
from the word (i.e. prefixes, suffixes and infixes). It was evaluated and
|
||
|
compared to several other stemmers using Paice's parameters (under-stemming
|
||
|
index, over-stemming index and stemming weight), and the results showed that
|
||
|
ARLSTem is promising and producing high performances. This stemmer is not
|
||
|
based on any dictionary and can be used on-line effectively.
|
||
|
"""
|
||
|
import re
|
||
|
|
||
|
from nltk.stem.api import StemmerI
|
||
|
|
||
|
|
||
|
class ARLSTem(StemmerI):
|
||
|
"""
|
||
|
ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary.
|
||
|
Department of Telecommunication & Information Processing. USTHB University,
|
||
|
Algiers, Algeria.
|
||
|
ARLSTem.stem(token) returns the Arabic stem for the input token.
|
||
|
The ARLSTem Stemmer requires that all tokens are encoded using Unicode
|
||
|
encoding.
|
||
|
"""
|
||
|
|
||
|
def __init__(self):
|
||
|
# different Alif with hamza
|
||
|
self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]")
|
||
|
self.re_alifMaqsura = re.compile(r"[\u0649]")
|
||
|
self.re_diacritics = re.compile(r"[\u064B-\u065F]")
|
||
|
|
||
|
# Alif Laam, Laam Laam, Fa Laam, Fa Ba
|
||
|
self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"]
|
||
|
# Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam
|
||
|
self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"]
|
||
|
# Fa Laam Laam, Waaw Laam Laam
|
||
|
self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"]
|
||
|
# Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam
|
||
|
self.pr4 = [
|
||
|
"\u0641\u0628\u0627\u0644",
|
||
|
"\u0648\u0628\u0627\u0644",
|
||
|
"\u0641\u0643\u0627\u0644",
|
||
|
]
|
||
|
|
||
|
# Kaf Yaa, Kaf Miim
|
||
|
self.su2 = ["\u0643\u064A", "\u0643\u0645"]
|
||
|
# Ha Alif, Ha Miim
|
||
|
self.su22 = ["\u0647\u0627", "\u0647\u0645"]
|
||
|
# Kaf Miim Alif, Kaf Noon Shadda
|
||
|
self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"]
|
||
|
# Ha Miim Alif, Ha Noon Shadda
|
||
|
self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"]
|
||
|
|
||
|
# Alif Noon, Ya Noon, Waaw Noon
|
||
|
self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"]
|
||
|
# Taa Alif Noon, Taa Ya Noon
|
||
|
self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"]
|
||
|
|
||
|
# Alif Noon, Waaw Noon
|
||
|
self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"]
|
||
|
# Siin Taa, Siin Yaa
|
||
|
self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"]
|
||
|
# Siin Alif, Siin Noon
|
||
|
self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"]
|
||
|
# Lam Noon, Lam Taa, Lam Yaa, Lam Hamza
|
||
|
self.verb_pr33 = [
|
||
|
"\u0644\u0646",
|
||
|
"\u0644\u062A",
|
||
|
"\u0644\u064A",
|
||
|
"\u0644\u0623",
|
||
|
]
|
||
|
# Taa Miim Alif, Taa Noon Shadda
|
||
|
self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"]
|
||
|
# Noon Alif, Taa Miim, Taa Alif, Waaw Alif
|
||
|
self.verb_suf2 = [
|
||
|
"\u0646\u0627",
|
||
|
"\u062A\u0645",
|
||
|
"\u062A\u0627",
|
||
|
"\u0648\u0627",
|
||
|
]
|
||
|
# Taa, Alif, Noon
|
||
|
self.verb_suf1 = ["\u062A", "\u0627", "\u0646"]
|
||
|
|
||
|
def stem(self, token):
|
||
|
"""
|
||
|
call this function to get the word's stem based on ARLSTem .
|
||
|
"""
|
||
|
try:
|
||
|
if token is None:
|
||
|
raise ValueError(
|
||
|
"The word could not be stemmed, because \
|
||
|
it is empty !"
|
||
|
)
|
||
|
# remove Arabic diacritics and replace some letters with others
|
||
|
token = self.norm(token)
|
||
|
# strip common prefixes of the nouns
|
||
|
pre = self.pref(token)
|
||
|
if pre is not None:
|
||
|
token = pre
|
||
|
# strip the suffixes which are common to nouns and verbs
|
||
|
token = self.suff(token)
|
||
|
# transform a plural noun to a singular noun
|
||
|
ps = self.plur2sing(token)
|
||
|
if ps is None:
|
||
|
# transform from the feminine form to the masculine form
|
||
|
fm = self.fem2masc(token)
|
||
|
if fm is not None:
|
||
|
return fm
|
||
|
else:
|
||
|
if pre is None: # if the prefixes are not stripped
|
||
|
# strip the verb prefixes and suffixes
|
||
|
return self.verb(token)
|
||
|
else:
|
||
|
return ps
|
||
|
return token
|
||
|
except ValueError as e:
|
||
|
print(e)
|
||
|
|
||
|
def norm(self, token):
|
||
|
"""
|
||
|
normalize the word by removing diacritics, replacing hamzated Alif
|
||
|
with Alif replacing AlifMaqsura with Yaa and removing Waaw at the
|
||
|
beginning.
|
||
|
"""
|
||
|
# strip Arabic diacritics
|
||
|
token = self.re_diacritics.sub("", token)
|
||
|
# replace Hamzated Alif with Alif bare
|
||
|
token = self.re_hamzated_alif.sub("\u0627", token)
|
||
|
# replace alifMaqsura with Yaa
|
||
|
token = self.re_alifMaqsura.sub("\u064A", token)
|
||
|
# strip the Waaw from the word beginning if the remaining is 3 letters
|
||
|
# at least
|
||
|
if token.startswith("\u0648") and len(token) > 3:
|
||
|
token = token[1:]
|
||
|
return token
|
||
|
|
||
|
def pref(self, token):
|
||
|
"""
|
||
|
remove prefixes from the words' beginning.
|
||
|
"""
|
||
|
if len(token) > 5:
|
||
|
for p3 in self.pr3:
|
||
|
if token.startswith(p3):
|
||
|
return token[3:]
|
||
|
if len(token) > 6:
|
||
|
for p4 in self.pr4:
|
||
|
if token.startswith(p4):
|
||
|
return token[4:]
|
||
|
if len(token) > 5:
|
||
|
for p3 in self.pr32:
|
||
|
if token.startswith(p3):
|
||
|
return token[3:]
|
||
|
if len(token) > 4:
|
||
|
for p2 in self.pr2:
|
||
|
if token.startswith(p2):
|
||
|
return token[2:]
|
||
|
|
||
|
def suff(self, token):
|
||
|
"""
|
||
|
remove suffixes from the word's end.
|
||
|
"""
|
||
|
if token.endswith("\u0643") and len(token) > 3:
|
||
|
return token[:-1]
|
||
|
if len(token) > 4:
|
||
|
for s2 in self.su2:
|
||
|
if token.endswith(s2):
|
||
|
return token[:-2]
|
||
|
if len(token) > 5:
|
||
|
for s3 in self.su3:
|
||
|
if token.endswith(s3):
|
||
|
return token[:-3]
|
||
|
if token.endswith("\u0647") and len(token) > 3:
|
||
|
token = token[:-1]
|
||
|
return token
|
||
|
if len(token) > 4:
|
||
|
for s2 in self.su22:
|
||
|
if token.endswith(s2):
|
||
|
return token[:-2]
|
||
|
if len(token) > 5:
|
||
|
for s3 in self.su32:
|
||
|
if token.endswith(s3):
|
||
|
return token[:-3]
|
||
|
if token.endswith("\u0646\u0627") and len(token) > 4:
|
||
|
return token[:-2]
|
||
|
return token
|
||
|
|
||
|
def fem2masc(self, token):
|
||
|
"""
|
||
|
transform the word from the feminine form to the masculine form.
|
||
|
"""
|
||
|
if token.endswith("\u0629") and len(token) > 3:
|
||
|
return token[:-1]
|
||
|
|
||
|
def plur2sing(self, token):
|
||
|
"""
|
||
|
transform the word from the plural form to the singular form.
|
||
|
"""
|
||
|
if len(token) > 4:
|
||
|
for ps2 in self.pl_si2:
|
||
|
if token.endswith(ps2):
|
||
|
return token[:-2]
|
||
|
if len(token) > 5:
|
||
|
for ps3 in self.pl_si3:
|
||
|
if token.endswith(ps3):
|
||
|
return token[:-3]
|
||
|
if len(token) > 3 and token.endswith("\u0627\u062A"):
|
||
|
return token[:-2]
|
||
|
if len(token) > 3 and token.startswith("\u0627") and token[2] == "\u0627":
|
||
|
return token[:2] + token[3:]
|
||
|
if len(token) > 4 and token.startswith("\u0627") and token[-2] == "\u0627":
|
||
|
return token[1:-2] + token[-1]
|
||
|
|
||
|
def verb(self, token):
|
||
|
"""
|
||
|
stem the verb prefixes and suffixes or both
|
||
|
"""
|
||
|
vb = self.verb_t1(token)
|
||
|
if vb is not None:
|
||
|
return vb
|
||
|
vb = self.verb_t2(token)
|
||
|
if vb is not None:
|
||
|
return vb
|
||
|
vb = self.verb_t3(token)
|
||
|
if vb is not None:
|
||
|
return vb
|
||
|
vb = self.verb_t4(token)
|
||
|
if vb is not None:
|
||
|
return vb
|
||
|
vb = self.verb_t5(token)
|
||
|
if vb is not None:
|
||
|
return vb
|
||
|
return self.verb_t6(token)
|
||
|
|
||
|
def verb_t1(self, token):
|
||
|
"""
|
||
|
stem the present prefixes and suffixes
|
||
|
"""
|
||
|
if len(token) > 5 and token.startswith("\u062A"): # Taa
|
||
|
for s2 in self.pl_si2:
|
||
|
if token.endswith(s2):
|
||
|
return token[1:-2]
|
||
|
if len(token) > 5 and token.startswith("\u064A"): # Yaa
|
||
|
for s2 in self.verb_su2:
|
||
|
if token.endswith(s2):
|
||
|
return token[1:-2]
|
||
|
if len(token) > 4 and token.startswith("\u0627"): # Alif
|
||
|
# Waaw Alif
|
||
|
if len(token) > 5 and token.endswith("\u0648\u0627"):
|
||
|
return token[1:-2]
|
||
|
# Yaa
|
||
|
if token.endswith("\u064A"):
|
||
|
return token[1:-1]
|
||
|
# Alif
|
||
|
if token.endswith("\u0627"):
|
||
|
return token[1:-1]
|
||
|
# Noon
|
||
|
if token.endswith("\u0646"):
|
||
|
return token[1:-1]
|
||
|
# ^Yaa, Noon$
|
||
|
if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"):
|
||
|
return token[1:-1]
|
||
|
# ^Taa, Noon$
|
||
|
if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"):
|
||
|
return token[1:-1]
|
||
|
|
||
|
def verb_t2(self, token):
|
||
|
"""
|
||
|
stem the future prefixes and suffixes
|
||
|
"""
|
||
|
if len(token) > 6:
|
||
|
for s2 in self.pl_si2:
|
||
|
# ^Siin Taa
|
||
|
if token.startswith(self.verb_pr2[0]) and token.endswith(s2):
|
||
|
return token[2:-2]
|
||
|
# ^Siin Yaa, Alif Noon$
|
||
|
if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]):
|
||
|
return token[2:-2]
|
||
|
# ^Siin Yaa, Waaw Noon$
|
||
|
if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]):
|
||
|
return token[2:-2]
|
||
|
# ^Siin Taa, Noon$
|
||
|
if (
|
||
|
len(token) > 5
|
||
|
and token.startswith(self.verb_pr2[0])
|
||
|
and token.endswith("\u0646")
|
||
|
):
|
||
|
return token[2:-1]
|
||
|
# ^Siin Yaa, Noon$
|
||
|
if (
|
||
|
len(token) > 5
|
||
|
and token.startswith(self.verb_pr2[1])
|
||
|
and token.endswith("\u0646")
|
||
|
):
|
||
|
return token[2:-1]
|
||
|
|
||
|
def verb_t3(self, token):
|
||
|
"""
|
||
|
stem the present suffixes
|
||
|
"""
|
||
|
if len(token) > 5:
|
||
|
for su3 in self.verb_suf3:
|
||
|
if token.endswith(su3):
|
||
|
return token[:-3]
|
||
|
if len(token) > 4:
|
||
|
for su2 in self.verb_suf2:
|
||
|
if token.endswith(su2):
|
||
|
return token[:-2]
|
||
|
if len(token) > 3:
|
||
|
for su1 in self.verb_suf1:
|
||
|
if token.endswith(su1):
|
||
|
return token[:-1]
|
||
|
|
||
|
def verb_t4(self, token):
|
||
|
"""
|
||
|
stem the present prefixes
|
||
|
"""
|
||
|
if len(token) > 3:
|
||
|
for pr1 in self.verb_suf1:
|
||
|
if token.startswith(pr1):
|
||
|
return token[1:]
|
||
|
if token.startswith("\u064A"):
|
||
|
return token[1:]
|
||
|
|
||
|
def verb_t5(self, token):
|
||
|
"""
|
||
|
stem the future prefixes
|
||
|
"""
|
||
|
if len(token) > 4:
|
||
|
for pr2 in self.verb_pr22:
|
||
|
if token.startswith(pr2):
|
||
|
return token[2:]
|
||
|
for pr2 in self.verb_pr2:
|
||
|
if token.startswith(pr2):
|
||
|
return token[2:]
|
||
|
return token
|
||
|
|
||
|
def verb_t6(self, token):
|
||
|
"""
|
||
|
stem the order prefixes
|
||
|
"""
|
||
|
if len(token) > 4:
|
||
|
for pr3 in self.verb_pr33:
|
||
|
if token.startswith(pr3):
|
||
|
return token[2:]
|
||
|
return token
|