Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
93142f70e9 | |||
d6f3f95c37 | |||
7b9537dd55 |
123
normalizer.py
123
normalizer.py
|
@ -250,18 +250,54 @@ class Normalizer():
|
|||
L = wrds.__len__()
|
||||
if L < 2:
|
||||
return doc_string
|
||||
cnt = 1
|
||||
for i in range(0, L - 1):
|
||||
w = wrds[i] + wrds[i + 1]
|
||||
|
||||
# NOTE: مشکل کد زیر :
|
||||
# NOTE: یک کلمه می تواند هم با کلمه قبل و هم با کلمه بعد ترکیب موفق درست کند و این باعث تکرار یک تک کلمه در دو ترکیب می شود.
|
||||
|
||||
# cnt = 1
|
||||
# for i in range(0, L - 1):
|
||||
# w = wrds[i] + wrds[i + 1]
|
||||
# try:
|
||||
# out_sentences = out_sentences + ' ' + self.dic2[w]
|
||||
# cnt = 0
|
||||
# except KeyError:
|
||||
# if cnt == 1:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i]
|
||||
# cnt = 1
|
||||
# if cnt == 1:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i + 1]
|
||||
|
||||
# NOTE: کد جایگزین
|
||||
|
||||
# cnt = 1
|
||||
# for i in range(L-1):
|
||||
# if cnt == 0:
|
||||
# cnt = 1
|
||||
# continue
|
||||
# w = wrds[i] + wrds[i+1]
|
||||
# try:
|
||||
# out_sentences = out_sentences + ' ' + self.dic2[w]
|
||||
# cnt = 0
|
||||
# except KeyError:
|
||||
# if cnt == 1:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i]
|
||||
# cnt = 1
|
||||
# if cnt == 1:
|
||||
# out_sentences = out_sentences + ' ' + wrds[L-1]
|
||||
|
||||
# NOTE: کد جایگزین دیگر
|
||||
|
||||
i = 0
|
||||
while i > L -1:
|
||||
w = wrds[i] + wrds[i+1]
|
||||
try:
|
||||
out_sentences = out_sentences + ' ' + self.dic2[w]
|
||||
cnt = 0
|
||||
i = i + 2
|
||||
except KeyError:
|
||||
if cnt == 1:
|
||||
out_sentences = out_sentences + ' ' + wrds[i]
|
||||
cnt = 1
|
||||
if cnt == 1:
|
||||
out_sentences = out_sentences + ' ' + wrds[i + 1]
|
||||
i = i + 1
|
||||
if i == L - 1:
|
||||
out_sentences = out_sentences + ' ' + wrds[L-1]
|
||||
return out_sentences
|
||||
|
||||
def space_correction_plus3(self, doc_string):
|
||||
|
@ -271,24 +307,63 @@ class Normalizer():
|
|||
L = wrds.__len__()
|
||||
if L < 3:
|
||||
return doc_string
|
||||
cnt = 1
|
||||
cnt2 = 0
|
||||
for i in range(0, L - 2):
|
||||
|
||||
# NOTE: مشکل کد زیر :
|
||||
# NOTE: این کد نیز مشکل ترکیبات هم پوشان را دارد به طوری که یک کلمه می تواند در سه ترکیب (یک بار انتها و یک بار وسط و یک بار ابتدای ترکیب) تکرار شود.
|
||||
|
||||
# cnt = 1
|
||||
# cnt2 = 0
|
||||
# for i in range(0, L - 2):
|
||||
# w = wrds[i] + wrds[i + 1] + wrds[i + 2]
|
||||
# try:
|
||||
# out_sentences = out_sentences + ' ' + self.dic3[w]
|
||||
# cnt = 0
|
||||
# cnt2 = 2
|
||||
# except KeyError:
|
||||
# if cnt == 1 and cnt2 == 0:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i]
|
||||
# else:
|
||||
# cnt2 -= 1
|
||||
# cnt = 1
|
||||
# if cnt == 1 and cnt2 == 0:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
|
||||
# elif cnt == 1 and cnt2 == 1:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i + 2]
|
||||
|
||||
# NOTE: کد جایگزین
|
||||
|
||||
# cnt = 0
|
||||
# for i in range(0, L - 2):
|
||||
# if cnt > 0:
|
||||
# cnt -= 1
|
||||
# continue
|
||||
# w = wrds[i] + wrds[i + 1] + wrds[i + 2]
|
||||
# try:
|
||||
# out_sentences = out_sentences + ' ' + self.dic3[w]
|
||||
# cnt = 2
|
||||
# except KeyError:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i]
|
||||
# if cnt == 2:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
|
||||
# elif cnt == 1:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i + 2]
|
||||
|
||||
# return out_sentences
|
||||
|
||||
# NOTE: کد جایگزین دیگر
|
||||
|
||||
i = 0
|
||||
while i < L - 2:
|
||||
w = wrds[i] + wrds[i + 1] + wrds[i + 2]
|
||||
try:
|
||||
out_sentences = out_sentences + ' ' + self.dic3[w]
|
||||
cnt = 0
|
||||
cnt2 = 2
|
||||
except KeyError:
|
||||
if cnt == 1 and cnt2 == 0:
|
||||
out_sentences = out_sentences + ' ' + wrds[i]
|
||||
if w in self.dic3:
|
||||
out_sentences += ' ' + self.dic3[w]
|
||||
i += 3
|
||||
else:
|
||||
cnt2 -= 1
|
||||
cnt = 1
|
||||
if cnt == 1 and cnt2 == 0:
|
||||
out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
|
||||
elif cnt == 1 and cnt2 == 1:
|
||||
out_sentences = out_sentences + ' ' + wrds[i + 2]
|
||||
out_sentences += ' ' + wrds[i]
|
||||
i += 1
|
||||
while i < L:
|
||||
out_sentences += ' ' + wrds[i]
|
||||
i += 1
|
||||
return out_sentences
|
||||
|
||||
def normalize(self, doc_string, new_line_elimination=False, return_dates = False):
|
||||
|
|
|
@ -16,7 +16,7 @@ class Tokenizer():
|
|||
nums_list = re.findall(pattern, doc_string)
|
||||
doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)
|
||||
|
||||
pattern = r'([!\.\?؟]+)[\n]*'
|
||||
pattern = r'([!\.\?؟]+)\n*'
|
||||
tmp = re.findall(pattern, doc_string)
|
||||
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||
|
||||
|
@ -32,7 +32,7 @@ class Tokenizer():
|
|||
tmp = re.findall(pattern, doc_string)
|
||||
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||
|
||||
pattern = r'[\n]+'
|
||||
pattern = r'\n+'
|
||||
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||
|
||||
for number in nums_list:
|
||||
|
|
Loading…
Reference in New Issue
Block a user