Prevent space_correction_plus3() from repeating one word in three combinations
This commit is contained in:
parent
d6f3f95c37
commit
93142f70e9
|
@ -307,24 +307,63 @@ class Normalizer():
|
||||||
L = wrds.__len__()
|
L = wrds.__len__()
|
||||||
if L < 3:
|
if L < 3:
|
||||||
return doc_string
|
return doc_string
|
||||||
cnt = 1
|
|
||||||
cnt2 = 0
|
# NOTE: مشکل کد زیر :
|
||||||
for i in range(0, L - 2):
|
# NOTE: این کد نیز مشکل ترکیبات هم پوشان را دارد به طوری که یک کلمه می تواند در سه ترکیب (یک بار انتها و یک بار وسط و یک بار ابتدای ترکیب) تکرار شود.
|
||||||
|
|
||||||
|
# cnt = 1
|
||||||
|
# cnt2 = 0
|
||||||
|
# for i in range(0, L - 2):
|
||||||
|
# w = wrds[i] + wrds[i + 1] + wrds[i + 2]
|
||||||
|
# try:
|
||||||
|
# out_sentences = out_sentences + ' ' + self.dic3[w]
|
||||||
|
# cnt = 0
|
||||||
|
# cnt2 = 2
|
||||||
|
# except KeyError:
|
||||||
|
# if cnt == 1 and cnt2 == 0:
|
||||||
|
# out_sentences = out_sentences + ' ' + wrds[i]
|
||||||
|
# else:
|
||||||
|
# cnt2 -= 1
|
||||||
|
# cnt = 1
|
||||||
|
# if cnt == 1 and cnt2 == 0:
|
||||||
|
# out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
|
||||||
|
# elif cnt == 1 and cnt2 == 1:
|
||||||
|
# out_sentences = out_sentences + ' ' + wrds[i + 2]
|
||||||
|
|
||||||
|
# NOTE: کد جایگزین
|
||||||
|
|
||||||
|
# cnt = 0
|
||||||
|
# for i in range(0, L - 2):
|
||||||
|
# if cnt > 0:
|
||||||
|
# cnt -= 1
|
||||||
|
# continue
|
||||||
|
# w = wrds[i] + wrds[i + 1] + wrds[i + 2]
|
||||||
|
# try:
|
||||||
|
# out_sentences = out_sentences + ' ' + self.dic3[w]
|
||||||
|
# cnt = 2
|
||||||
|
# except KeyError:
|
||||||
|
# out_sentences = out_sentences + ' ' + wrds[i]
|
||||||
|
# if cnt == 2:
|
||||||
|
# out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
|
||||||
|
# elif cnt == 1:
|
||||||
|
# out_sentences = out_sentences + ' ' + wrds[i + 2]
|
||||||
|
|
||||||
|
# return out_sentences
|
||||||
|
|
||||||
|
# NOTE: کد جایگزین دیگر
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < L - 2:
|
||||||
w = wrds[i] + wrds[i + 1] + wrds[i + 2]
|
w = wrds[i] + wrds[i + 1] + wrds[i + 2]
|
||||||
try:
|
if w in self.dic3:
|
||||||
out_sentences = out_sentences + ' ' + self.dic3[w]
|
out_sentences += ' ' + self.dic3[w]
|
||||||
cnt = 0
|
i += 3
|
||||||
cnt2 = 2
|
else:
|
||||||
except KeyError:
|
out_sentences += ' ' + wrds[i]
|
||||||
if cnt == 1 and cnt2 == 0:
|
i += 1
|
||||||
out_sentences = out_sentences + ' ' + wrds[i]
|
while i < L:
|
||||||
else:
|
out_sentences += ' ' + wrds[i]
|
||||||
cnt2 -= 1
|
i += 1
|
||||||
cnt = 1
|
|
||||||
if cnt == 1 and cnt2 == 0:
|
|
||||||
out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
|
|
||||||
elif cnt == 1 and cnt2 == 1:
|
|
||||||
out_sentences = out_sentences + ' ' + wrds[i + 2]
|
|
||||||
return out_sentences
|
return out_sentences
|
||||||
|
|
||||||
def normalize(self, doc_string, new_line_elimination=False, return_dates = False):
|
def normalize(self, doc_string, new_line_elimination=False, return_dates = False):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user