Prevent space_correction_plus3() from repeating one word in three combinations
This commit is contained in:
parent
d6f3f95c37
commit
93142f70e9
|
@ -307,24 +307,63 @@ class Normalizer():
|
|||
L = wrds.__len__()
|
||||
if L < 3:
|
||||
return doc_string
|
||||
cnt = 1
|
||||
cnt2 = 0
|
||||
for i in range(0, L - 2):
|
||||
|
||||
# NOTE: مشکل کد زیر :
|
||||
# NOTE: این کد نیز مشکل ترکیبات هم پوشان را دارد به طوری که یک کلمه می تواند در سه ترکیب (یک بار انتها و یک بار وسط و یک بار ابتدای ترکیب) تکرار شود.
|
||||
|
||||
# cnt = 1
|
||||
# cnt2 = 0
|
||||
# for i in range(0, L - 2):
|
||||
# w = wrds[i] + wrds[i + 1] + wrds[i + 2]
|
||||
# try:
|
||||
# out_sentences = out_sentences + ' ' + self.dic3[w]
|
||||
# cnt = 0
|
||||
# cnt2 = 2
|
||||
# except KeyError:
|
||||
# if cnt == 1 and cnt2 == 0:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i]
|
||||
# else:
|
||||
# cnt2 -= 1
|
||||
# cnt = 1
|
||||
# if cnt == 1 and cnt2 == 0:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
|
||||
# elif cnt == 1 and cnt2 == 1:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i + 2]
|
||||
|
||||
# NOTE: کد جایگزین
|
||||
|
||||
# cnt = 0
|
||||
# for i in range(0, L - 2):
|
||||
# if cnt > 0:
|
||||
# cnt -= 1
|
||||
# continue
|
||||
# w = wrds[i] + wrds[i + 1] + wrds[i + 2]
|
||||
# try:
|
||||
# out_sentences = out_sentences + ' ' + self.dic3[w]
|
||||
# cnt = 2
|
||||
# except KeyError:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i]
|
||||
# if cnt == 2:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
|
||||
# elif cnt == 1:
|
||||
# out_sentences = out_sentences + ' ' + wrds[i + 2]
|
||||
|
||||
# return out_sentences
|
||||
|
||||
# NOTE: کد جایگزین دیگر
|
||||
|
||||
i = 0
|
||||
while i < L - 2:
|
||||
w = wrds[i] + wrds[i + 1] + wrds[i + 2]
|
||||
try:
|
||||
out_sentences = out_sentences + ' ' + self.dic3[w]
|
||||
cnt = 0
|
||||
cnt2 = 2
|
||||
except KeyError:
|
||||
if cnt == 1 and cnt2 == 0:
|
||||
out_sentences = out_sentences + ' ' + wrds[i]
|
||||
if w in self.dic3:
|
||||
out_sentences += ' ' + self.dic3[w]
|
||||
i += 3
|
||||
else:
|
||||
cnt2 -= 1
|
||||
cnt = 1
|
||||
if cnt == 1 and cnt2 == 0:
|
||||
out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
|
||||
elif cnt == 1 and cnt2 == 1:
|
||||
out_sentences = out_sentences + ' ' + wrds[i + 2]
|
||||
out_sentences += ' ' + wrds[i]
|
||||
i += 1
|
||||
while i < L:
|
||||
out_sentences += ' ' + wrds[i]
|
||||
i += 1
|
||||
return out_sentences
|
||||
|
||||
def normalize(self, doc_string, new_line_elimination=False, return_dates = False):
|
||||
|
|
Loading…
Reference in New Issue
Block a user