Prevent space_correction_plus3() from repeating one word in three combinations

This commit is contained in:
m_ghazizadeh 2025-07-01 14:59:56 +03:30
parent d6f3f95c37
commit 93142f70e9

View File

@ -307,24 +307,63 @@ class Normalizer():
L = wrds.__len__()
if L < 3:
return doc_string
cnt = 1
cnt2 = 0
for i in range(0, L - 2):
# NOTE: مشکل کد زیر :
# NOTE: این کد نیز مشکل ترکیبات هم پوشان را دارد به طوری که یک کلمه می تواند در سه ترکیب (یک بار انتها و یک بار وسط و یک بار ابتدای ترکیب) تکرار شود.
# cnt = 1
# cnt2 = 0
# for i in range(0, L - 2):
# w = wrds[i] + wrds[i + 1] + wrds[i + 2]
# try:
# out_sentences = out_sentences + ' ' + self.dic3[w]
# cnt = 0
# cnt2 = 2
# except KeyError:
# if cnt == 1 and cnt2 == 0:
# out_sentences = out_sentences + ' ' + wrds[i]
# else:
# cnt2 -= 1
# cnt = 1
# if cnt == 1 and cnt2 == 0:
# out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
# elif cnt == 1 and cnt2 == 1:
# out_sentences = out_sentences + ' ' + wrds[i + 2]
# NOTE: کد جایگزین
# cnt = 0
# for i in range(0, L - 2):
# if cnt > 0:
# cnt -= 1
# continue
# w = wrds[i] + wrds[i + 1] + wrds[i + 2]
# try:
# out_sentences = out_sentences + ' ' + self.dic3[w]
# cnt = 2
# except KeyError:
# out_sentences = out_sentences + ' ' + wrds[i]
# if cnt == 2:
# out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
# elif cnt == 1:
# out_sentences = out_sentences + ' ' + wrds[i + 2]
# return out_sentences
# NOTE: کد جایگزین دیگر
i = 0
while i < L - 2:
w = wrds[i] + wrds[i + 1] + wrds[i + 2]
try:
out_sentences = out_sentences + ' ' + self.dic3[w]
cnt = 0
cnt2 = 2
except KeyError:
if cnt == 1 and cnt2 == 0:
out_sentences = out_sentences + ' ' + wrds[i]
if w in self.dic3:
out_sentences += ' ' + self.dic3[w]
i += 3
else:
cnt2 -= 1
cnt = 1
if cnt == 1 and cnt2 == 0:
out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
elif cnt == 1 and cnt2 == 1:
out_sentences = out_sentences + ' ' + wrds[i + 2]
out_sentences += ' ' + wrds[i]
i += 1
while i < L:
out_sentences += ' ' + wrds[i]
i += 1
return out_sentences
def normalize(self, doc_string, new_line_elimination=False, return_dates = False):