From 93142f70e92b22e7820490d69abadd66604b5692 Mon Sep 17 00:00:00 2001 From: m_ghazizadeh Date: Tue, 1 Jul 2025 14:59:56 +0330 Subject: [PATCH] Prevent space_correction_plus3() from repeating one word in three combinations --- normalizer.py | 73 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/normalizer.py b/normalizer.py index fbd7d89..0e3e1e5 100644 --- a/normalizer.py +++ b/normalizer.py @@ -307,24 +307,63 @@ class Normalizer(): L = wrds.__len__() if L < 3: return doc_string - cnt = 1 - cnt2 = 0 - for i in range(0, L - 2): + + # NOTE: مشکل کد زیر : + # NOTE: این کد نیز مشکل ترکیبات هم پوشان را دارد به طوری که یک کلمه می تواند در سه ترکیب (یک بار انتها و یک بار وسط و یک بار ابتدای ترکیب) تکرار شود. + + # cnt = 1 + # cnt2 = 0 + # for i in range(0, L - 2): + # w = wrds[i] + wrds[i + 1] + wrds[i + 2] + # try: + # out_sentences = out_sentences + ' ' + self.dic3[w] + # cnt = 0 + # cnt2 = 2 + # except KeyError: + # if cnt == 1 and cnt2 == 0: + # out_sentences = out_sentences + ' ' + wrds[i] + # else: + # cnt2 -= 1 + # cnt = 1 + # if cnt == 1 and cnt2 == 0: + # out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] + # elif cnt == 1 and cnt2 == 1: + # out_sentences = out_sentences + ' ' + wrds[i + 2] + + # NOTE: کد جایگزین + + # cnt = 0 + # for i in range(0, L - 2): + # if cnt > 0: + # cnt -= 1 + # continue + # w = wrds[i] + wrds[i + 1] + wrds[i + 2] + # try: + # out_sentences = out_sentences + ' ' + self.dic3[w] + # cnt = 2 + # except KeyError: + # out_sentences = out_sentences + ' ' + wrds[i] + # if cnt == 2: + # out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] + # elif cnt == 1: + # out_sentences = out_sentences + ' ' + wrds[i + 2] + + # return out_sentences + + # NOTE: کد جایگزین دیگر + + i = 0 + while i < L - 2: w = wrds[i] + wrds[i + 1] + wrds[i + 2] - try: - out_sentences = out_sentences + ' ' + self.dic3[w] - cnt = 0 - cnt2 = 2 - except KeyError: - if cnt == 1 and cnt2 == 0: - out_sentences = out_sentences + ' ' + wrds[i] - else: - cnt2 -= 1 - cnt = 1 - if cnt == 1 and cnt2 == 0: - out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] - elif cnt == 1 and cnt2 == 1: - out_sentences = out_sentences + ' ' + wrds[i + 2] + if w in self.dic3: + out_sentences += ' ' + self.dic3[w] + i += 3 + else: + out_sentences += ' ' + wrds[i] + i += 1 + while i < L: + out_sentences += ' ' + wrds[i] + i += 1 return out_sentences def normalize(self, doc_string, new_line_elimination=False, return_dates = False):