diff --git a/normalizer.py b/normalizer.py index 64890b3..fbd7d89 100644 --- a/normalizer.py +++ b/normalizer.py @@ -250,18 +250,54 @@ class Normalizer(): L = wrds.__len__() if L < 2: return doc_string - cnt = 1 - for i in range(0, L - 1): - w = wrds[i] + wrds[i + 1] + + # NOTE: مشکل کد زیر : + # NOTE: یک کلمه می تواند هم با کلمه قبل و هم با کلمه بعد ترکیب موفق درست کند و این باعث تکرار یک تک کلمه در دو ترکیب می شود. + + # cnt = 1 + # for i in range(0, L - 1): + # w = wrds[i] + wrds[i + 1] + # try: + # out_sentences = out_sentences + ' ' + self.dic2[w] + # cnt = 0 + # except KeyError: + # if cnt == 1: + # out_sentences = out_sentences + ' ' + wrds[i] + # cnt = 1 + # if cnt == 1: + # out_sentences = out_sentences + ' ' + wrds[i + 1] + + # NOTE: کد جایگزین + + # cnt = 1 + # for i in range(L-1): + # if cnt == 0: + # cnt = 1 + # continue + # w = wrds[i] + wrds[i+1] + # try: + # out_sentences = out_sentences + ' ' + self.dic2[w] + # cnt = 0 + # except KeyError: + # if cnt == 1: + # out_sentences = out_sentences + ' ' + wrds[i] + # cnt = 1 + # if cnt == 1: + # out_sentences = out_sentences + ' ' + wrds[L-1] + + # NOTE: کد جایگزین دیگر + + i = 0 + while i > L -1: + w = wrds[i] + wrds[i+1] try: out_sentences = out_sentences + ' ' + self.dic2[w] - cnt = 0 + i = i + 2 except KeyError: - if cnt == 1: - out_sentences = out_sentences + ' ' + wrds[i] - cnt = 1 - if cnt == 1: - out_sentences = out_sentences + ' ' + wrds[i + 1] + out_sentences = out_sentences + ' ' + wrds[i] + i = i + 1 + if i == L - 1: + out_sentences = out_sentences + ' ' + wrds[L-1] return out_sentences def space_correction_plus3(self, doc_string):