diff --git a/normalizer.py b/normalizer.py index fbd7d89..0e3e1e5 100644 --- a/normalizer.py +++ b/normalizer.py @@ -307,24 +307,63 @@ class Normalizer(): L = wrds.__len__() if L < 3: return doc_string - cnt = 1 - cnt2 = 0 - for i in range(0, L - 2): + + # NOTE: مشکل کد زیر : + # NOTE: این کد نیز مشکل ترکیبات هم پوشان را دارد به طوری که یک کلمه می تواند در سه ترکیب (یک بار انتها و یک بار وسط و یک بار ابتدای ترکیب) تکرار شود. + + # cnt = 1 + # cnt2 = 0 + # for i in range(0, L - 2): + # w = wrds[i] + wrds[i + 1] + wrds[i + 2] + # try: + # out_sentences = out_sentences + ' ' + self.dic3[w] + # cnt = 0 + # cnt2 = 2 + # except KeyError: + # if cnt == 1 and cnt2 == 0: + # out_sentences = out_sentences + ' ' + wrds[i] + # else: + # cnt2 -= 1 + # cnt = 1 + # if cnt == 1 and cnt2 == 0: + # out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] + # elif cnt == 1 and cnt2 == 1: + # out_sentences = out_sentences + ' ' + wrds[i + 2] + + # NOTE: کد جایگزین + + # cnt = 0 + # for i in range(0, L - 2): + # if cnt > 0: + # cnt -= 1 + # continue + # w = wrds[i] + wrds[i + 1] + wrds[i + 2] + # try: + # out_sentences = out_sentences + ' ' + self.dic3[w] + # cnt = 2 + # except KeyError: + # out_sentences = out_sentences + ' ' + wrds[i] + # if cnt == 2: + # out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] + # elif cnt == 1: + # out_sentences = out_sentences + ' ' + wrds[i + 2] + + # return out_sentences + + # NOTE: کد جایگزین دیگر + + i = 0 + while i < L - 2: w = wrds[i] + wrds[i + 1] + wrds[i + 2] - try: - out_sentences = out_sentences + ' ' + self.dic3[w] - cnt = 0 - cnt2 = 2 - except KeyError: - if cnt == 1 and cnt2 == 0: - out_sentences = out_sentences + ' ' + wrds[i] - else: - cnt2 -= 1 - cnt = 1 - if cnt == 1 and cnt2 == 0: - out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] - elif cnt == 1 and cnt2 == 1: - out_sentences = out_sentences + ' ' + wrds[i + 2] + if w in self.dic3: + out_sentences += ' ' + self.dic3[w] + i += 3 + else: + out_sentences += ' ' + wrds[i] + i += 1 + while i < L: + out_sentences += ' ' + wrds[i] + i += 1 return out_sentences def normalize(self, doc_string, new_line_elimination=False, return_dates = False):