From 7b9537dd55f4cf299d9f15fe01c8cf6845146f4d Mon Sep 17 00:00:00 2001 From: m_ghazizadeh Date: Mon, 30 Jun 2025 18:24:43 +0330 Subject: [PATCH 1/3] Remove additional [] in two regex codes where doens't needed! --- tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tokenizer.py b/tokenizer.py index 59694cd..946a7ad 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -16,7 +16,7 @@ class Tokenizer(): nums_list = re.findall(pattern, doc_string) doc_string = re.sub(pattern, 'floatingpointnumber', doc_string) - pattern = r'([!\.\?؟]+)[\n]*' + pattern = r'([!\.\?؟]+)\n*' tmp = re.findall(pattern, doc_string) doc_string = re.sub(pattern, self.add_tab, doc_string) @@ -32,7 +32,7 @@ class Tokenizer(): tmp = re.findall(pattern, doc_string) doc_string = re.sub(pattern, self.add_tab, doc_string) - pattern = r'[\n]+' + pattern = r'\n+' doc_string = re.sub(pattern, self.add_tab, doc_string) for number in nums_list: From d6f3f95c37d6c6ba92ecf555d6dec81145c24f22 Mon Sep 17 00:00:00 2001 From: m_ghazizadeh Date: Tue, 1 Jul 2025 14:58:17 +0330 Subject: [PATCH 2/3] Prevent space_correction_plus2() from repeating one word in two combinations --- normalizer.py | 54 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/normalizer.py b/normalizer.py index 64890b3..fbd7d89 100644 --- a/normalizer.py +++ b/normalizer.py @@ -250,18 +250,54 @@ class Normalizer(): L = wrds.__len__() if L < 2: return doc_string - cnt = 1 - for i in range(0, L - 1): - w = wrds[i] + wrds[i + 1] + + # NOTE: مشکل کد زیر : + # NOTE: یک کلمه می تواند هم با کلمه قبل و هم با کلمه بعد ترکیب موفق درست کند و این باعث تکرار یک تک کلمه در دو ترکیب می شود. + + # cnt = 1 + # for i in range(0, L - 1): + # w = wrds[i] + wrds[i + 1] + # try: + # out_sentences = out_sentences + ' ' + self.dic2[w] + # cnt = 0 + # except KeyError: + # if cnt == 1: + # out_sentences = out_sentences + ' ' + wrds[i] + # cnt = 1 + # if cnt == 1: + # out_sentences = out_sentences + ' ' + wrds[i + 1] + + # NOTE: کد جایگزین + + # cnt = 1 + # for i in range(L-1): + # if cnt == 0: + # cnt = 1 + # continue + # w = wrds[i] + wrds[i+1] + # try: + # out_sentences = out_sentences + ' ' + self.dic2[w] + # cnt = 0 + # except KeyError: + # if cnt == 1: + # out_sentences = out_sentences + ' ' + wrds[i] + # cnt = 1 + # if cnt == 1: + # out_sentences = out_sentences + ' ' + wrds[L-1] + + # NOTE: کد جایگزین دیگر + + i = 0 + while i > L -1: + w = wrds[i] + wrds[i+1] try: out_sentences = out_sentences + ' ' + self.dic2[w] - cnt = 0 + i = i + 2 except KeyError: - if cnt == 1: - out_sentences = out_sentences + ' ' + wrds[i] - cnt = 1 - if cnt == 1: - out_sentences = out_sentences + ' ' + wrds[i + 1] + out_sentences = out_sentences + ' ' + wrds[i] + i = i + 1 + if i == L - 1: + out_sentences = out_sentences + ' ' + wrds[L-1] return out_sentences def space_correction_plus3(self, doc_string): From 93142f70e92b22e7820490d69abadd66604b5692 Mon Sep 17 00:00:00 2001 From: m_ghazizadeh Date: Tue, 1 Jul 2025 14:59:56 +0330 Subject: [PATCH 3/3] Prevent space_correction_plus3() from repeating one word in three combinations --- normalizer.py | 73 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/normalizer.py b/normalizer.py index fbd7d89..0e3e1e5 100644 --- a/normalizer.py +++ b/normalizer.py @@ -307,24 +307,63 @@ class Normalizer(): L = wrds.__len__() if L < 3: return doc_string - cnt = 1 - cnt2 = 0 - for i in range(0, L - 2): + + # NOTE: مشکل کد زیر : + # NOTE: این کد نیز مشکل ترکیبات هم پوشان را دارد به طوری که یک کلمه می تواند در سه ترکیب (یک بار انتها و یک بار وسط و یک بار ابتدای ترکیب) تکرار شود. + + # cnt = 1 + # cnt2 = 0 + # for i in range(0, L - 2): + # w = wrds[i] + wrds[i + 1] + wrds[i + 2] + # try: + # out_sentences = out_sentences + ' ' + self.dic3[w] + # cnt = 0 + # cnt2 = 2 + # except KeyError: + # if cnt == 1 and cnt2 == 0: + # out_sentences = out_sentences + ' ' + wrds[i] + # else: + # cnt2 -= 1 + # cnt = 1 + # if cnt == 1 and cnt2 == 0: + # out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] + # elif cnt == 1 and cnt2 == 1: + # out_sentences = out_sentences + ' ' + wrds[i + 2] + + # NOTE: کد جایگزین + + # cnt = 0 + # for i in range(0, L - 2): + # if cnt > 0: + # cnt -= 1 + # continue + # w = wrds[i] + wrds[i + 1] + wrds[i + 2] + # try: + # out_sentences = out_sentences + ' ' + self.dic3[w] + # cnt = 2 + # except KeyError: + # out_sentences = out_sentences + ' ' + wrds[i] + # if cnt == 2: + # out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] + # elif cnt == 1: + # out_sentences = out_sentences + ' ' + wrds[i + 2] + + # return out_sentences + + # NOTE: کد جایگزین دیگر + + i = 0 + while i < L - 2: w = wrds[i] + wrds[i + 1] + wrds[i + 2] - try: - out_sentences = out_sentences + ' ' + self.dic3[w] - cnt = 0 - cnt2 = 2 - except KeyError: - if cnt == 1 and cnt2 == 0: - out_sentences = out_sentences + ' ' + wrds[i] - else: - cnt2 -= 1 - cnt = 1 - if cnt == 1 and cnt2 == 0: - out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] - elif cnt == 1 and cnt2 == 1: - out_sentences = out_sentences + ' ' + wrds[i + 2] + if w in self.dic3: + out_sentences += ' ' + self.dic3[w] + i += 3 + else: + out_sentences += ' ' + wrds[i] + i += 1 + while i < L: + out_sentences += ' ' + wrds[i] + i += 1 return out_sentences def normalize(self, doc_string, new_line_elimination=False, return_dates = False):