130 lines
3.8 KiB
Python
130 lines
3.8 KiB
Python
import re
|
|
import os
|
|
import json
|
|
from pandas import read_excel
|
|
def remove_signs():
|
|
str = read_file()
|
|
# lines =
|
|
pattern = r"\(|\)"
|
|
str = re.sub(pattern,'', str)
|
|
# str = re.sub(')','', str)
|
|
# str = re.sub('/','', str)
|
|
|
|
return str
|
|
|
|
def read_file():
|
|
with open('./data/DATASET_2.txt', 'r', encoding='utf-8') as file:
|
|
text = ''
|
|
try:
|
|
text = str(file.read())
|
|
except:
|
|
pass
|
|
return text
|
|
|
|
def read_file_by_address(file_address):
|
|
with open(file_address, 'r', encoding='utf-8') as file:
|
|
text = ''
|
|
try:
|
|
text = str(file.read())
|
|
except:
|
|
pass
|
|
return text
|
|
|
|
def save_to_file(result):
|
|
with open('./data/DATASET_3.txt', 'a+', encoding='utf-8') as file:
|
|
previous_result = ''
|
|
try:
|
|
previous_result = file.read()
|
|
except:
|
|
pass
|
|
file.write(result)
|
|
file.close()
|
|
|
|
def save_to_file_by_address(file_address, text):
|
|
with open(file_address, 'a+', encoding='utf-8') as file:
|
|
previous_result = ''
|
|
try:
|
|
previous_result = file.read()
|
|
except:
|
|
pass
|
|
file.write(text)
|
|
file.close()
|
|
|
|
|
|
def read_from_excel(file_address, column_name):
|
|
# خواندن فایل اکسل
|
|
data = read_excel(file_address)
|
|
|
|
# استخراج محتوای ستون مورد نظر
|
|
column_data = data[column_name]
|
|
return column_data
|
|
|
|
def add_columndata_to_excel(file_address, column_name, columndata):
|
|
|
|
# خواندن فایل اکسل
|
|
data = read_excel(file_address)
|
|
|
|
# اضافه کردن ستون جدید به دادهها
|
|
data[column_name] = columndata
|
|
|
|
# ذخیره کردن دادهها در فایل اکسل
|
|
data.to_excel(file_address, index=False)
|
|
|
|
def write_to_json(dict, file_address):
|
|
|
|
# تبدیل دیکشنری به فرمت JSON
|
|
json_data = json.dumps(dict, indent=2, ensure_ascii=False)
|
|
|
|
# ذخیره فایل
|
|
with open(file_address, 'a+', encoding='utf-8') as file:
|
|
file.write(json_data)
|
|
|
|
def read_from_json(file_address):
|
|
data_dict = []
|
|
# خواندن اطلاعات از فایل JSON
|
|
with open(file_address, 'r', encoding='utf-8') as file:
|
|
loaded_data = json.load(file)
|
|
|
|
# نمایش اطلاعات خوانده شده
|
|
for item in loaded_data:
|
|
data_dict.append(item)
|
|
return data_dict
|
|
|
|
|
|
def separated_date_format_finder(date_ner):
|
|
result = False
|
|
date_ner = date_ner.replace('.','/')
|
|
date_ner = date_ner.replace('،','/')
|
|
date_ner = date_ner.replace('ر','/')
|
|
#date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}|\d{1,2}/\d{1,2}/\d{2,4}|\d{2,4} /\d{1,2} /\d{1,2}|\d{2,4}/\d{1,2}/\d{1,2}'
|
|
date_pattern = r'\b(?:(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9])|(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9]|[0-9]{2}))\b'
|
|
regex = re.compile(date_pattern)
|
|
match_dates = regex.finditer(date_ner)
|
|
for date_item in match_dates:
|
|
result = True
|
|
break
|
|
return result
|
|
|
|
# dataset = remove_signs()
|
|
# save_to_file(dataset)
|
|
|
|
if __name__ == "__main__":
|
|
# file_address = "./Flair_NER/data/test_sections.json"
|
|
# column_name = "content"
|
|
# data = read_from_excel(file_address, column_name)
|
|
# sections_text = []
|
|
# sections = []
|
|
# dest_address = "./Flair_NER/data/test_sections.json"
|
|
# for item in data:
|
|
# sections.append({"content" : item, "real_keywords":"", "ai_keywords":""})
|
|
|
|
# write_to_json(sections, dest_address)
|
|
|
|
# for section in sections:
|
|
# write_to_json(section, dest_address)
|
|
|
|
# data_rows = read_from_json(file_address)
|
|
# print(len(data_rows))
|
|
# for data in data_rows:
|
|
# h = data
|
|
pass |