Flair_NER/funcs.py

130 lines
3.8 KiB
Python

import re
import os
import json
from pandas import read_excel
def remove_signs():
str = read_file()
# lines =
pattern = r"\(|\)"
str = re.sub(pattern,'', str)
# str = re.sub(')','', str)
# str = re.sub('/','', str)
return str
def read_file():
with open('./data/DATASET_2.txt', 'r', encoding='utf-8') as file:
text = ''
try:
text = str(file.read())
except:
pass
return text
def read_file_by_address(file_address):
with open(file_address, 'r', encoding='utf-8') as file:
text = ''
try:
text = str(file.read())
except:
pass
return text
def save_to_file(result):
with open('./data/DATASET_3.txt', 'a+', encoding='utf-8') as file:
previous_result = ''
try:
previous_result = file.read()
except:
pass
file.write(result)
file.close()
def save_to_file_by_address(file_address, text):
with open(file_address, 'a+', encoding='utf-8') as file:
previous_result = ''
try:
previous_result = file.read()
except:
pass
file.write(text)
file.close()
def read_from_excel(file_address, column_name):
# خواندن فایل اکسل
data = read_excel(file_address)
# استخراج محتوای ستون مورد نظر
column_data = data[column_name]
return column_data
def add_columndata_to_excel(file_address, column_name, columndata):
# خواندن فایل اکسل
data = read_excel(file_address)
# اضافه کردن ستون جدید به داده‌ها
data[column_name] = columndata
# ذخیره کردن داده‌ها در فایل اکسل
data.to_excel(file_address, index=False)
def write_to_json(dict, file_address):
# تبدیل دیکشنری به فرمت JSON
json_data = json.dumps(dict, indent=2, ensure_ascii=False)
# ذخیره فایل
with open(file_address, 'a+', encoding='utf-8') as file:
file.write(json_data)
def read_from_json(file_address):
data_dict = []
# خواندن اطلاعات از فایل JSON
with open(file_address, 'r', encoding='utf-8') as file:
loaded_data = json.load(file)
# نمایش اطلاعات خوانده شده
for item in loaded_data:
data_dict.append(item)
return data_dict
def separated_date_format_finder(date_ner):
result = False
date_ner = date_ner.replace('.','/')
date_ner = date_ner.replace('،','/')
date_ner = date_ner.replace('ر','/')
#date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}|\d{1,2}/\d{1,2}/\d{2,4}|\d{2,4} /\d{1,2} /\d{1,2}|\d{2,4}/\d{1,2}/\d{1,2}'
date_pattern = r'\b(?:(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9])|(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9]|[0-9]{2}))\b'
regex = re.compile(date_pattern)
match_dates = regex.finditer(date_ner)
for date_item in match_dates:
result = True
break
return result
# dataset = remove_signs()
# save_to_file(dataset)
if __name__ == "__main__":
# file_address = "./Flair_NER/data/test_sections.json"
# column_name = "content"
# data = read_from_excel(file_address, column_name)
# sections_text = []
# sections = []
# dest_address = "./Flair_NER/data/test_sections.json"
# for item in data:
# sections.append({"content" : item, "real_keywords":"", "ai_keywords":""})
# write_to_json(sections, dest_address)
# for section in sections:
# write_to_json(section, dest_address)
# data_rows = read_from_json(file_address)
# print(len(data_rows))
# for data in data_rows:
# h = data
pass