ai_dataset/main_qa_data/deldar_ner.py

99 lines
3.4 KiB
Python

import os
import re
from bs4 import BeautifulSoup
from funcs import read_from_json, save_to_file_by_address
base_address = os.getcwd()
html_address = base_address + "/main_qa_data/data/htmlner.json"
data = read_from_json(html_address)
html_file = ''
data = data[300:500]
for i, line in enumerate(data):
id = line["id"]
if id == 'qs211576':
pass
content = line["html"]
start_part = "<html><body style='direction: rtl;'>"
end_part = "</body></html>"
ners = content.lstrip(start_part)
ners = ners.rstrip(end_part)
if ners == "":
continue
html_ner = start_part + ners + end_part
parsed_html = BeautifulSoup(html_ner, 'html.parser')
# ner_span = parsed_html.find_all('div', {'class': 'off'})
# ner_span = parsed_html.find_all('span', {'style': 'background-color: #e8e'})
span_tags = parsed_html.find_all('span', style=re.compile(r'background-color'))
for span in span_tags:
text = span.text
raw_html = str(span)
raw_html = raw_html.replace('"', "'")
html_parts = raw_html.split(text)
# ORG
if html_parts[0].__contains__("background-color: #8ee"):
new_html = f"<span style='background-color: #8ee' class='tooltip'>{text}<span class='tooltiptext'>ORG</span></span>"
ners = ners.replace(raw_html,new_html)
# DATE
elif html_parts[0].__contains__("background-color: #ee6"):
new_html = f"<span style='background-color: #ee6' class='tooltip'>{text}<span class='tooltiptext'>DATE</span></span>"
ners = ners.replace(raw_html,new_html)
# NUM
elif html_parts[0].__contains__("background-color: #e8e"):
new_html = f"<span style='background-color: #e8e' class='tooltip'>{text}<span class='tooltiptext'>NUM</span></span>"
ners = ners.replace(raw_html,new_html)
# LOC
elif html_parts[0].__contains__("background-color: #ee8"):
new_html = f"<span style='background-color: #ee8' class='tooltip'>{text}<span class='tooltiptext'>LOC</span></span>"
ners = ners.replace(raw_html,new_html)
# PERCENT
elif html_parts[0].__contains__("background-color: #cc8"):
new_html = f"<span style='background-color: #cc8' class='tooltip'>{text}<span class='tooltiptext'>PERCENT</span></span>"
ners = ners.replace(raw_html,new_html)
#ners = ners.replace("<span style='background-color: #8ee'>", "<span style='background-color: #8ee' class='tooltip'><span class='tooltiptext'>ORG</span>")
html_file += "<h5>" +str(id)+"</h5>"+ "<div>" + ners + "</div><hr>"
print(str(i) + "/" + str(len(data)))
style = """<style>
.tooltip {
position: relative;
display: inline-block;
border-bottom: 1px dotted black;
}
.tooltip .tooltiptext {
visibility: hidden;
width: 120px;
background-color: black;
color: #fff;
text-align: center;
border-radius: 6px;
padding: 5px 0;
position: absolute;
z-index: 1;
bottom: 100%;
left: 50%;
margin-left: -60px;
/* Fade in tooltip - takes 1 second to go from 0% to 100% opac: */
opacity: 0;
transition: opacity 1s;
}
.tooltip:hover .tooltiptext {
visibility: visible;
opacity: 1;
}
</style>"""
html_s = "<html>"
body_s = "<body style='direction: rtl;'>"
html_file = html_s + style + body_s + html_file + end_part
result_address = base_address + "/main_qa_data/data/htmlner.htm"
save_to_file_by_address(result_address, html_file)