99 lines
3.4 KiB
Python
99 lines
3.4 KiB
Python
import os
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
from funcs import read_from_json, save_to_file_by_address
|
|
base_address = os.getcwd()
|
|
html_address = base_address + "/main_qa_data/data/htmlner.json"
|
|
data = read_from_json(html_address)
|
|
|
|
html_file = ''
|
|
|
|
data = data[300:500]
|
|
for i, line in enumerate(data):
|
|
id = line["id"]
|
|
if id == 'qs211576':
|
|
pass
|
|
content = line["html"]
|
|
start_part = "<html><body style='direction: rtl;'>"
|
|
end_part = "</body></html>"
|
|
ners = content.lstrip(start_part)
|
|
ners = ners.rstrip(end_part)
|
|
if ners == "":
|
|
continue
|
|
html_ner = start_part + ners + end_part
|
|
parsed_html = BeautifulSoup(html_ner, 'html.parser')
|
|
# ner_span = parsed_html.find_all('div', {'class': 'off'})
|
|
# ner_span = parsed_html.find_all('span', {'style': 'background-color: #e8e'})
|
|
span_tags = parsed_html.find_all('span', style=re.compile(r'background-color'))
|
|
|
|
for span in span_tags:
|
|
text = span.text
|
|
raw_html = str(span)
|
|
raw_html = raw_html.replace('"', "'")
|
|
html_parts = raw_html.split(text)
|
|
|
|
# ORG
|
|
if html_parts[0].__contains__("background-color: #8ee"):
|
|
new_html = f"<span style='background-color: #8ee' class='tooltip'>{text}<span class='tooltiptext'>ORG</span></span>"
|
|
ners = ners.replace(raw_html,new_html)
|
|
# DATE
|
|
elif html_parts[0].__contains__("background-color: #ee6"):
|
|
new_html = f"<span style='background-color: #ee6' class='tooltip'>{text}<span class='tooltiptext'>DATE</span></span>"
|
|
ners = ners.replace(raw_html,new_html)
|
|
# NUM
|
|
elif html_parts[0].__contains__("background-color: #e8e"):
|
|
new_html = f"<span style='background-color: #e8e' class='tooltip'>{text}<span class='tooltiptext'>NUM</span></span>"
|
|
ners = ners.replace(raw_html,new_html)
|
|
# LOC
|
|
elif html_parts[0].__contains__("background-color: #ee8"):
|
|
new_html = f"<span style='background-color: #ee8' class='tooltip'>{text}<span class='tooltiptext'>LOC</span></span>"
|
|
ners = ners.replace(raw_html,new_html)
|
|
# PERCENT
|
|
elif html_parts[0].__contains__("background-color: #cc8"):
|
|
new_html = f"<span style='background-color: #cc8' class='tooltip'>{text}<span class='tooltiptext'>PERCENT</span></span>"
|
|
ners = ners.replace(raw_html,new_html)
|
|
|
|
#ners = ners.replace("<span style='background-color: #8ee'>", "<span style='background-color: #8ee' class='tooltip'><span class='tooltiptext'>ORG</span>")
|
|
html_file += "<h5>" +str(id)+"</h5>"+ "<div>" + ners + "</div><hr>"
|
|
print(str(i) + "/" + str(len(data)))
|
|
|
|
style = """<style>
|
|
.tooltip {
|
|
position: relative;
|
|
display: inline-block;
|
|
border-bottom: 1px dotted black;
|
|
}
|
|
|
|
.tooltip .tooltiptext {
|
|
visibility: hidden;
|
|
width: 120px;
|
|
background-color: black;
|
|
color: #fff;
|
|
text-align: center;
|
|
border-radius: 6px;
|
|
padding: 5px 0;
|
|
position: absolute;
|
|
z-index: 1;
|
|
bottom: 100%;
|
|
left: 50%;
|
|
margin-left: -60px;
|
|
|
|
/* Fade in tooltip - takes 1 second to go from 0% to 100% opac: */
|
|
opacity: 0;
|
|
transition: opacity 1s;
|
|
}
|
|
|
|
.tooltip:hover .tooltiptext {
|
|
visibility: visible;
|
|
opacity: 1;
|
|
}
|
|
</style>"""
|
|
html_s = "<html>"
|
|
body_s = "<body style='direction: rtl;'>"
|
|
html_file = html_s + style + body_s + html_file + end_part
|
|
|
|
result_address = base_address + "/main_qa_data/data/htmlner.htm"
|
|
|
|
save_to_file_by_address(result_address, html_file)
|
|
|