import os import re from bs4 import BeautifulSoup from funcs import read_from_json, save_to_file_by_address base_address = os.getcwd() html_address = base_address + "/main_qa_data/data/htmlner.json" data = read_from_json(html_address) html_file = '' data = data[300:500] for i, line in enumerate(data): id = line["id"] if id == 'qs211576': pass content = line["html"] start_part = "" end_part = "" ners = content.lstrip(start_part) ners = ners.rstrip(end_part) if ners == "": continue html_ner = start_part + ners + end_part parsed_html = BeautifulSoup(html_ner, 'html.parser') # ner_span = parsed_html.find_all('div', {'class': 'off'}) # ner_span = parsed_html.find_all('span', {'style': 'background-color: #e8e'}) span_tags = parsed_html.find_all('span', style=re.compile(r'background-color')) for span in span_tags: text = span.text raw_html = str(span) raw_html = raw_html.replace('"', "'") html_parts = raw_html.split(text) # ORG if html_parts[0].__contains__("background-color: #8ee"): new_html = f"" ners = ners.replace(raw_html,new_html) # DATE elif html_parts[0].__contains__("background-color: #ee6"): new_html = f"" ners = ners.replace(raw_html,new_html) # NUM elif html_parts[0].__contains__("background-color: #e8e"): new_html = f"" ners = ners.replace(raw_html,new_html) # LOC elif html_parts[0].__contains__("background-color: #ee8"): new_html = f"" ners = ners.replace(raw_html,new_html) # PERCENT elif html_parts[0].__contains__("background-color: #cc8"): new_html = f"" ners = ners.replace(raw_html,new_html) #ners = ners.replace("", "

" +str(id)+"