ai_dataset/import_data/label_tokens.py

67 lines
1.9 KiB
Python

from docx import Document
import re
def extract_key_phrases(file_path):
doc = Document(file_path)
key_phrases = []
pattern = re.compile(r'!(.*?)!')
for paragraph in doc.paragraphs:
matches = pattern.findall(paragraph.text)
key_phrases.extend(matches)
return key_phrases
def extract_text_from_docx(file_path):
doc = Document(file_path)
text = ''
for paragraph in doc.paragraphs:
text += paragraph.text + ' '
return text.strip()
def tokenize_and_tag(text, key_phrases):
key_phrases_sorted = sorted(key_phrases, key=len, reverse=True)
for key_phrase in key_phrases_sorted:
text = text.replace(f'!{key_phrase}!', f' {key_phrase} ')
tokens = re.findall(r'\S+|[^\s\w]', text)
tagged_tokens = []
i = 0
while i < len(tokens):
token = tokens[i]
matched = False
for key_phrase in key_phrases_sorted:
key_tokens = key_phrase.split()
if tokens[i:i + len(key_tokens)] == key_tokens:
tagged_tokens.append((key_tokens[0], 'B-key'))
for key_token in key_tokens[1:]:
tagged_tokens.append((key_token, 'I-key'))
i += len(key_tokens)
matched = True
break
if not matched:
tagged_tokens.append((token, 'O'))
i += 1
return tagged_tokens
def write_tokens_to_file(tagged_tokens, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
for token, tag in tagged_tokens:
f.write(f"{token}\t{tag}\n")
file_path = 'D:/project/output.docx'
output_file = 'output.txt'
text = extract_text_from_docx(file_path)
key_phrases = extract_key_phrases(file_path)
tagged_tokens = tokenize_and_tag(text, key_phrases)
write_tokens_to_file(tagged_tokens, output_file)