67 lines
1.9 KiB
Python
67 lines
1.9 KiB
Python
|
from docx import Document
|
||
|
import re
|
||
|
|
||
|
def extract_key_phrases(file_path):
|
||
|
doc = Document(file_path)
|
||
|
key_phrases = []
|
||
|
pattern = re.compile(r'!(.*?)!')
|
||
|
|
||
|
for paragraph in doc.paragraphs:
|
||
|
matches = pattern.findall(paragraph.text)
|
||
|
key_phrases.extend(matches)
|
||
|
|
||
|
return key_phrases
|
||
|
|
||
|
def extract_text_from_docx(file_path):
|
||
|
doc = Document(file_path)
|
||
|
text = ''
|
||
|
|
||
|
for paragraph in doc.paragraphs:
|
||
|
text += paragraph.text + ' '
|
||
|
|
||
|
return text.strip()
|
||
|
|
||
|
def tokenize_and_tag(text, key_phrases):
|
||
|
|
||
|
key_phrases_sorted = sorted(key_phrases, key=len, reverse=True)
|
||
|
for key_phrase in key_phrases_sorted:
|
||
|
|
||
|
text = text.replace(f'!{key_phrase}!', f' {key_phrase} ')
|
||
|
|
||
|
tokens = re.findall(r'\S+|[^\s\w]', text)
|
||
|
tagged_tokens = []
|
||
|
i = 0
|
||
|
|
||
|
while i < len(tokens):
|
||
|
token = tokens[i]
|
||
|
matched = False
|
||
|
|
||
|
|
||
|
for key_phrase in key_phrases_sorted:
|
||
|
key_tokens = key_phrase.split()
|
||
|
if tokens[i:i + len(key_tokens)] == key_tokens:
|
||
|
tagged_tokens.append((key_tokens[0], 'B-key'))
|
||
|
for key_token in key_tokens[1:]:
|
||
|
tagged_tokens.append((key_token, 'I-key'))
|
||
|
i += len(key_tokens)
|
||
|
matched = True
|
||
|
break
|
||
|
|
||
|
if not matched:
|
||
|
tagged_tokens.append((token, 'O'))
|
||
|
i += 1
|
||
|
|
||
|
return tagged_tokens
|
||
|
|
||
|
def write_tokens_to_file(tagged_tokens, output_file):
|
||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||
|
for token, tag in tagged_tokens:
|
||
|
f.write(f"{token}\t{tag}\n")
|
||
|
|
||
|
file_path = 'D:/project/output.docx'
|
||
|
output_file = 'output.txt'
|
||
|
|
||
|
text = extract_text_from_docx(file_path)
|
||
|
key_phrases = extract_key_phrases(file_path)
|
||
|
tagged_tokens = tokenize_and_tag(text, key_phrases)
|
||
|
write_tokens_to_file(tagged_tokens, output_file)
|