from docx import Document import re def extract_key_phrases(file_path): doc = Document(file_path) key_phrases = [] pattern = re.compile(r'!(.*?)!') for paragraph in doc.paragraphs: matches = pattern.findall(paragraph.text) key_phrases.extend(matches) return key_phrases def extract_text_from_docx(file_path): doc = Document(file_path) text = '' for paragraph in doc.paragraphs: text += paragraph.text + ' ' return text.strip() def tokenize_and_tag(text, key_phrases): key_phrases_sorted = sorted(key_phrases, key=len, reverse=True) for key_phrase in key_phrases_sorted: text = text.replace(f'!{key_phrase}!', f' {key_phrase} ') tokens = re.findall(r'\S+|[^\s\w]', text) tagged_tokens = [] i = 0 while i < len(tokens): token = tokens[i] matched = False for key_phrase in key_phrases_sorted: key_tokens = key_phrase.split() if tokens[i:i + len(key_tokens)] == key_tokens: tagged_tokens.append((key_tokens[0], 'B-key')) for key_token in key_tokens[1:]: tagged_tokens.append((key_token, 'I-key')) i += len(key_tokens) matched = True break if not matched: tagged_tokens.append((token, 'O')) i += 1 return tagged_tokens def write_tokens_to_file(tagged_tokens, output_file): with open(output_file, 'w', encoding='utf-8') as f: for token, tag in tagged_tokens: f.write(f"{token}\t{tag}\n") file_path = 'D:/project/output.docx' output_file = 'output.txt' text = extract_text_from_docx(file_path) key_phrases = extract_key_phrases(file_path) tagged_tokens = tokenize_and_tag(text, key_phrases) write_tokens_to_file(tagged_tokens, output_file)