word_txt.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. import textract
  2. import re
  3. import os
  4. def read_word_text(docx_path):
  5. # 使用 textract 提取 Word 文档中的文本
  6. text = textract.process(docx_path).decode('utf-8')
  7. return text
  8. def clean_text(text):
  9. # 1. 移除段内不必要的换行
  10. # 将段落内的换行符替换为空格
  11. text = re.sub(r'(\S)\n(\S)', r'\1 \2', text)
  12. # 2. 保留段落之间的换行
  13. # 合并连续的空行
  14. cleaned_text = re.sub(r'\n\s*\n+', '\n\n', text)
  15. # 3. 去除每行开头和结尾的空白字符
  16. cleaned_text = re.sub(r'^\s+|\s+$', '', cleaned_text, flags=re.MULTILINE)
  17. return cleaned_text
  18. def remove_unwanted_content(text):
  19. # 1. 去除目录
  20. text = re.sub(r'^目录.*?\n', '', text, flags=re.DOTALL | re.MULTILINE)
  21. # 2. 去除页眉和页脚
  22. # 假设页眉和页脚通常是每页的第一行和最后一行
  23. lines = text.split('\n')
  24. new_lines = []
  25. for i in range(0, len(lines), 2): # 每两行处理一次
  26. if i + 1 < len(lines):
  27. # 去除页眉和页脚
  28. new_lines.append(lines[i + 1])
  29. text = '\n'.join(new_lines)
  30. # 3. 去除注释
  31. text = re.sub(r'$.*?$', '', text) # 去除括号内的注释
  32. text = re.sub(r'\*.*?\*', '', text) # 去除星号内的注释
  33. # 4. 去除空行和多余的空白
  34. text = re.sub(r'\n\s*\n+', '\n\n', text)
  35. return text
  36. def format_special_terms(text):
  37. # 识别图、表、Figure、Table等关键词,并在它们之前插入换行符
  38. text = re.sub(r'(?<!\n)(图|表|Figure|Table)\s+', r'\n\g<1> ', text)
  39. return text
  40. def save_text_as_utf8(text, output_path):
  41. # 将文本以 UTF-8 格式保存到文件
  42. with open(output_path, 'w', encoding='utf-8') as file:
  43. file.write(text)
  44. def main(docx_path):
  45. # 提取 Word 文件名
  46. docx_filename = os.path.basename(docx_path)
  47. docx_name, _ = os.path.splitext(docx_filename)
  48. # 创建新文件夹
  49. output_dir = 'word_output'
  50. if not os.path.exists(output_dir):
  51. os.makedirs(output_dir)
  52. # 构建输出文件路径
  53. output_path = os.path.join(output_dir, f"{docx_name}.txt")
  54. # 提取文本
  55. word_text = read_word_text(docx_path)
  56. # 清理文本
  57. cleaned_text = clean_text(word_text)
  58. # 去除不需要的信息
  59. cleaned_text = remove_unwanted_content(cleaned_text)
  60. # 格式化特殊术语
  61. final_text = format_special_terms(cleaned_text)
  62. # 保存清理后的文本
  63. save_text_as_utf8(final_text, output_path)
  64. print(f"Text extracted, cleaned, and saved to {output_path} in UTF-8 format.")
  65. if __name__ == "__main__":
  66. docx_path = r'C:\Users\Machenike\Desktop\火工大\hgd240914\1图像目标智能识别软件设计方案.docx' # 请确保这里的路径正确指向您的 Word 文件
  67. main(docx_path)