pdf_txt.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. # import fitz # PyMuPDF
  2. # import re
  3. # import os
  4. #
  5. #
  6. # def read_pdf_text(pdf_path):
  7. # # 打开 PDF 文件
  8. # document = fitz.open(pdf_path)
  9. #
  10. # # 初始化一个空字符串来存储所有页面的文本
  11. # pdf_text = ''
  12. #
  13. # # 遍历每一页
  14. # for page_num in range(len(document)):
  15. # page = document.load_page(page_num) # 加载页面
  16. # text = page.get_text("text") # 提取页面上的文本
  17. # pdf_text += text + '\n' # 将文本添加到总文本中,每页之间用换行分隔
  18. #
  19. # return pdf_text
  20. #
  21. #
  22. # def clean_text(text):
  23. # # 1. 移除段内不必要的换行
  24. # # 将段落内的换行符替换为空格
  25. # text = re.sub(r'(\S)\n(\S)', r'\1 \2', text)
  26. #
  27. # # 2. 保留段落之间的换行
  28. # # 合并连续的空行
  29. # cleaned_text = re.sub(r'\n\s*\n+', '\n\n', text)
  30. #
  31. # # 3. 去除每行开头和结尾的空白字符
  32. # cleaned_text = re.sub(r'^\s+|\s+$', '', cleaned_text, flags=re.MULTILINE)
  33. #
  34. # return cleaned_text
  35. #
  36. #
  37. # def remove_unwanted_content(text):
  38. # # 1. 去除目录
  39. # text = re.sub(r'^目录.*?\n', '', text, flags=re.DOTALL | re.MULTILINE)
  40. #
  41. # # 2. 去除页眉和页脚
  42. # # 假设页眉和页脚通常是每页的第一行和最后一行
  43. # lines = text.split('\n')
  44. # new_lines = []
  45. # for i in range(0, len(lines), 2): # 每两行处理一次
  46. # if i + 1 < len(lines):
  47. # # 去除页眉和页脚
  48. # new_lines.append(lines[i + 1])
  49. # text = '\n'.join(new_lines)
  50. #
  51. # # 3. 去除注释
  52. # text = re.sub(r'$.*?$', '', text) # 去除括号内的注释
  53. # text = re.sub(r'\*.*?\*', '', text) # 去除星号内的注释
  54. #
  55. # # 4. 去除空行和多余的空白
  56. # text = re.sub(r'\n\s*\n+', '\n\n', text)
  57. #
  58. # return text
  59. #
  60. #
  61. # def format_special_terms(text):
  62. # # 识别图、表、Figure、Table等关键词,并在它们之前插入换行符
  63. # text = re.sub(r'(?<!\n)(图|表|Figure|Table)\s+', r'\n\g<1> ', text)
  64. # return text
  65. #
  66. #
  67. # def save_text_as_utf8(text, output_path):
  68. # # 将文本以 UTF-8 格式保存到文件
  69. # with open(output_path, 'w', encoding='utf-8') as file:
  70. # file.write(text)
  71. #
  72. #
  73. # def main(pdf_path):
  74. # # 提取 PDF 文件名
  75. # pdf_filename = os.path.basename(pdf_path)
  76. # pdf_name, _ = os.path.splitext(pdf_filename)
  77. #
  78. # # 创建新文件夹
  79. # output_dir = 'pdf_output'
  80. # if not os.path.exists(output_dir):
  81. # os.makedirs(output_dir)
  82. #
  83. # # 构建输出文件路径
  84. # output_path = os.path.join(output_dir, f"{pdf_name}.txt")
  85. #
  86. # # 提取文本
  87. # pdf_text = read_pdf_text(pdf_path)
  88. #
  89. # # 清理文本
  90. # cleaned_text = clean_text(pdf_text)
  91. #
  92. # # 去除不需要的信息
  93. # cleaned_text = remove_unwanted_content(cleaned_text)
  94. #
  95. # # 格式化特殊术语
  96. # final_text = format_special_terms(cleaned_text)
  97. #
  98. # # 保存清理后的文本
  99. # save_text_as_utf8(final_text, output_path)
  100. #
  101. # print(f"Text extracted, cleaned, and saved to {output_path} in UTF-8 format.")
  102. #
  103. #
  104. # if __name__ == "__main__":
  105. # pdf_path = r'C:\Users\Machenike\Desktop\火工大\hgd240914\1图像目标智能识别软件设计方案.pdf' # 请确保这里的路径正确指向您的 PDF 文件
  106. # main(pdf_path)
  107. # import pdfplumber
  108. #
  109. # def read_chinese_pdf_text(pdf_path):
  110. # text = ""
  111. # with pdfplumber.open(pdf_path) as pdf:
  112. # for page in pdf.pages:
  113. # text += page.extract_text()
  114. # return text
  115. # if __name__ == '__main__':
  116. # path = r'C:\Users\Machenike\Desktop\火工大\hgd240914\1图像目标智能识别软件设计方案.pdf'
  117. # data = read_chinese_pdf_text(path)
  118. # print(data)
  119. import os
  120. import pdfplumber
  121. import re
  122. def read_chinese_pdf_text(pdf_path):
  123. project_root = os.getcwd()
  124. output_dir = os.path.join(project_root, 'output')
  125. if not os.path.exists(output_dir):
  126. os.makedirs(output_dir)
  127. base_name = os.path.splitext(os.path.basename(pdf_path))[0]
  128. output_file = os.path.join(output_dir, f"{base_name}.txt")
  129. text = ""
  130. with pdfplumber.open(pdf_path) as pdf:
  131. for page in pdf.pages:
  132. height = page.height
  133. crop_area = (0, height * 0.1, page.width, height * 0.9)
  134. cropped_page = page.crop(crop_area)
  135. page_text = cropped_page.extract_text() or ''
  136. lines = page_text.split('\n')
  137. processed_lines = []
  138. current_line = ""
  139. for line in lines:
  140. line = line.strip() # 去除每行首尾的空白字符
  141. if len(line) == 0: # 如果是空行,则直接添加换行
  142. if current_line:
  143. processed_lines.append(current_line)
  144. current_line = ""
  145. processed_lines.append('')
  146. continue
  147. # 检查是否是表格或图片的名称
  148. if re.match(r'(表|图)\s*\d+', line):
  149. if current_line:
  150. processed_lines.append(current_line)
  151. current_line = ""
  152. processed_lines.append(line)
  153. processed_lines.append('')
  154. continue
  155. # 判断是否需要合并行
  156. if (not current_line.endswith(('。', '!', '?', '.', '!', '?', ','))) and not line.startswith(' '):
  157. # 当前行不是句子结尾,且下一行没有缩进,合并这两行
  158. current_line += line
  159. else:
  160. if current_line:
  161. processed_lines.append(current_line)
  162. current_line = line
  163. if current_line: # 添加最后一行
  164. processed_lines.append(current_line)
  165. text += '\n'.join(processed_lines) + '\n'
  166. with open(output_file, 'w', encoding='utf-8') as file:
  167. file.write(text)
  168. return output_file
  169. if __name__ == '__main__':
  170. path = r'C:\Users\Machenike\Desktop\火工大\hgd240914\1图像目标智能识别软件设计方案.pdf'
  171. result_file = read_chinese_pdf_text(path)
  172. print(f"Text has been extracted and saved to {result_file}")