123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187 |
- # import fitz # PyMuPDF
- # import re
- # import os
- #
- #
- # def read_pdf_text(pdf_path):
- # # 打开 PDF 文件
- # document = fitz.open(pdf_path)
- #
- # # 初始化一个空字符串来存储所有页面的文本
- # pdf_text = ''
- #
- # # 遍历每一页
- # for page_num in range(len(document)):
- # page = document.load_page(page_num) # 加载页面
- # text = page.get_text("text") # 提取页面上的文本
- # pdf_text += text + '\n' # 将文本添加到总文本中,每页之间用换行分隔
- #
- # return pdf_text
- #
- #
- # def clean_text(text):
- # # 1. 移除段内不必要的换行
- # # 将段落内的换行符替换为空格
- # text = re.sub(r'(\S)\n(\S)', r'\1 \2', text)
- #
- # # 2. 保留段落之间的换行
- # # 合并连续的空行
- # cleaned_text = re.sub(r'\n\s*\n+', '\n\n', text)
- #
- # # 3. 去除每行开头和结尾的空白字符
- # cleaned_text = re.sub(r'^\s+|\s+$', '', cleaned_text, flags=re.MULTILINE)
- #
- # return cleaned_text
- #
- #
- # def remove_unwanted_content(text):
- # # 1. 去除目录
- # text = re.sub(r'^目录.*?\n', '', text, flags=re.DOTALL | re.MULTILINE)
- #
- # # 2. 去除页眉和页脚
- # # 假设页眉和页脚通常是每页的第一行和最后一行
- # lines = text.split('\n')
- # new_lines = []
- # for i in range(0, len(lines), 2): # 每两行处理一次
- # if i + 1 < len(lines):
- # # 去除页眉和页脚
- # new_lines.append(lines[i + 1])
- # text = '\n'.join(new_lines)
- #
- # # 3. 去除注释
- # text = re.sub(r'$.*?$', '', text) # 去除括号内的注释
- # text = re.sub(r'\*.*?\*', '', text) # 去除星号内的注释
- #
- # # 4. 去除空行和多余的空白
- # text = re.sub(r'\n\s*\n+', '\n\n', text)
- #
- # return text
- #
- #
- # def format_special_terms(text):
- # # 识别图、表、Figure、Table等关键词,并在它们之前插入换行符
- # text = re.sub(r'(?<!\n)(图|表|Figure|Table)\s+', r'\n\g<1> ', text)
- # return text
- #
- #
- # def save_text_as_utf8(text, output_path):
- # # 将文本以 UTF-8 格式保存到文件
- # with open(output_path, 'w', encoding='utf-8') as file:
- # file.write(text)
- #
- #
- # def main(pdf_path):
- # # 提取 PDF 文件名
- # pdf_filename = os.path.basename(pdf_path)
- # pdf_name, _ = os.path.splitext(pdf_filename)
- #
- # # 创建新文件夹
- # output_dir = 'pdf_output'
- # if not os.path.exists(output_dir):
- # os.makedirs(output_dir)
- #
- # # 构建输出文件路径
- # output_path = os.path.join(output_dir, f"{pdf_name}.txt")
- #
- # # 提取文本
- # pdf_text = read_pdf_text(pdf_path)
- #
- # # 清理文本
- # cleaned_text = clean_text(pdf_text)
- #
- # # 去除不需要的信息
- # cleaned_text = remove_unwanted_content(cleaned_text)
- #
- # # 格式化特殊术语
- # final_text = format_special_terms(cleaned_text)
- #
- # # 保存清理后的文本
- # save_text_as_utf8(final_text, output_path)
- #
- # print(f"Text extracted, cleaned, and saved to {output_path} in UTF-8 format.")
- #
- #
- # if __name__ == "__main__":
- # pdf_path = r'C:\Users\Machenike\Desktop\火工大\hgd240914\1图像目标智能识别软件设计方案.pdf' # 请确保这里的路径正确指向您的 PDF 文件
- # main(pdf_path)
- # import pdfplumber
- #
- # def read_chinese_pdf_text(pdf_path):
- # text = ""
- # with pdfplumber.open(pdf_path) as pdf:
- # for page in pdf.pages:
- # text += page.extract_text()
- # return text
- # if __name__ == '__main__':
- # path = r'C:\Users\Machenike\Desktop\火工大\hgd240914\1图像目标智能识别软件设计方案.pdf'
- # data = read_chinese_pdf_text(path)
- # print(data)
- import os
- import pdfplumber
- import re
- def read_chinese_pdf_text(pdf_path):
- project_root = os.getcwd()
- output_dir = os.path.join(project_root, 'output')
- if not os.path.exists(output_dir):
- os.makedirs(output_dir)
- base_name = os.path.splitext(os.path.basename(pdf_path))[0]
- output_file = os.path.join(output_dir, f"{base_name}.txt")
- text = ""
- with pdfplumber.open(pdf_path) as pdf:
- for page in pdf.pages:
- height = page.height
- crop_area = (0, height * 0.1, page.width, height * 0.9)
- cropped_page = page.crop(crop_area)
- page_text = cropped_page.extract_text() or ''
- lines = page_text.split('\n')
- processed_lines = []
- current_line = ""
- for line in lines:
- line = line.strip() # 去除每行首尾的空白字符
- if len(line) == 0: # 如果是空行,则直接添加换行
- if current_line:
- processed_lines.append(current_line)
- current_line = ""
- processed_lines.append('')
- continue
- # 检查是否是表格或图片的名称
- if re.match(r'(表|图)\s*\d+', line):
- if current_line:
- processed_lines.append(current_line)
- current_line = ""
- processed_lines.append(line)
- processed_lines.append('')
- continue
- # 判断是否需要合并行
- if (not current_line.endswith(('。', '!', '?', '.', '!', '?', ','))) and not line.startswith(' '):
- # 当前行不是句子结尾,且下一行没有缩进,合并这两行
- current_line += line
- else:
- if current_line:
- processed_lines.append(current_line)
- current_line = line
- if current_line: # 添加最后一行
- processed_lines.append(current_line)
- text += '\n'.join(processed_lines) + '\n'
- with open(output_file, 'w', encoding='utf-8') as file:
- file.write(text)
- return output_file
- if __name__ == '__main__':
- path = r'C:\Users\Machenike\Desktop\火工大\hgd240914\1图像目标智能识别软件设计方案.pdf'
- result_file = read_chinese_pdf_text(path)
- print(f"Text has been extracted and saved to {result_file}")
|