pdfReader.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import io
  2. from pdfminer.converter import TextConverter
  3. from pdfminer.pdfinterp import PDFPageInterpreter
  4. from pdfminer.pdfinterp import PDFResourceManager
  5. from pdfminer.pdfpage import PDFPage
  6. import re
  7. from get_result import get_ner_result, get_re_result
  8. import requests
  9. # 将pdf文件中的文本提取到一个字符串中
  10. def extract_text_from_pdf(pdf_path):
  11. if pdf_path.startswith('http'): # 如果是从URL读取PDF文件
  12. # 从URL中读取PDF文件的二进制数据
  13. response = requests.get(pdf_path)
  14. fh = io.BytesIO(response.content)
  15. else:
  16. fh = open(pdf_path, 'rb')
  17. # 创建PDF资源管理器
  18. rsrcmgr = PDFResourceManager()
  19. # 创建一个字符串对象
  20. retstr = io.StringIO()
  21. # 创建一个PDF设备对象
  22. device = TextConverter(rsrcmgr, retstr, laparams=None)
  23. # 创建一个PDF解析器对象
  24. interpreter = PDFPageInterpreter(rsrcmgr, device)
  25. # 处理文档当中的每个页面
  26. for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
  27. interpreter.process_page(page)
  28. # 获取解析器生成的文本并关闭设备
  29. text = retstr.getvalue()
  30. device.close()
  31. retstr.close()
  32. if not fh.closed:
  33. fh.close()
  34. # 将文本字符串中的每行结尾的“\n”替换为空字符串
  35. text = re.sub(r'\n$', '', text, flags=re.MULTILINE)
  36. # 返回提取到的文本字符串
  37. return text
  38. # 对文本进行分句
  39. def split_sentences(text):
  40. # 可以根据实际情况修改分句的正则表达式
  41. sentence_pattern = re.compile(r'([。;!?\\?!])')
  42. sentences = sentence_pattern.split(text)
  43. # 将分句符与句子合并
  44. merged_sentences = []
  45. for i in range(0, len(sentences)-1, 2):
  46. merged_sentences.append(sentences[i] + sentences[i+1])
  47. if len(sentences) % 2 == 1:
  48. merged_sentences.append(sentences[-1])
  49. return merged_sentences