12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- import io
- from pdfminer.converter import TextConverter
- from pdfminer.pdfinterp import PDFPageInterpreter
- from pdfminer.pdfinterp import PDFResourceManager
- from pdfminer.pdfpage import PDFPage
- import re
- from get_result import get_ner_result, get_re_result
- import requests
- # 将pdf文件中的文本提取到一个字符串中
- def extract_text_from_pdf(pdf_path):
- if pdf_path.startswith('http'): # 如果是从URL读取PDF文件
- # 从URL中读取PDF文件的二进制数据
- response = requests.get(pdf_path)
- fh = io.BytesIO(response.content)
- else:
- fh = open(pdf_path, 'rb')
- # 创建PDF资源管理器
- rsrcmgr = PDFResourceManager()
- # 创建一个字符串对象
- retstr = io.StringIO()
- # 创建一个PDF设备对象
- device = TextConverter(rsrcmgr, retstr, laparams=None)
- # 创建一个PDF解析器对象
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- # 处理文档当中的每个页面
- for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
- interpreter.process_page(page)
- # 获取解析器生成的文本并关闭设备
- text = retstr.getvalue()
- device.close()
- retstr.close()
- if not fh.closed:
- fh.close()
- # 将文本字符串中的每行结尾的“\n”替换为空字符串
- text = re.sub(r'\n$', '', text, flags=re.MULTILINE)
- # 返回提取到的文本字符串
- return text
- # 对文本进行分句
- def split_sentences(text):
- # 可以根据实际情况修改分句的正则表达式
- sentence_pattern = re.compile(r'([。;!?\\?!])')
- sentences = sentence_pattern.split(text)
- # 将分句符与句子合并
- merged_sentences = []
- for i in range(0, len(sentences)-1, 2):
- merged_sentences.append(sentences[i] + sentences[i+1])
- if len(sentences) % 2 == 1:
- merged_sentences.append(sentences[-1])
- return merged_sentences
|