用到的包

pdfminer3k

代码

import os
import re

from pdfminer.pdfinterp import PDFResourceManager,process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open

def readPDF(pdffile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr,retstr,laparams=laparams)

process_pdf(rsrcmgr,device,pdffile)
device.close()

content = retstr.getvalue()
retstr.close()

return content

def handleData(filename):
file = open('txt/' + filename, 'r', encoding='utf-8')

while True:
line = file.readline()
if not line:
file.close()
break

elif line.find("Load bearing anchor") > -1 or line.find("Torsion anchor") > -1 or line.find("Pins") > -1:
# print(line)
with open('result/' + filename, 'a', encoding='utf8') as f:
f.write(line)

path = 'pdf'
pdfList = os.listdir(path)

for li in pdfList:
pdffile = open(path + '/' + li, "rb")
content = readPDF(pdffile)

str = re.sub('.pdf', '.txt', li)
file1 = 'txt/' + str
with open( file1, 'w', encoding='utf8') as f:
f.write(content)

handleData(str)