从PDF中提取数据并导出到excel

几个月,我做脚本自动化下一个过程。

  1. 使文件夹内的.pdf文件列表。
  2. 从每个pdf文件中提取数据
  3. 将提取的数据保存在Excel表格中

当处理多达15个PDF文件,但是如果我尝试更多不工作的脚本工作知情。 我认为在3号进程中崩溃,但我不能确定。

我写检查点(打印find的文件数,提取的打印数据等),但为了能够保存不间断的空间数据,我需要把这个代码:

import sys reload(sys) sys.setdefaultencoding('Cp1252') 

当我把这一行,我没有看到任何东西在Python shell,所以我不知道什么时候脚本崩溃。

我想也许可能是关于记忆的东西,但我需要你的帮助。

我apreciate如果你可以检查我的代码,并给我的build议

谢谢,

我所有的脚本:

 import sys reload(sys) sys.setdefaultencoding('Cp1252') import os from glob import glob from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO import re import xlsxwriter import time def find_ext(dr, ext): return glob(path.join(dr,"*.{}".format(ext))) files = [f for f in os.listdir('.') if os.path.isfile(f)] files = filter(lambda f: f.endswith(('.pdf','.PDF')), files) def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() fstr = '' for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fstr += str fp.close() device.close() retstr.close() return fstr fecha_de_hoy =(time.strftime("%d/%m/%Y")) fecha_de_hoy = re.sub("/", "-", fecha_de_hoy) # Create a workbook and add a worksheet. workbook = xlsxwriter.Workbook('Expenses.xlsx') worksheet = workbook.add_worksheet() # Start from the first cell. Rows and columns are zero indexed. row = 0 col = 0 # Iterate over the data and write it out row by row. worksheet.write(row, col, "FECHA") worksheet.write(row, col + 1, "CLIENTE") worksheet.write(row, col + 2, "PROVEEDOR" ) worksheet.write(row, col + 3, "REF. CLIENTE" ) worksheet.write(row, col + 4, "REMITENTE") worksheet.write(row, col + 5, "DESTINATARIO") worksheet.write(row, col + 6, "DIRECCION DEST.") worksheet.write(row, col + 7, "CODIGO POSTAL DEST.") worksheet.write(row, col + 8, "POBLACION DEST.") worksheet.write(row, col + 9, "PROVINCIA DEST.") worksheet.write(row, col + 10, "Nº BULTOS") worksheet.write(row, col + 11, "PESO") worksheet.write(row, col + 12, "COSTE") worksheet.write(row, col + 13, "PVP") worksheet.write(row, col + 14, "E-mail CONFIRMACIÓN") row+=1 e = len(files) lengthlist = e w=0 print e while w < lengthlist: print w print files[w] factura = files[w] string = convert_pdf_to_txt(factura) txtList = convert_pdf_to_txt(factura).splitlines() destinatarioIdx, direcionNumIdx, codigoNumIdx, poblacionIdx, provinciaIdx, pedidoIdx, bultosIdx = -1, -1, -1, -1, -1, -1, -1 for idx, line in enumerate(txtList): if line == "Destino MercancÃa": destinatarioIdx = idx +1 direcionNumIdx = idx +2 codigoNumIdx = idx +3 poblacionIdx = idx +3 provinciaIdx = idx +4 if line == "Nº de Pedido": pedidoIdx = idx +1 if "Bultos" in line: bultosIdx = idx + 2 nombre_destinatario = txtList[destinatarioIdx] if destinatarioIdx != -1 else '' nombre_destinatario = re.sub("É", "É", nombre_destinatario) direccion_destinatario = txtList[direcionNumIdx] if direcionNumIdx != -1 else '' codigo_destinatario = txtList[codigoNumIdx] if codigoNumIdx != -1 else '' codigo_destinatario = re.sub("\D", "", codigo_destinatario) poblacion_destinatario = txtList[poblacionIdx] if poblacionIdx != -1 else '' poblacion_destinatario = re.sub("[0-9]", "", poblacion_destinatario) poblacion_destinatario = re.sub(r"\s+", "", poblacion_destinatario, flags=re.UNICODE) provincia_destinatario = txtList[provinciaIdx] if provinciaIdx != -1 else '' pedido_destinatario = txtList[pedidoIdx] if pedidoIdx != -1 else '' bultos_destinatario = txtList[bultosIdx] if bultosIdx != -1 else '' bultos_destinatario = re.sub(r"\s+", "", bultos_destinatario, flags=re.UNICODE) #ARREGLAR EXCEPCIONES '''for idx, line in enumerate(txtList): if line == "Destino MercancÃa": destinatarioIdx = idx +1 direcionNumIdx = idx +2 codigoNumIdx = idx +3 if codigoNumIdx < 1000: direcion1 = idx +2 direccion2 = idx +3 direcionNumIdx = (direcion1, direccion2) codigoNumIdx = idx +4 poblacionIdx = idx +4 provinciaIdx = idx +5''' print "Nombre Destinatario" print nombre_destinatario print "Direccion destinatario" print direccion_destinatario print "codigo destinatario" print codigo_destinatario print "poblacion destinatario" print poblacion_destinatario print "Provincia destinatario" print provincia_destinatario print "Nº pedido destinatario" print pedido_destinatario print "Nº bultos envío" print bultos_destinatario # Iterate over the data and write it out row by row. worksheet.write(row, col, fecha_de_hoy) worksheet.write(row, col + 1, "SIDAC") worksheet.write(row, col + 2, "PROVEEDOR" ) worksheet.write(row, col + 3, pedido_destinatario ) worksheet.write(row, col + 4, "SIDAC") worksheet.write(row, col + 5, nombre_destinatario) worksheet.write(row, col + 6, direccion_destinatario) worksheet.write(row, col + 7, codigo_destinatario) worksheet.write(row, col + 8, poblacion_destinatario) worksheet.write(row, col + 9, provincia_destinatario) worksheet.write(row, col + 10, bultos_destinatario) worksheet.write(row, col + 11, "PESO") worksheet.write(row, col + 12, "COSTE") worksheet.write(row, col + 13, "PVP") worksheet.write(row, col + 14, "trafico@buendialogistica.com") w+=1 row+=1 workbook.close()