从PDF中提取数据并导出到excel

几个月，我做脚本自动化下一个过程。

使文件夹内的.pdf文件列表。
从每个pdf文件中提取数据
将提取的数据保存在Excel表格中

当处理多达15个PDF文件，但是如果我尝试更多不工作的脚本工作知情。我认为在3号进程中崩溃，但我不能确定。

我写检查点（打印find的文件数，提取的打印数据等），但为了能够保存不间断的空间数据，我需要把这个代码：

import sys reload(sys) sys.setdefaultencoding('Cp1252')

当我把这一行，我没有看到任何东西在Python shell，所以我不知道什么时候脚本崩溃。

我想也许可能是关于记忆的东西，但我需要你的帮助。

我apreciate如果你可以检查我的代码，并给我的build议

谢谢，

我所有的脚本：

 import sys reload(sys) sys.setdefaultencoding('Cp1252') import os from glob import glob from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO import re import xlsxwriter import time def find_ext(dr, ext): return glob(path.join(dr,"*.{}".format(ext))) files = [f for f in os.listdir('.') if os.path.isfile(f)] files = filter(lambda f: f.endswith(('.pdf','.PDF')), files) def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() fstr = '' for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fstr += str fp.close() device.close() retstr.close() return fstr fecha_de_hoy =(time.strftime("%d/%m/%Y")) fecha_de_hoy = re.sub("/", "-", fecha_de_hoy) # Create a workbook and add a worksheet. workbook = xlsxwriter.Workbook('Expenses.xlsx') worksheet = workbook.add_worksheet() # Start from the first cell. Rows and columns are zero indexed. row = 0 col = 0 # Iterate over the data and write it out row by row. worksheet.write(row, col, "FECHA") worksheet.write(row, col + 1, "CLIENTE") worksheet.write(row, col + 2, "PROVEEDOR" ) worksheet.write(row, col + 3, "REF. CLIENTE" ) worksheet.write(row, col + 4, "REMITENTE") worksheet.write(row, col + 5, "DESTINATARIO") worksheet.write(row, col + 6, "DIRECCION DEST.") worksheet.write(row, col + 7, "CODIGO POSTAL DEST.") worksheet.write(row, col + 8, "POBLACION DEST.") worksheet.write(row, col + 9, "PROVINCIA DEST.") worksheet.write(row, col + 10, "Nº BULTOS") worksheet.write(row, col + 11, "PESO") worksheet.write(row, col + 12, "COSTE") worksheet.write(row, col + 13, "PVP") worksheet.write(row, col + 14, "E-mail CONFIRMACIÓN") row+=1 e = len(files) lengthlist = e w=0 print e while w < lengthlist: print w print files[w] factura = files[w] string = convert_pdf_to_txt(factura) txtList = convert_pdf_to_txt(factura).splitlines() destinatarioIdx, direcionNumIdx, codigoNumIdx, poblacionIdx, provinciaIdx, pedidoIdx, bultosIdx = -1, -1, -1, -1, -1, -1, -1 for idx, line in enumerate(txtList): if line == "Destino MercancÃa": destinatarioIdx = idx +1 direcionNumIdx = idx +2 codigoNumIdx = idx +3 poblacionIdx = idx +3 provinciaIdx = idx +4 if line == "NÂº de Pedido": pedidoIdx = idx +1 if "Bultos" in line: bultosIdx = idx + 2 nombre_destinatario = txtList[destinatarioIdx] if destinatarioIdx != -1 else '' nombre_destinatario = re.sub("Ã‰", "É", nombre_destinatario) direccion_destinatario = txtList[direcionNumIdx] if direcionNumIdx != -1 else '' codigo_destinatario = txtList[codigoNumIdx] if codigoNumIdx != -1 else '' codigo_destinatario = re.sub("\D", "", codigo_destinatario) poblacion_destinatario = txtList[poblacionIdx] if poblacionIdx != -1 else '' poblacion_destinatario = re.sub("[0-9]", "", poblacion_destinatario) poblacion_destinatario = re.sub(r"\s+", "", poblacion_destinatario, flags=re.UNICODE) provincia_destinatario = txtList[provinciaIdx] if provinciaIdx != -1 else '' pedido_destinatario = txtList[pedidoIdx] if pedidoIdx != -1 else '' bultos_destinatario = txtList[bultosIdx] if bultosIdx != -1 else '' bultos_destinatario = re.sub(r"\s+", "", bultos_destinatario, flags=re.UNICODE) #ARREGLAR EXCEPCIONES '''for idx, line in enumerate(txtList): if line == "Destino MercancÃa": destinatarioIdx = idx +1 direcionNumIdx = idx +2 codigoNumIdx = idx +3 if codigoNumIdx < 1000: direcion1 = idx +2 direccion2 = idx +3 direcionNumIdx = (direcion1, direccion2) codigoNumIdx = idx +4 poblacionIdx = idx +4 provinciaIdx = idx +5''' print "Nombre Destinatario" print nombre_destinatario print "Direccion destinatario" print direccion_destinatario print "codigo destinatario" print codigo_destinatario print "poblacion destinatario" print poblacion_destinatario print "Provincia destinatario" print provincia_destinatario print "Nº pedido destinatario" print pedido_destinatario print "Nº bultos envío" print bultos_destinatario # Iterate over the data and write it out row by row. worksheet.write(row, col, fecha_de_hoy) worksheet.write(row, col + 1, "SIDAC") worksheet.write(row, col + 2, "PROVEEDOR" ) worksheet.write(row, col + 3, pedido_destinatario ) worksheet.write(row, col + 4, "SIDAC") worksheet.write(row, col + 5, nombre_destinatario) worksheet.write(row, col + 6, direccion_destinatario) worksheet.write(row, col + 7, codigo_destinatario) worksheet.write(row, col + 8, poblacion_destinatario) worksheet.write(row, col + 9, provincia_destinatario) worksheet.write(row, col + 10, bultos_destinatario) worksheet.write(row, col + 11, "PESO") worksheet.write(row, col + 12, "COSTE") worksheet.write(row, col + 13, "PVP") worksheet.write(row, col + 14, "trafico@buendialogistica.com") w+=1 row+=1 workbook.close()

从PDF中提取数据并导出到excel

将随机数分配给Excel中的名称列

怎样才能有效地做到PANDAS中Excel的MATCH函数（小于）？

连接function：拆分和合并单元格

在Excel 2010中更新公式中的任何单元格

OptionButton编号循环

如何在Excel工作表中为整个列创build和分配自定义格式的date和时间

修剪bash中的string中的新行字符

重复嵌套的IF语句

脚本在特定的时间在特定的工作簿中运行vba代码

Excel VBA – 列出所有的VBA环境variables