使用Python在文件夹结构中的文档中查找关键字

我想知道是否有人知道如何在文档中查找关键字,然后将这些文字导出到Excel文档中。

例如,我在文件夹A>文件夹B中有一个名为“test”的文档。我想要进入该文件夹,find名为“test”的文档,打开该文件,find关键字“test”,然后导出“testing“到一个excel文件。

我问的原因是有成千上万的文件夹需要这样做。 我已经看了其他解决scheme,他们build议pdfminer,但我不知道如何跨多个文件夹/子文件夹运行,并将关键词导出到Excel文档。

这是我迄今为止。 它能够将PDF转换为文本,然后从文本中查找关键字。 不知道我怎么能够将这些数据导出到Excel文件/列。 谢谢

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO import xlsxwriter # Create a workbook and add a worksheet. workbook = xlsxwriter.Workbook('Stafford_Capital.xlsx') worksheet = workbook.add_worksheet() def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() fstr = '' for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fstr += str fp.close() device.close() retstr.close() return fstr print convert_pdf_to_txt("test.pdf") string = convert_pdf_to_txt("test.pdf") lines = list(filter(bool,string.split('\n'))) custData = {} for i in range(len(lines)): if 'Lead:' in lines[i]: custData['Name'] = lines[i+2] elif 'Date:Date:Date:Date:' in lines[i]: custData['Fund Manager'] = lines[i+2] elif 'Priority:' in lines[i]: custData['Industry'] = lines[i+2] custData['Date'] = lines[i+1] custData['Deal Size']= lines [i+3] elif 'DEAL QUALIFYING MEMORANDUM' in lines[i]: custData['Owner'] = lines[i+2] elif 'Fund Manager' in lines[i]: custData['Investment Type'] = lines [i+2] print custData row = 0 col = 0 # Iterate over the data and write it out row by row. for item, descrip in custData.iteritems(): worksheet.write(row, col, item) worksheet.write(row+1, col, descrip) col += 1 workbook.close()