如何将PDF文件转换为Excel中的C#
我想提取一些数据,如" email addresses "
..从PDF文件中的表格,并使用我提取的这些电子邮件地址发送电子邮件给这些人。
我通过searchnetworking发现了什么:
-
我必须将PDF文件转换为Excel来轻松读取数据,并根据需要使用它们。
-
我发现一些免费的DLL像
itextsharp
或PDFsharp
。
但是我没有find任何代码帮助在C#中做到这一点。 有没有解决办法?
您绝对不必将PDF转换为Excel。 首先,请确定您的PDF是否包含文本数据,或者是否为扫描图像。 如果它包含文本数据,那么你是正确的使用“一些免费的DLL”。 我推荐iTextSharp,因为它很受欢迎,而且易于使用。
现在是有争议的部分。 如果您不需要坚如磐石的解决scheme,将所有PDF读取为string,然后使用正则expression式检索电子邮件将是最简单的方法。
以下是使用iTextSharp阅读PDF并提取电子邮件的示例(不完美):
public string PdfToString(string fileName) { var sb = new StringBuilder(); var reader = new PdfReader(fileName); for (int page = 1; page <= reader.NumberOfPages; page++) { var strategy = new SimpleTextExtractionStrategy(); string text = PdfTextExtractor.GetTextFromPage(reader, page, strategy); text = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text))); sb.Append(text); } reader.Close(); return sb.ToString(); } //adjust expression as needed Regex emailRegex = new Regex("Email Address (?<email>.+?) Passport No"); public IEnumerable<string> ExtractEmails(string content) { var matches = emailRegex.Matches(content); foreach (Match m in matches) { yield return m.Groups["email"].Value; } }
使用bytescout PDF Extractor SDK,我们可以将整个页面提取为csv,如下所示。
CSVExtractor extractor = new CSVExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; TableDetector tdetector = new TableDetector(); tdetector.RegistrationKey = "demo"; tdetector.RegistrationName = "demo"; // Load the document extractor.LoadDocumentFromFile("C:\\sample.pdf"); tdetector.LoadDocumentFromFile("C:\\sample.pdf"); int pageCount = tdetector.GetPageCount(); for (int i = 1; i <= pageCount; i++) { int j = 1; do { extractor.SetExtractionArea(tdetector.GetPageRect_Left(i), tdetector.GetPageRect_Top(i), tdetector.GetPageRect_Width(i), tdetector.GetPageRect_Height(i) ); // and finally save the table into CSV file extractor.SavePageCSVToFile(i, "C:\\page-" + i + "-table-" + j + ".csv"); j++; } while (tdetector.FindNextTable()); // search next table }
public void Convert(string fileNames) { int pageCount = 0; iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(fileNames); pageCount = reader.NumberOfPages; string ext = System.IO.Path.GetExtension(fileNames); //string[] outfiles = new string[pageCount]; //Excel.Application app = new Excel.Application(); //app.Workbooks.Add(""); CSVExtractor extractor = new CSVExtractor(); //string outfilePDF1 = fileNames.Replace((System.IO.Path.GetFileName(fileNames)), (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_rez" + ".csv")); string outfilePDFExcel1 = fileNames.Replace((System.IO.Path.GetFileName(fileNames)), (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_rez" + ".xls")); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; string folderName = @"C:\Users\Dafina\Desktop\PDF_EditProject\PDF_EditProject\PDFs"; string pathString = System.IO.Path.Combine(folderName, System.IO.Path.GetFileName(fileNames).Replace(".pdf", "")) + "-CSVs"; System.IO.Directory.CreateDirectory(pathString); for (int i = 0; i < pageCount; i++) { string outfilePDF = fileNames.Replace((System.IO.Path.GetFileName(fileNames)), (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_" + (i + 1).ToString()) + ext); extractor.LoadDocumentFromFile(outfilePDF); //string outfile = fileNames.Replace((System.IO.Path.GetFileName(fileNames)), // (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_" + (i + 1).ToString()) + ".csv"); string outfile = fileNames.Replace((System.IO.Path.GetFileName(fileNames)), (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "-CSVs\\" + "Sheet_" + (i + 1).ToString()) + ".csv"); extractor.SaveCSVToFile(outfile); } Excel.Application xlApp = new Microsoft.Office.Interop.Excel.Application(); if (xlApp == null) { Console.WriteLine("Excel is not properly installed!!"); return; } Excel.Workbook xlWorkBook; object misValue = System.Reflection.Missing.Value; xlWorkBook = xlApp.Workbooks.Add(misValue); string[] cvsFiles = Directory.GetFiles(pathString); Array.Sort(cvsFiles, new AlphanumComparatorFast()); //string[] lista = new string[pageCount]; //for (int t = 0; t < pageCount; t++) //{ // lista[t] = cvsFiles[t]; //} //Array.Sort(lista, new AlphanumComparatorFast()); Microsoft.Office.Interop.Excel.Worksheet xlWorkSheet; for (int i = 0; i < cvsFiles.Length; i++) { int sheet = i + 1; xlWorkSheet = xlWorkBook.Sheets[sheet]; if (i < cvsFiles.Length - 1) { xlWorkBook.Worksheets.Add(Type.Missing, xlWorkSheet, Type.Missing, Type.Missing); } int sheetRow = 1; Encoding objEncoding = Encoding.Default; StreamReader readerd = new StreamReader(File.OpenRead(cvsFiles[i])); int ColumLength = 0; while (!readerd.EndOfStream) { string line = readerd.ReadLine(); Console.WriteLine(line); try { string[] columns = line.Split((new char[] { '\"' })); for (int col = 0; col < columns.Length; col++) { if (ColumLength < columns.Length) { ColumLength = columns.Length; } if (col % 2 == 0) { } else if (columns[col] == "") { } else { xlWorkSheet.Cells[sheetRow, col + 1] = columns[col].Replace("\"", ""); } } sheetRow++; } catch (Exception e) { string msg = e.Message; } } int k = 1; for (int s = 1; s <= ColumLength; s++) { xlWorkSheet.Columns[k].Delete(); k++; } releaseObject(xlWorkSheet); readerd.Close(); } xlWorkBook.SaveAs(outfilePDFExcel1, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Microsoft.Office.Interop.Excel.XlSaveAsAccessMode.xlExclusive, misValue, misValue, misValue, misValue, misValue); xlWorkBook.Close(true, misValue, misValue); xlApp.Quit(); releaseObject(xlWorkBook); releaseObject(xlApp); var dir = new DirectoryInfo(pathString); dir.Attributes = dir.Attributes & ~FileAttributes.ReadOnly; dir.Delete(true); }