如何导出使用POI的Excel中的embedded文件?

我编写了一个java基础程序,下面是使用Apache POI在Excel表格中embedded3种文件(ppt,doc,txt)。 现在这个文件,我想用它的原始格式导出。 这个怎么做?

参考链接是使用Apache POI将文件embedded到Excel中 。 我从这个链接制作了节目。

总之我想在embedded式文件上导出function。

我已经使用下面的代码尝试了上面的问题,但它不适用于在Excel表中导出embedded的文件:

这是试图解决的代码:

public static void main(String[] args) throws IOException { String fileName = "ole_ppt_in_xls.xls"; ReadExcel(fileName); } public static void ReadExcel(String fileName) throws IOException { FileInputStream inputFileStream = new FileInputStream(fileName); POIFSFileSystem fs = new POIFSFileSystem(inputFileStream); HSSFWorkbook workbook = new HSSFWorkbook(fs); for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) { // the OLE2 Class Name of the object String oleName = obj.getOLE2ClassName(); System.out.println(oleName); if (oleName.equals("Worksheet")) { System.out.println("Worksheet"); DirectoryNode dn = (DirectoryNode) obj.getDirectory(); HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false); } else if (oleName.equals("Document")) { System.out.println("Document"); DirectoryNode dn = (DirectoryNode) obj.getDirectory(); HWPFDocument embeddedWordDocument = new HWPFDocument(dn, fs); } else if (oleName.equals("Presentation")) { System.out.println("Presentation"); DirectoryNode dn = (DirectoryNode) obj.getDirectory(); SlideShow embeddedPowerPointDocument = new SlideShow( new HSLFSlideShow(dn, fs)); } else if (oleName.equals("Presentation")) { System.out.println("Presentation"); DirectoryNode dn = (DirectoryNode) obj.getDirectory(); SlideShow embeddedPowerPointDocument = new SlideShow( new HSLFSlideShow(dn, fs)); }else { System.out.println("Else part "); if (obj.hasDirectoryEntry()) { System.out.println("obj.hasDirectoryEntry()"+obj.hasDirectoryEntry()); // The DirectoryEntry is a DocumentNode. Examine its entries DirectoryNode dn = (DirectoryNode) obj.getDirectory(); for (Iterator entries = dn.getEntries(); entries.hasNext();) { Entry entry = (Entry) entries.next(); System.out.println(oleName + "." + entry.getName()); } } else { System.out.println("Else part 22"); byte[] objectData = obj.getObjectData(); } } } } 

以上程序的输出画面: 在这里输入图像说明

那么,如何导出function实现?

 import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.util.ArrayList; /** * Demonstrates how you can extract embedded data from a .xlsx file */ public class GetEmbedded { public static void main(String[] args) throws Exception { String path = "SomeExcelFile.xlsx" XSSFWorkbook workbook = new XSSFWorkbook(new FileInputStream(new File(path))); for (PackagePart pPart : workbook.getAllEmbedds()) { String contentType = pPart.getContentType(); if (contentType.equals("application/vnd.ms-excel")) { //This is to read xls workbook embedded to xlsx file HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream()); int countOfSheetXls=embeddedWorkbook.getNumberOfSheets(); } else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) { //This is to read xlsx workbook embedded to xlsx file if(pPart.getPartName().getName().equals("/xl/embeddings/Microsoft_Excel_Worksheet12.xlsx")){ //"/xl/embeddings/Microsoft_Excel_Worksheet12.xlsx" - Can read an Excel from a particular sheet // This is the worksheet from the Parent Excel-sheet-12 XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(pPart.getInputStream()); int countOfSheetXlsx=embeddedWorkbook.getNumberOfSheets(); ArrayList<String> sheetNames= new ArrayList<String>(); for(int i=0;i<countOfSheetXlsx;i++){ String name=workbook.getSheetName(i); sheetNames.add(name); } } } } } } 

这部分是如何从使用Apache POI的xls文件中获取名称的图片的复本, 我为此编写了原始粘贴 。

根据请求,我还添加了一个如何添加和embedded与OLE 1.0包装的帮助的示例 – 在此期间,我已经添加到POI的代码,所以现在更容易。 对于基于OOXML的文件有一个看看这个答案 。

所以代码遍历DrawingPatriarch的所有形状,并提取图片和embedded文件。

我已经添加了完整的代码 – 而不是一个片段 – 这个答案,因为我期待下一个“为什么我不能导出这种embedded”来很快…

 package poijartest; import java.awt.Color; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Method; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.imageio.ImageIO; import org.apache.poi.ddf.EscherComplexProperty; import org.apache.poi.ddf.EscherOptRecord; import org.apache.poi.ddf.EscherProperty; import org.apache.poi.hpsf.ClassID; import org.apache.poi.hslf.usermodel.HSLFSlideShow; import org.apache.poi.hssf.usermodel.HSSFClientAnchor; import org.apache.poi.hssf.usermodel.HSSFObjectData; import org.apache.poi.hssf.usermodel.HSSFPatriarch; import org.apache.poi.hssf.usermodel.HSSFPicture; import org.apache.poi.hssf.usermodel.HSSFPictureData; import org.apache.poi.hssf.usermodel.HSSFShape; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFSimpleShape; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.Ole10NativeException; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.sl.usermodel.AutoShape; import org.apache.poi.sl.usermodel.ShapeType; import org.apache.poi.sl.usermodel.Slide; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.WorkbookFactory; import org.apache.poi.util.IOUtils; import org.apache.poi.xssf.usermodel.XSSFDrawing; import org.apache.poi.xssf.usermodel.XSSFPicture; import org.apache.poi.xssf.usermodel.XSSFPictureData; import org.apache.poi.xssf.usermodel.XSSFShape; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTPicture; /** * Tested with POI 3.16-beta1 * * 17.12.2014: original version for * http://apache-poi.1045710.n5.nabble.com/How-to-get-the-full-file-name-of-a-picture-in-xls-file-td5717205.html * * 17.12.2016: added sample/dummy data for * https://stackoverflow.com/questions/41101012/how-to-export-embeded-file-which-from-excel-using-poi */ public class EmbeddedReader { private File excel_file; private ImageReader image_reader; public static void main(String[] args) throws Exception { File sample = new File("bla.xls"); getSampleEmbedded(sample); ImageReader ir = new ImageReader(sample); for (EmbeddedData ed : ir.embeddings) { System.out.println(ed.filename); FileOutputStream fos = new FileOutputStream(ed.filename); IOUtils.copy(ed.is, fos); fos.close(); } ir.close(); } static void getSampleEmbedded(File sample) throws IOException { HSSFWorkbook wb = new HSSFWorkbook(); int storageId = wb.addOlePackage(getSamplePPT(), "dummy.ppt", "dummy.ppt", "dummy.ppt"); int picId = wb.addPicture(getSamplePng(), HSSFPicture.PICTURE_TYPE_PNG); HSSFSheet sheet = wb.createSheet(); HSSFPatriarch pat = sheet.createDrawingPatriarch(); HSSFClientAnchor anc = pat.createAnchor(0, 0, 0, 0, 1, 1, 3, 6); HSSFObjectData od = pat.createObjectData(anc, storageId, picId); od.setNoFill(true); wb.write(sample); wb.close(); } static byte[] getSamplePng() throws IOException { ClassLoader cl = Thread.currentThread().getContextClassLoader(); URL imgUrl = cl.getResource("javax/swing/plaf/metal/icons/ocean/directory.gif"); BufferedImage img = ImageIO.read(imgUrl); ByteArrayOutputStream bos = new ByteArrayOutputStream(); ImageIO.write(img, "PNG", bos); return bos.toByteArray(); } static byte[] getSamplePPT() throws IOException { HSLFSlideShow ppt = new HSLFSlideShow(); Slide<?,?> slide = ppt.createSlide(); AutoShape<?,?> sh1 = slide.createAutoShape(); sh1.setShapeType(ShapeType.STAR_32); sh1.setAnchor(new java.awt.Rectangle(50, 50, 100, 200)); sh1.setFillColor(Color.red); ByteArrayOutputStream bos = new ByteArrayOutputStream(); ppt.write(bos); ppt.close(); POIFSFileSystem poifs = new POIFSFileSystem(new ByteArrayInputStream(bos.toByteArray())); poifs.getRoot().setStorageClsid(ClassID.PPT_SHOW); bos.reset(); poifs.writeFilesystem(bos); poifs.close(); return bos.toByteArray(); } public EmbeddedReader(String excel_path) throws IOException { excel_file = new File(excel_path); image_reader = new ImageReader(excel_file); } public String[] get_file_names() { ArrayList<String> file_names = new ArrayList<String>(); for (EmbeddedData ed : image_reader.embeddings) { file_names.add(ed.filename); } return file_names.toArray(new String[file_names.size()]); } public InputStream get_stream(String file_name) { InputStream input_stream = null; for (EmbeddedData ed : image_reader.embeddings) { if(file_name.equals(ed.filename)) { input_stream = ed.is; break; } } return input_stream; } static class ImageReader implements Closeable { EmbeddedExtractor extractors[] = { new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor() }; List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>(); Workbook wb; public ImageReader(File excelfile) throws IOException { try { wb = WorkbookFactory.create(excelfile); Sheet receiptImages = wb.getSheet("Receipt images"); if (wb instanceof XSSFWorkbook) { addSheetPicsAndEmbedds((XSSFSheet)receiptImages); } else { addAllEmbedds((HSSFWorkbook)wb); addSheetPics((HSSFSheet)receiptImages); } } catch (Exception e) { // todo: error handling } } protected void addSheetPicsAndEmbedds(XSSFSheet sheet) throws IOException { if (sheet == null) return; XSSFDrawing draw = sheet.createDrawingPatriarch(); for (XSSFShape shape : draw.getShapes()) { if (!(shape instanceof XSSFPicture)) continue; XSSFPicture picture = (XSSFPicture)shape; XSSFPictureData pd = picture.getPictureData(); PackagePart pp = pd.getPackagePart(); CTPicture ctPic = picture.getCTPicture(); String filename = null; try { filename = ctPic.getNvPicPr().getCNvPr().getName(); } catch (Exception e) {} if (filename == null || "".equals(filename)) { filename = new File(pp.getPartName().toString()).getName(); } EmbeddedData ed = new EmbeddedData(); ed.filename = fileNameWithoutPath(filename); ed.is = pp.getInputStream(); embeddings.add(ed); } } protected void addAllEmbedds(HSSFWorkbook hwb) throws IOException { for (HSSFObjectData od : hwb.getAllEmbeddedObjects()) { String alternativeName = getAlternativeName(od); if (od.hasDirectoryEntry()) { DirectoryNode src = (DirectoryNode)od.getDirectory(); for (EmbeddedExtractor ee : extractors) { if (ee.canExtract(src)) { EmbeddedData ed = ee.extract(src); if (ed.filename == null || ed.filename.startsWith("MBD") || alternativeName != null) { ed.filename = alternativeName; } ed.filename = fileNameWithoutPath(ed.filename); ed.source = "object"; embeddings.add(ed); break; } } } } } protected String getAlternativeName(HSSFShape shape) { EscherOptRecord eor = reflectEscherOptRecord(shape); if (eor == null) return null; for (EscherProperty ep : eor.getEscherProperties()) { if ("groupshape.shapename".equals(ep.getName()) && ep.isComplex()) { return new String(((EscherComplexProperty)ep).getComplexData(), Charset.forName("UTF-16LE")); } } return null; } protected void addSheetPics(HSSFSheet sheet) { if (sheet == null) return; int picIdx=0; int emfIdx = 0; HSSFPatriarch patriarch = sheet.getDrawingPatriarch(); if (patriarch == null) return; // Loop through the objects for (HSSFShape shape : patriarch.getChildren()) { if (!(shape instanceof HSSFPicture)) { continue; } HSSFPicture picture = (HSSFPicture) shape; if (picture.getShapeType() != HSSFSimpleShape.OBJECT_TYPE_PICTURE) continue; HSSFPictureData pd = picture.getPictureData(); byte pictureBytes[] = pd.getData(); int pictureBytesOffset = 0; int pictureBytesLen = pictureBytes.length; String filename = picture.getFileName(); // try to find an alternative name if (filename == null || "".equals(filename)) { filename = getAlternativeName(picture); } // default to dummy name if (filename == null || "".equals(filename)) { filename = "picture"+(picIdx++); } filename = filename.trim(); // check for emf+ embedded pdf (poor mans style :( ) // Mac Excel 2011 embeds pdf files with this method. boolean validFile = true; if (pd.getFormat() == Workbook.PICTURE_TYPE_EMF) { validFile = false; int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes()); if (idxStart != -1) { int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes()); if (idxEnd != -1) { pictureBytesOffset = idxStart; pictureBytesLen = idxEnd-idxStart+6; validFile = true; } } else { // This shape was not a Mac Excel 2011 embedded pdf file. // So this is a shape related to a regular embedded object // Lets update the object filename with the shapes filename // if the object filename is of format ARGF1234.pdf EmbeddedData ed_obj = embeddings.get(emfIdx); Pattern pattern = Pattern.compile("^[A-Z0-9]{8}\\.[pdfPDF]{3}$"); Matcher matcher = pattern.matcher(ed_obj.filename); if(matcher.matches()) { ed_obj.filename = filename; } emfIdx += 1; } } EmbeddedData ed = new EmbeddedData(); ed.filename = fileNameWithoutPath(filename); ed.is = new ByteArrayInputStream(pictureBytes, pictureBytesOffset, pictureBytesLen); if(fileNotInEmbeddings(ed.filename) && validFile) { embeddings.add(ed); } } } private static EscherOptRecord reflectEscherOptRecord(HSSFShape shape) { try { Method m = HSSFShape.class.getDeclaredMethod("getOptRecord"); m.setAccessible(true); return (EscherOptRecord)m.invoke(shape); } catch (Exception e) { // todo: log ... well actually "should not happen" ;) return null; } } private String fileNameWithoutPath(String filename) { int last_index = filename.lastIndexOf("\\"); return filename.substring(last_index + 1); } private boolean fileNotInEmbeddings(String filename) { boolean exists = true; for(EmbeddedData ed : embeddings) { if(ed.filename.equals(filename)) { exists = false; } } return exists; } public void close() throws IOException { Iterator<EmbeddedData> ed = embeddings.iterator(); while (ed.hasNext()) { ed.next().is.close(); } wb.close(); } } static class EmbeddedData { String filename; InputStream is; String source; } static abstract class EmbeddedExtractor { abstract boolean canExtract(DirectoryNode dn); abstract EmbeddedData extract(DirectoryNode dn) throws IOException; protected EmbeddedData extractFS(DirectoryNode dn, String filename) throws IOException { assert(canExtract(dn)); POIFSFileSystem dest = new POIFSFileSystem(); copyNodes(dn, dest.getRoot()); EmbeddedData ed = new EmbeddedData(); ed.filename = filename; ByteArrayOutputStream bos = new ByteArrayOutputStream(); dest.writeFilesystem(bos); dest.close(); ed.is = new ByteArrayInputStream(bos.toByteArray()); return ed; } } static class Ole10Extractor extends EmbeddedExtractor { public boolean canExtract(DirectoryNode dn) { ClassID clsId = dn.getStorageClsid(); return ClassID.OLE10_PACKAGE.equals(clsId); } public EmbeddedData extract(DirectoryNode dn) throws IOException { try { Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn); EmbeddedData ed = new EmbeddedData(); ed.filename = new File(ole10.getFileName()).getName(); ed.is = new ByteArrayInputStream(ole10.getDataBuffer()); return ed; } catch (Ole10NativeException e) { throw new IOException(e); } } } static class PdfExtractor extends EmbeddedExtractor { static ClassID PdfClassID = new ClassID("{B801CA65-A1FC-11D0-85AD-444553540000}"); public boolean canExtract(DirectoryNode dn) { ClassID clsId = dn.getStorageClsid(); return (PdfClassID.equals(clsId) || dn.hasEntry("CONTENTS")); } public EmbeddedData extract(DirectoryNode dn) throws IOException { EmbeddedData ed = new EmbeddedData(); ed.is = dn.createDocumentInputStream("CONTENTS"); ed.filename = dn.getName()+".pdf"; return ed; } } static class WordExtractor extends EmbeddedExtractor { public boolean canExtract(DirectoryNode dn) { ClassID clsId = dn.getStorageClsid(); return (ClassID.WORD95.equals(clsId) || ClassID.WORD97.equals(clsId) || dn.hasEntry("WordDocument")); } public EmbeddedData extract(DirectoryNode dn) throws IOException { return extractFS(dn, dn.getName()+".doc"); } } static class ExcelExtractor extends EmbeddedExtractor { public boolean canExtract(DirectoryNode dn) { ClassID clsId = dn.getStorageClsid(); return (ClassID.EXCEL95.equals(clsId) || ClassID.EXCEL97.equals(clsId) || dn.hasEntry("Workbook") /*...*/); } public EmbeddedData extract(DirectoryNode dn) throws IOException { return extractFS(dn, dn.getName()+".xls"); } } static class FsExtractor extends EmbeddedExtractor { public boolean canExtract(DirectoryNode dn) { return true; } public EmbeddedData extract(DirectoryNode dn) throws IOException { return extractFS(dn, dn.getName()+".dat"); } } private static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException { for (Entry e : src) { if (e instanceof DirectoryNode) { DirectoryNode srcDir = (DirectoryNode)e; DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName()); destDir.setStorageClsid(srcDir.getStorageClsid()); copyNodes(srcDir, destDir); } else { InputStream is = src.createDocumentInputStream(e); dest.createDocument(e.getName(), is); is.close(); } } } /** * Knuth-Morris-Pratt Algorithm for Pattern Matching * Finds the first occurrence of the pattern in the text. */ private static int indexOf(byte[] data, int offset, byte[] pattern) { int[] failure = computeFailure(pattern); int j = 0; if (data.length == 0) return -1; for (int i = offset; i < data.length; i++) { while (j > 0 && pattern[j] != data[i]) { j = failure[j - 1]; } if (pattern[j] == data[i]) { j++; } if (j == pattern.length) { return i - pattern.length + 1; } } return -1; } /** * Computes the failure function using a boot-strapping process, * where the pattern is matched against itself. */ private static int[] computeFailure(byte[] pattern) { int[] failure = new int[pattern.length]; int j = 0; for (int i = 1; i < pattern.length; i++) { while (j > 0 && pattern[j] != pattern[i]) { j = failure[j - 1]; } if (pattern[j] == pattern[i]) { j++; } failure[i] = j; } return failure; } } 

必需的jar文件列表:

  1. 公地编解码器1.10.jar
  2. dom4j.jar
  3. POI-3.16-beta1.jar
  4. POI-OOXML-3.8.jar
  5. POI-OOXML-模式-3.9.jar
  6. POI暂存器-3.9.jar
  7. XMLBeans的-2.3.0.jar

这是我的整个代码实现:

  import java.awt.Color; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Method; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.imageio.ImageIO; import org.apache.poi.ddf.EscherComplexProperty; import org.apache.poi.ddf.EscherOptRecord; import org.apache.poi.ddf.EscherProperty; import org.apache.poi.hpsf.ClassID; import org.apache.poi.hslf.HSLFSlideShow; import org.apache.poi.hssf.usermodel.HSSFClientAnchor; import org.apache.poi.hssf.usermodel.HSSFObjectData; import org.apache.poi.hssf.usermodel.HSSFPatriarch; import org.apache.poi.hssf.usermodel.HSSFPicture; import org.apache.poi.hssf.usermodel.HSSFPictureData; import org.apache.poi.hssf.usermodel.HSSFShape; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFSimpleShape; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.Ole10NativeException; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.sl.usermodel.AutoShape; import org.apache.poi.sl.usermodel.Slide; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.WorkbookFactory; import org.apache.poi.util.IOUtils; import org.apache.poi.xssf.usermodel.XSSFDrawing; import org.apache.poi.xssf.usermodel.XSSFPicture; import org.apache.poi.xssf.usermodel.XSSFPictureData; import org.apache.poi.xssf.usermodel.XSSFShape; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTPicture; public class EmbeddedReader { public static final OleType OLE10_PACKAGE = new OleType("{0003000C-0000-0000-C000-000000000046}"); public static final OleType PPT_SHOW = new OleType("{64818D10-4F9B-11CF-86EA-00AA00B929E8}"); public static final OleType XLS_WORKBOOK = new OleType("{00020841-0000-0000-C000-000000000046}"); public static final OleType TXT_ONLY = new OleType("{5e941d80-bf96-11cd-b579-08002b30bfeb}"); public static final OleType EXCEL97 = new OleType("{00020820-0000-0000-C000-000000000046}"); public static final OleType EXCEL95 = new OleType("{00020810-0000-0000-C000-000000000046}"); public static final OleType WORD97 = new OleType("{00020906-0000-0000-C000-000000000046}"); public static final OleType WORD95 = new OleType("{00020900-0000-0000-C000-000000000046}"); public static final OleType POWERPOINT97 = new OleType("{64818D10-4F9B-11CF-86EA-00AA00B929E8}"); public static final OleType POWERPOINT95 = new OleType("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}"); public static final OleType EQUATION30 = new OleType("{0002CE02-0000-0000-C000-000000000046}"); public static final OleType PdfClassID = new OleType("{B801CA65-A1FC-11D0-85AD-444553540000}"); private File excel_file; private ImageReader image_reader; static class OleType { final String classId; OleType(String classId) { this.classId = classId; } ClassID getClassID() { ClassID cls = new ClassID(); byte clsBytes[] = cls.getBytes(); String clsStr = classId.replaceAll("[{}-]", ""); for (int i = 0; i < clsStr.length(); i += 2) { clsBytes[i / 2] = (byte) Integer.parseInt( clsStr.substring(i, i + 2), 16); } return cls; } } public static void main(String[] args) throws Exception { File sample = new File("D:\\ole_ppt_in_xls.xls"); ImageReader ir = new ImageReader(sample); for (EmbeddedData ed : ir.embeddings) { FileOutputStream fos = new FileOutputStream(System.getProperty("user.home") + "/Desktop" + "/sumit/"+ ed.filename); IOUtils.copy(ed.is, fos); fos.close(); } ir.close(); } static byte[] getSamplePng() throws IOException { ClassLoader cl = Thread.currentThread().getContextClassLoader(); URL imgUrl = cl.getResource("javax/swing/plaf/metal/icons/ocean/directory.gif"); BufferedImage img = ImageIO.read(imgUrl); ByteArrayOutputStream bos = new ByteArrayOutputStream(); ImageIO.write(img, "PNG", bos); return bos.toByteArray(); } public EmbeddedReader(String excel_path) throws IOException { excel_file = new File(excel_path); image_reader = new ImageReader(excel_file); } public String[] get_file_names() { ArrayList<String> file_names = new ArrayList<String>(); for (EmbeddedData ed : image_reader.embeddings) { file_names.add(ed.filename); } return file_names.toArray(new String[file_names.size()]); } public InputStream get_stream(String file_name) { InputStream input_stream = null; for (EmbeddedData ed : image_reader.embeddings) { if (file_name.equals(ed.filename)) { input_stream = ed.is; break; } } return input_stream; } static class ImageReader implements Closeable { EmbeddedExtractor extractors[] = { new Ole10Extractor(),new PdfExtractor(), new WordExtractor(), new ExcelExtractor(),new FsExtractor() }; List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>(); Workbook wb; public ImageReader(File excelfile) throws IOException { try { wb = WorkbookFactory.create(excelfile); Sheet receiptImages = wb.getSheet("Receipt images"); if (wb instanceof XSSFWorkbook) { addSheetPicsAndEmbedds((XSSFSheet) receiptImages); } else { addAllEmbedds((HSSFWorkbook) wb); addSheetPics((HSSFSheet) receiptImages); } } catch (Exception e) { e.printStackTrace(); } } protected void addSheetPicsAndEmbedds(XSSFSheet sheet)throws IOException { if (sheet == null) return; XSSFDrawing draw = sheet.createDrawingPatriarch(); for (XSSFShape shape : draw.getShapes()) { if (!(shape instanceof XSSFPicture)) continue; XSSFPicture picture = (XSSFPicture) shape; XSSFPictureData pd = picture.getPictureData(); PackagePart pp = pd.getPackagePart(); CTPicture ctPic = picture.getCTPicture(); String filename = null; try { filename = ctPic.getNvPicPr().getCNvPr().getName(); } catch (Exception e) { } if (filename == null || "".equals(filename)) { filename = new File(pp.getPartName().toString()).getName(); } EmbeddedData ed = new EmbeddedData(); ed.filename = fileNameWithoutPath(filename); ed.is = pp.getInputStream(); embeddings.add(ed); } } protected void addAllEmbedds(HSSFWorkbook hwb) throws IOException { for (HSSFObjectData od : hwb.getAllEmbeddedObjects()) { String alternativeName = getAlternativeName(od); if (od.hasDirectoryEntry()) { DirectoryNode src = (DirectoryNode) od.getDirectory(); for (EmbeddedExtractor ee : extractors) { if (ee.canExtract(src)) { EmbeddedData ed = ee.extract(src); if (ed.filename == null || ed.filename.startsWith("MBD")|| alternativeName != null) { if (alternativeName != null) { ed.filename = alternativeName; } } ed.filename = fileNameWithoutPath(ed.filename); ed.source = "object"; embeddings.add(ed); break; } } } } } protected String getAlternativeName(HSSFShape shape) { EscherOptRecord eor = reflectEscherOptRecord(shape); if (eor == null) { return null; } for (EscherProperty ep : eor.getEscherProperties()) { if ("groupshape.shapename".equals(ep.getName()) && ep.isComplex()) { return new String( ((EscherComplexProperty) ep).getComplexData(), Charset.forName("UTF-16LE")); } } return null; } protected void addSheetPics(HSSFSheet sheet) { if (sheet == null) return; int picIdx = 0; int emfIdx = 0; HSSFPatriarch patriarch = sheet.getDrawingPatriarch(); if (patriarch == null) return; // Loop through the objects for (HSSFShape shape : patriarch.getChildren()) { if (!(shape instanceof HSSFPicture)) { continue; } HSSFPicture picture = (HSSFPicture) shape; if (picture.getShapeType() != HSSFSimpleShape.OBJECT_TYPE_PICTURE) continue; HSSFPictureData pd = picture.getPictureData(); byte pictureBytes[] = pd.getData(); int pictureBytesOffset = 0; int pictureBytesLen = pictureBytes.length; String filename = picture.getFileName(); // try to find an alternative name if (filename == null || "".equals(filename)) { filename = getAlternativeName(picture); } // default to dummy name if (filename == null || "".equals(filename)) { filename = "picture" + (picIdx++); } filename = filename.trim(); // check for emf+ embedded pdf (poor mans style :( ) // Mac Excel 2011 embeds pdf files with this method. boolean validFile = true; if (pd.getFormat() == Workbook.PICTURE_TYPE_EMF) { validFile = false; int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes()); if (idxStart != -1) { int idxEnd = indexOf(pictureBytes, idxStart,"%%EOF".getBytes()); if (idxEnd != -1) { pictureBytesOffset = idxStart; pictureBytesLen = idxEnd - idxStart + 6; validFile = true; } } else { // This shape was not a Mac Excel 2011 embedded pdf file. // So this is a shape related to a regular embedded object // Lets update the object filename with the shapes filename // if the object filename is of format ARGF1234.pdf EmbeddedData ed_obj = embeddings.get(emfIdx); Pattern pattern = Pattern .compile("^[A-Z0-9]{8}\\.[pdfPDF]{3}$"); Matcher matcher = pattern.matcher(ed_obj.filename); if (matcher.matches()) { ed_obj.filename = filename; } emfIdx += 1; } } EmbeddedData ed = new EmbeddedData(); ed.filename = fileNameWithoutPath(filename); ed.is = new ByteArrayInputStream(pictureBytes, pictureBytesOffset, pictureBytesLen); if (fileNotInEmbeddings(ed.filename) && validFile) { embeddings.add(ed); } } } private static EscherOptRecord reflectEscherOptRecord(HSSFShape shape) { try { Method m = HSSFShape.class.getDeclaredMethod("getOptRecord"); m.setAccessible(true); return (EscherOptRecord) m.invoke(shape); } catch (Exception e) { e.printStackTrace(); return null; } } private String fileNameWithoutPath(String filename) { int last_index = filename.lastIndexOf("\\"); return filename.substring(last_index + 1); } private boolean fileNotInEmbeddings(String filename) { boolean exists = true; for (EmbeddedData ed : embeddings) { if (ed.filename.equals(filename)) { exists = false; } } return exists; } public void close() throws IOException { Iterator<EmbeddedData> ed = embeddings.iterator(); while (ed.hasNext()) { ed.next().is.close(); } wb.close(); } } static class EmbeddedData { String filename; InputStream is; String source; } static abstract class EmbeddedExtractor { abstract boolean canExtract(DirectoryNode dn); abstract EmbeddedData extract(DirectoryNode dn) throws IOException; protected EmbeddedData extractFS(DirectoryNode dn, String filename) throws IOException { assert (canExtract(dn)); POIFSFileSystem dest = new POIFSFileSystem(); copyNodes(dn, dest.getRoot()); EmbeddedData ed = new EmbeddedData(); ed.filename = filename; ByteArrayOutputStream bos = new ByteArrayOutputStream(); dest.writeFilesystem(bos); bos.close(); ed.is = new ByteArrayInputStream(bos.toByteArray()); return ed; } } static class Ole10Extractor extends EmbeddedExtractor { public boolean canExtract(DirectoryNode dn) { ClassID clsId = dn.getStorageClsid(); return OLE10_PACKAGE.equals(clsId); } public EmbeddedData extract(DirectoryNode dn) throws IOException { try { Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn); EmbeddedData ed = new EmbeddedData(); ed.filename = new File(ole10.getFileName()).getName(); ed.is = new ByteArrayInputStream(ole10.getDataBuffer()); return ed; } catch (Ole10NativeException e) { e.printStackTrace(); throw new IOException(e); } } } static class PdfExtractor extends EmbeddedExtractor { public boolean canExtract(DirectoryNode dn) { ClassID clsId = dn.getStorageClsid(); return (PdfClassID.equals(clsId) || dn.hasEntry("CONTENTS")); } public EmbeddedData extract(DirectoryNode dn) throws IOException { EmbeddedData ed = new EmbeddedData(); ed.is = dn.createDocumentInputStream("CONTENTS"); ed.filename = dn.getName() + ".pdf"; return ed; } } static class WordExtractor extends EmbeddedExtractor { public boolean canExtract(DirectoryNode dn) { ClassID clsId = dn.getStorageClsid(); return (WORD95.equals(clsId) || WORD97.equals(clsId) || dn.hasEntry("WordDocument")); } public EmbeddedData extract(DirectoryNode dn) throws IOException { return extractFS(dn, dn.getName() + ".doc"); } } static class ExcelExtractor extends EmbeddedExtractor { public boolean canExtract(DirectoryNode dn) { ClassID clsId = dn.getStorageClsid(); return (EXCEL95.equals(clsId) || EXCEL97.equals(clsId) || dn .hasEntry("Workbook") /* ... */); } public EmbeddedData extract(DirectoryNode dn) throws IOException { return extractFS(dn, dn.getName() + ".xls"); } } static class FsExtractor extends EmbeddedExtractor { public boolean canExtract(DirectoryNode dn) { return true; } public EmbeddedData extract(DirectoryNode dn) throws IOException { return extractFS(dn, dn.getName() + ".dat"); } } private static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException { for (Entry e : src) { if (e instanceof DirectoryNode) { DirectoryNode srcDir = (DirectoryNode) e; DirectoryNode destDir = (DirectoryNode) dest .createDirectory(srcDir.getName()); destDir.setStorageClsid(srcDir.getStorageClsid()); copyNodes(srcDir, destDir); } else { InputStream is = src.createDocumentInputStream(e); dest.createDocument(e.getName(), is); is.close(); } } } /** * Knuth-Morris-Pratt Algorithm for Pattern Matching Finds the first * occurrence of the pattern in the text. */ private static int indexOf(byte[] data, int offset, byte[] pattern) { int[] failure = computeFailure(pattern); int j = 0; if (data.length == 0) return -1; for (int i = offset; i < data.length; i++) { while (j > 0 && pattern[j] != data[i]) { j = failure[j - 1]; } if (pattern[j] == data[i]) { j++; } if (j == pattern.length) { return i - pattern.length + 1; } } return -1; } /** * Computes the failure function using a boot-strapping process, where the * pattern is matched against itself. */ private static int[] computeFailure(byte[] pattern) { int[] failure = new int[pattern.length]; int j = 0; for (int i = 1; i < pattern.length; i++) { while (j > 0 && pattern[j] != pattern[i]) { j = failure[j - 1]; } if (pattern[j] == pattern[i]) { j++; } failure[i] = j; } return failure; } 

}

To simplify the processing of embedded data, I've added an extractor class to POI, which will be available in POI 3.16-beta2 or a nightly until then.

The following will extract the objects of .xls/x files – all which is left, is to write the embedded bytes somewhere. It's possible to extend the extractor classes by simply extending EmbeddedExtractor and provide your own iterator() method.

 import java.io.FileInputStream; import java.io.InputStream; import org.apache.poi.ss.extractor.EmbeddedData; import org.apache.poi.ss.extractor.EmbeddedExtractor; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.WorkbookFactory; public class BlaExtract { public static void main(String[] args) throws Exception { InputStream fis = new FileInputStream("bla.xlsx"); Workbook wb = WorkbookFactory.create(fis); fis.close(); EmbeddedExtractor ee = new EmbeddedExtractor(); for (Sheet s : wb) { for (EmbeddedData ed : ee.extractAll(s)) { System.out.println(ed.getFilename()+" ("+ed.getContentType()+") - "+ed.getEmbeddedData().length+" bytes"); } } wb.close(); } }