PDF parsing in Java using Apache PDFBOX

Apache PDFBox is an open source Java Library to manipulate PDF documents. We can use PDFBox to read PDF, Fill forms, creation of PDF, Save as Image, etc

In this Tutorial, we can see how to read the pdf, extract the images from the PDF and convert the PDF to Image.

 

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
public class PdfTest {
    public static void main(String[] args) throws IOException {
        try {
            PDDocument doc = PDDocument.load(new File("filename.pdf"));
            String text = new PDFTextStripper().getText(doc);

            System.out.println("Content in PDF\n---------------------------------");
            System.out.println(text);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

 

To convert the PDF document to Images:

 

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
public class PdfTest {
    public static void main(String[] args) throws IOException {
        try {
            PDDocument document = PDDocument.load(new File("filename.pdf"));
            PDFRenderer pdfRenderer = new PDFRenderer(document);
            // Get number of pages in PDF document and iterate
            for (int countOfPage = 0; countOfPage < document.getNumberOfPages(); ++countOfPage)
            {
                BufferedImage bim = pdfRenderer.renderImageWithDPI(countOfPage, 300, ImageType.RGB);
                // suffix in filename will be used as the file format
                ImageIO.write(bim, "png", new File (""+countOfPage+"Cucum.png"));
            }
            document.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    }

 

We can extract the Images from PDF document:

 

package org.example;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
public class PdfTest {
    public static void main(String[] args) throws IOException {
        
        PDDocument doc = PDDocument.load(new File("filename.pdf"));
        // Get number of pages in pdf and iterate
        PDPageTree list = doc.getPages();
        for (PDPage page : list) {
            PDResources pdResources = page.getResources();
            int i = 1;
            for (COSName name : pdResources.getXObjectNames()) {
                PDXObject pobject = pdResources.getXObject(name);
                if (pobject instanceof PDImageXObject) {
                    PDImageXObject img = (PDImageXObject)pobject;
                    String filename = "extracted-image-" + i + ".png";
                    ImageIO.write(img.getImage(), "png", new File(filename));
                    i++;
                }
            }
        }
    }
}

 

Related Tutorials

Related Questions






Read more