/**************************************************************************** * * File: TextExt.java * * Usage: java TextExt PDF-InputFile * * Description: Prints out the text of PDF-InputFile. * * Version: 1.03 (18-November-2005) * * Author: Philip Renggli, PDF Tools AG * * Copyright: Copyright (C) 2005 PDF Tools AG, Switzerland * Permission to use, copy, modify, and distribute this * software and its documentation for any purpose and without * fee is hereby granted, provided that the above copyright * notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting * documentation. This software is provided "as is" without * express or implied warranty. * ***************************************************************************/ import com.pdftools.expa.*; public class TextExt { public static void main(String[] args) { try { // open input file Document thePDF = new Document(args[0], ""); // loop through all pages for(int curPage = 1; curPage <= thePDF.getPageCount(); curPage++) { // set the page number thePDF.setPageNo(curPage); // get the content Content theContent = thePDF.getPage().getContent(); // extract words theContent.resetContent(true); theContent.resetContent(false); // loop through all text tokens on the page System.out.println("** Begin Page " + curPage + " of " + thePDF.getPageCount() + " **"); double dY_old = -1.0; while(theContent.getNextText() != null) { // compare Y position with the position of the last token if(theContent.getText().getYPos().length == 0) { System.out.println(); break; } float dY_new = theContent.getText().getYPos()[0]; if(dY_old == dY_new) System.out.print(" "); else if (dY_old != -1.0) System.out.println(); System.out.print(theContent.getText().getUnicodeString()); // save the Y position dY_old = dY_new; } } thePDF.close(); } catch (Throwable e) { e.printStackTrace(); } } }