csharp_pain/Scraping/COM/samples/Java/TextExt.java

/****************************************************************************
 *
 * File:            TextExt.java
 *
 * Usage:           java TextExt PDF-InputFile
 *
 * Description:     Prints out the text of PDF-InputFile.
 *
 * Version:         1.03  (18-November-2005)
 *
 * Author:          Philip Renggli, PDF Tools AG
 *
 * Copyright:       Copyright (C) 2005 PDF Tools AG, Switzerland
 *                  Permission to use, copy, modify, and distribute this
 *                  software and its documentation for any purpose and without
 *                  fee is hereby granted, provided that the above copyright
 *                  notice appear in all copies and that both that copyright
 *                  notice and this permission notice appear in supporting
 *                  documentation.  This software is provided "as is" without
 *                  express or implied warranty.
 *
 ***************************************************************************/

import com.pdftools.expa.*;

public class TextExt {

public static void main(String[] args) {
    try {
        // open input file
        Document thePDF = new Document(args[0], "");

        // loop through all pages
        for(int curPage = 1; curPage <= thePDF.getPageCount(); curPage++)
        {
            // set the page number
            thePDF.setPageNo(curPage);

            // get the content
            Content theContent = thePDF.getPage().getContent();

            // extract words
            theContent.resetContent(true);
            theContent.resetContent(false);

            // loop through all text tokens on the page
            System.out.println("** Begin Page " + curPage + " of " + thePDF.getPageCount() + " **");
            double dY_old = -1.0;
            while(theContent.getNextText() != null)
            {
                // compare Y position with the position of the last token
            	if(theContent.getText().getYPos().length == 0)
            	{
            		System.out.println();
            		break;
            	}
                float dY_new = theContent.getText().getYPos()[0];
                if(dY_old == dY_new)
                    System.out.print(" ");
                else if (dY_old != -1.0)
                    System.out.println();
                System.out.print(theContent.getText().getUnicodeString());
                // save the Y position
                dY_old = dY_new;
            }
        }
        thePDF.close();
    } catch (Throwable e) {
        e.printStackTrace();
    }
}
}