csharp_pain/Scraping/COM/samples/Java/TextExt.java

/****************************************************************************
 *
 * File:            TextExt.java
 *
 * Usage:           java TextExt PDF-InputFile
 *
 * Description:     Prints out the text of PDF-InputFile.
 *
 * Version:         1.03  (18-November-2005)
 *
 * Author:          Philip Renggli, PDF Tools AG   
 * 
 * Copyright:       Copyright (C) 2005 PDF Tools AG, Switzerland
 *                  Permission to use, copy, modify, and distribute this
 *                  software and its documentation for any purpose and without
 *                  fee is hereby granted, provided that the above copyright
 *                  notice appear in all copies and that both that copyright
 *                  notice and this permission notice appear in supporting
 *                  documentation.  This software is provided "as is" without
 *                  express or implied warranty.
 *
 ***************************************************************************/

import com.pdftools.expa.*;

public class TextExt {

public static void main(String[] args) {
    try {
        // open input file
        Document thePDF = new Document(args[0], "");
        
        // loop through all pages
        for(int curPage = 1; curPage <= thePDF.getPageCount(); curPage++)
        {
            // set the page number
            thePDF.setPageNo(curPage);

            // get the content
            Content theContent = thePDF.getPage().getContent();
            
            // extract words
            theContent.resetContent(true);
            theContent.resetContent(false);

            // loop through all text tokens on the page
            System.out.println("** Begin Page " + curPage + " of " + thePDF.getPageCount() + " **");
            double dY_old = -1.0;
            while(theContent.getNextText() != null)
            {
                // compare Y position with the position of the last token
            	if(theContent.getText().getYPos().length == 0)
            	{
            		System.out.println();            		
            		break;
            	}
                float dY_new = theContent.getText().getYPos()[0];
                if(dY_old == dY_new)
                    System.out.print(" ");
                else if (dY_old != -1.0)
                    System.out.println();
                System.out.print(theContent.getText().getUnicodeString());
                // save the Y position
                dY_old = dY_new;
            }
        }
        thePDF.close();
    } catch (Throwable e) {
        e.printStackTrace();
    }
}
}
initial commit 2014-06-26 15:13:46 +00:00			`/****************************************************************************`
			`*`
			`* File: TextExt.java`
			`*`
			`* Usage: java TextExt PDF-InputFile`
			`*`
			`* Description: Prints out the text of PDF-InputFile.`
			`*`
			`* Version: 1.03 (18-November-2005)`
			`*`
			`* Author: Philip Renggli, PDF Tools AG`
			`*`
			`* Copyright: Copyright (C) 2005 PDF Tools AG, Switzerland`
			`* Permission to use, copy, modify, and distribute this`
			`* software and its documentation for any purpose and without`
			`* fee is hereby granted, provided that the above copyright`
			`* notice appear in all copies and that both that copyright`
			`* notice and this permission notice appear in supporting`
			`* documentation. This software is provided "as is" without`
			`* express or implied warranty.`
			`*`
			`***************************************************************************/`

			`import com.pdftools.expa.*;`

			`public class TextExt {`

			`public static void main(String[] args) {`
			`try {`
			`// open input file`
			`Document thePDF = new Document(args[0], "");`

			`// loop through all pages`
			`for(int curPage = 1; curPage <= thePDF.getPageCount(); curPage++)`
			`{`
			`// set the page number`
			`thePDF.setPageNo(curPage);`

			`// get the content`
			`Content theContent = thePDF.getPage().getContent();`

			`// extract words`
			`theContent.resetContent(true);`
			`theContent.resetContent(false);`

			`// loop through all text tokens on the page`
			`System.out.println(" Begin Page " + curPage + " of " + thePDF.getPageCount() + " ");`
			`double dY_old = -1.0;`
			`while(theContent.getNextText() != null)`
			`{`
			`// compare Y position with the position of the last token`
			`if(theContent.getText().getYPos().length == 0)`
			`{`
			`System.out.println();`
			`break;`
			`}`
			`float dY_new = theContent.getText().getYPos()[0];`
			`if(dY_old == dY_new)`
			`System.out.print(" ");`
			`else if (dY_old != -1.0)`
			`System.out.println();`
			`System.out.print(theContent.getText().getUnicodeString());`
			`// save the Y position`
			`dY_old = dY_new;`
			`}`
			`}`
			`thePDF.close();`
			`} catch (Throwable e) {`
			`e.printStackTrace();`
			`}`
			`}`
			`}`