csharp_pain/Scraping/COM/samples/Java/TextExt.java

73 lines
2.6 KiB
Java
Raw Normal View History

2014-06-26 15:13:46 +00:00
/****************************************************************************
*
* File: TextExt.java
*
* Usage: java TextExt PDF-InputFile
*
* Description: Prints out the text of PDF-InputFile.
*
* Version: 1.03 (18-November-2005)
*
* Author: Philip Renggli, PDF Tools AG
*
* Copyright: Copyright (C) 2005 PDF Tools AG, Switzerland
* Permission to use, copy, modify, and distribute this
* software and its documentation for any purpose and without
* fee is hereby granted, provided that the above copyright
* notice appear in all copies and that both that copyright
* notice and this permission notice appear in supporting
* documentation. This software is provided "as is" without
* express or implied warranty.
*
***************************************************************************/
import com.pdftools.expa.*;
public class TextExt {
public static void main(String[] args) {
try {
// open input file
Document thePDF = new Document(args[0], "");
// loop through all pages
for(int curPage = 1; curPage <= thePDF.getPageCount(); curPage++)
{
// set the page number
thePDF.setPageNo(curPage);
// get the content
Content theContent = thePDF.getPage().getContent();
// extract words
theContent.resetContent(true);
theContent.resetContent(false);
// loop through all text tokens on the page
System.out.println("** Begin Page " + curPage + " of " + thePDF.getPageCount() + " **");
double dY_old = -1.0;
while(theContent.getNextText() != null)
{
// compare Y position with the position of the last token
if(theContent.getText().getYPos().length == 0)
{
System.out.println();
break;
}
float dY_new = theContent.getText().getYPos()[0];
if(dY_old == dY_new)
System.out.print(" ");
else if (dY_old != -1.0)
System.out.println();
System.out.print(theContent.getText().getUnicodeString());
// save the Y position
dY_old = dY_new;
}
}
thePDF.close();
} catch (Throwable e) {
e.printStackTrace();
}
}
}