73 lines
2.6 KiB
Java
73 lines
2.6 KiB
Java
|
/****************************************************************************
|
||
|
*
|
||
|
* File: TextExt.java
|
||
|
*
|
||
|
* Usage: java TextExt PDF-InputFile
|
||
|
*
|
||
|
* Description: Prints out the text of PDF-InputFile.
|
||
|
*
|
||
|
* Version: 1.03 (18-November-2005)
|
||
|
*
|
||
|
* Author: Philip Renggli, PDF Tools AG
|
||
|
*
|
||
|
* Copyright: Copyright (C) 2005 PDF Tools AG, Switzerland
|
||
|
* Permission to use, copy, modify, and distribute this
|
||
|
* software and its documentation for any purpose and without
|
||
|
* fee is hereby granted, provided that the above copyright
|
||
|
* notice appear in all copies and that both that copyright
|
||
|
* notice and this permission notice appear in supporting
|
||
|
* documentation. This software is provided "as is" without
|
||
|
* express or implied warranty.
|
||
|
*
|
||
|
***************************************************************************/
|
||
|
|
||
|
import com.pdftools.expa.*;
|
||
|
|
||
|
public class TextExt {
|
||
|
|
||
|
public static void main(String[] args) {
|
||
|
try {
|
||
|
// open input file
|
||
|
Document thePDF = new Document(args[0], "");
|
||
|
|
||
|
// loop through all pages
|
||
|
for(int curPage = 1; curPage <= thePDF.getPageCount(); curPage++)
|
||
|
{
|
||
|
// set the page number
|
||
|
thePDF.setPageNo(curPage);
|
||
|
|
||
|
// get the content
|
||
|
Content theContent = thePDF.getPage().getContent();
|
||
|
|
||
|
// extract words
|
||
|
theContent.resetContent(true);
|
||
|
theContent.resetContent(false);
|
||
|
|
||
|
// loop through all text tokens on the page
|
||
|
System.out.println("** Begin Page " + curPage + " of " + thePDF.getPageCount() + " **");
|
||
|
double dY_old = -1.0;
|
||
|
while(theContent.getNextText() != null)
|
||
|
{
|
||
|
// compare Y position with the position of the last token
|
||
|
if(theContent.getText().getYPos().length == 0)
|
||
|
{
|
||
|
System.out.println();
|
||
|
break;
|
||
|
}
|
||
|
float dY_new = theContent.getText().getYPos()[0];
|
||
|
if(dY_old == dY_new)
|
||
|
System.out.print(" ");
|
||
|
else if (dY_old != -1.0)
|
||
|
System.out.println();
|
||
|
System.out.print(theContent.getText().getUnicodeString());
|
||
|
// save the Y position
|
||
|
dY_old = dY_new;
|
||
|
}
|
||
|
}
|
||
|
thePDF.close();
|
||
|
} catch (Throwable e) {
|
||
|
e.printStackTrace();
|
||
|
}
|
||
|
}
|
||
|
}
|