137 lines
3.6 KiB
C++
137 lines
3.6 KiB
C++
/****************************************************************************
|
|
*
|
|
* File: textext.cpp
|
|
*
|
|
* Usage: textext PDF-input
|
|
*
|
|
* Description: Extracts Text of a PDF document and prints it to a
|
|
* unicode file
|
|
*
|
|
* Version: 1.03 (12-July-2005)
|
|
*
|
|
* Author: Philip Renggli, PDF Tools AG
|
|
*
|
|
* Copyright: Copyright (C) 2005 PDF Tools AG, Switzerland
|
|
* Permission to use, copy, modify, and distribute this
|
|
* software and its documentation for any purpose and without
|
|
* fee is hereby granted, provided that the above copyright
|
|
* notice appear in all copies and that both that copyright
|
|
* notice and this permission notice appear in supporting
|
|
* documentation. This software is provided "as is" without
|
|
* express or implied warranty.
|
|
*
|
|
***************************************************************************/
|
|
|
|
#include <stdio.h>
|
|
#include <windows.h>
|
|
#include <fcntl.h>
|
|
#include <stdlib.h>
|
|
#include <io.h>
|
|
#include "expa_c.h"
|
|
|
|
void print(const WCHAR* wzStr, FILE* out)
|
|
{
|
|
while(*wzStr)
|
|
{
|
|
if (*wzStr == '\n')
|
|
{
|
|
fputc('\r', out);
|
|
fputc(0, out);
|
|
}
|
|
fputc(*wzStr % 256, out);
|
|
fputc(*wzStr / 256, out);
|
|
wzStr++;
|
|
}
|
|
}
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
TExpaDocument pDocument;
|
|
TExpaContent pContent;
|
|
TExpaText pText;
|
|
int iPage;
|
|
const unsigned short* sText;
|
|
FILE* out;
|
|
|
|
if (argc < 3)
|
|
{
|
|
printf("Usage: textext input.pdf output.txt");
|
|
return 3;
|
|
}
|
|
|
|
ExpaInitialize();
|
|
|
|
/* Create the object */
|
|
pDocument = ExpaCreateObject();
|
|
|
|
/* Open the document */
|
|
if (!ExpaDocOpen(pDocument, argv[1], ""))
|
|
{
|
|
printf("error opening PDF file %s...\n",argv[1]);
|
|
return 1;
|
|
}
|
|
|
|
/* Create a Unicode Output file */
|
|
out = fopen(argv[2], "wb");
|
|
if (!out){
|
|
fprintf(stderr, "Couldn't create file %s\n", argv[2]);
|
|
return 2;
|
|
}
|
|
//#ifdef WIN32
|
|
_setmode(_fileno(stdout), _O_BINARY);
|
|
//#endif
|
|
fprintf(out, "\377\376");
|
|
|
|
/* loop through all page and render them */
|
|
for(iPage = 1; iPage <= ExpaDocGetPageCount(pDocument); iPage++)
|
|
{
|
|
float y;
|
|
|
|
/* set the page number */
|
|
ExpaDocSetPageNo(pDocument, iPage);
|
|
|
|
/* get the content */
|
|
pContent = ExpaPageGetContent(ExpaDocGetPage(pDocument));
|
|
|
|
/* extract words */
|
|
ExpaContentResetContent(pContent, 1);
|
|
ExpaContentSetBreakWords(pContent, true);
|
|
|
|
y = -1;
|
|
while (pText = ExpaContentGetNextText(pContent))
|
|
{
|
|
int nLength;
|
|
|
|
nLength = ExpaTextGetStringLength(pText);
|
|
if (nLength)
|
|
{
|
|
TPDFVector* pos;
|
|
|
|
/* print the text, add newline if the Y coordinate changes. */
|
|
pos = ExpaTextGetPosition(pText);
|
|
if (pos[0].y != y)
|
|
print(L"\n", out);
|
|
|
|
/* write the text token */
|
|
sText = ExpaTextGetUnicodeString(pText);
|
|
if (sText)
|
|
print(sText, out);
|
|
|
|
/* add a line break or a space */
|
|
print(L" ", out);
|
|
|
|
/* save the Y position */
|
|
y = pos[0].y;
|
|
}
|
|
}
|
|
}
|
|
/* Close the text file */
|
|
fclose(out);
|
|
|
|
/* Close and Destroy the object */
|
|
ExpaDocClose(pDocument);
|
|
ExpaDestroyObject(pDocument);
|
|
ExpaUnInitialize();
|
|
return 0;
|
|
}
|
|
|