csharp_pain/Scraping/COM/samples/CPP/textext/textext.cpp

138 lines
3.6 KiB
C++
Raw Permalink Normal View History

2014-06-26 15:13:46 +00:00
/****************************************************************************
*
* File: textext.cpp
*
* Usage: textext PDF-input
*
* Description: Extracts Text of a PDF document and prints it to a
* unicode file
*
* Version: 1.03 (12-July-2005)
*
* Author: Philip Renggli, PDF Tools AG
*
* Copyright: Copyright (C) 2005 PDF Tools AG, Switzerland
* Permission to use, copy, modify, and distribute this
* software and its documentation for any purpose and without
* fee is hereby granted, provided that the above copyright
* notice appear in all copies and that both that copyright
* notice and this permission notice appear in supporting
* documentation. This software is provided "as is" without
* express or implied warranty.
*
***************************************************************************/
#include <stdio.h>
#include <windows.h>
#include <fcntl.h>
#include <stdlib.h>
#include <io.h>
#include "expa_c.h"
void print(const WCHAR* wzStr, FILE* out)
{
while(*wzStr)
{
if (*wzStr == '\n')
{
fputc('\r', out);
fputc(0, out);
}
fputc(*wzStr % 256, out);
fputc(*wzStr / 256, out);
wzStr++;
}
}
int main(int argc, char* argv[])
{
TExpaDocument pDocument;
TExpaContent pContent;
TExpaText pText;
int iPage;
const unsigned short* sText;
FILE* out;
if (argc < 3)
{
printf("Usage: textext input.pdf output.txt");
return 3;
}
ExpaInitialize();
/* Create the object */
pDocument = ExpaCreateObject();
/* Open the document */
if (!ExpaDocOpen(pDocument, argv[1], ""))
{
printf("error opening PDF file %s...\n",argv[1]);
return 1;
}
/* Create a Unicode Output file */
out = fopen(argv[2], "wb");
if (!out){
fprintf(stderr, "Couldn't create file %s\n", argv[2]);
return 2;
}
//#ifdef WIN32
_setmode(_fileno(stdout), _O_BINARY);
//#endif
fprintf(out, "\377\376");
/* loop through all page and render them */
for(iPage = 1; iPage <= ExpaDocGetPageCount(pDocument); iPage++)
{
float y;
/* set the page number */
ExpaDocSetPageNo(pDocument, iPage);
/* get the content */
pContent = ExpaPageGetContent(ExpaDocGetPage(pDocument));
/* extract words */
ExpaContentResetContent(pContent, 1);
ExpaContentSetBreakWords(pContent, true);
y = -1;
while (pText = ExpaContentGetNextText(pContent))
{
int nLength;
nLength = ExpaTextGetStringLength(pText);
if (nLength)
{
TPDFVector* pos;
/* print the text, add newline if the Y coordinate changes. */
pos = ExpaTextGetPosition(pText);
if (pos[0].y != y)
print(L"\n", out);
/* write the text token */
sText = ExpaTextGetUnicodeString(pText);
if (sText)
print(sText, out);
/* add a line break or a space */
print(L" ", out);
/* save the Y position */
y = pos[0].y;
}
}
}
/* Close the text file */
fclose(out);
/* Close and Destroy the object */
ExpaDocClose(pDocument);
ExpaDestroyObject(pDocument);
ExpaUnInitialize();
return 0;
}