138 lines
4.1 KiB
C++
138 lines
4.1 KiB
C++
|
/****************************************************************************
|
||
|
*
|
||
|
* File: comparser.cpp
|
||
|
*
|
||
|
* Usage: comparser input.pdf
|
||
|
*
|
||
|
* Description: Very simple text extraction through the COM interface.
|
||
|
*
|
||
|
* Version: 1.01 (5-April-2004)
|
||
|
*
|
||
|
* Author: Hans Baerfuss, PDF Tools AG
|
||
|
*
|
||
|
* Copyright: Copyright (C) 2004 PDF Tools AG, Switzerland
|
||
|
* Permission to use, copy, modify, and distribute this
|
||
|
* software and its documentation for any purpose and without
|
||
|
* fee is hereby granted, provided that the above copyright
|
||
|
* notice appear in all copies and that both that copyright
|
||
|
* notice and this permission notice appear in supporting
|
||
|
* documentation. This software is provided "as is" without
|
||
|
* express or implied warranty.
|
||
|
*
|
||
|
***************************************************************************/
|
||
|
|
||
|
#include "stdafx.h"
|
||
|
#include "pdfparser.h"
|
||
|
#include "pdfparser_i.c"
|
||
|
|
||
|
void Usage(char* argv0)
|
||
|
{
|
||
|
printf("PDF Export API - C++ COM Sample\n"
|
||
|
"Usage: %s input.pdf\n", argv0);
|
||
|
printf("Errors: 1 - couldn't open input file\n"
|
||
|
" 3 - too few arguments\n"
|
||
|
" 4 - couldn't create COM object\n");
|
||
|
}
|
||
|
|
||
|
int main(int argc, char* argv[])
|
||
|
{
|
||
|
// Check if there are enough arguments.
|
||
|
if (argc < 2)
|
||
|
{
|
||
|
Usage(argv[0]);
|
||
|
return 3;
|
||
|
}
|
||
|
|
||
|
// Initialize COM.
|
||
|
::CoInitialize(NULL);
|
||
|
|
||
|
// Create the COM object.
|
||
|
IPDFDocument* pDocument = NULL;
|
||
|
HRESULT hResult = ::CoCreateInstance(CLSID_Document, NULL, CLSCTX_INPROC_SERVER, IID_IPDFDocument, (void**) &pDocument);
|
||
|
if (FAILED(hResult))
|
||
|
{
|
||
|
fprintf(stderr, "Couldn't create COM object\n");
|
||
|
::CoUninitialize();
|
||
|
return 4;
|
||
|
}
|
||
|
|
||
|
// Open the PDF input document.
|
||
|
USES_CONVERSION;
|
||
|
VARIANT_BOOL bDone;
|
||
|
BSTR fn = T2OLE(argv[1]);
|
||
|
BSTR password = OLESTR("");
|
||
|
hResult = pDocument->Open(fn, password, &bDone);
|
||
|
if (FAILED(hResult) || bDone == 0)
|
||
|
{
|
||
|
fprintf(stderr, "Couldn't open input file\n");
|
||
|
pDocument->Release();
|
||
|
::CoUninitialize();
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
// Traverse the pages.
|
||
|
long nPageCount;
|
||
|
hResult = pDocument->get_PageCount(&nPageCount);
|
||
|
for (long i = 1; i <= nPageCount; i++)
|
||
|
{
|
||
|
printf("Page no. %d\n", i);
|
||
|
hResult = pDocument->put_PageNo(i);
|
||
|
|
||
|
// Get the page interface.
|
||
|
IPDFPage* pPage = NULL;
|
||
|
hResult = pDocument->get_Page(&pPage);
|
||
|
if (!pPage)
|
||
|
continue;
|
||
|
|
||
|
// Get the content interface.
|
||
|
IPDFContent* pContent = NULL;
|
||
|
pPage->get_Content(&pContent);
|
||
|
if (!pContent)
|
||
|
continue;
|
||
|
|
||
|
// Traverse the objects in the page content.
|
||
|
for (;;)
|
||
|
{
|
||
|
TPDFContentObject iResult;
|
||
|
hResult = pContent->GetNextObject(&iResult);
|
||
|
switch (iResult)
|
||
|
{
|
||
|
// Exit when there are no more objects.
|
||
|
case eNone:
|
||
|
goto _exitloop;
|
||
|
|
||
|
// Extract the text.
|
||
|
case eText:
|
||
|
{
|
||
|
IPDFText* pText = NULL;
|
||
|
hResult = pContent->get_Text(&pText);
|
||
|
|
||
|
BSTR s = NULL;
|
||
|
hResult = pText->get_UnicodeString(&s);
|
||
|
if (s)
|
||
|
wprintf(L"Text: %s\r\n", s);
|
||
|
::SysFreeString(s);
|
||
|
pText->Release();
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
_exitloop:
|
||
|
|
||
|
// Release the content and page object.
|
||
|
pContent->Release();
|
||
|
pPage->Release();
|
||
|
}
|
||
|
|
||
|
// Close the document.
|
||
|
hResult = pDocument->Close();
|
||
|
|
||
|
// Release the document object.
|
||
|
pDocument->Release();
|
||
|
|
||
|
// Uninitialize COM.
|
||
|
::CoUninitialize();
|
||
|
return 0;
|
||
|
}
|
||
|
|