csharp_pain/Scraping/COM/samples/CPP/comparser/comparser.cpp

138 lines
4.1 KiB
C++
Raw Permalink Normal View History

2014-06-26 15:13:46 +00:00
/****************************************************************************
*
* File: comparser.cpp
*
* Usage: comparser input.pdf
*
* Description: Very simple text extraction through the COM interface.
*
* Version: 1.01 (5-April-2004)
*
* Author: Hans Baerfuss, PDF Tools AG
*
* Copyright: Copyright (C) 2004 PDF Tools AG, Switzerland
* Permission to use, copy, modify, and distribute this
* software and its documentation for any purpose and without
* fee is hereby granted, provided that the above copyright
* notice appear in all copies and that both that copyright
* notice and this permission notice appear in supporting
* documentation. This software is provided "as is" without
* express or implied warranty.
*
***************************************************************************/
#include "stdafx.h"
#include "pdfparser.h"
#include "pdfparser_i.c"
void Usage(char* argv0)
{
printf("PDF Export API - C++ COM Sample\n"
"Usage: %s input.pdf\n", argv0);
printf("Errors: 1 - couldn't open input file\n"
" 3 - too few arguments\n"
" 4 - couldn't create COM object\n");
}
int main(int argc, char* argv[])
{
// Check if there are enough arguments.
if (argc < 2)
{
Usage(argv[0]);
return 3;
}
// Initialize COM.
::CoInitialize(NULL);
// Create the COM object.
IPDFDocument* pDocument = NULL;
HRESULT hResult = ::CoCreateInstance(CLSID_Document, NULL, CLSCTX_INPROC_SERVER, IID_IPDFDocument, (void**) &pDocument);
if (FAILED(hResult))
{
fprintf(stderr, "Couldn't create COM object\n");
::CoUninitialize();
return 4;
}
// Open the PDF input document.
USES_CONVERSION;
VARIANT_BOOL bDone;
BSTR fn = T2OLE(argv[1]);
BSTR password = OLESTR("");
hResult = pDocument->Open(fn, password, &bDone);
if (FAILED(hResult) || bDone == 0)
{
fprintf(stderr, "Couldn't open input file\n");
pDocument->Release();
::CoUninitialize();
return 1;
}
// Traverse the pages.
long nPageCount;
hResult = pDocument->get_PageCount(&nPageCount);
for (long i = 1; i <= nPageCount; i++)
{
printf("Page no. %d\n", i);
hResult = pDocument->put_PageNo(i);
// Get the page interface.
IPDFPage* pPage = NULL;
hResult = pDocument->get_Page(&pPage);
if (!pPage)
continue;
// Get the content interface.
IPDFContent* pContent = NULL;
pPage->get_Content(&pContent);
if (!pContent)
continue;
// Traverse the objects in the page content.
for (;;)
{
TPDFContentObject iResult;
hResult = pContent->GetNextObject(&iResult);
switch (iResult)
{
// Exit when there are no more objects.
case eNone:
goto _exitloop;
// Extract the text.
case eText:
{
IPDFText* pText = NULL;
hResult = pContent->get_Text(&pText);
BSTR s = NULL;
hResult = pText->get_UnicodeString(&s);
if (s)
wprintf(L"Text: %s\r\n", s);
::SysFreeString(s);
pText->Release();
}
break;
}
}
_exitloop:
// Release the content and page object.
pContent->Release();
pPage->Release();
}
// Close the document.
hResult = pDocument->Close();
// Release the document object.
pDocument->Release();
// Uninitialize COM.
::CoUninitialize();
return 0;
}