/**************************************************************************** * * File: comparser.cpp * * Usage: comparser input.pdf * * Description: Very simple text extraction through the COM interface. * * Version: 1.01 (5-April-2004) * * Author: Hans Baerfuss, PDF Tools AG * * Copyright: Copyright (C) 2004 PDF Tools AG, Switzerland * Permission to use, copy, modify, and distribute this * software and its documentation for any purpose and without * fee is hereby granted, provided that the above copyright * notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting * documentation. This software is provided "as is" without * express or implied warranty. * ***************************************************************************/ #include "stdafx.h" #include "pdfparser.h" #include "pdfparser_i.c" void Usage(char* argv0) { printf("PDF Export API - C++ COM Sample\n" "Usage: %s input.pdf\n", argv0); printf("Errors: 1 - couldn't open input file\n" " 3 - too few arguments\n" " 4 - couldn't create COM object\n"); } int main(int argc, char* argv[]) { // Check if there are enough arguments. if (argc < 2) { Usage(argv[0]); return 3; } // Initialize COM. ::CoInitialize(NULL); // Create the COM object. IPDFDocument* pDocument = NULL; HRESULT hResult = ::CoCreateInstance(CLSID_Document, NULL, CLSCTX_INPROC_SERVER, IID_IPDFDocument, (void**) &pDocument); if (FAILED(hResult)) { fprintf(stderr, "Couldn't create COM object\n"); ::CoUninitialize(); return 4; } // Open the PDF input document. USES_CONVERSION; VARIANT_BOOL bDone; BSTR fn = T2OLE(argv[1]); BSTR password = OLESTR(""); hResult = pDocument->Open(fn, password, &bDone); if (FAILED(hResult) || bDone == 0) { fprintf(stderr, "Couldn't open input file\n"); pDocument->Release(); ::CoUninitialize(); return 1; } // Traverse the pages. long nPageCount; hResult = pDocument->get_PageCount(&nPageCount); for (long i = 1; i <= nPageCount; i++) { printf("Page no. %d\n", i); hResult = pDocument->put_PageNo(i); // Get the page interface. IPDFPage* pPage = NULL; hResult = pDocument->get_Page(&pPage); if (!pPage) continue; // Get the content interface. IPDFContent* pContent = NULL; pPage->get_Content(&pContent); if (!pContent) continue; // Traverse the objects in the page content. for (;;) { TPDFContentObject iResult; hResult = pContent->GetNextObject(&iResult); switch (iResult) { // Exit when there are no more objects. case eNone: goto _exitloop; // Extract the text. case eText: { IPDFText* pText = NULL; hResult = pContent->get_Text(&pText); BSTR s = NULL; hResult = pText->get_UnicodeString(&s); if (s) wprintf(L"Text: %s\r\n", s); ::SysFreeString(s); pText->Release(); } break; } } _exitloop: // Release the content and page object. pContent->Release(); pPage->Release(); } // Close the document. hResult = pDocument->Close(); // Release the document object. pDocument->Release(); // Uninitialize COM. ::CoUninitialize(); return 0; }