/**************************************************************************** * * File: textext.cpp * * Usage: textext PDF-input * * Description: Extracts Text of a PDF document and prints it to a * unicode file * * Version: 1.03 (12-July-2005) * * Author: Philip Renggli, PDF Tools AG * * Copyright: Copyright (C) 2005 PDF Tools AG, Switzerland * Permission to use, copy, modify, and distribute this * software and its documentation for any purpose and without * fee is hereby granted, provided that the above copyright * notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting * documentation. This software is provided "as is" without * express or implied warranty. * ***************************************************************************/ #include #include #include #include #include #include "expa_c.h" void print(const WCHAR* wzStr, FILE* out) { while(*wzStr) { if (*wzStr == '\n') { fputc('\r', out); fputc(0, out); } fputc(*wzStr % 256, out); fputc(*wzStr / 256, out); wzStr++; } } int main(int argc, char* argv[]) { TExpaDocument pDocument; TExpaContent pContent; TExpaText pText; int iPage; const unsigned short* sText; FILE* out; if (argc < 3) { printf("Usage: textext input.pdf output.txt"); return 3; } ExpaInitialize(); /* Create the object */ pDocument = ExpaCreateObject(); /* Open the document */ if (!ExpaDocOpen(pDocument, argv[1], "")) { printf("error opening PDF file %s...\n",argv[1]); return 1; } /* Create a Unicode Output file */ out = fopen(argv[2], "wb"); if (!out){ fprintf(stderr, "Couldn't create file %s\n", argv[2]); return 2; } //#ifdef WIN32 _setmode(_fileno(stdout), _O_BINARY); //#endif fprintf(out, "\377\376"); /* loop through all page and render them */ for(iPage = 1; iPage <= ExpaDocGetPageCount(pDocument); iPage++) { float y; /* set the page number */ ExpaDocSetPageNo(pDocument, iPage); /* get the content */ pContent = ExpaPageGetContent(ExpaDocGetPage(pDocument)); /* extract words */ ExpaContentResetContent(pContent, 1); ExpaContentSetBreakWords(pContent, true); y = -1; while (pText = ExpaContentGetNextText(pContent)) { int nLength; nLength = ExpaTextGetStringLength(pText); if (nLength) { TPDFVector* pos; /* print the text, add newline if the Y coordinate changes. */ pos = ExpaTextGetPosition(pText); if (pos[0].y != y) print(L"\n", out); /* write the text token */ sText = ExpaTextGetUnicodeString(pText); if (sText) print(sText, out); /* add a line break or a space */ print(L" ", out); /* save the Y position */ y = pos[0].y; } } } /* Close the text file */ fclose(out); /* Close and Destroy the object */ ExpaDocClose(pDocument); ExpaDestroyObject(pDocument); ExpaUnInitialize(); return 0; }