/**************************************************************************** * * Project: TextExt, C#.NET * * Description: Extracts text of a page of a PDF document using the * 3-Heights PDF Extraction Tool. * * Version: 1.01 (5-October-2005) * * Author: Philip Renggli, PDF Tools AG * * Copyright: Copyright (C) 2005 PDF Tools AG, Switzerland * Permission to use, copy, modify, and distribute this * software and its documentation for any purpose and without * fee is hereby granted, provided that the above copyright * notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting * documentation. This software is provided "as is" without * express or implied warranty. * ***************************************************************************/ using System; using System.Drawing; using System.Collections; using System.ComponentModel; using System.Windows.Forms; using System.Data; namespace TextExt { public class Form1 : System.Windows.Forms.Form { private System.Windows.Forms.Button Browse; private System.Windows.Forms.TextBox txtInput; private System.Windows.Forms.Button ExtractText; private AxMSComDlg.AxCommonDialog axCommonDialog; private System.Windows.Forms.Label label1; private System.Windows.Forms.Label label2; private System.Windows.Forms.TextBox txtOutput; private System.Windows.Forms.TextBox txtPageNo; private System.Windows.Forms.Panel panel1; private System.Windows.Forms.Label label3; private System.Windows.Forms.Label label4; private System.Windows.Forms.Label label5; private System.ComponentModel.Container components = null; public Form1() { InitializeComponent(); } protected override void Dispose( bool disposing ) { if( disposing ) { if (components != null) { components.Dispose(); } } base.Dispose( disposing ); } #region Windows Form Designer generated code /// /// Erforderliche Methode für die Designerunterstützung. /// Der Inhalt der Methode darf nicht mit dem Code-Editor geändert werden. /// private void InitializeComponent() { System.Resources.ResourceManager resources = new System.Resources.ResourceManager(typeof(Form1)); this.Browse = new System.Windows.Forms.Button(); this.txtInput = new System.Windows.Forms.TextBox(); this.ExtractText = new System.Windows.Forms.Button(); this.axCommonDialog = new AxMSComDlg.AxCommonDialog(); this.txtOutput = new System.Windows.Forms.TextBox(); this.label1 = new System.Windows.Forms.Label(); this.txtPageNo = new System.Windows.Forms.TextBox(); this.label2 = new System.Windows.Forms.Label(); this.panel1 = new System.Windows.Forms.Panel(); this.label3 = new System.Windows.Forms.Label(); this.label4 = new System.Windows.Forms.Label(); this.label5 = new System.Windows.Forms.Label(); ((System.ComponentModel.ISupportInitialize)(this.axCommonDialog)).BeginInit(); this.SuspendLayout(); // // Browse // this.Browse.BackColor = System.Drawing.SystemColors.ControlLight; this.Browse.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0))); this.Browse.ForeColor = System.Drawing.Color.Black; this.Browse.Location = new System.Drawing.Point(472, 488); this.Browse.Name = "Browse"; this.Browse.Size = new System.Drawing.Size(24, 24); this.Browse.TabIndex = 8; this.Browse.Text = "..."; this.Browse.Click += new System.EventHandler(this.Browse_Click); // // txtInput // this.txtInput.Location = new System.Drawing.Point(120, 488); this.txtInput.Name = "txtInput"; this.txtInput.Size = new System.Drawing.Size(344, 20); this.txtInput.TabIndex = 7; this.txtInput.Text = ""; // // ExtractText // this.ExtractText.BackColor = System.Drawing.SystemColors.ControlLight; this.ExtractText.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0))); this.ExtractText.ForeColor = System.Drawing.Color.Black; this.ExtractText.Location = new System.Drawing.Point(8, 544); this.ExtractText.Name = "ExtractText"; this.ExtractText.Size = new System.Drawing.Size(96, 24); this.ExtractText.TabIndex = 6; this.ExtractText.Text = "Extract Text"; this.ExtractText.Click += new System.EventHandler(this.ExtractText_Click); // // axCommonDialog // this.axCommonDialog.Enabled = true; this.axCommonDialog.Location = new System.Drawing.Point(464, 520); this.axCommonDialog.Name = "axCommonDialog"; this.axCommonDialog.OcxState = ((System.Windows.Forms.AxHost.State)(resources.GetObject("axCommonDialog.OcxState"))); this.axCommonDialog.Size = new System.Drawing.Size(32, 32); this.axCommonDialog.TabIndex = 9; // // txtOutput // this.txtOutput.Location = new System.Drawing.Point(8, 32); this.txtOutput.MaxLength = 200000; this.txtOutput.Multiline = true; this.txtOutput.Name = "txtOutput"; this.txtOutput.ScrollBars = System.Windows.Forms.ScrollBars.Both; this.txtOutput.Size = new System.Drawing.Size(496, 448); this.txtOutput.TabIndex = 10; this.txtOutput.Text = ""; // // label1 // this.label1.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(122)), ((System.Byte)(182)), ((System.Byte)(215))); this.label1.Font = new System.Drawing.Font("Verdana", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0))); this.label1.ForeColor = System.Drawing.Color.FromArgb(((System.Byte)(0)), ((System.Byte)(102)), ((System.Byte)(153))); this.label1.Location = new System.Drawing.Point(0, 488); this.label1.Name = "label1"; this.label1.Size = new System.Drawing.Size(112, 18); this.label1.TabIndex = 11; this.label1.Text = " FILE NAME"; // // txtPageNo // this.txtPageNo.Location = new System.Drawing.Point(120, 512); this.txtPageNo.Name = "txtPageNo"; this.txtPageNo.Size = new System.Drawing.Size(40, 20); this.txtPageNo.TabIndex = 12; this.txtPageNo.Text = "1"; // // label2 // this.label2.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(122)), ((System.Byte)(182)), ((System.Byte)(215))); this.label2.Font = new System.Drawing.Font("Verdana", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0))); this.label2.ForeColor = System.Drawing.Color.FromArgb(((System.Byte)(0)), ((System.Byte)(102)), ((System.Byte)(153))); this.label2.Location = new System.Drawing.Point(0, 512); this.label2.Name = "label2"; this.label2.Size = new System.Drawing.Size(112, 18); this.label2.TabIndex = 13; this.label2.Text = " PAGE NUMBER"; // // panel1 // this.panel1.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(174)), ((System.Byte)(209)), ((System.Byte)(226))); this.panel1.Location = new System.Drawing.Point(-8, 0); this.panel1.Name = "panel1"; this.panel1.Size = new System.Drawing.Size(120, 624); this.panel1.TabIndex = 15; // // label3 // this.label3.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(255)), ((System.Byte)(153)), ((System.Byte)(102))); this.label3.Font = new System.Drawing.Font("Verdana", 9F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0))); this.label3.ForeColor = System.Drawing.Color.White; this.label3.Location = new System.Drawing.Point(120, 8); this.label3.Name = "label3"; this.label3.Size = new System.Drawing.Size(384, 19); this.label3.TabIndex = 16; this.label3.Text = " 3-Heights PDF Extract Tool - Extract Text Sample"; // // label4 // this.label4.Location = new System.Drawing.Point(120, 544); this.label4.Name = "label4"; this.label4.Size = new System.Drawing.Size(384, 16); this.label4.TabIndex = 17; this.label4.Text = "This is a text extraction sample for the 3-Heights PDF Extract Tool."; // // label5 // this.label5.Location = new System.Drawing.Point(120, 560); this.label5.Name = "label5"; this.label5.Size = new System.Drawing.Size(384, 16); this.label5.TabIndex = 18; this.label5.Text = "Copyright (C) 2005 PDF Tools AG, Switzerland"; // // Form1 // this.AutoScaleBaseSize = new System.Drawing.Size(5, 13); this.BackColor = System.Drawing.Color.White; this.ClientSize = new System.Drawing.Size(512, 581); this.Controls.AddRange(new System.Windows.Forms.Control[] { this.label5, this.label4, this.label3, this.txtPageNo, this.txtInput, this.label2, this.label1, this.txtOutput, this.axCommonDialog, this.Browse, this.ExtractText, this.panel1}); this.Name = "Form1"; ((System.ComponentModel.ISupportInitialize)(this.axCommonDialog)).EndInit(); this.ResumeLayout(false); } #endregion [STAThread] static void Main() { Application.Run(new Form1()); } private void Browse_Click(object sender, System.EventArgs e) { axCommonDialog.FileName = txtInput.Text; axCommonDialog.ShowOpen(); txtInput.Text = axCommonDialog.FileName; } private void ExtractText_Click(object sender, System.EventArgs e) { PDFPARSERLib.DocumentClass document = new PDFPARSERLib.DocumentClass(); if (document.Open(txtInput.Text, "")) { // clear the text control txtOutput.Text = ""; // check the page range Int32.Parse(txtPageNo.Text); int iPageNo = Int32.Parse(txtPageNo.Text); if(iPageNo > 0 && iPageNo <= document.PageCount) { // set the page number document.PageNo = iPageNo; // get a content handle PDFPARSERLib.Content content = document.Page.Content; if(content != null) { // get one word per token content.BreakWords = true; // account page rotation content.Reset(true); // keep track of the last y-coordinate float y_old = -1.0f; while(content.GetNextText() != null) { // get a text handle PDFPARSERLib.Text text = content.Text; float y_new = 1.0f; if (text.Length > 0) y_new = (float) ((object[])text.YPos)[0]; // insert a blank if y-coordinate is the same as in previous token // insert a new line if it changes txtOutput.Text = string.Concat(txtOutput.Text, y_old == y_new ? " " : "\r\n" , text.UnicodeString); y_old = y_new; } } else { // no content on page } } else { // page number out of range } } else { // failed to open } } } }