299 lines
11 KiB
C#
299 lines
11 KiB
C#
|
/****************************************************************************
|
|||
|
*
|
|||
|
* Project: TextExt, C#.NET
|
|||
|
*
|
|||
|
* Description: Extracts text of a page of a PDF document using the
|
|||
|
* 3-Heights PDF Extraction Tool.
|
|||
|
*
|
|||
|
* Version: 1.01 (5-October-2005)
|
|||
|
*
|
|||
|
* Author: Philip Renggli, PDF Tools AG
|
|||
|
*
|
|||
|
* Copyright: Copyright (C) 2005 PDF Tools AG, Switzerland
|
|||
|
* Permission to use, copy, modify, and distribute this
|
|||
|
* software and its documentation for any purpose and without
|
|||
|
* fee is hereby granted, provided that the above copyright
|
|||
|
* notice appear in all copies and that both that copyright
|
|||
|
* notice and this permission notice appear in supporting
|
|||
|
* documentation. This software is provided "as is" without
|
|||
|
* express or implied warranty.
|
|||
|
*
|
|||
|
***************************************************************************/
|
|||
|
|
|||
|
using System;
|
|||
|
using System.Drawing;
|
|||
|
using System.Collections;
|
|||
|
using System.ComponentModel;
|
|||
|
using System.Windows.Forms;
|
|||
|
using System.Data;
|
|||
|
|
|||
|
namespace TextExt
|
|||
|
{
|
|||
|
public class Form1 : System.Windows.Forms.Form
|
|||
|
{
|
|||
|
private System.Windows.Forms.Button Browse;
|
|||
|
private System.Windows.Forms.TextBox txtInput;
|
|||
|
private System.Windows.Forms.Button ExtractText;
|
|||
|
private AxMSComDlg.AxCommonDialog axCommonDialog;
|
|||
|
private System.Windows.Forms.Label label1;
|
|||
|
private System.Windows.Forms.Label label2;
|
|||
|
private System.Windows.Forms.TextBox txtOutput;
|
|||
|
private System.Windows.Forms.TextBox txtPageNo;
|
|||
|
private System.Windows.Forms.Panel panel1;
|
|||
|
private System.Windows.Forms.Label label3;
|
|||
|
private System.Windows.Forms.Label label4;
|
|||
|
private System.Windows.Forms.Label label5;
|
|||
|
|
|||
|
private System.ComponentModel.Container components = null;
|
|||
|
|
|||
|
public Form1()
|
|||
|
{
|
|||
|
InitializeComponent();
|
|||
|
}
|
|||
|
|
|||
|
protected override void Dispose( bool disposing )
|
|||
|
{
|
|||
|
if( disposing )
|
|||
|
{
|
|||
|
if (components != null)
|
|||
|
{
|
|||
|
components.Dispose();
|
|||
|
}
|
|||
|
}
|
|||
|
base.Dispose( disposing );
|
|||
|
}
|
|||
|
|
|||
|
#region Windows Form Designer generated code
|
|||
|
/// <summary>
|
|||
|
/// Erforderliche Methode f<>r die Designerunterst<73>tzung.
|
|||
|
/// Der Inhalt der Methode darf nicht mit dem Code-Editor ge<67>ndert werden.
|
|||
|
/// </summary>
|
|||
|
private void InitializeComponent()
|
|||
|
{
|
|||
|
System.Resources.ResourceManager resources = new System.Resources.ResourceManager(typeof(Form1));
|
|||
|
this.Browse = new System.Windows.Forms.Button();
|
|||
|
this.txtInput = new System.Windows.Forms.TextBox();
|
|||
|
this.ExtractText = new System.Windows.Forms.Button();
|
|||
|
this.axCommonDialog = new AxMSComDlg.AxCommonDialog();
|
|||
|
this.txtOutput = new System.Windows.Forms.TextBox();
|
|||
|
this.label1 = new System.Windows.Forms.Label();
|
|||
|
this.txtPageNo = new System.Windows.Forms.TextBox();
|
|||
|
this.label2 = new System.Windows.Forms.Label();
|
|||
|
this.panel1 = new System.Windows.Forms.Panel();
|
|||
|
this.label3 = new System.Windows.Forms.Label();
|
|||
|
this.label4 = new System.Windows.Forms.Label();
|
|||
|
this.label5 = new System.Windows.Forms.Label();
|
|||
|
((System.ComponentModel.ISupportInitialize)(this.axCommonDialog)).BeginInit();
|
|||
|
this.SuspendLayout();
|
|||
|
//
|
|||
|
// Browse
|
|||
|
//
|
|||
|
this.Browse.BackColor = System.Drawing.SystemColors.ControlLight;
|
|||
|
this.Browse.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0)));
|
|||
|
this.Browse.ForeColor = System.Drawing.Color.Black;
|
|||
|
this.Browse.Location = new System.Drawing.Point(472, 488);
|
|||
|
this.Browse.Name = "Browse";
|
|||
|
this.Browse.Size = new System.Drawing.Size(24, 24);
|
|||
|
this.Browse.TabIndex = 8;
|
|||
|
this.Browse.Text = "...";
|
|||
|
this.Browse.Click += new System.EventHandler(this.Browse_Click);
|
|||
|
//
|
|||
|
// txtInput
|
|||
|
//
|
|||
|
this.txtInput.Location = new System.Drawing.Point(120, 488);
|
|||
|
this.txtInput.Name = "txtInput";
|
|||
|
this.txtInput.Size = new System.Drawing.Size(344, 20);
|
|||
|
this.txtInput.TabIndex = 7;
|
|||
|
this.txtInput.Text = "";
|
|||
|
//
|
|||
|
// ExtractText
|
|||
|
//
|
|||
|
this.ExtractText.BackColor = System.Drawing.SystemColors.ControlLight;
|
|||
|
this.ExtractText.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0)));
|
|||
|
this.ExtractText.ForeColor = System.Drawing.Color.Black;
|
|||
|
this.ExtractText.Location = new System.Drawing.Point(8, 544);
|
|||
|
this.ExtractText.Name = "ExtractText";
|
|||
|
this.ExtractText.Size = new System.Drawing.Size(96, 24);
|
|||
|
this.ExtractText.TabIndex = 6;
|
|||
|
this.ExtractText.Text = "Extract Text";
|
|||
|
this.ExtractText.Click += new System.EventHandler(this.ExtractText_Click);
|
|||
|
//
|
|||
|
// axCommonDialog
|
|||
|
//
|
|||
|
this.axCommonDialog.Enabled = true;
|
|||
|
this.axCommonDialog.Location = new System.Drawing.Point(464, 520);
|
|||
|
this.axCommonDialog.Name = "axCommonDialog";
|
|||
|
this.axCommonDialog.OcxState = ((System.Windows.Forms.AxHost.State)(resources.GetObject("axCommonDialog.OcxState")));
|
|||
|
this.axCommonDialog.Size = new System.Drawing.Size(32, 32);
|
|||
|
this.axCommonDialog.TabIndex = 9;
|
|||
|
//
|
|||
|
// txtOutput
|
|||
|
//
|
|||
|
this.txtOutput.Location = new System.Drawing.Point(8, 32);
|
|||
|
this.txtOutput.MaxLength = 200000;
|
|||
|
this.txtOutput.Multiline = true;
|
|||
|
this.txtOutput.Name = "txtOutput";
|
|||
|
this.txtOutput.ScrollBars = System.Windows.Forms.ScrollBars.Both;
|
|||
|
this.txtOutput.Size = new System.Drawing.Size(496, 448);
|
|||
|
this.txtOutput.TabIndex = 10;
|
|||
|
this.txtOutput.Text = "";
|
|||
|
//
|
|||
|
// label1
|
|||
|
//
|
|||
|
this.label1.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(122)), ((System.Byte)(182)), ((System.Byte)(215)));
|
|||
|
this.label1.Font = new System.Drawing.Font("Verdana", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0)));
|
|||
|
this.label1.ForeColor = System.Drawing.Color.FromArgb(((System.Byte)(0)), ((System.Byte)(102)), ((System.Byte)(153)));
|
|||
|
this.label1.Location = new System.Drawing.Point(0, 488);
|
|||
|
this.label1.Name = "label1";
|
|||
|
this.label1.Size = new System.Drawing.Size(112, 18);
|
|||
|
this.label1.TabIndex = 11;
|
|||
|
this.label1.Text = " FILE NAME";
|
|||
|
//
|
|||
|
// txtPageNo
|
|||
|
//
|
|||
|
this.txtPageNo.Location = new System.Drawing.Point(120, 512);
|
|||
|
this.txtPageNo.Name = "txtPageNo";
|
|||
|
this.txtPageNo.Size = new System.Drawing.Size(40, 20);
|
|||
|
this.txtPageNo.TabIndex = 12;
|
|||
|
this.txtPageNo.Text = "1";
|
|||
|
//
|
|||
|
// label2
|
|||
|
//
|
|||
|
this.label2.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(122)), ((System.Byte)(182)), ((System.Byte)(215)));
|
|||
|
this.label2.Font = new System.Drawing.Font("Verdana", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0)));
|
|||
|
this.label2.ForeColor = System.Drawing.Color.FromArgb(((System.Byte)(0)), ((System.Byte)(102)), ((System.Byte)(153)));
|
|||
|
this.label2.Location = new System.Drawing.Point(0, 512);
|
|||
|
this.label2.Name = "label2";
|
|||
|
this.label2.Size = new System.Drawing.Size(112, 18);
|
|||
|
this.label2.TabIndex = 13;
|
|||
|
this.label2.Text = " PAGE NUMBER";
|
|||
|
//
|
|||
|
// panel1
|
|||
|
//
|
|||
|
this.panel1.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(174)), ((System.Byte)(209)), ((System.Byte)(226)));
|
|||
|
this.panel1.Location = new System.Drawing.Point(-8, 0);
|
|||
|
this.panel1.Name = "panel1";
|
|||
|
this.panel1.Size = new System.Drawing.Size(120, 624);
|
|||
|
this.panel1.TabIndex = 15;
|
|||
|
//
|
|||
|
// label3
|
|||
|
//
|
|||
|
this.label3.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(255)), ((System.Byte)(153)), ((System.Byte)(102)));
|
|||
|
this.label3.Font = new System.Drawing.Font("Verdana", 9F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0)));
|
|||
|
this.label3.ForeColor = System.Drawing.Color.White;
|
|||
|
this.label3.Location = new System.Drawing.Point(120, 8);
|
|||
|
this.label3.Name = "label3";
|
|||
|
this.label3.Size = new System.Drawing.Size(384, 19);
|
|||
|
this.label3.TabIndex = 16;
|
|||
|
this.label3.Text = " 3-Heights PDF Extract Tool - Extract Text Sample";
|
|||
|
//
|
|||
|
// label4
|
|||
|
//
|
|||
|
this.label4.Location = new System.Drawing.Point(120, 544);
|
|||
|
this.label4.Name = "label4";
|
|||
|
this.label4.Size = new System.Drawing.Size(384, 16);
|
|||
|
this.label4.TabIndex = 17;
|
|||
|
this.label4.Text = "This is a text extraction sample for the 3-Heights PDF Extract Tool.";
|
|||
|
//
|
|||
|
// label5
|
|||
|
//
|
|||
|
this.label5.Location = new System.Drawing.Point(120, 560);
|
|||
|
this.label5.Name = "label5";
|
|||
|
this.label5.Size = new System.Drawing.Size(384, 16);
|
|||
|
this.label5.TabIndex = 18;
|
|||
|
this.label5.Text = "Copyright (C) 2005 PDF Tools AG, Switzerland";
|
|||
|
//
|
|||
|
// Form1
|
|||
|
//
|
|||
|
this.AutoScaleBaseSize = new System.Drawing.Size(5, 13);
|
|||
|
this.BackColor = System.Drawing.Color.White;
|
|||
|
this.ClientSize = new System.Drawing.Size(512, 581);
|
|||
|
this.Controls.AddRange(new System.Windows.Forms.Control[] {
|
|||
|
this.label5,
|
|||
|
this.label4,
|
|||
|
this.label3,
|
|||
|
this.txtPageNo,
|
|||
|
this.txtInput,
|
|||
|
this.label2,
|
|||
|
this.label1,
|
|||
|
this.txtOutput,
|
|||
|
this.axCommonDialog,
|
|||
|
this.Browse,
|
|||
|
this.ExtractText,
|
|||
|
this.panel1});
|
|||
|
this.Name = "Form1";
|
|||
|
((System.ComponentModel.ISupportInitialize)(this.axCommonDialog)).EndInit();
|
|||
|
this.ResumeLayout(false);
|
|||
|
|
|||
|
}
|
|||
|
#endregion
|
|||
|
|
|||
|
[STAThread]
|
|||
|
static void Main()
|
|||
|
{
|
|||
|
Application.Run(new Form1());
|
|||
|
}
|
|||
|
|
|||
|
private void Browse_Click(object sender, System.EventArgs e)
|
|||
|
{
|
|||
|
axCommonDialog.FileName = txtInput.Text;
|
|||
|
axCommonDialog.ShowOpen();
|
|||
|
txtInput.Text = axCommonDialog.FileName;
|
|||
|
}
|
|||
|
|
|||
|
private void ExtractText_Click(object sender, System.EventArgs e)
|
|||
|
{
|
|||
|
PDFPARSERLib.DocumentClass document = new PDFPARSERLib.DocumentClass();
|
|||
|
if (document.Open(txtInput.Text, ""))
|
|||
|
{
|
|||
|
// clear the text control
|
|||
|
txtOutput.Text = "";
|
|||
|
|
|||
|
// check the page range
|
|||
|
Int32.Parse(txtPageNo.Text);
|
|||
|
int iPageNo = Int32.Parse(txtPageNo.Text);
|
|||
|
if(iPageNo > 0 && iPageNo <= document.PageCount)
|
|||
|
{
|
|||
|
// set the page number
|
|||
|
document.PageNo = iPageNo;
|
|||
|
// get a content handle
|
|||
|
PDFPARSERLib.Content content = document.Page.Content;
|
|||
|
if(content != null)
|
|||
|
{
|
|||
|
// get one word per token
|
|||
|
content.BreakWords = true;
|
|||
|
// account page rotation
|
|||
|
content.Reset(true);
|
|||
|
// keep track of the last y-coordinate
|
|||
|
float y_old = -1.0f;
|
|||
|
while(content.GetNextText() != null)
|
|||
|
{
|
|||
|
// get a text handle
|
|||
|
PDFPARSERLib.Text text = content.Text;
|
|||
|
float y_new = 1.0f;
|
|||
|
if (text.Length > 0)
|
|||
|
y_new = (float) ((object[])text.YPos)[0];
|
|||
|
// insert a blank if y-coordinate is the same as in previous token
|
|||
|
// insert a new line if it changes
|
|||
|
txtOutput.Text = string.Concat(txtOutput.Text, y_old == y_new ? " " : "\r\n" , text.UnicodeString);
|
|||
|
y_old = y_new;
|
|||
|
}
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
// no content on page
|
|||
|
}
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
// page number out of range
|
|||
|
}
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
// failed to open
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
}
|