csharp_pain/Scraping/COM/samples/CS.NET/TextExt/Backup/Form1.cs
2014-06-26 17:13:46 +02:00

298 lines
11 KiB
C#
Raw Blame History

/****************************************************************************
*
* Project: TextExt, C#.NET
*
* Description: Extracts text of a page of a PDF document using the
* 3-Heights PDF Extraction Tool.
*
* Version: 1.01 (5-October-2005)
*
* Author: Philip Renggli, PDF Tools AG
*
* Copyright: Copyright (C) 2005 PDF Tools AG, Switzerland
* Permission to use, copy, modify, and distribute this
* software and its documentation for any purpose and without
* fee is hereby granted, provided that the above copyright
* notice appear in all copies and that both that copyright
* notice and this permission notice appear in supporting
* documentation. This software is provided "as is" without
* express or implied warranty.
*
***************************************************************************/
using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
namespace TextExt
{
public class Form1 : System.Windows.Forms.Form
{
private System.Windows.Forms.Button Browse;
private System.Windows.Forms.TextBox txtInput;
private System.Windows.Forms.Button ExtractText;
private AxMSComDlg.AxCommonDialog axCommonDialog;
private System.Windows.Forms.Label label1;
private System.Windows.Forms.Label label2;
private System.Windows.Forms.TextBox txtOutput;
private System.Windows.Forms.TextBox txtPageNo;
private System.Windows.Forms.Panel panel1;
private System.Windows.Forms.Label label3;
private System.Windows.Forms.Label label4;
private System.Windows.Forms.Label label5;
private System.ComponentModel.Container components = null;
public Form1()
{
InitializeComponent();
}
protected override void Dispose( bool disposing )
{
if( disposing )
{
if (components != null)
{
components.Dispose();
}
}
base.Dispose( disposing );
}
#region Windows Form Designer generated code
/// <summary>
/// Erforderliche Methode f<>r die Designerunterst<73>tzung.
/// Der Inhalt der Methode darf nicht mit dem Code-Editor ge<67>ndert werden.
/// </summary>
private void InitializeComponent()
{
System.Resources.ResourceManager resources = new System.Resources.ResourceManager(typeof(Form1));
this.Browse = new System.Windows.Forms.Button();
this.txtInput = new System.Windows.Forms.TextBox();
this.ExtractText = new System.Windows.Forms.Button();
this.axCommonDialog = new AxMSComDlg.AxCommonDialog();
this.txtOutput = new System.Windows.Forms.TextBox();
this.label1 = new System.Windows.Forms.Label();
this.txtPageNo = new System.Windows.Forms.TextBox();
this.label2 = new System.Windows.Forms.Label();
this.panel1 = new System.Windows.Forms.Panel();
this.label3 = new System.Windows.Forms.Label();
this.label4 = new System.Windows.Forms.Label();
this.label5 = new System.Windows.Forms.Label();
((System.ComponentModel.ISupportInitialize)(this.axCommonDialog)).BeginInit();
this.SuspendLayout();
//
// Browse
//
this.Browse.BackColor = System.Drawing.SystemColors.ControlLight;
this.Browse.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0)));
this.Browse.ForeColor = System.Drawing.Color.Black;
this.Browse.Location = new System.Drawing.Point(472, 488);
this.Browse.Name = "Browse";
this.Browse.Size = new System.Drawing.Size(24, 24);
this.Browse.TabIndex = 8;
this.Browse.Text = "...";
this.Browse.Click += new System.EventHandler(this.Browse_Click);
//
// txtInput
//
this.txtInput.Location = new System.Drawing.Point(120, 488);
this.txtInput.Name = "txtInput";
this.txtInput.Size = new System.Drawing.Size(344, 20);
this.txtInput.TabIndex = 7;
this.txtInput.Text = "";
//
// ExtractText
//
this.ExtractText.BackColor = System.Drawing.SystemColors.ControlLight;
this.ExtractText.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0)));
this.ExtractText.ForeColor = System.Drawing.Color.Black;
this.ExtractText.Location = new System.Drawing.Point(8, 544);
this.ExtractText.Name = "ExtractText";
this.ExtractText.Size = new System.Drawing.Size(96, 24);
this.ExtractText.TabIndex = 6;
this.ExtractText.Text = "Extract Text";
this.ExtractText.Click += new System.EventHandler(this.ExtractText_Click);
//
// axCommonDialog
//
this.axCommonDialog.Enabled = true;
this.axCommonDialog.Location = new System.Drawing.Point(464, 520);
this.axCommonDialog.Name = "axCommonDialog";
this.axCommonDialog.OcxState = ((System.Windows.Forms.AxHost.State)(resources.GetObject("axCommonDialog.OcxState")));
this.axCommonDialog.Size = new System.Drawing.Size(32, 32);
this.axCommonDialog.TabIndex = 9;
//
// txtOutput
//
this.txtOutput.Location = new System.Drawing.Point(8, 32);
this.txtOutput.MaxLength = 200000;
this.txtOutput.Multiline = true;
this.txtOutput.Name = "txtOutput";
this.txtOutput.ScrollBars = System.Windows.Forms.ScrollBars.Both;
this.txtOutput.Size = new System.Drawing.Size(496, 448);
this.txtOutput.TabIndex = 10;
this.txtOutput.Text = "";
//
// label1
//
this.label1.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(122)), ((System.Byte)(182)), ((System.Byte)(215)));
this.label1.Font = new System.Drawing.Font("Verdana", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0)));
this.label1.ForeColor = System.Drawing.Color.FromArgb(((System.Byte)(0)), ((System.Byte)(102)), ((System.Byte)(153)));
this.label1.Location = new System.Drawing.Point(0, 488);
this.label1.Name = "label1";
this.label1.Size = new System.Drawing.Size(112, 18);
this.label1.TabIndex = 11;
this.label1.Text = " FILE NAME";
//
// txtPageNo
//
this.txtPageNo.Location = new System.Drawing.Point(120, 512);
this.txtPageNo.Name = "txtPageNo";
this.txtPageNo.Size = new System.Drawing.Size(40, 20);
this.txtPageNo.TabIndex = 12;
this.txtPageNo.Text = "1";
//
// label2
//
this.label2.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(122)), ((System.Byte)(182)), ((System.Byte)(215)));
this.label2.Font = new System.Drawing.Font("Verdana", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0)));
this.label2.ForeColor = System.Drawing.Color.FromArgb(((System.Byte)(0)), ((System.Byte)(102)), ((System.Byte)(153)));
this.label2.Location = new System.Drawing.Point(0, 512);
this.label2.Name = "label2";
this.label2.Size = new System.Drawing.Size(112, 18);
this.label2.TabIndex = 13;
this.label2.Text = " PAGE NUMBER";
//
// panel1
//
this.panel1.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(174)), ((System.Byte)(209)), ((System.Byte)(226)));
this.panel1.Location = new System.Drawing.Point(-8, 0);
this.panel1.Name = "panel1";
this.panel1.Size = new System.Drawing.Size(120, 624);
this.panel1.TabIndex = 15;
//
// label3
//
this.label3.BackColor = System.Drawing.Color.FromArgb(((System.Byte)(255)), ((System.Byte)(153)), ((System.Byte)(102)));
this.label3.Font = new System.Drawing.Font("Verdana", 9F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((System.Byte)(0)));
this.label3.ForeColor = System.Drawing.Color.White;
this.label3.Location = new System.Drawing.Point(120, 8);
this.label3.Name = "label3";
this.label3.Size = new System.Drawing.Size(384, 19);
this.label3.TabIndex = 16;
this.label3.Text = " 3-Heights PDF Extract Tool - Extract Text Sample";
//
// label4
//
this.label4.Location = new System.Drawing.Point(120, 544);
this.label4.Name = "label4";
this.label4.Size = new System.Drawing.Size(384, 16);
this.label4.TabIndex = 17;
this.label4.Text = "This is a text extraction sample for the 3-Heights PDF Extract Tool.";
//
// label5
//
this.label5.Location = new System.Drawing.Point(120, 560);
this.label5.Name = "label5";
this.label5.Size = new System.Drawing.Size(384, 16);
this.label5.TabIndex = 18;
this.label5.Text = "Copyright (C) 2005 PDF Tools AG, Switzerland";
//
// Form1
//
this.AutoScaleBaseSize = new System.Drawing.Size(5, 13);
this.BackColor = System.Drawing.Color.White;
this.ClientSize = new System.Drawing.Size(512, 581);
this.Controls.AddRange(new System.Windows.Forms.Control[] {
this.label5,
this.label4,
this.label3,
this.txtPageNo,
this.txtInput,
this.label2,
this.label1,
this.txtOutput,
this.axCommonDialog,
this.Browse,
this.ExtractText,
this.panel1});
this.Name = "Form1";
((System.ComponentModel.ISupportInitialize)(this.axCommonDialog)).EndInit();
this.ResumeLayout(false);
}
#endregion
[STAThread]
static void Main()
{
Application.Run(new Form1());
}
private void Browse_Click(object sender, System.EventArgs e)
{
axCommonDialog.FileName = txtInput.Text;
axCommonDialog.ShowOpen();
txtInput.Text = axCommonDialog.FileName;
}
private void ExtractText_Click(object sender, System.EventArgs e)
{
PDFPARSERLib.DocumentClass document = new PDFPARSERLib.DocumentClass();
if (document.Open(txtInput.Text, ""))
{
// clear the text control
txtOutput.Text = "";
// check the page range
Int32.Parse(txtPageNo.Text);
int iPageNo = Int32.Parse(txtPageNo.Text);
if(iPageNo > 0 && iPageNo <= document.PageCount)
{
// set the page number
document.PageNo = iPageNo;
// get a content handle
PDFPARSERLib.Content content = document.Page.Content;
if(content != null)
{
// get one word per token
content.BreakWords = true;
// account page rotation
content.Reset(true);
// keep track of the last y-coordinate
float y_old = -1.0f;
while(content.GetNextText() != null)
{
// get a text handle
PDFPARSERLib.Text text = content.Text;
float y_new = 1.0f;
if (text.Length > 0)
y_new = (float) ((object[])text.YPos)[0];
// insert a blank if y-coordinate is the same as in previous token
// insert a new line if it changes
txtOutput.Text = string.Concat(txtOutput.Text, y_old == y_new ? " " : "\r\n" , text.UnicodeString);
y_old = y_new;
}
}
else
{
// no content on page
}
}
else
{
// page number out of range
}
}
else
{
// failed to open
}
}
}
}