csharp_pain/Scraping/COM/samples/VB/TextExt.frm

235 lines
8.3 KiB
Text
Raw Permalink Normal View History

2014-06-26 15:13:46 +00:00
VERSION 5.00
Object = "{F9043C88-F6F2-101A-A3C9-08002B2F49FB}#1.2#0"; "comdlg32.ocx"
Begin VB.Form Form1
BackColor = &H00FFFFFF&
BorderStyle = 1 'Fixed Single
Caption = "Extract Text"
ClientHeight = 9285
ClientLeft = 45
ClientTop = 435
ClientWidth = 5550
Icon = "TextExt.frx":0000
LinkTopic = "Form1"
MaxButton = 0 'False
MinButton = 0 'False
ScaleHeight = 9285
ScaleWidth = 5550
StartUpPosition = 3 'Windows Default
Begin VB.CommandButton GetText
Caption = "Extract"
Height = 375
Left = 2280
TabIndex = 9
Top = 8760
Width = 1095
End
Begin MSComDlg.CommonDialog FileDialog
Left = 4080
Top = 8760
_ExtentX = 847
_ExtentY = 847
_Version = 393216
End
Begin VB.CommandButton Browse
BackColor = &H00C0C0C0&
Caption = "..."
Height = 375
Left = 5040
TabIndex = 4
Top = 8400
Width = 375
End
Begin VB.TextBox PDFFile
Height = 285
Left = 1080
TabIndex = 3
Top = 8400
Width = 3855
End
Begin VB.Frame Frame1
BackColor = &H00AC7A3E&
BorderStyle = 0 'None
ForeColor = &H00AC7A3E&
Height = 7455
Left = 120
TabIndex = 1
Top = 720
Width = 5295
Begin VB.TextBox TextOut
BorderStyle = 0 'None
Height = 7035
Left = 50
MultiLine = -1 'True
ScrollBars = 2 'Vertical
TabIndex = 2
Top = 360
Width = 5190
End
Begin VB.Label Label2
BackStyle = 0 'Transparent
Caption = "Output Text"
BeginProperty Font
Name = "Verdana"
Size = 9
Charset = 0
Weight = 700
Underline = 0 'False
Italic = 0 'False
Strikethrough = 0 'False
EndProperty
ForeColor = &H00FFFFFF&
Height = 255
Left = 120
TabIndex = 5
Top = 90
Width = 2055
End
End
Begin VB.TextBox TextLogo
BorderStyle = 0 'None
BeginProperty Font
Name = "Verdana"
Size = 12.75
Charset = 0
Weight = 700
Underline = 0 'False
Italic = 0 'False
Strikethrough = 0 'False
EndProperty
ForeColor = &H00404040&
Height = 375
Left = 0
TabIndex = 0
Text = " pdf-tools.com"
Top = 120
Width = 5535
End
Begin VB.Frame Frame3
BackColor = &H00E0E0E0&
BorderStyle = 0 'None
Height = 615
Left = 0
TabIndex = 7
Top = 0
Width = 5535
End
Begin VB.Frame Frame2
BackColor = &H00E0E0E0&
BorderStyle = 0 'None
Height = 9375
Left = 0
TabIndex = 6
Top = 0
Width = 1215
Begin VB.Label Label1
BackColor = &H00E0E0E0&
BackStyle = 0 'Transparent
Caption = "PDF File"
BeginProperty Font
Name = "Verdana"
Size = 8.25
Charset = 0
Weight = 700
Underline = 0 'False
Italic = 0 'False
Strikethrough = 0 'False
EndProperty
ForeColor = &H00AC7A3E&
Height = 255
Left = 120
TabIndex = 8
Top = 8400
Width = 855
End
End
End
Attribute VB_Name = "Form1"
Attribute VB_GlobalNameSpace = False
Attribute VB_Creatable = False
Attribute VB_PredeclaredId = True
Attribute VB_Exposed = False
' List Fonts and Text Extraction
' ------------------------------
'
' Visual Basic 6 sample for the 3-Heights PDF Extract Tool API
' http://www.pdf-tools.com
'
' Copyright (C) 2005 PDF Tools AG, Switzerland
' Permission to use, copy, modify, and distribute this
' software and its documentation for any purpose and without
' fee is hereby granted, provided that the above copyright
' notice appear in all copies and that both that copyright
' notice and this permission notice appear in supporting
' documentation. This software is provided "as is" without
' express or implied warranty.
Private Sub GetText_Click()
Dim pdf As New PDFPARSERLib.Document
Dim content As PDFPARSERLib.content
Dim text As PDFPARSERLib.text
Dim cFontRes As PDFPARSERLib.Font
Dim X As Single, Y As Single, Yold As Single, FontSize As Single
Dim CurPage As Long
Dim sPageText As String ' Separate string per page to improve the
' string concatenaten operation
If pdf.Open(PDFFile.text) Then
' List fonts
sCR = Chr(13) & Chr(10) ' Carriage Return
TextOut.text = "- - - Fonts - - -" & sCR & sCR
Set cFontRes = pdf.GetFirstFontResource
While Not cFontRes Is Nothing
TextOut.text = TextOut.text & cFontRes.BaseName & sCR
Set cFontRes = pdf.GetNextFontResource
Wend
' List text
Yold = -1
If pdf.PageCount > 10 Then ' limit page numbers to 10
LastPage = 10 ' due to limitation of the text control
Else
LastPage = pdf.PageCount
End If
For CurPage = 1 To LastPage
pdf.PageNo = CurPage ' set the current page number
Set content = pdf.Page.content ' get the page's content
If Not (content Is Nothing) Then
content.BreakWords = True ' extract words
TextOut.text = TextOut.text & sCR & "- - - Page " & CurPage & " - - -" & sCR
sPageText = ""
Do
If content.GetNextText Is Nothing Then Exit Do
Set text = content.text ' at this point text properties can be accessed
If Not (text Is Nothing) Then
FontSize = text.FontSize ' the font size
If text.Length > 0 Then
X = text.XPos(0) ' the X position
Y = text.YPos(0) ' the y position
If Yold = Y Then
sPageText = sPageText & " " & text.UnicodeString
Else
sPageText = sPageText & sCR & text.UnicodeString
End If
Yold = Y
End If
End If
Loop
TextOut.text = TextOut.text & sPageText
Else
TextOut.text = TextOut.text & sCR & sCR & "- - - There is no content on page " _
& CurPage & " - - -" & sCR
End If
Next CurPage
pdf.Close
Else
MsgBox "Couldn't open input file"
End If
End Sub
Private Sub Browse_Click()
FileDialog.FileName = PDFFile.text
FileDialog.ShowOpen
PDFFile.text = FileDialog.FileName
End Sub