65.9K
CodeProject is changing. Read more.
Home

Decode /FlateDecode PDF Stream To Plain Text using ZLib Inflate Function in VB

starIconstarIconstarIconstarIcon
emptyStarIcon
starIcon

4.31/5 (8 votes)

Dec 9, 2008

CPOL
viewsIcon

138494

downloadIcon

2652

Do you want to use zLIB Inflate function in VB? - This sample decodes a PDF (stream...endstream) to plain text. Extract plain text from PDF file

Introduction

This example shows how to use the ZLib Inflate & Deflate function in VB6. This example illustrates how to decode a PDF file to extract plain text.

Background 

This is similar to The Code Project article, Code to Extract Plain Text from a PDF File, but this project does not remove any internal PDF text. I leave the processing up to you. I might update this project later.

Using the Code 

Open the PDF file and get all bytes:

ReDim TheBytes(FileLen(filenam$) - 1)

Open filenam$ For Binary Access Read As #1
    Get #1, , TheBytes()
Close #1

'Convert the entire byte array to a string
sStr = StrConv(TheBytes, vbUnicode)

'Search for stream and endstream
lStart = InStr(1, sStr, "stream")
Do While lStart > 0
    lEnd = InStr(lStart, sStr, "endstream")
    If lEnd > 0 Then
        'Get the stream without the text Stream
        sStream = Mid(sStr, lStart + 6, lEnd - lStart - 6)
        
        'check if to remove the crlf after stream
        If Left(sStream, 2) = vbCrLf Then sStream = Mid(sStream, 3)

        'Convert this stream portion back to a byte array
        TheBytes = StrConv(sStream, vbFromUnicode)
        
        'Decode this portion
        Module4.UncompressData TheBytes, xbBufferOut
        
        'Display the results
        txtUncompressed = txtUncompressed & vbCrLf & vbCrLf & vbCrLf & _
					StrConv(xbBufferOut, vbUnicode)
        
        'Search the next stream where we left off
        lStart = InStr(lEnd + 8, sStr, "stream")
    Else: lStart = 0
    End If
Loop 

'This is the code for Module4 
 Option Explicit

 Private Const Z_FINISH As Long = 4

 Public Enum ZLIB_CompressionLevelConstants
    Z_NO_COMPRESSION = 0
    Z_BEST_SPEED = 1
    Z_BEST_COMPRESSION = 9
    Z_DEFAULT_COMPRESSION = (-1)
 End Enum

 Private Type zStream
    next_in As Long
    avail_in As Long
    total_in As Long
    next_out As Long
    avail_out As Long
    total_out As Long
    msg As Long
    state As Long
    zalloc As Long
    zfree As Long
    opaque As Long
    data_type As Long
    adler As Long
    reserved As Long
 End Type

 Private Declare Function ArrPtr Lib "msvbvm60.dll" _
			Alias "VarPtr" (Ptr() As Any) As Long
 Private Declare Sub CopyMemory Lib "kernel32.dll" Alias "RtlMoveMemory" _
	(ByRef Destination As Any, ByRef Source As Any, ByVal Length As Long)
 Private Declare Function deflate Lib "zlib.dll" _
	(vStream As zStream, Optional ByVal vflush As Long = Z_FINISH) As Long
 Private Declare Function deflateEnd Lib "zlib.dll" (vStream As zStream) As Long
 Private Declare Function deflateInit Lib "zlib.dll" Alias "deflateInit_" _
	(strm As zStream, ByVal Level As Long, ByVal version As String, _
	ByVal stream_size As Long) As Long
 Private Declare Function inflate Lib "zlib.dll" _
	(vStream As zStream, Optional ByVal vflush As Long = 1) As Long
 Private Declare Function inflateEnd Lib "zlib.dll" (vStream As zStream) As Long
 Private Declare Function inflateInit Lib "zlib.dll" Alias "inflateInit_" _
	(strm As zStream, ByVal version As String, ByVal stream_size As Long) As Long

 Private msVersion As String
 Private mnChunkSize As Long

Public Property Get ZLIB_ChunkSize() As Long
    If mnChunkSize = 0 Then
        mnChunkSize = &H10000
    End If
    ZLIB_ChunkSize = mnChunkSize
End Property
Public Property Let ZLIB_ChunkSize(ByVal Value As Long)
    mnChunkSize = Value
End Property

Public Property Get ZLIB_Version() As String
    If LenB(msVersion) = 0 Then
        msVersion = "1.1.2.0"
    End If
    ZLIB_Version = msVersion
End Property

Public Property Let ZLIB_Version(ByRef Value As String)
    msVersion = Value
End Property

Public Function CompressData(ByRef vxbInput() As Byte, _
	ByRef vxbOutput() As Byte, Optional vnStart As Long = 0, _
	Optional vnMaxSize As Long = 0, Optional veCompressionLevel _
	As ZLIB_CompressionLevelConstants = Z_DEFAULT_COMPRESSION) As Boolean
Dim tStream As zStream
Dim rc As Long
Dim xbCopy() As Byte
 With tStream
 
    If deflateInit(tStream, veCompressionLevel, ZLIB_Version, Len(tStream)) = 0 Then
        CompressData = True
        
        CopyMemory rc, ByVal ArrPtr(vxbInput), 4
        If rc Then
            CopyMemory .avail_in, ByVal rc + 16, 4
            .avail_in = .avail_in - vnStart
        End If
        If .avail_in > 0 And vnStart < .avail_in Then
            If vnMaxSize <> 0 And vnMaxSize < .avail_in Then
                .avail_in = vnMaxSize
            End If
            .next_in = VarPtr(vxbInput(vnStart))

            CopyMemory rc, ByVal ArrPtr(vxbOutput), 4
            If rc Then
                CopyMemory rc, ByVal rc + 12, 4

                If rc + vnStart = .next_in Then
                    xbCopy = vxbInput
                    .next_in = VarPtr(xbCopy(vnStart))
                ElseIf vnStart Then
                    ReDim vxbOutput(vnStart - 1)
                    CopyMemory vxbOutput(0), vxbInput(0), vnStart - 1
                End If
            Else
                vxbOutput = vxbInput
            End If

            .avail_out = .avail_in + 12
 
            ReDim Preserve vxbOutput(.total_out - 1 + .avail_out + vnStart)
 
            .next_out = VarPtr(vxbOutput(vnStart + .total_out))

            CompressData = deflate(tStream, 4) = 1

            If .total_out Or vnStart Then
                ReDim Preserve vxbOutput(.total_out + vnStart - 1)
            Else
                Erase vxbOutput
            End If
        End If

        deflateEnd tStream
    End If
 End With
End Function

 Public Function UncompressData(ByRef vxbInput() As Byte, _
	ByRef vxbOutput() As Byte, Optional vnStart As Long = 0, _
	Optional vnMaxSize As Long = 0, _
	Optional ByVal vnUncompressedSize As Long = 0) As Boolean
 Dim tStream As zStream
 Dim rc As Long
 
 Dim xbCopy() As Byte
    With tStream
        
        If inflateInit(tStream, ZLIB_Version, Len(tStream)) = 0 Then
            UncompressData = True
            
            CopyMemory rc, ByVal ArrPtr(vxbInput), 4
            If rc Then
                CopyMemory .avail_in, ByVal rc + 16, 4
                .avail_in = .avail_in - vnStart
            End If
            If .avail_in > 0 And vnStart < .avail_in Then
                
                If vnMaxSize <> 0 And vnMaxSize < .avail_in Then
                    .avail_in = vnMaxSize
                End If
                .next_in = VarPtr(vxbInput(vnStart))
    
                CopyMemory rc, ByVal ArrPtr(vxbOutput), 4
                If rc Then
                    CopyMemory rc, ByVal rc + 12, 4
    
                If rc + vnStart = .next_in Then
                    xbCopy = vxbInput
                    .next_in = VarPtr(xbCopy(vnStart))
                ElseIf vnStart Then
                    ReDim xbDataOut(vnStart - 1)
                    CopyMemory vxbOutput(0), vxbInput(0), vnStart - 1
                End If
            ElseIf vnStart Then
                vxbOutput = vxbInput
            End If
    
            If vnUncompressedSize Then
                .avail_out = vnUncompressedSize
            Else
                .avail_out = .avail_in * 2
            End If

            Do
                ReDim Preserve vxbOutput(.total_out - 1 + .avail_out + vnStart)
               
                .next_out = VarPtr(vxbOutput(vnStart + .total_out))
    
                rc = inflate(tStream)
                If rc Then
                    UncompressData = rc > 0
                    Exit Do
                End If
    
             
                .avail_out = ZLIB_ChunkSize
            Loop Until rc = 1
    
            If .total_out Or vnStart Then
                ReDim Preserve vxbOutput(.total_out + vnStart - 1)
            Else
                Erase vxbOutput
            End If
        End If
    
        inflateEnd tStream
    End If
 End With
End Function

History

  • 9th December, 2008: Initial post

I might update this project to show how to process the text.