Introduction
This application will tell you if a scanned image file (JPG, GIF, PNG or TIF) is blank.
One you detected a blank file, you can:
- Add
_deleted
to the filename of a blank image - Place blank images to to_be_deleted folder
- Delete blank images
At first, I thought that this was a simple problem and all I had to do is look at each pixel in the image and see how many of them are white. If the number of white pixels exceeded some specified “Content Density” threshold (like 99%) I would consider the image blank. However, this approach did not work because most of the pixels in a scanned image of a blank page are not white but a mix of various colors.
To address this problem, I identified the most common color. I did this by creating the histogram for each color. The histogram is basically a table that gives you the number of times each color is used (like red-5, green – 7). And I picked the most common color from the table. So if the blank sheet was pink, I will still be able to see that it is blank because most of the pixels were of the most common color: pink.
There still was a problem because two pixels that looked exactly the same had different color numbers. System.Drawing.Color
gives you over 16 million colors (255^3). So I converted the color to one of 216 common colors I found here. This gave me the ability to group color that looked more or less similar. The color conversion function looked for the most similar common color by calculating the color distance between the color and a common color. The color distance is the sum of absolute values of the differences between reds, greens and blues.
Yet, again, I ran into a problem because white pixels did not look white once I zoomed in. The white “pixels” was a cluster of pixels made up of pixels of various colors. Each one was not white but the cluster itself looked white. To deal with this problem, I expended the definition of pixels to a box 5x5 of 25 pixels.
Another problem I ran into had to do with margins. A scanned white sheet of paper can have a blank margin around. This can make the program think that it was looking at a non-white sheet of paper. To address this problem, I set to ignore 200 pixels margin.
At this point, program runs satisfactorily but very slow. I solved by saving the file in grayscale and reducing the file size to 200 px.
Using the Code
Here is the code:
Imports System.IO
Imports System.Drawing.Imaging
Public Class frmMain
Dim iPixelSize As Integer = 3
Private Sub Form1_Load(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles MyBase.Load
Dim oAppRegistry As New AppSetting
cbFileType.SelectedIndex = oAppRegistry.GetValueDef("FileType", "0")
cbAction.SelectedIndex = oAppRegistry.GetValueDef("Action", "0")
For i As Integer = 100 To 1 Step -1
Dim sVal As String = i & "%"
If i = 1 Or i = 25 Or i = 50 Or i = 75 Then
sVal = i & "% or more of all pixels have to be white for the file to be deleted"
ElseIf i = 100 Then
sVal = "100% of all pixels have to be white for the file to be deleted"
End If
cbDensity.Items.Add(sVal)
Next
cbDensity.SelectedIndex = oAppRegistry.GetValueDef("Density", "1")
txtIgnoreMargin.Text = oAppRegistry.GetValueDef("IgnoreMargin", "200")
txtPixelSize.Text = oAppRegistry.GetValueDef("PixelSize", "5")
txtFrom.Text = oAppRegistry.GetValue("From")
chkSubfolders.Checked = oAppRegistry.GetValue("Subfolders") = "1"
chkResize.Checked = oAppRegistry.GetValue("Resize") = "1"
chkShowHistogram.Checked = oAppRegistry.GetValue("Histogram") = "1"
End Sub
Private Sub frmMain_FormClosing(ByVal sender As Object, ByVal e As System.Windows.Forms.FormClosingEventArgs) Handles Me.FormClosing
Dim oAppRegistry As New AppSetting
oAppRegistry.SetValue("From", txtFrom.Text)
oAppRegistry.SetValue("FileType", cbFileType.SelectedIndex)
oAppRegistry.SetValue("Action", cbAction.SelectedIndex)
oAppRegistry.SetValue("Density", cbDensity.SelectedIndex)
oAppRegistry.SetValue("IgnoreMargin", txtIgnoreMargin.Text)
oAppRegistry.SetValue("PixelSize", txtPixelSize.Text)
oAppRegistry.SetValue("Subfolders", IIf(chkSubfolders.Checked, 1, 0))
oAppRegistry.SetValue("Resize", IIf(chkResize.Checked, 1, 0))
oAppRegistry.SetValue("Histogram", IIf(chkShowHistogram.Checked, 1, 0))
oAppRegistry.SaveData()
End Sub
Function GetRequiredDensity() As Integer
Dim sDensity As String = Split(cbDensity.SelectedItem, "%")(0)
Return CInt(sDensity)
End Function
Private Sub btnProcess_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles btnProcess.Click
Dim sFromPath As String = txtFrom.Text
If Not System.IO.Directory.Exists(sFromPath) Then
MsgBox("Folder does not exist")
Exit Sub
End If
iPixelSize = txtPixelSize.Text
sHistogramHtml = ""
btnProcess.Enabled = False
txtOutput.Text = ""
txtOutput.Text += "Starting..." & vbCrLf
ProccessFolder(sFromPath)
txtOutput.Text += "Done!"
btnProcess.Enabled = True
If sHistogramHtml <> "" Then
Dim sAssPath As String = System.Reflection.Assembly.GetExecutingAssembly().Location
Dim sPath As String = System.IO.Path.GetDirectoryName(sAssPath)
Dim sFilePath As String = System.IO.Path.Combine(sPath, "Histogram.htm")
If IO.File.Exists(sFilePath) Then
IO.File.Delete(sFilePath)
End If
Dim oStreamWriter As New System.IO.StreamWriter(sFilePath, True)
oStreamWriter.Write(sHistogramHtml)
oStreamWriter.Close()
System.Diagnostics.Process.Start(sFilePath)
End If
End Sub
Sub ProccessFolder(ByVal sFolderPath As String)
Dim sFileType As String = cbFileType.SelectedItem
Dim iRequiredDensity As Integer = GetRequiredDensity()
Dim sAction As String = cbAction.SelectedItem
Dim iMargin As Integer = 0
Try
iMargin = IIf(IsNumeric(txtIgnoreMargin.Text), txtIgnoreMargin.Text, 0)
Catch ex As Exception
End Try
Dim oFiles As String() = Directory.GetFiles(sFolderPath)
ProgressBar1.Maximum = oFiles.Length
For i As Integer = 0 To oFiles.Length - 1
Dim sFromFilePath As String = oFiles(i)
Dim oFileInfo As New FileInfo(sFromFilePath)
Dim sExt As String = PadExt(oFileInfo.Extension)
If sFileType = "All" OrElse sExt = sFileType Then
Dim dTime As DateTime = DateTime.Now
Dim iPercent As Integer = GetWhitePecent(sFromFilePath, iMargin)
Dim iSecondsToReadFile As Integer = (DateTime.Now).Subtract(dTime).TotalSeconds()
txtOutput.Text += sFromFilePath & vbTab & " has " & iPercent.ToString("0.00") & "% of white pixels. File was read in " & iSecondsToReadFile & " seconds." & vbCrLf
If iPercent >= iRequiredDensity Then
Select Case sAction
Case "Delete blank images"
oFileInfo.Delete()
Case "Add _deleted to the filename of a blank image"
Dim sNewFileName As String = IO.Path.GetFileNameWithoutExtension(sFromFilePath) & "_deleted." & sExt
Dim sNewFilePath As String = sFolderPath & "\" & sNewFileName
If IO.File.Exists(sNewFilePath) Then
txtOutput.Text += "File " & oFileInfo.Name & " cannot be renamed to " & sNewFileName & " because file with this name already exists." & vbCrLf
Else
oFileInfo.MoveTo(sNewFilePath)
End If
Case "Place blank images to to_be_deleted folder"
Dim sTempFolderPath As String = sFolderPath & "\to_be_deleted"
If IO.Directory.Exists(sTempFolderPath) = False Then
IO.Directory.CreateDirectory(sTempFolderPath)
End If
If IO.File.Exists(sTempFolderPath & "\" & oFileInfo.Name) Then
txtOutput.Text += "File " & oFileInfo.Name & " cannot be moved to " & sTempFolderPath & " because file with this name already exists there." & vbCrLf
Else
oFileInfo.MoveTo(sTempFolderPath & "\" & oFileInfo.Name)
End If
End Select
End If
End If
ProgressBar1.Value = i
Next
ProgressBar1.Value = 0
If chkSubfolders.Checked Then
Dim oFolders As String() = Directory.GetDirectories(sFolderPath)
For i As Integer = 0 To oFolders.Length - 1
Dim sChildFolder As String = oFolders(i)
Dim iPos As Integer = sChildFolder.LastIndexOf("\")
Dim sFolderName As String = sChildFolder.Substring(iPos + 1)
If sFolderName <> "to_be_deleted" Then
ProccessFolder(sChildFolder)
End If
Next
End If
End Sub
Private Function PadExt(ByVal s As String) As String
s = UCase(s)
If s.Length > 3 Then
s = s.Substring(1, 3)
End If
Return s
End Function
Dim oKnownColorLookup As New Hashtable
Private Function GetColor2(ByVal oColor As Color) As String
If oKnownColorLookup.ContainsKey(oColor) Then
Return oKnownColorLookup(oColor)
End If
Dim iColorDistance As Long = 10000000
Dim oColors As ArrayList = GetWebSafeColors()
For Each sColor As String In oColors
Dim iDistance As Integer = CompareColors(ColorTranslator.FromHtml("#" & sColor), oColor)
If iDistance < iColorDistance Then iColorDistance = iDistance
Next
For Each sColor As String In oColors
Dim iDistance As Integer = CompareColors(ColorTranslator.FromHtml("#" & sColor), oColor)
If iDistance = iColorDistance Then
oKnownColorLookup.Add(oColor, sColor)
Return sColor
End If
Next
Return ""
End Function
Public Shared Function CompareColors(ByVal a As Color, ByVal b As Color) As Integer
Return Math.Abs(CInt(a.R) - CInt(b.R)) + Math.Abs(CInt(a.G) - CInt(b.G)) + Math.Abs(CInt(a.B) - CInt(b.B))
End Function
Dim sHistogramHtml As String = ""
Private Function GetMostCommonColor(ByRef oBitmap As Bitmap, ByVal sFilePath As String) As String
Dim oColorHistogram As New Hashtable
Dim iTotalCount As Integer = 0
For x As Integer = 0 To oBitmap.Width - 1 - iPixelSize Step iPixelSize
For y As Integer = 0 To oBitmap.Height - 1 - iPixelSize Step iPixelSize
Dim oColor As Color = GetColorPixel(oBitmap, x, y)
Dim sColor As String = GetColor2(oColor)
If oColorHistogram.ContainsKey(sColor) Then
oColorHistogram(sColor) = CInt(oColorHistogram(sColor)) + 1
Else
oColorHistogram.Add(sColor, 1)
End If
iTotalCount += 1
Next
Next
Dim oDataView As DataView = SortHashtable(oColorHistogram)
If chkShowHistogram.Checked Then
sHistogramHtml += "<h2>Histogram for " & sFilePath & "</h2>" & vbCrLf
sHistogramHtml += "<table border=1>" & vbCrLf
For iRow As Long = 0 To oDataView.Count - 1
Dim sKey As String = oDataView(iRow)("key")
Dim sValue As String = oDataView(iRow)("value")
Dim iPecent As Double = (CDbl(sValue) / iTotalCount) * 100
If iPecent > 1 Then
sHistogramHtml += vbTab & "<tr><td bgcolor='" & sKey & "'> </td><td>" & sKey & "</td><td>"
sHistogramHtml += Math.Round(iPecent, 2) & "%</td></tr>" & vbCrLf
End If
Next
sHistogramHtml += "</table>" & vbCrLf
End If
If oColorHistogram.Count > 0 Then
Dim sKey As String = oDataView(0)("key")
Dim sValue As String = oDataView(0)("value")
Return sKey
End If
Return ""
End Function
Public Function MakeGrayscale(ByRef original As Bitmap) As Bitmap
Dim destWidth As Integer = original.Width
Dim destHeight As Integer = original.Height
If chkResize.Checked Then
Dim Width As Integer = 200
Dim Height As Integer = 200
Dim sourceWidth As Integer = destWidth
Dim sourceHeight As Integer = destHeight
Dim nPercent As Single = 0
Dim nPercentW As Single = 0
Dim nPercentH As Single = 0
nPercentW = (CSng(Width) / CSng(sourceWidth))
nPercentH = (CSng(Height) / CSng(sourceHeight))
If nPercentH < nPercentW Then
nPercent = nPercentH
Else
nPercent = nPercentW
End If
destWidth = CInt((sourceWidth * nPercent))
destHeight = CInt((sourceHeight * nPercent))
End If
Dim newBitmap As Bitmap = New Bitmap(destWidth, destHeight)
Dim g As Graphics = Graphics.FromImage(newBitmap)
Dim colorMatrix As ColorMatrix = New ColorMatrix(New Single()() {New Single() {0.3F, 0.3F, 0.3F, 0, 0}, New Single() {0.59F, 0.59F, 0.59F, 0, 0}, New Single() {0.11F, 0.11F, 0.11F, 0, 0}, New Single() {0, 0, 0, 1, 0}, New Single() {0, 0, 0, 0, 1}})
Dim attributes As ImageAttributes = New ImageAttributes()
attributes.SetColorMatrix(colorMatrix)
g.DrawImage(original, New Rectangle(0, 0, destWidth, destHeight), 0, 0, original.Width, original.Height, GraphicsUnit.Pixel, attributes)
g.Dispose()
original.Dispose()
Return newBitmap
End Function
Private Function GetWhitePecent(ByVal sFilePath As String, ByVal iMargin As Integer) As Integer
Dim oBitmap As Bitmap = New Bitmap(sFilePath)
oBitmap = MakeGrayscale(oBitmap)
Dim iTotalCount As Long = 0
Dim iCount As Long = 0
Dim sMostCommonColor As String = GetMostCommonColor(oBitmap, sFilePath)
Dim iWidth As Integer = oBitmap.Width
Dim iHeight As Integer = oBitmap.Height
For x As Integer = iMargin To iWidth - 1 - iPixelSize - iMargin Step iPixelSize
For y As Integer = iMargin To iHeight - 1 - iPixelSize - iMargin Step iPixelSize
Dim oColor As Color = GetColorPixel(oBitmap, x, y)
If GetColor2(oColor) = sMostCommonColor Then
iCount += 1
End If
iTotalCount += 1
Next
Next
If iTotalCount = 0 Then
MsgBox("Magin is too large " & txtIgnoreMargin.Text)
txtIgnoreMargin.Text = "0"
Return 0
End If
Return (CDbl(iCount) / iTotalCount) * 100
End Function
Private Function GetColorPixel(ByRef oBitmap As Bitmap, ByVal x As Integer, ByVal y As Integer) As Color
If iPixelSize = 1 Then Return oBitmap.GetPixel(x, y)
Dim iA As Integer = 0
Dim iR As Integer = 0
Dim iG As Integer = 0
Dim iB As Integer = 0
Dim iCount As Integer = 0
For iX As Integer = 0 To iPixelSize
For iY As Integer = 0 To iPixelSize
Dim oColor As Color = oBitmap.GetPixel(x + iX, y + iY)
iA += CInt(oColor.A)
iR += CInt(oColor.R)
iG += CInt(oColor.G)
iB += CInt(oColor.B)
iCount += 1
Next
Next
Return Color.FromArgb((iA / iCount), (iR / iCount), (iG / iCount), (iB / iCount))
End Function
Private Function SortHashtable(ByVal oHash As Hashtable) As DataView
Dim oTable As New Data.DataTable
oTable.Columns.Add(New Data.DataColumn("key", System.Type.GetType("System.String")))
oTable.Columns.Add(New Data.DataColumn("value", System.Type.GetType("System.Int32")))
For Each oEntry As Collections.DictionaryEntry In oHash
Dim oDataRow As DataRow = oTable.NewRow()
oDataRow("key") = oEntry.Key
oDataRow("value") = oEntry.Value
oTable.Rows.Add(oDataRow)
Next
Dim oDataView As DataView = New DataView(oTable)
oDataView.Sort = "value DESC"
Return oDataView
End Function
Private Sub btnFromFolder_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles btnFromFolder.Click
fldFrom.ShowDialog()
txtFrom.Text = fldFrom.SelectedPath
End Sub
Private Function GetWebSafeColors() As ArrayList
Dim o As New ArrayList
o.Add("000000")
o.Add("000033")
o.Add("000066")
o.Add("000099")
o.Add("0000cc")
o.Add("0000ff")
o.Add("003300")
o.Add("003333")
o.Add("003366")
o.Add("003399")
o.Add("0033cc")
o.Add("0033ff")
o.Add("006600")
o.Add("006633")
o.Add("006666")
o.Add("006699")
o.Add("0066cc")
o.Add("0066ff")
o.Add("009900")
o.Add("009933")
o.Add("009966")
o.Add("009999")
o.Add("0099cc")
o.Add("0099ff")
o.Add("00cc00")
o.Add("00cc33")
o.Add("00cc66")
o.Add("00cc99")
o.Add("00cccc")
o.Add("00ccff")
o.Add("00ff00")
o.Add("00ff33")
o.Add("00ff66")
o.Add("00ff99")
o.Add("00ffcc")
o.Add("00ffff")
o.Add("330000")
o.Add("330033")
o.Add("330066")
o.Add("330099")
o.Add("3300cc")
o.Add("3300ff")
o.Add("333300")
o.Add("333333")
o.Add("333366")
o.Add("333399")
o.Add("3333cc")
o.Add("3333ff")
o.Add("336600")
o.Add("336633")
o.Add("336666")
o.Add("336699")
o.Add("3366cc")
o.Add("3366ff")
o.Add("339900")
o.Add("339933")
o.Add("339966")
o.Add("339999")
o.Add("3399cc")
o.Add("3399ff")
o.Add("33cc00")
o.Add("33cc33")
o.Add("33cc66")
o.Add("33cc99")
o.Add("33cccc")
o.Add("33ccff")
o.Add("33ff00")
o.Add("33ff33")
o.Add("33ff66")
o.Add("33ff99")
o.Add("33ffcc")
o.Add("33ffff")
o.Add("660000")
o.Add("660033")
o.Add("660066")
o.Add("660099")
o.Add("6600cc")
o.Add("6600ff")
o.Add("663300")
o.Add("663333")
o.Add("663366")
o.Add("663399")
o.Add("6633cc")
o.Add("6633ff")
o.Add("666600")
o.Add("666633")
o.Add("666666")
o.Add("666699")
o.Add("6666cc")
o.Add("6666ff")
o.Add("669900")
o.Add("669933")
o.Add("669966")
o.Add("669999")
o.Add("6699cc")
o.Add("6699ff")
o.Add("66cc00")
o.Add("66cc33")
o.Add("66cc66")
o.Add("66cc99")
o.Add("66cccc")
o.Add("66ccff")
o.Add("66ff00")
o.Add("66ff33")
o.Add("66ff66")
o.Add("66ff99")
o.Add("66ffcc")
o.Add("66ffff")
o.Add("990000")
o.Add("990033")
o.Add("990066")
o.Add("990099")
o.Add("9900cc")
o.Add("9900ff")
o.Add("993300")
o.Add("993333")
o.Add("993366")
o.Add("993399")
o.Add("9933cc")
o.Add("9933ff")
o.Add("996600")
o.Add("996633")
o.Add("996666")
o.Add("996699")
o.Add("9966cc")
o.Add("9966ff")
o.Add("999900")
o.Add("999933")
o.Add("999966")
o.Add("999999")
o.Add("9999cc")
o.Add("9999ff")
o.Add("99cc00")
o.Add("99cc33")
o.Add("99cc66")
o.Add("99cc99")
o.Add("99cccc")
o.Add("99ccff")
o.Add("99ff00")
o.Add("99ff33")
o.Add("99ff66")
o.Add("99ff99")
o.Add("99ffcc")
o.Add("99ffff")
o.Add("cc0000")
o.Add("cc0033")
o.Add("cc0066")
o.Add("cc0099")
o.Add("cc00cc")
o.Add("cc00ff")
o.Add("cc3300")
o.Add("cc3333")
o.Add("cc3366")
o.Add("cc3399")
o.Add("cc33cc")
o.Add("cc33ff")
o.Add("cc6600")
o.Add("cc6633")
o.Add("cc6666")
o.Add("cc6699")
o.Add("cc66cc")
o.Add("cc66ff")
o.Add("cc9900")
o.Add("cc9933")
o.Add("cc9966")
o.Add("cc9999")
o.Add("cc99cc")
o.Add("cc99ff")
o.Add("cccc00")
o.Add("cccc33")
o.Add("cccc66")
o.Add("cccc99")
o.Add("cccccc")
o.Add("ccccff")
o.Add("ccff00")
o.Add("ccff33")
o.Add("ccff66")
o.Add("ccff99")
o.Add("ccffcc")
o.Add("ccffff")
o.Add("ff0000")
o.Add("ff0033")
o.Add("ff0066")
o.Add("ff0099")
o.Add("ff00cc")
o.Add("ff00ff")
o.Add("ff3300")
o.Add("ff3333")
o.Add("ff3366")
o.Add("ff3399")
o.Add("ff33cc")
o.Add("ff33ff")
o.Add("ff6600")
o.Add("ff6633")
o.Add("ff6666")
o.Add("ff6699")
o.Add("ff66cc")
o.Add("ff66ff")
o.Add("ff9900")
o.Add("ff9933")
o.Add("ff9966")
o.Add("ff9999")
o.Add("ff99cc")
o.Add("ff99ff")
o.Add("ffcc00")
o.Add("ffcc33")
o.Add("ffcc66")
o.Add("ffcc99")
o.Add("ffcccc")
o.Add("ffccff")
o.Add("ffff00")
o.Add("ffff33")
o.Add("ffff66")
o.Add("ffff99")
o.Add("ffffcc")
o.Add("ffffff")
Return o
End Function
Private Sub chkResize_CheckedChanged(sender As Object, e As EventArgs) Handles chkResize.CheckedChanged
HideSampleMargin()
End Sub
Private Sub chkGrayscale_CheckedChanged(sender As Object, e As EventArgs)
HideSampleMargin()
End Sub
Private Sub HideSampleMargin()
If chkResize.Checked = False = False Then
txtIgnoreMargin.Text = "200"
Else
txtIgnoreMargin.Text = "3"
End If
End Sub
End Class
Public Class AppSetting
Private oDS As New Data.DataSet
Private oTable As New Data.DataTable
Private sFilePath As String = ""
Public Sub New()
Dim sAssPath As String = System.Reflection.Assembly.GetExecutingAssembly().Location
Dim sPath As String = System.IO.Path.GetDirectoryName(sAssPath)
sFilePath = System.IO.Path.Combine(sPath, "Settings.xml")
LoadData()
End Sub
Private Sub LoadData()
oDS = New Data.DataSet()
If System.IO.File.Exists(sFilePath) Then
oDS.ReadXml(sFilePath)
If oDS.Tables.Count > 0 Then
oTable = oDS.Tables(0)
Exit Sub
End If
End If
oTable = New Data.DataTable()
oTable.Columns.Add(New Data.DataColumn("key"))
oTable.Columns.Add(New Data.DataColumn("value"))
oDS.Tables.Add(oTable)
End Sub
Public Sub SaveData()
oTable.DataSet.WriteXml(sFilePath, XmlWriteMode.WriteSchema)
End Sub
Public Sub SetValue(ByVal sKey As String, ByVal sValue As String)
Dim oDataRow As DataRow
Dim oDataRows As DataRow() = oTable.Select("key = '" & Replace(sKey, "'", "''") & "'")
If oDataRows.Length > 0 Then
oDataRows(0)("value") = sValue
Else
oDataRow = oTable.NewRow()
oDataRow("key") = sKey
oDataRow("value") = sValue
oTable.Rows.Add(oDataRow)
End If
End Sub
Public Function GetValue(ByVal sKey As String) As String
Dim oDataRows As DataRow() = oTable.Select("key = '" & Replace(sKey, "'", "''") & "'")
If oDataRows.Length > 0 Then
Return oDataRows(0)("value") & ""
End If
Return ""
End Function
Public Function GetValueDef(ByVal sKey As String, ByVal sDefVal As String) As String
Dim sValue As String = GetValue(sKey)
If sValue <> "" Then
Return sValue
End If
Return sDefVal
End Function
End Class
History
- 27th October, 2014: Initial version
Igor is a business intelligence consultant working in Tampa, Florida. He has a BS in Finance from University of South Carolina and Masters in Information Management System from University of South Florida. He also has following professional certifications: MCSD, MCDBA, MCAD.