Click here to Skip to main content
Rate this: bad
good
Please Sign up or sign in to vote.
See more: WinXP C# .NET4
Hello, everybody. The code below, works very slowly, about 10 seconds per iteration. How to improve him?
 
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
 
namespace Compare
{
    public class Comparer
    {
        public static List<string> CompareLength(string file, List<string> arr)
        {
            List<string> result = new List<string>();
            foreach (string item in arr)
                if (new FileInfo(item).Length == new FileInfo(file).Length)
                    result.Add(item);
            return result;
        }
        public static void CompareFilesRec(List<string> array)
        {
            List<string> outp = new List<string>();
 
            int i = 0;
            while (i < array.Count)
            {
                List<string> fromAll = Comparer.CompareLength(array[i], array);
                string chosen = array[i];
                if (fromAll.Count != 0)
                    foreach (var f2 in fromAll)
                    {
                        if (Checksum.CompareHash(chosen, f2))
                        {
                            outp.Add(f2);
                            array.Remove(f2);
                        }
                        else { i++; }
 
                    }
                Comparer.ShowResults(outp);
                outp.Clear();
            }
        }
        public static void ShowResults(List<string> results)
        {
            if (results.Count >= 2)
            {
                foreach (var element in results)
                {
                    Console.WriteLine(element);
                }
                Console.WriteLine();
            }
        }
    }
}
 
Checksum.CompareHash
        static public bool CompareHash(string file1, string file2)
        {
            long FirstSize = new FileInfo(file1).Length;
            long SecondSize = new FileInfo(file2).Length;
 
            byte[] buffer1 = new byte[FirstSize];
            byte[] buffer2 = new byte[SecondSize];
 
            try
            {
                using (var mmf1 = MemoryMappedFile.CreateFromFile(file1, FileMode.OpenOrCreate, null, 0))
                using (var mmf2 = MemoryMappedFile.CreateFromFile(file2, FileMode.OpenOrCreate, null, 0))
                using (var reader1 = mmf1.CreateViewStream())
                using (var reader2 = mmf2.CreateViewStream())
                {
                    try
                    {
                        reader1.Read(buffer1, 0, (int)FirstSize);
                        reader2.Read(buffer2, 0, (int)SecondSize);
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine("Error: {0}", ex.Message);
                    }
 

                    if (buffer1.Length != buffer2.Length || Crc8.ComputeChecksum(buffer1) != Crc8.ComputeChecksum(buffer2))
                        return false;
                }
            }
            catch (IOException) { }
            catch (UnauthorizedAccessException) { }
            return true;
        }
 

So, I realized that solution, check it:
namespace Compare
{
    public class Comparer
    {
        public static List<string> CompareLength(string file, List<string> arr)
        {
            List<string> result = new List<string>();
            foreach (string item in arr)
                if (new FileInfo(item).Length == new FileInfo(file).Length && Checksum.CompareHash(file,item))//here I added hash check. To avoid another loop or something.
                    result.Add(item);
            return result;
        }
        public static void CompareFilesRec(List<string> array)
        {
            Dictionary<long, List<string>> di = new Dictionary<long,List<string>>();
 
            int i = 0;
            while (i < array.Count)
            {
                long size = new FileInfo(array[i]).Length;
 
                List<string> fromAll = Comparer.CompareLength(array[i], array);
 
                if (di.ContainsKey(size))
                {
                    fromAll.Clear();
                    array.Remove(array[i]);
                }
                else
                {
                    di.Add(size, fromAll);
                    i++;
                }
            }
            ShowResults(di);
        }
        public static void ShowResults(Dictionary<long, List<string>> dic)
        {
            foreach(var element in dic)
            {
                foreach (var outer in element.Value)
                {
                    Console.WriteLine(outer);
                }
                Console.WriteLine();
            }
        }
    }
}
 
Also i changed the hashing function:
namespace Compare
{
    public class Checksum
    {
        static public bool CompareHash(string file1, string file2)
        {
            long FirstSize = new FileInfo(file1).Length;
            long SecondSize = new FileInfo(file2).Length;
 
            SHA256 sha1 = SHA256Managed.Create();
            SHA256 sha2 = SHA256Managed.Create();
 
            byte[] buffer1 = new byte[FirstSize];
            byte[] buffer2 = new byte[SecondSize];
 
            try
            {
                using (var mmf1 = MemoryMappedFile.CreateFromFile(file1, FileMode.OpenOrCreate, null, 0))
                using (var mmf2 = MemoryMappedFile.CreateFromFile(file2, FileMode.OpenOrCreate, null, 0))
                using (var reader1 = mmf1.CreateViewStream())
                using (var reader2 = mmf2.CreateViewStream())
                {
                    try
                    {
                        reader1.Read(buffer1, 0, (int)FirstSize);
                        reader2.Read(buffer2, 0, (int)SecondSize);
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine("Error: {0}", ex.Message);
                    }
 

                    if (buffer1.Length != buffer2.Length && sha1.ComputeHash(buffer1) != sha2.ComputeHash(buffer2))
                        return false;
                }
            }
            catch (IOException) { }
            catch (UnauthorizedAccessException) { }
            return true;
        }
    }
}
 
Please, tell me my errors in code above...
Posted 3-Apr-13 10:23am
Je7511
Edited 6-Apr-13 5:55am
v4
Comments
Sergey Alexandrovich Kryukov at 3-Apr-13 15:43pm
   
Please, what's the purpose of comparison? Same exact content or not? Or something else?
 
Of course, there are some problems already apparent. Why comparing hash? Why passing file names to length comparison? You are wasting some time on it.
 
—SA
Je7 at 3-Apr-13 15:47pm
   
Yes, same exact content is the purpose of comparison.
 
The logic: we receive simple list of files -> get element from list -> compare chosen's file length with another files, with the same length -> collect similar files and output them.
 
What you can advice to do?
Sergey Alexandrovich Kryukov at 3-Apr-13 16:03pm
   
No, "similar" is not "exact same content". Can you strictly formulate it. Do you want to sort the list into sets of identical files... It's hard to understand, what exactly.
Anyway, in your code, there is an huge amount of repeated operations: getting file info again and again, passing file names instead of some data. Where is Checksum.CompareHash?..
—SA
Je7 at 3-Apr-13 16:23pm
   
Ok. I need "exact same content". Checksum.CompareHash added above.
Matt T Heffron at 3-Apr-13 16:25pm
   
You didn't complete the edit of the question above. we don't see the Checksum.CompareHash...
[EDIT] Now it's there.[/EDIT]
Matt T Heffron at 3-Apr-13 16:34pm
   
You *should* at this point already know the lengths match for two files checking the Checksum, so that is redundant (but trivial). An 8bit CRC (I'm assuming Crc8 is 8 bits, not 8 bytes) is not going to be a very good basis for declaring two files are identical. (There's a 1 in 256 chance of "false positive".) Try System.Security.Cryptography.SHA256 or SHA512
Je7 at 3-Apr-13 16:48pm
   
I have CRC8, yes, and 8 means 8 bits...so, i need to change it, and what did you mean, that I should know the lengths match for two files checking the Checksum?
Matt T Heffron at 3-Apr-13 16:51pm
   
When you call Checksum.CompareHash in your code, it is between files that you have already determined have the same lengths. This will not be true in the "general case", but it is in your case. Anyway, my solution outlines a way with (almost) no redundant work.
Je7 at 3-Apr-13 17:01pm
   
Ok, I'll try to increase performance, using your solution. It is very interesting way.

1 solution

Rate this: bad
good
Please Sign up or sign in to vote.

Solution 1

No code, just suggestions/hints.
Ok, first remove redundant work. (There's plenty of it.)
 
Create a Dictionary<ulong, List<string>>
Make one pass through the list of all filenames.
For each filename, get the file length.
Use the .TryGetValue to get the list of files with that length.
If there is no such list, then create a new List<string> and insert it into the Dictionary.
Add the filename to the list.
 
After all of the filenames have been "processed".
Go through all of the .Values in the Dictionary.
If the length is >1 then perform a similar process on each filename in the list, with the checksum/hash as the Dictionary key instead of the length (use a different Dictionary).
All of the lists in this Dictionary with more than one entry are sets of identical files.
 
Length checking is done only once per file and the checksum/hash is computed only once per file (and only for the files that could possibly be matched).
  Permalink  
v4
Comments
Sergey Alexandrovich Kryukov at 3-Apr-13 16:45pm
   
Yes, there is too much of redundancy, but finding an near-optimum implementation requires good thinking and work. Your suggestions make sense, my 5.
—SA
Matt T Heffron at 3-Apr-13 16:48pm
   
Thanks.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

  Print Answers RSS
0 George Jonsson 175
1 Kornfeld Eliyahu Peter 169
2 Zoltán Zörgő 139
3 PIEBALDconsult 130
4 OriginalGriff 120
0 OriginalGriff 6,165
1 DamithSL 4,658
2 Maciej Los 4,107
3 Kornfeld Eliyahu Peter 3,649
4 Sergey Alexandrovich Kryukov 3,342


Advertise | Privacy | Mobile
Web02 | 2.8.141220.1 | Last Updated 6 Apr 2013
Copyright © CodeProject, 1999-2014
All Rights Reserved. Terms of Service
Layout: fixed | fluid

CodeProject, 503-250 Ferrand Drive Toronto Ontario, M3C 3G8 Canada +1 416-849-8900 x 100