Click here to Skip to main content
12,064,498 members (48,547 online)
Rate this:
 
Please Sign up or sign in to vote.
See more: WinXP C# Windows .NET .NET4
Hello, everybody. The code below, works very slowly, about 10 seconds per iteration. How to improve him?

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
 
namespace Compare
{
    public class Comparer
    {
        public static List<string> CompareLength(string file, List<string> arr)
        {
            List<string> result = new List<string>();
            foreach (string item in arr)
                if (new FileInfo(item).Length == new FileInfo(file).Length)
                    result.Add(item);
            return result;
        }
        public static void CompareFilesRec(List<string> array)
        {
            List<string> outp = new List<string>();
 
            int i = 0;
            while (i < array.Count)
            {
                List<string> fromAll = Comparer.CompareLength(array[i], array);
                string chosen = array[i];
                if (fromAll.Count != 0)
                    foreach (var f2 in fromAll)
                    {
                        if (Checksum.CompareHash(chosen, f2))
                        {
                            outp.Add(f2);
                            array.Remove(f2);
                        }
                        else { i++; }
 
                    }
                Comparer.ShowResults(outp);
                outp.Clear();
            }
        }
        public static void ShowResults(List<string> results)
        {
            if (results.Count >= 2)
            {
                foreach (var element in results)
                {
                    Console.WriteLine(element);
                }
                Console.WriteLine();
            }
        }
    }
}

Checksum.CompareHash
static public bool CompareHash(string file1, string file2)
{
    long FirstSize = new FileInfo(file1).Length;
    long SecondSize = new FileInfo(file2).Length;
 
    byte[] buffer1 = new byte[FirstSize];
    byte[] buffer2 = new byte[SecondSize];
 
    try
    {
        using (var mmf1 = MemoryMappedFile.CreateFromFile(file1, FileMode.OpenOrCreate, null, 0))
        using (var mmf2 = MemoryMappedFile.CreateFromFile(file2, FileMode.OpenOrCreate, null, 0))
        using (var reader1 = mmf1.CreateViewStream())
        using (var reader2 = mmf2.CreateViewStream())
        {
            try
            {
                reader1.Read(buffer1, 0, (int)FirstSize);
                reader2.Read(buffer2, 0, (int)SecondSize);
            }
            catch (Exception ex)
            {
                Console.WriteLine("Error: {0}", ex.Message);
            }
 

            if (buffer1.Length != buffer2.Length || Crc8.ComputeChecksum(buffer1) != Crc8.ComputeChecksum(buffer2))
                return false;
        }
    }
    catch (IOException) { }
    catch (UnauthorizedAccessException) { }
    return true;
}


So, I realized that solution, check it:
namespace Compare
{
    public class Comparer
    {
        public static List<string> CompareLength(string file, List<string> arr)
        {
            List<string> result = new List<string>();
            foreach (string item in arr)
                if (new FileInfo(item).Length == new FileInfo(file).Length && Checksum.CompareHash(file,item))//here I added hash check. To avoid another loop or something.
                    result.Add(item);
            return result;
        }
        public static void CompareFilesRec(List<string> array)
        {
            Dictionary<long, List<string>> di = new Dictionary<long,List<string>>();
 
            int i = 0;
            while (i < array.Count)
            {
                long size = new FileInfo(array[i]).Length;
 
                List<string> fromAll = Comparer.CompareLength(array[i], array);
 
                if (di.ContainsKey(size))
                {
                    fromAll.Clear();
                    array.Remove(array[i]);
                }
                else
                {
                    di.Add(size, fromAll);
                    i++;
                }
            }
            ShowResults(di);
        }
        public static void ShowResults(Dictionary<long, List<string>> dic)
        {
            foreach(var element in dic)
            {
                foreach (var outer in element.Value)
                {
                    Console.WriteLine(outer);
                }
                Console.WriteLine();
            }
        }
    }
}

Also i changed the hashing function:
namespace Compare
{
    public class Checksum
    {
        static public bool CompareHash(string file1, string file2)
        {
            long FirstSize = new FileInfo(file1).Length;
            long SecondSize = new FileInfo(file2).Length;
 
            SHA256 sha1 = SHA256Managed.Create();
            SHA256 sha2 = SHA256Managed.Create();
 
            byte[] buffer1 = new byte[FirstSize];
            byte[] buffer2 = new byte[SecondSize];
 
            try
            {
                using (var mmf1 = MemoryMappedFile.CreateFromFile(file1, FileMode.OpenOrCreate, null, 0))
                using (var mmf2 = MemoryMappedFile.CreateFromFile(file2, FileMode.OpenOrCreate, null, 0))
                using (var reader1 = mmf1.CreateViewStream())
                using (var reader2 = mmf2.CreateViewStream())
                {
                    try
                    {
                        reader1.Read(buffer1, 0, (int)FirstSize);
                        reader2.Read(buffer2, 0, (int)SecondSize);
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine("Error: {0}", ex.Message);
                    }
 

                    if (buffer1.Length != buffer2.Length && sha1.ComputeHash(buffer1) != sha2.ComputeHash(buffer2))
                        return false;
                }
            }
            catch (IOException) { }
            catch (UnauthorizedAccessException) { }
            return true;
        }
    }
}

Please, tell me my errors in code above...
Posted 3-Apr-13 10:23am
Je7515
Edited 6-Apr-13 5:55am
v4
Comments
   
Please, what's the purpose of comparison? Same exact content or not? Or something else?

Of course, there are some problems already apparent. Why comparing hash? Why passing file names to length comparison? You are wasting some time on it.

—SA
Je7 3-Apr-13 15:47pm
   
Yes, same exact content is the purpose of comparison.

The logic: we receive simple list of files -> get element from list -> compare chosen's file length with another files, with the same length -> collect similar files and output them.

What you can advice to do?
   
No, "similar" is not "exact same content". Can you strictly formulate it. Do you want to sort the list into sets of identical files... It's hard to understand, what exactly.
Anyway, in your code, there is an huge amount of repeated operations: getting file info again and again, passing file names instead of some data. Where is Checksum.CompareHash?..
—SA
Je7 3-Apr-13 16:23pm
   
Ok. I need "exact same content". Checksum.CompareHash added above.
Matt T Heffron 3-Apr-13 16:25pm
   
You didn't complete the edit of the question above. we don't see the Checksum.CompareHash...
[EDIT] Now it's there.[/EDIT]
Matt T Heffron 3-Apr-13 16:34pm
   
You *should* at this point already know the lengths match for two files checking the Checksum, so that is redundant (but trivial). An 8bit CRC (I'm assuming Crc8 is 8 bits, not 8 bytes) is not going to be a very good basis for declaring two files are identical. (There's a 1 in 256 chance of "false positive".) Try System.Security.Cryptography.SHA256 or SHA512
Je7 3-Apr-13 16:48pm
   
I have CRC8, yes, and 8 means 8 bits...so, i need to change it, and what did you mean, that I should know the lengths match for two files checking the Checksum?
Matt T Heffron 3-Apr-13 16:51pm
   
When you call Checksum.CompareHash in your code, it is between files that you have already determined have the same lengths. This will not be true in the "general case", but it is in your case. Anyway, my solution outlines a way with (almost) no redundant work.
Je7 3-Apr-13 17:01pm
   
Ok, I'll try to increase performance, using your solution. It is very interesting way.

1 solution

Rate this: bad
 
good
Please Sign up or sign in to vote.

Solution 1

No code, just suggestions/hints.
Ok, first remove redundant work. (There's plenty of it.)

Create a Dictionary<ulong, List<string>>
Make one pass through the list of all filenames.
For each filename, get the file length.
Use the .TryGetValue to get the list of files with that length.
If there is no such list, then create a new List<string> and insert it into the Dictionary.
Add the filename to the list.

After all of the filenames have been "processed".
Go through all of the .Values in the Dictionary.
If the length is >1 then perform a similar process on each filename in the list, with the checksum/hash as the Dictionary key instead of the length (use a different Dictionary).
All of the lists in this Dictionary with more than one entry are sets of identical files.

Length checking is done only once per file and the checksum/hash is computed only once per file (and only for the files that could possibly be matched).
  Permalink  
v4
Comments
   
Yes, there is too much of redundancy, but finding an near-optimum implementation requires good thinking and work. Your suggestions make sense, my 5.
—SA
Matt T Heffron 3-Apr-13 16:48pm
   
Thanks.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

  Print Answers RSS
Top Experts
Last 24hrsThis month


Advertise | Privacy | Mobile
Web02 | 2.8.160204.4 | Last Updated 6 Apr 2013
Copyright © CodeProject, 1999-2016
All Rights Reserved. Terms of Service
Layout: fixed | fluid

CodeProject, 503-250 Ferrand Drive Toronto Ontario, M3C 3G8 Canada +1 416-849-8900 x 100