Click here to Skip to main content
Rate this: bad
good
Please Sign up or sign in to vote.
See more: WinXP C# Visual-Studio
The recource using is almost good(10-20% CPU, RAM starts from 8Mb and grows), file comparer is normal, but program is slow on 7000+ files. About 20-30 minutes. How optimize? Maybe i shouldn't use recursive method in functions? Maybe the structure is not optimized? Please, help.
using System;
using System.Collections.Generic;
using System.IO;
using System.Diagnostics;
 
namespace ConsoleApplication3
{
    class Comparer
    {
        static void Main(string[] args)
        {
            Stopwatch s1 = new Stopwatch();
            s1.Start();
            switch (args.Length)
            {
                case 1:
                    {
                        CompareFilesRec(LookIn(args[0]));
                        s1.Stop();
                        break;
                    }
                default:
                    {
                        Console.WriteLine("Type only one argument.");
                        break;
                    }
 
            }
            
            Console.WriteLine("{0} ms", s1.ElapsedMilliseconds);
        }
 
        static bool CheckFile(string file)
        {
            FileInfo someFileInfo = new FileInfo(file);//берем інфу про файл в змінну someFileInfo
            if (someFileInfo.Length >= 2147483648 || someFileInfo.Length < 1)//перевірка
                return false;
            else return true;
        }
        static bool CheckDirEmpty(string dir)
        {
            DirectoryInfo someDir = new DirectoryInfo(dir);
            if (someDir.GetFiles().Length > 0)
                return false;
            else return true;
        }
 
        static List<string> LookIn(string path)
        {
            Stopwatch s1 = new Stopwatch();
            s1.Start();
            /*Ініціалізую і об*являю два списки:для файлів і папок*/
            List<string> files = new List<string>();
            List<string> dirs = new List<string>();
 
            /*Шукаєм всі доступні файли*/
            try
            {
                /*Добавляю знайдені папки і файли в список*/
                files.AddRange(Directory.GetFiles(path));
                dirs.AddRange(Directory.GetDirectories(path));    
            }
            catch (UnauthorizedAccessException) { }
            catch (DirectoryNotFoundException) { }
            catch (ArgumentOutOfRangeException) { }
            catch (IOException) { }
 
            for (int i = files.Count - 1; i >= 0; i--)
            {
                if (!CheckFile(files[i]))
                    files.RemoveAt(i);
            }
            for (int k = dirs.Count - 1; k >= 0; k--)
            {
                if(CheckDirEmpty(dirs[k]))
                    dirs.RemoveAt(k);
            }
            
            /*"Заглядаєм" за файлами в кожну директорію...*/
            foreach (string dir in dirs)
            {
                files.AddRange(LookIn(dir));//...і додаєм до списку
            }
            s1.Stop();
            Console.WriteLine("LookIN  = {0} ms", s1.ElapsedMilliseconds);
            return files;//повертаєм повний список знайдених файлів
        }
 
        static void CompareFilesRec(List<string> array)
        {
            Stopwatch s1 = new Stopwatch();
            s1.Start();
            List<KeyValuePair<long, string>> yeah = new List<KeyValuePair<long, string>>();
            List<string> Trash = new List<string>();
            List<string> outp = new List<string>();
 
            for (int j = array.Count-1; j >= 0; j--)
            {
                FileInfo fii = new FileInfo(array[j]);
                yeah.Add(new KeyValuePair<long, string>(fii.Length, array[j]));
            }
            array.Clear();
 
            foreach (var el in yeah)
            {
                if (!array.Contains(el.Value) && !Trash.Contains(el.Value))
                {
                    foreach (var ele in yeah)
                    {
                        if (el.Key == ele.Key && !array.Contains(ele.Value))
                        {
                            array.Add(ele.Value);
                        }
                        else
                        {
                            if (!Trash.Contains(ele.Value))
                            {
                                Trash.Add(ele.Value);
                            }
                        }
                    }
                }
            }
            yeah.Clear();
            /*TODO*/
            //byte crc = Crc8.ComputeChecksum(1, 2);
            int Arr = ComputeByteChecksum(array[0]);
            foreach (string f in array)
            {
                int File = ComputeByteChecksum(f);
                if (f != array[0] && Arr == File)
                    outp.Add(f);
                if (Arr != File)
                    if (!Trash.Contains(f))
                         Trash.Add(f);
            }
            outp.Add(array[0]);
            array.Clear();
            /*TODO end*/
            if (outp.Count > 1)
            {
                foreach (string fi in outp)
                {
                    Console.WriteLine(fi);
                }
                outp.Clear();
                Console.WriteLine();
            }
            
            if (Trash.Count > 1)
            {
                CompareFilesRec(Trash);
            }
            Trash.Clear();
            s1.Stop();
            Console.WriteLine("Comparing = {0} ms", s1.ElapsedMilliseconds);
        }
        static int ComputeByteChecksum(string path)
        {
            using (var reader = new BinaryReader(File.OpenRead(path)))
            {
                byte b1 = reader.ReadByte();
                reader.BaseStream.Position = reader.BaseStream.Position = reader.BaseStream.Length >> 1;
                byte b2 = reader.ReadByte();
                reader.BaseStream.Position = reader.BaseStream.Length - 1;
                byte b3 = reader.ReadByte();
                reader.Close();
                byte crc = Crc8.ComputeChecksum(1, 2);
                return Crc8.ComputeChecksum(b1, crc) + Crc8.ComputeChecksum(b2, crc) + Crc8.ComputeChecksum(b3, crc); 
            }
        }
    }
    public static class Crc8
    {
        static byte[] table = new byte[256];
 
        const byte poly = 0xd5;
 
        public static byte ComputeChecksum(params byte[] bytes)
        {
            byte crc = 0;
            if (bytes != null && bytes.Length > 0)
            {
                foreach (byte b in bytes)
                {
                    crc = table[crc ^ b];
                }
            }
            return crc;
        }
 
        static Crc8()
        {
            for (int i = 0; i < 256; ++i)
            {
                int temp = i;
                for (int j = 0; j < 8; ++j)
                {
                    if ((temp & 0x80) != 0)
                    {
                        temp = (temp << 1) ^ poly;
                    }
                    else
                    {
                        temp <<= 1;
                    }
                }
                table[i] = (byte)temp;
            }
        }
    }
}
Posted 25-Nov-12 0:31am
Je7511
Edited 25-Nov-12 0:32am
v2

1 solution

Rate this: bad
good
Please Sign up or sign in to vote.

Solution 1

You could look at a couple of approaches;
 
But first, what are you comparing? You are rolling your own CRCs, it maybe better to used some of the optimised .Net methods for building the CRCs or use larger hashes to reduce collission risk.
 
You could use a multithreaded approach, first build and index of files you are comparing, then get worker threads to then parrallel hash the files etc. the limiting factor is likely to be disk throughput, so you could keep increasing the thread count until just before you bottlekneck on the HDD subsystem.
 
Multithreading doesn't always help, so it would be beneficial to benchmark different approachs;
e.g. 1 thread - 1 file at a time
e.g. multiple threads - multiple files in parrallel
e.g. your own crc vs .Net classes (probably more optimised)
  Permalink  
Comments
Je7 at 25-Nov-12 6:24am
   
I just need to compare files(their content) in all directories. In my solution i take onle 3 bytes from file. In the beginning, in the half of file and in the end. So, how i understand you: create some hash array from that files(using multithreading) and than compare hashes?
DaveAuld at 25-Nov-12 6:33am
   
Have a look at this: http://www.codeproject.com/Articles/28512/Duplicate-Files-Finder it is doing a directory trawl and file comparison using MD5 hash. source is available.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



Advertise | Privacy | Mobile
Web04 | 2.8.141220.1 | Last Updated 25 Nov 2012
Copyright © CodeProject, 1999-2014
All Rights Reserved. Terms of Service
Layout: fixed | fluid

CodeProject, 503-250 Ferrand Drive Toronto Ontario, M3C 3G8 Canada +1 416-849-8900 x 100