// Sound Fingerprinting framework
// https://code.google.com/p/soundfingerprinting/
// Code license: GNU General Public License v2
// ciumac.sergiu@gmail.com
using System;
using System.Collections.Generic;
using System.IO;
using System.Threading;
using DuplicateTracks.Audio;
using DuplicateTracks.DataAccess;
using DuplicateTracks.Fingerprinting;
using DuplicateTracks.Model;
using Un4seen.Bass.AddOn.Tags;
namespace DuplicateTracks.ViewModel
{
/// <summary>
/// Class which prepares the data for Repository analysis of the tracks (does all the "dirty job")
/// </summary>
public class RepositoryGateway
{
#region Constants
/// <summary>
/// Maximum track length (track's bigger than this value will be discarded)
/// </summary>
private const int MAX_TRACK_LENGTH = 60*10; /*10 min - maximal track length*/
/// <summary>
/// Number of milliseconds to process from each song
/// </summary>
private const int MILLISECONDS_TO_PROCESS = 30*1000;
/// <summary>
/// Starting processing point
/// </summary>
private const int MILLISECONDS_START = 20*1000;
/// <summary>
/// Minimum track length (track's less than this value will be discarded)
/// </summary>
private const int MIN_TRACK_LENGTH = (MILLISECONDS_TO_PROCESS + MILLISECONDS_START)/1000 + 1;
/// <summary>
/// Incremental static stride size (1024 samples from the start)
/// </summary>
private const int STRIDE_SIZE_INCREMENTAL = 1024;
/// <summary>
/// Number of LSH tables
/// </summary>
private const int NUMBER_OF_HASH_TABLES = 25;
/// <summary>
/// Number of Min Hash keys per 1 hash function (1 LSH table)
/// </summary>
private const int NUMBER_OF_KEYS = 4;
/// <summary>
/// Path to permutations (generated using greedy algorithm)
/// </summary>
private const string PATH_TO_PERMUTATIONS = "perms.csv";
/// <summary>
/// Number of threshold votes for a file to be considerate a duplicate
/// </summary>
private const int THRESHOLD_VOTES = 8;
/// <summary>
/// Separator in the .csv files
/// </summary>
private const string SEPARATOR = ",";
/// <summary>
/// Number of samples per fingerprint (8192 correspond to 1.48 sec granularity)
/// </summary>
private const int SAMPLES_IN_FINGERPRINT = 8192;
#endregion
#region Read-only components
/// <summary>
/// Creational stride (used in hashing audio objects)
/// </summary>
private readonly IStride _createStride;
/// <summary>
/// Permutations provider
/// </summary>
private readonly IPermutations _permutations;
/// <summary>
/// Bass proxy used in reading from files
/// </summary>
private readonly BassProxy _proxy;
/// <summary>
/// Query stride (used in querying the database)
/// </summary>
private readonly IStride _queryStride;
/// <summary>
/// Repository for storage, permutations, algorithm
/// </summary>
private readonly Repository _repository;
/// <summary>
/// Storage for hash signatures and tracks
/// </summary>
private readonly IStorage _storage;
#endregion
/// <summary>
/// Flag for abortion
/// </summary>
private bool _aborted;
private bool _isRunning = false;
/// <summary>
/// Repository gateway parameter less constructor
/// </summary>
public RepositoryGateway()
{
_storage = new RamStorage(NUMBER_OF_HASH_TABLES); /*Number of LSH Tables, used for storage purposes*/
_permutations = new LocalPermutations(PATH_TO_PERMUTATIONS, SEPARATOR); /*Permutations*/
_repository = new Repository(_storage, _permutations);
_proxy = new BassProxy(); /*audio proxy used in reading the file*/
_createStride = new IncrementalStaticStride(STRIDE_SIZE_INCREMENTAL, SAMPLES_IN_FINGERPRINT);
_queryStride = new IncrementalRandomStride(STRIDE_SIZE_INCREMENTAL, SAMPLES_IN_FINGERPRINT, SAMPLES_IN_FINGERPRINT);
}
/// <summary>
/// Process the tracks asynchronously (get their path location, fingerprint content, hash fingerprint into storage)
/// </summary>
/// <param name = "paths">Paths to be processed</param>
/// <param name = "fileFilters">File filters used</param>
/// <param name = "callback">Callback invoked once processing ends</param>
/// <param name = "trackProcessed">Callback invoked once 1 track is processed</param>
/// <param name = "fileLookup">File lookup callback, invoked each time a file is found, and tags are read</param>
public void ProcessTracksAsync(IEnumerable<Item> paths, string[] fileFilters,
Action<List<Track>, Exception> callback, Action<Track> trackProcessed, Action<string> fileLookup)
{
List<string> files = new List<string>();
foreach (Item path in paths)
{
if (path.IsFolder)
files.AddRange(Helper.GetMusicFiles(path.Path, fileFilters, true)); //get music file names
else
files.Add(path.Path);
}
int fileCount = files.Count;
int threadCount = 1; //find the appropriate number of threads that will process the data
if (fileCount < 10)
threadCount = 1;
else if (fileCount < 50)
threadCount = 2;
else if (fileCount < 200)
threadCount = 3;
else
threadCount = 4;
int itemsPerThread = files.Count/threadCount;
int rest = files.Count%threadCount;
int threadsProcessed = 0;
List<Track> tracks = new List<Track>();
for (int i = 0; i < threadCount; i++)
{
int start = i*itemsPerThread; //Define start and end indexes
int end = (i == threadCount - 1) ? i*itemsPerThread + itemsPerThread + rest : i*itemsPerThread + itemsPerThread;
List<string> partOfFiles = files.GetRange(start, end - start);
Func<List<string>, Action<Track>, Action<string>, List<Track>> func = ProcessFiles; /*Process files synchronously on each file range*/
Exception exception = null;
func.BeginInvoke(
partOfFiles,
trackProcessed,
fileLookup,
(result) =>
{
try
{
List<Track> p = func.EndInvoke(result);
if (p != null)
tracks.AddRange(p);
}
catch (Exception e)
{
exception = e; /*if an exception occurs it will be forwarded back to main*/
}
// ReSharper disable AccessToModifiedClosure
Interlocked.Increment(ref threadsProcessed);
// ReSharper restore AccessToModifiedClosure
lock (this)
{
if (threadsProcessed == threadCount)
{
if(!_aborted)
callback.Invoke(tracks, exception);
_isRunning = false;
}
}
}, func);
}
}
/// <summary>
/// Process files synchronously (get fingerprint signatures, hash them into storage)
/// </summary>
/// <param name = "files">List of files to be hashed</param>
/// <param name = "processed">Callback invoked once 1 track is processed</param>
/// <param name = "fileLookup">File lookup callback, invoked each time a file is found on the drive, and tags are read</param>
/// <returns>List of processed tracks</returns>
public List<Track> ProcessFiles(List<string> files, Action<Track> processed, Action<string> fileLookup)
{
_isRunning = true;
List<Track> tracks = new List<Track>();
foreach (string file in files) //filter files that can be processed
{
if (_aborted) return null;
Track track = new Track();
TAG_INFO tags = _proxy.GetTagInfoFromFile(file); //get file tags
if (fileLookup != null)
fileLookup.Invoke(file);
string artist, title;
double duration;
if (tags == null)
{
/*The song does not contain any tags*/
artist = "Unknown";
title = "Unknown";
duration = 60;
}
else
{
artist = tags.artist;
title = tags.title;
duration = tags.duration;
}
if (String.IsNullOrEmpty(artist)) /*assign a name to music files that don't have tags*/
artist = "Unknown";
if (String.IsNullOrEmpty(title)) /*assign a title to music files that don't have tags*/
title = "Unknown";
if (duration < MIN_TRACK_LENGTH || duration > MAX_TRACK_LENGTH) /*check the duration of a music file*/
continue;
track.Artist = artist;
track.Title = title;
track.TrackLength = duration;
track.Path = Path.GetFullPath(file);
tracks.Add(track);
}
/*preprocessing stage ended, now make sure to do the actual job*/
_repository.ProcessTracks(tracks, _proxy, _queryStride, _createStride,
MILLISECONDS_TO_PROCESS, MILLISECONDS_START, NUMBER_OF_HASH_TABLES,
NUMBER_OF_KEYS, processed);
return _aborted ? null : tracks;
}
/// <summary>
/// Find duplicate files for specific tracks
/// </summary>
/// <param name = "tracks">Tracks to search in</param>
/// <returns>Set of tracks that are duplicate</returns>
public HashSet<Track>[] FindDuplicates(List<Track> tracks)
{
return _repository.FindDuplicates(tracks, THRESHOLD_VOTES);
}
/// <summary>
/// Find all duplicate files from the storage
/// </summary>
/// <returns>Set of tracks that are duplicate</returns>
public HashSet<Track>[] FindAllDuplicates()
{
return _repository.FindDuplicates(_storage.GetAllTracks(), THRESHOLD_VOTES);
}
/// <summary>
/// Abort processing the files (at any stage)
/// </summary>
public void AbortProcessing()
{
_aborted = true;
_repository.AbortProcessing();
while(_isRunning)
{
Thread.Sleep(10);
}
_repository.ClearStorage();
_aborted = false;
_repository.ResetProcessing();
}
}
}