You probably got your answers, but I might have fond an even faster method. I have to admit, I had oly 18MB of test data with around 225k lines. Still, might worth giving a try. I made a small test comparing ReadLines, and my MemoryMappedFile based approach.
using System;
using System.IO;
using System.IO.MemoryMappedFiles;
using System.Collections.Generic;
using System.Text;
namespace MMText
{
public class MemoryMappedTextFileReader:IDisposable
{
MemoryMappedFile memoryMappedFile;
public MemoryMappedTextFileReader(string fileName)
{
memoryMappedFile = MemoryMappedFile.CreateFromFile(fileName, FileMode.Open);
}
public IEnumerable<string> ReadLines()
{
using (var memoryMappedViewStream = memoryMappedFile.CreateViewStream())
{
using (StreamReader sr = new StreamReader(memoryMappedViewStream, UTF8Encoding.UTF8, true, 4096)) {
while (!sr.EndOfStream) {
String line = sr.ReadLine();
yield return line;
}
}
}
}
#region IDisposable implementation
bool disposed = false;
public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}
protected virtual void Dispose(bool disposing)
{
if (disposed)
return;
if (disposing) {
memoryMappedFile.Dispose();
}
disposed = true;
}
#endregion
}
}
And the test:
using System;
using System.IO;
using System.Diagnostics;
namespace MMText
{
class Program
{
public static void Main(string[] args)
{
long lines = 0;
const string fileName = @"D:\TEMP\setupapi.dev.20140929_185959.log";
var watch = Stopwatch.StartNew();
foreach (var s in File.ReadLines(fileName))
{
lines++;
}
watch.Stop();
TimeSpan ts = watch.Elapsed;
string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10);
Console.WriteLine("ReadLines - Reading {0} lines took: {1}. Average: {2} ms/line", lines, elapsedTime, 1.0f*watch.ElapsedMilliseconds/lines);
lines = 0;
watch = Stopwatch.StartNew();
using(var x = new MemoryMappedTextFileReader(fileName))
{
foreach(var s in x.ReadLines())
{
lines++;
}
}
watch.Stop();
ts = watch.Elapsed;
elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10);
Console.WriteLine("MMF - Reading {0} lines took: {1}. Average: {2} ms/line", lines, elapsedTime, 1.0f*watch.ElapsedMilliseconds/lines);
Console.Write("Press any key to continue . . . ");
Console.ReadKey(true);
}
}
}
Here are the results:
ReadLines - Reading 225661 lines took: 00:00:00.35. Average: 0,001564293 ms/line
MMF - Reading 225662 lines took: 00:00:00.29. Average: 0,001320559 ms/line
Might differ from run to run, but the ratio is the same. You might have noticed the difference of 1 line. Interesting. Opening it with FAR manager's editor shows 225662... so I don't know what ReadLines is missing there...
Still, one has to be carefull with MMF, if you take this path, you should read this also:
http://blogs.msdn.com/b/bclteam/archive/2011/06/06/memory-mapped-file-quirks.aspx[
^]
[Update: added memory usage tests]
I have updated the test application like this:
public static void Main(string[] args)
{
AppDomain.MonitoringIsEnabled = true;
long lines = 0;
const string fileName = @"D:\TEMP\setupapi.dev.20140929_185959.log";
var watch = Stopwatch.StartNew();
long frl_MU_b = AppDomain.CurrentDomain.MonitoringTotalAllocatedMemorySize;
foreach (var s in File.ReadLines(fileName))
{
lines++;
}
long frl_MU_a = AppDomain.CurrentDomain.MonitoringTotalAllocatedMemorySize;
watch.Stop();
TimeSpan ts = watch.Elapsed;
string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10);
Console.WriteLine("ReadLines - Reading {0} lines took: {1}. Average: {2} ms/line. Memory usage: {3}", lines, elapsedTime, 1.0f*watch.ElapsedMilliseconds/lines, frl_MU_a-frl_MU_b);
lines = 0;
watch = Stopwatch.StartNew();
long mmf_MU_b = AppDomain.CurrentDomain.MonitoringTotalAllocatedMemorySize;
using(var x = new MemoryMappedTextFileReader(fileName))
{
foreach(var s in x.ReadLines())
{
lines++;
}
}
long mmf_MU_a = AppDomain.CurrentDomain.MonitoringTotalAllocatedMemorySize;
watch.Stop();
ts = watch.Elapsed;
elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10);
Console.WriteLine("MMF - Reading {0} lines took: {1}. Average: {2} ms/line. Memory usage: {3}", lines, elapsedTime, 1.0f*watch.ElapsedMilliseconds/lines, mmf_MU_a-mmf_MU_b);
Console.Write("Press any key to continue . . . ");
Console.ReadKey(true);
}
And here are the results:
ReadLines - Reading 225661 lines took: 00:00:00.36. Average: 0,001613039 ms/line. Memory usage: 41667828
MMF - Reading 225662 lines took: 00:00:00.35. Average: 0,001586443 ms/line. Memory usage: 37764368
As you can see, the MMF apprach consumes even less memory.