What Happens during a PLINQ Query?
Introduction
This article assumes that the reader has some experience in working with LINQ. The first section will demonstrate a few examples of some basic LINQ queries. Try and recall the underlying principles. We want to make an inquiry on a sequence of data elements, define which elements will fit our desired criteria, and then output the results of the query. Shown below is a basic LINQ query. We declare and initialize a small sequence to signify the two possible binary bits, 0 and 1. We then query them and perform an operation on each element based on the results of that query:
using System;
using System.Linq;
using System.Collections.Generic;
public class Program
{
public static void Main()
{
var binary = new int[] { 0, 1 };
var q = from b4 in binary
from b3 in binary
from b2 in binary
from b1 in binary
select String.Format(
"{0}{1}{2}{3}", b4, b3, b2, b1);
foreach (var element in q)
Console.WriteLine(element);
}
}
outputs:
0000
0001
0010
0011
0100
0101
0110
0111
1000
1001
1010
1011
1100
1101
1110
1111
This next example involves writing a simple class called Contact. We get and set its properties and to then consume that class in the main body of the code. Notice the generics. We have defined the type in order to pass it as a parameter (in almost the same way you pass a parameter to a method):
using System;
using System.Linq;
using System.Collections.Generic;
public class Program
{
public static void Main()
{
var query = Contact.GetInfo()
.Where(c => c.State == "CA")
.Select(c => new
{
Name = c.FirstName + " " + c.LastName,
State = c.State
});
foreach (var item in query)
Console.WriteLine("{0}, ({1})", item.Name, item.State);
}
public class Contact
{
public string FirstName { get; set; }
public string LastName { get; set; }
public string State { get; set; }
public static List<Contact> GetInfo()
{
return new List<Contact> {
new Contact { FirstName = "Bob", LastName = "Johnson", State = "NC" },
new Contact { FirstName = "Fat", LastName = "Albert", State = "MA" },
new Contact { FirstName = "John", LastName = "Skippy", State = "CT" },
new Contact { FirstName = "Joe", LastName = "Smith", State = "NY" },
new Contact { FirstName = "Sue", LastName = "Lane", State = "CA" },
new Contact { FirstName = "Mathew", LastName = "Blue", State = "WA" },
new Contact { FirstName = "Jake", LastName = "Ruth", State = "CA" }
};
}
}
}
Outputs:
Sue Lane, (CA)
Jake Ruth, (CA)
Now let’s try and group data by recording the activity of those contacts. This requires writing another class that will nest in with the previous class. After defining those types, we will consume by writing code that will group this data after filtering the query. Again we will use the List class of the System.Collections.Generic namespace:
using System;
using System.Linq;
using System.Collections.Generic;
public class Program
{
public static void Main()
{
List<Contact> contacts = Contact.GetInfo();
List<Calllog> callLog = CallLog.GetInfo();
var q = from contact in contacts
select new
{
Name = contact.FirstName + " " +
contact.LastName,
YearGroup = from call in callLog
where call.Number == contact.Phone
group call by call.When.Year
into groupYear
select new
{
Year = groupYear.Key,
MonthGroup =
from c in groupYear
group c by c.When.ToString("MMMM")
}
};
foreach (var con in q)
{
Console.WriteLine("Customer: {0}", con.Name);
foreach (var year in con.YearGroup)
{
Console.WriteLine(" Year:{0}", year.Year);
foreach (var month in year.MonthGroup)
{
Console.WriteLine(" Month:{0}", month.Key);
foreach (var call in month)
{
Console.WriteLine(" {0} - for {1} minutes",
call.When, call.Duration);
}
}
}
}
}
public class Contact
{
public string FirstName { get; set; }
public string LastName { get; set; }
public string Phone { get; set; }
public string Extension { get; set; }
public DateTime DateOfBirth { get; set; }
public string State { get; set; }
public static List<Contact> GetInfo()
{
return new List<Contact> {
new Contact {FirstName = "Billy", LastName = "Willy", DateOfBirth = new DateTime(1945,10,19), Phone = "885 983 8858", State = "CA" },
new Contact {FirstName = "Maria", LastName = "Silly", State = "CA" },
new Contact {FirstName = "Bernadette", LastName = "Gottshall", State = "WA" },
new Contact {FirstName = "Armando", LastName = "Valdes", DateOfBirth = new DateTime(1973,12,09), Phone = "848 553 8487", State = "WA" },
new Contact {FirstName = "Adam", LastName = "Gauwain", DateOfBirth = new DateTime(1959,10,03), Phone = "115 999 1154", State = "AK" },
new Contact {FirstName = "Chris", LastName = "Gauwain", State = "AK" },
new Contact {FirstName = "Anthony", LastName = "Gauwain", State = "CA" },
new Contact {FirstName = "Jeffery", LastName = "Deane", DateOfBirth = new DateTime(1950,12,16), Phone = "677 602 6774", State = "CA" },
new Contact {FirstName = "Collin", LastName = "Zeeman", DateOfBirth = new DateTime(1935,02,10), Phone = "603 303 6030", State = "FL" },
new Contact {FirstName = "Stewart", LastName = "Kagel", DateOfBirth = new DateTime(1950,02,20), Phone = "546 607 5462", State = "WA" },
new Contact {FirstName = "Chance", LastName = "Lard", DateOfBirth = new DateTime(1951,10,21), Phone = "278 918 2789", State = "WA" },
new Contact {FirstName = "Blaine", LastName = "Reifsteck", DateOfBirth = new DateTime(1946,05,18), Phone = "715 920 7157", Extension = "", State = "TX" },
new Contact {FirstName = "Mack", LastName = "Kamph", DateOfBirth = new DateTime(1977,09,17), Phone = "364 202 3644", State = "TX" },
new Contact {FirstName = "Ariel", LastName = "Hazelgrove", DateOfBirth = new DateTime(1922,05,23), Phone = "165 737 1656", State = "OR" }
};
}
}
public class CallLog
{
public string Number { get; set; }
public int Duration { get; set; }
public bool Incoming { get; set; }
public DateTime When { get; set; }
public string Extension { get; set; }
public static List<Calllog> GetInfo()
{
return new List<Calllog> {
new CallLog { Number = "885 983 8858", Extension = "", Duration = 2, Incoming = true, When = new DateTime(2006, 8, 7, 8, 12, 0)},
new CallLog { Number = "165 737 1656", Extension = "", Duration = 15, Incoming = true, When = new DateTime(2006, 8, 7, 9, 23, 0) },
new CallLog { Number = "364 202 3644", Extension = "", Duration = 1, Incoming = false, When = new DateTime(2006, 8, 7, 10, 5, 0) },
new CallLog { Number = "603 303 6030", Extension = "", Duration = 2, Incoming = false, When = new DateTime(2006, 8, 7, 10, 35, 0) },
new CallLog { Number = "546 607 5462", Extension = "", Duration = 4, Incoming = true, When = new DateTime(2006, 8, 7, 11, 15, 0) },
new CallLog { Number = "885 983 8858", Extension = "", Duration = 15, Incoming = false, When = new DateTime(2006, 8, 7, 13, 12, 0) },
new CallLog { Number = "885 983 8858", Extension = "", Duration = 3, Incoming = true, When = new DateTime(2006, 8, 7, 13, 47, 0) },
new CallLog { Number = "546 607 5462", Extension = "", Duration = 1, Incoming = false, When = new DateTime(2006, 8, 7, 20, 34, 0) },
new CallLog { Number = "546 607 5462", Extension = "", Duration = 3, Incoming = false, When = new DateTime(2006, 8, 8, 10, 10, 0) },
new CallLog { Number = "603 303 6030", Extension = "", Duration = 23, Incoming = false, When = new DateTime(2006, 8, 8, 10, 40, 0) },
new CallLog { Number = "848 553 8487", Extension = "", Duration = 3, Incoming = false, When = new DateTime(2006, 8, 8, 14, 0, 0) },
new CallLog { Number = "848 553 8487", Extension = "", Duration = 7, Incoming = true, When = new DateTime(2006, 8, 8, 14, 37, 0) },
new CallLog { Number = "278 918 2789", Extension = "", Duration = 6, Incoming = true, When = new DateTime(2006, 8, 8, 15, 23, 0) },
new CallLog { Number = "364 202 3644", Extension = "", Duration = 20, Incoming = true, When = new DateTime(2006, 8, 8, 17, 12, 0) },
new CallLog { Number = "885 983 8858", Extension = "", Duration = 5, Incoming = true, When = new DateTime(2006, 7, 12, 8, 12, 0)},
new CallLog { Number = "165 737 1656", Extension = "", Duration = 12, Incoming = true, When = new DateTime(2006, 6, 14, 9, 23, 0) },
new CallLog { Number = "364 202 3644", Extension = "", Duration = 10, Incoming = false, When = new DateTime(2006, 7, 9, 10, 5, 0) },
new CallLog { Number = "603 303 6030", Extension = "", Duration = 22, Incoming = false, When = new DateTime(2006, 7, 5, 10, 35, 0) },
new CallLog { Number = "546 607 5462", Extension = "", Duration = 9, Incoming = true, When = new DateTime(2006, 6, 7, 11, 15, 0) },
new CallLog { Number = "885 983 8858", Extension = "", Duration = 10, Incoming = false, When = new DateTime(2006, 6, 7, 13, 12, 0) },
new CallLog { Number = "885 983 8858", Extension = "", Duration = 21, Incoming = true, When = new DateTime(2006, 7, 7, 13, 47, 0) },
new CallLog { Number = "546 607 5462", Extension = "",Duration = 7, Incoming = false, When = new DateTime(2006, 7, 7, 20, 34, 0) },
new CallLog { Number = "546 607 5462", Extension = "", Duration = 2, Incoming = false, When = new DateTime(2006, 6, 8, 10, 10, 0) },
new CallLog { Number = "603 303 6030", Extension = "", Duration = 3, Incoming = false, When = new DateTime(2006, 6, 8, 10, 40, 0) },
new CallLog { Number = "848 553 8487", Extension = "", Duration = 32, Incoming = false, When = new DateTime(2006, 7, 8, 14, 0, 0) },
new CallLog { Number = "848 553 8487", Extension = "", Duration = 13, Incoming = true, When = new DateTime(2006, 7, 8, 14, 37, 0) },
new CallLog { Number = "278 918 2789", Extension = "", Duration = 16, Incoming = true, When = new DateTime(2006, 5, 8, 15, 23, 0) },
new CallLog { Number = "364 202 3644", Extension = "", Duration = 24, Incoming = true, When = new DateTime(2006, 6, 8, 17, 12, 0) }
};
}
}
}
Outputs:
Customer: Billy Willy
Year:2006
Month:August
8/7/2006 8:12:00 AM - for 2 minutes
8/7/2006 1:12:00 PM - for 15 minutes
8/7/2006 1:47:00 PM - for 3 minutes
Month:July
7/12/2006 8:12:00 AM - for 5 minutes
7/7/2006 1:47:00 PM - for 21 minutes
Month:June
6/7/2006 1:12:00 PM - for 10 minutes
Customer: Maria Silly
Customer: Bernard Peters
Customer: Armando Valdes
Year:2006
Month:August
8/8/2006 2:00:00 PM - for 3 minutes
8/8/2006 2:37:00 PM - for 7 minutes
Month:July
7/8/2006 2:00:00 PM - for 32 minutes
7/8/2006 2:37:00 PM - for 13 minutes
Customer: Adam Gauwain
Customer: Chris Gauwain
Customer: Anthony Gauwain
Customer: Jeffery Deane
Customer: Collin Zeeman
Year:2006
Month:August
8/7/2006 10:35:00 AM - for 2 minutes
8/8/2006 10:40:00 AM - for 23 minutes
Month:July
7/5/2006 10:35:00 AM - for 22 minutes
Month:June
6/8/2006 10:40:00 AM - for 3 minutes
Customer: Stewart Kagel
Year:2006
Month:August
8/7/2006 11:15:00 AM - for 4 minutes
8/7/2006 8:34:00 PM - for 1 minutes
8/8/2006 10:10:00 AM - for 3 minutes
Month:June
6/7/2006 11:15:00 AM - for 9 minutes
6/8/2006 10:10:00 AM - for 2 minutes
Month:July
7/7/2006 8:34:00 PM - for 7 minutes
Customer: Chance Lard
Year:2006
Month:August
8/8/2006 3:23:00 PM - for 6 minutes
Month:May
5/8/2006 3:23:00 PM - for 16 minutes
Customer: Blaine Reifsteck
Customer: Mack Kamph
Year:2006
Month:August
8/7/2006 10:05:00 AM - for 1 minutes
8/8/2006 5:12:00 PM - for 20 minutes
Month:July
7/9/2006 10:05:00 AM - for 10 minutes
Month:June
6/8/2006 5:12:00 PM - for 24 minutes
Customer: Ariel Hazelgrove
Year:2006
Month:August
8/7/2006 9:23:00 AM - for 15 minutes
Month:June
6/14/2006 9:23:00 AM - for 12 minutes
PLINQ: What Happens during a Parallel LINQ Query?
You should never parallelize small queries. It is not as simple as adding .AsParallel. The first step when a LINQ query is passed to the Parallel LINQ execution engine is query analysis. The query pattern is examined to determine if it is a candidate for parallelization improvement with respect to time and memory resources. If it isn’t, then the query will be executed sequentially. The next step is to partition the data. Parallel LINQ has a number of data partitioning schemes it uses to segment the data for different worker processes. For example, if an input source has one hundred elements, when this query is run on a dual-core machine, the Parallel LINQ execution engine might break the data into two partitions of 50 elements for parallel execution. This is a simplistic case, however, and probably not used in practice. The built-in partition schemes at the time of writing are called chunk, range, striped, and hash. Each of these data partitioning schemes suits a certain set of source data types and operator scenarios; the parallel runtime chooses the appropriate one and partitions the data for execution when a worker process asks for more work. Once data is partitioned, it is executed in parallel using the underlying Parallel Framework extensions. After each thread has completed, its results must be merged into the final result sequence (or value). The TPL has allowed some control over when results are made available, either immediately after a partition is processed or buffering the results until all partitions have completed. This control is provided by using the WithMergeOptions extension and choosing a value of NotBuffered, AutoBuffered (the default), or FullyBuffered.
To appreciate the benefit of PLINQ queries, the following example shows the results and performance of a LINQ query before and after parallelization.Geonames.org is an online geographic database of over eight million place names. The files are available for download under a Creative Commons license, and the website contains regional subsets of data and also a complete file of all place names (780MB when I downloaded it for this example – that is a large text file!).
The files are in a specific tab-separated text structure, with over 15 columns of data, including elevation, name, and a two-letter country code. The example query will simply iterate this file and return a sorted list of all entries with an elevation greater than 8,000 meters. The exact download link is http://download.geonames.org/export/dump/allCountries.zip. Unzip the file to your Desktop, and then copy and paste the file to, say, your C: directory. We will perform both a sequential and PLINQ query on this file. It is large text file, not a database (meaning we will need no connection string).
Examine the code below. Note that .NET 4 introduced new overloads into the File and Directory classes. File.ReadLines previously returned a string[] of all lines in a file. A new overload returns an IEnumerable, which allows you to iterate a file.
using System;
using System.Linq;
using System.IO;
using System.Diagnostics;
using System.Text;
public class Program {
public static void Main() {
const int nameColumn = 1;
const int countryColumn = 8;
const int elevationColumn = 15;
Stopwatch watch = new Stopwatch();
watch.Start();
var lines = File.ReadLines(Path.Combine(
Environment.CurrentDirectory, @"c:\AllCountries.txt"));
var q = from line in lines
let fields = line.Split(new char[] { '\t' })
let elevation = string.IsNullOrEmpty(
fields[elevationColumn]) ?
0 : int.Parse(fields[elevationColumn])
where elevation > 8000 orderby elevation descending
select new
{
name = fields[nameColumn] ?? "",
elevation = elevation,
country = fields[countryColumn]
};
foreach (var x in q)
{
if (x != null)
Console.WriteLine("{0} ({1}m) - located in {2}",
x.name, x.elevation, x.country);
}
Console.WriteLine("Elapsed time: {0}ms",
watch.ElapsedMilliseconds);
}
}
The results for this sequential query are:
Mount Everest (8848m) - located in NP
K2 (8611m) - located in PK
Señal Salumpite (8600m) - located in PE
Kanchenjunga (8586m) - located in NP
Lo-tzu Feng (8516m) - located in NP
Makalu (8463m) - located in NP
Qowowuyag (8188m) - located in CN
Dhaulagiri (8167m) - located in NP
Manaslu (8163m) - located in NP
Nanga Parbat (8125m) - located in PK
Annapurna Himal (8091m) - located in NP
Annapurna I (8091m) - located in NP
Gasherbrum Shan (8080m) - located in PK
Broad Feng (8051m) - located in PK
Gasherbrum II Feng (8034m) - located in PK
Xixabangma Feng (8027m) - located in CN
Broad Peak Central (8011m) - located in PK
Elapsed time: 26005ms
Here is a Parallel LINQ query of the same:
using System;
using System.Linq;
using System.IO;
using System.Diagnostics;
using System.Text;
public class Program {
public static void Main() {
const int nameColumn = 1;
const int countryColumn = 8;
const int elevationColumn = 15;
Stopwatch watch = new Stopwatch();
watch.Start();
var lines = File.ReadLines(Path.Combine(
Environment.CurrentDirectory, @"c:\AllCountries.txt"));
var q = from line in lines.AsParallel()
let fields = line.Split(new char[] { '\t' })
let elevation = string.IsNullOrEmpty(
fields[elevationColumn]) ?
0 : int.Parse(fields[elevationColumn])
where elevation > 8000 orderby elevation descending
select new
{
name = fields[nameColumn] ?? "",
elevation = elevation,
country = fields[countryColumn]
};
foreach (var x in q)
{
if (x != null)
Console.WriteLine("{0} ({1}m) - located in {2}",
x.name, x.elevation, x.country);
}
Console.WriteLine("Elapsed time: {0}ms",
watch.ElapsedMilliseconds);
}
}
The results of the PLINQ query are the same, but the elapsed time when I ran this program was 16328 ms, nearly 10 seconds quicker. Now notice this next program. It makes use of another text file, FeatureCodes.txt that I downloaded and copied into my C: directory. This file is available on the web site mentioned above:
using System;
using System.Linq;
using System.IO;
using System.Diagnostics;
using System.Text;
public class Program {
public static void Main() {
const int nameColumn = 1;
const int featureClassColumn = 6;
const int featureCodeColumn = 7;
const int countryColumn = 8;
const int elevationIndexColumn = 15;
Stopwatch watch = new Stopwatch();
watch.Start();
var codeFile = File.ReadLines(
Path.Combine(Environment.CurrentDirectory, @"C:\FeatureCodes.txt"));
var codes = (from code in codeFile.AsParallel()
let c = code.Split(new char[] { '\t' })
select new
{
featureClass = c[0][0],
featureCode = c[0].Remove(0, 2),
featureDescription = c[1]
})
.ToArray();
var lines = File.ReadLines(Path.Combine(Environment.CurrentDirectory, @"c:\AllCountries.txt"));
var q = from line in lines.AsParallel()
let fields = line.Split(new char[] { '\t' })
let elevation = string.IsNullOrEmpty(fields[elevationIndexColumn]) ?
0 : int.Parse(fields[elevationIndexColumn])
where elevation > 6000 let code = codes.SingleOrDefault(
c => c.featureCode == fields[featureCodeColumn] &&
c.featureClass == fields[featureClassColumn][0])
orderby elevation descending
select new
{
name = fields[nameColumn] ?? "",
elevation = elevation,
country = fields[countryColumn],
description = code != null ? code.featureDescription : ""
};
foreach (var x in q)
{
if (x != null)
Console.WriteLine("{0} ({1}m) - A {2} in {3}",
x.name,
x.elevation,
x.description,
x.country);
}
Console.WriteLine();
Console.WriteLine("Elapsed time: {0}ms", watch.ElapsedMilliseconds);
}
}
Compiling and executing this code yields the following:
Mount Everest (8848m) - A mountain in NP
K2 (8611m) - A mountain in PK
Señal Salumpite (8600m) - A triangulation station in PE
Kanchenjunga (8586m) - A mountain in NP
Lo-tzu Feng (8516m) - A peak in NP
Makalu (8463m) - A mountain in NP
Qowowuyag (8188m) - A mountain in CN
Dhaulagiri (8167m) - A peak in NP
Manaslu (8163m) - A mountain in NP
Nanga Parbat (8125m) - A peak in PK
Annapurna Himal (8091m) - A area in NP
Annapurna I (8091m) - A mountain in NP
Gasherbrum Shan (8080m) - A mountain in PK
Broad Feng (8051m) - A mountain in PK
Gasherbrum II Feng (8034m) - A mountain in PK
Xixabangma Feng (8027m) - A mountain in CN
Broad Peak Central (8011m) - A mountain in PK
Gyachung Kang (7952m) - A mountain in CN
Gasherbrum III Feng (7952m) - A mountain in PK
Annapurna II (7937m) - A mountain in NP
Gasherbrum IV Feng (7925m) - A mountain in PK
Ngozumpa Kang I (7916m) - A mountain in NP
Himal Chuli (7893m) - A peak in NP
. . . .. and on . . . . . .
Carter Horn (6031m) - A cape in US
Cerro Ciénaga Grande (6030m) - A mountain in AR
Nevado Hualca Hualca (6025m) - A mountain in PE
Nevado Rasac (6017m) - A mountain in PE
Cerro de San Francisco (6016m) - A mountain in CL
Mitre (6013m) - A peak in PK
Dhampus Peak (6012m) - A mountain in NP
Cerro Uturunco (6008m) - A mountain in BO
Koh-e Sar-e Tund (6001m) - A mountain in AF
Elapsed time: 17570ms
What if we wanted to examine an assembly to query any obsolete methods? Examine this code:
using System;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Threading;
using System.Threading.Tasks;
public static class Program {
public static void Main() {
Go();
}
public static void Go() {
ObsoleteMethods(typeof(Object).Assembly);
}
private static void ObsoleteMethods(Assembly assembly) {
var query =
from type in assembly.GetExportedTypes().AsParallel()
from method in type.GetMethods(BindingFlags.Public |
BindingFlags.Instance | BindingFlags.Static)
let obsoleteAttrType = typeof(ObsoleteAttribute)
where Attribute.IsDefined(method, obsoleteAttrType)
orderby type.FullName
let obsoleteAttrObj = (ObsoleteAttribute)
Attribute.GetCustomAttribute(method, obsoleteAttrType)
select String.Format("Type={0}\nMethod={1}\nMessage={2}\n",
type.FullName, method.ToString(), obsoleteAttrObj.Message);
foreach (var result in query) Console.WriteLine(result);
}
}
Compiling and executing this code results in an extensive list of obsolete methods, attributes, and deprecated types.
References
- CLR via C#, 3rd Edition, Jeffrey Richter
- Linq to Objects using C# 4.0, Troy Magennis