myTable row count is 150000 rows. I need to compare the values in NAME column. If I detect similarity <= similarity level and if the NUM column values are the same I should log it.
They way I loop through this table takes forever. What other solutions are there?
DataTable dt1 = new DataTable();
dt1.Load(DbInfo.DataRdr(Conn, "SELECT * FROM myTable"));
for (int i = 0; i < dt1.Rows.Count; i++)
{
for (int j = 0; i + 1 < dt1.Rows.Count; j++)
{
if (dt1.Rows[i]["NUM"].ToString() == dt1.Rows[j]["NUM"].ToString())
{
if (dt1.Rows[i]["Name"].ToString().
LevenshteinDistance(dt1.Rows[j]["Name"].ToString()) <= 10)
{
Logging.Write(...);
}
}
}
}
public static int LevenshteinDistance(this string s, string t)
{
if (s == null)
throw new ArgumentNullException("s");
if (t == null)
throw new ArgumentNullException("t");
int n = s.Length;
int m = t.Length;
int[,] d = new int[n+1,m+1];
if (n == 0 || m == 0)
return Math.Max(m, n);
for (int i = 0; i <= n; i++)
{
d[i, 0] = i;
}
for (int i = 0; i < m; i++)
{
d[0, i] = i;
}
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
int cost = (t[j] == s[i]) ? 0 : 1;
d[i + 1, j + 1] = Math.Min(Math.Min(d[i, j + 1] + 1, d[i + 1, j] + 1), d[i, j] + cost);
}
}
return d[n, m];
}myTable row count is 150000 rows.
I need to compare the values in NAME column. If I detect similarity <= similarity level and if the NUM column values are the same I should log it.
They way I loop through this table takes forever. What other solutions are there?
<pre lang="cs">DataTable dt1 = new DataTable();
dt1.Load(DbInfo.DataRdr(Conn, "SELECT * FROM myTable"));
for (int i = 0; i < dt1.Rows.Count; i++)
{
for (int j = 0; j < dt1.Rows.Count; j++)
{
if (dt1.Rows[i]["Name"].ToString().LevenshteinDistance(dt1.Rows[j] ["Name"].ToString()) <= 10)
{
if (dt1.Rows[i]["NUM"].ToString() == dt1.Rows[i]["NUM"].ToString())
{
Logging.Write(...);
}
}
}
}