I have a list of ~20,000 email addresses, some of which I know to be fraudulent attempts to get around a "1 per e-mail" limit, such as [email protected], [email protected], [email protected], etc. I want to find similar email addresses for evaluation. Currently I'm using a Levenshtein algorithm to check each e-mail against the others in the list and report any with an edit distance of less than 2. However, this is painstakingly slow. Is there a more efficient approach?
The test code I'm using now is:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Threading;
namespace LevenshteinAnalyzer
{
class Program
{
const string INPUT_FILE = @"C:\Input.txt";
const string OUTPUT_FILE = @"C:\Output.txt";
static void Main(string[] args)
{
var inputWords = File.ReadAllLines(INPUT_FILE);
var outputWords = new SortedSet<string>();
for (var i = 0; i < inputWords.Length; i++)
{
if (i % 100 == 0)
Console.WriteLine("Processing record #" + i);
var word1 = inputWords[i].ToLower();
for (var n = i + 1; n < inputWords.Length; n++)
{
if (i == n) continue;
var word2 = inputWords[n].ToLower();
if (word1 == word2) continue;
if (outputWords.Contains(word1)) continue;
if (outputWords.Contains(word2)) continue;
var distance = LevenshteinAlgorithm.Compute(word1, word2);
if (distance <= 2)
{
outputWords.Add(word1);
outputWords.Add(word2);
}
}
}
File.WriteAllLines(OUTPUT_FILE, outputWords.ToArray());
Console.WriteLine("Found {0} words", outputWords.Count);
}
}
}
Edit: Some of the stuff I'm trying to catch looks like:
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]