In C#, what is the fastest way to detect duplicate characters in a String and remove them (removal including 1st instance of the duplicated character)?
Example Input: nbHHkRvrXbvkn
Example Output: RrX
In C#, what is the fastest way to detect duplicate characters in a String and remove them (removal including 1st instance of the duplicated character)?
Example Input: nbHHkRvrXbvkn
Example Output: RrX
Fastest as in fewest-lines-of-code:
var s = "nbHHkRvrXbvkn";
var duplicates = s.Where(ch => s.Count(c => c == ch) > 1);
var result = new string(s.Except(duplicates).ToArray()); // = "RrX"
Fastest as in fastest-performance would probably be something like this (does not preserve order):
var h1 = new HashSet<char>();
var h2 = new HashSet<char>();
foreach (var ch in "nbHHkRvrXbvkn")
{
if (!h1.Add(ch))
{
h2.Add(ch);
}
}
h1.ExceptWith(h2); // remove duplicates
var chars = new char[h1.Count];
h1.CopyTo(chars);
var result = new string(chars); // = "RrX"
Performance test
When in doubt -- test it :)
Yuriy Faktorovich's answer 00:00:00.2360900 Luke's answer 00:00:00.2225683 My 'few lines' answer 00:00:00.5318395 My 'fast' answer 00:00:00.1842144
This algorithm is general, can be applied to any language
Here is a pretty fast one preserving order. But I'd be a little worried about how LINQ does Group and Where:
string s = "nbHHkRvrXbvkn";
Console.WriteLine(
s.ToCharArray()
.GroupBy(c => c)
.Where(g => g.Count() == 1)
.Aggregate(new StringBuilder(), (b, g) => b.Append(g.Key)));
Edit: This one beats Luke's in some cases still slower than dtb's, but it preserves the order
private static string MyMethod(string s)
{
StringBuilder sb = new StringBuilder(s.Length);
foreach (var g in s.ToCharArray().GroupBy(c => c))
if (g.Count() == 1) sb.Append(g.Key);
return sb.ToString();
}
This one should be pretty quick (and it preserves the original order):
public static string RemoveDuplicates(string source)
{
HashSet<char> found = new HashSet<char>();
HashSet<char> dupes = new HashSet<char>();
foreach (char c in source)
{
if (!found.Add(c))
{
dupes.Add(c);
}
}
StringBuilder sb = new StringBuilder(source.Length);
foreach (char c in source)
{
if (!dupes.Contains(c))
{
sb.Append(c);
}
}
return sb.ToString();
}
This preserves order and, based on my tests, is 4 times faster than using a HashSet. This assumes your character range is 0-255 but you can extend that easily. If you're planning on using this in a loop, move the int[] c = new int[255];
out and in the function do an Array.Clear(c,0,255)
.
private static string RemoveDuplicates(string s)
{
int[] c = new int[255];
for (int i = 0; i < s.Length; i++)
{
c[s[i]]++;
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.Length; i++)
{
if (c[s[i]] == 1) sb.Append(s[i]);
}
return sb.ToString();
}
Following is strictly O(n) and fastest.
public static string RemoveDuplicates(this string input)
{
var lastIndex = new int[255];
var result = new char[input.Length + 1];
int i = 1;
int pos;
char ch2;
int start = 1;
foreach (char ch in input)
{
pos = lastIndex[ch];
if (pos == 0)
{
lastIndex[ch] = i;
result[i++] = ch;
}
else if (pos > 0)
{
ch2 = result[start];
result[pos] = ch2;
lastIndex[ch2] = pos;
start++;
lastIndex[ch] = -1;
}
}
return new String(result, start, i-start);
}
If you want to take full character set in consideration you can replace lastIndex with Dictionary.
public static string RemoveDuplicates(this string input)
{
var lastIndex = new Dictionary<char, int>(input.Length);
var result = new char[input.Length + 1];
int i = 1;
int pos;
char ch2;
int start = 1;
foreach (char ch in input)
{
lastIndex.TryGetValue(ch, out pos);
if (pos == 0)
{
lastIndex[ch] = i;
result[i++] = ch;
}
else if (pos > 0)
{
ch2 = result[start];
result[pos] = ch2;
lastIndex[ch2] = pos;
start++;
lastIndex[ch] = -1;
}
}
return new String(result, start, i-start);
}
If you are ok with unsafe code and do not want to use Dictionary following is the solution.
[DllImport("Kernel32.dll", EntryPoint = "RtlZeroMemory", SetLastError = false)]
static extern void ZeroMemory(IntPtr dest, IntPtr size);
static int[] lastIndex = new int[Char.MaxValue];
public static string RemoveDuplicates(this string input)
{
unsafe
{
fixed (int* pointer = lastIndex)
ZeroMemory((IntPtr)pointer, (IntPtr)lastIndex.Length);
}
var result = new char[input.Length + 1];
int i = 1;
int pos;
char ch2;
int start = 1;
foreach (char ch in input)
{
pos = lastIndex[ch];
if (pos == 0)
{
lastIndex[ch] = i;
result[i++] = ch;
}
else if (pos > 0)
{
ch2 = result[start];
result[pos] = ch2;
lastIndex[ch2] = pos;
start++;
lastIndex[ch] = -1;
}
}
return new String(result, start, i-start);
}