I have a tool to compare 2 csv files and then bucket each cell into one of the 6 buckets. Basically, it reads in the csv files (using fast csv reader, credit: http://www.codeproject.com/KB/database/CsvReader.aspx) and then creates a dictionary pertaining to each file based on the keys provided by the user. I then iterate through th dictionaries comparing the values and writing a result csv file.
While it is blazing fast, it is very inefficient in terms of memory usage. I cannot compare more than 150 MB files on my box with 3 GB physical memory.
Here is a code snippet to read the expected file. At the end of this piece, the memory usage is close to 500 MB from the task manager.
// Read Expected
long rowNumExp;
System.IO.StreamReader readerStreamExp = new System.IO.StreamReader(@expFile);
SortedDictionary<string, string[]> dictExp = new SortedDictionary<string, string[]>();
List<string[]> listDupExp = new List<string[]>();
using (CsvReader readerCSVExp = new CsvReader(readerStreamExp, hasHeaders, 4096))
{
readerCSVExp.SkipEmptyLines = false;
readerCSVExp.DefaultParseErrorAction = ParseErrorAction.ThrowException;
readerCSVExp.MissingFieldAction = MissingFieldAction.ParseError;
fieldCountExp = readerCSVExp.FieldCount;
string keyExp;
string[] rowExp = null;
while (readerCSVExp.ReadNextRecord())
{
if (hasHeaders == true)
{
rowNumExp = readerCSVExp.CurrentRecordIndex + 2;
}
else
{
rowNumExp = readerCSVExp.CurrentRecordIndex + 1;
}
try
{
rowExp = new string[fieldCount + 1];
}
catch (Exception exExpOutOfMemory)
{
MessageBox.Show(exExpOutOfMemory.Message);
Environment.Exit(1);
}
keyExp = readerCSVExp[keyColumns[0] - 1];
for (int i = 1; i < keyColumns.Length; i++)
{
keyExp = keyExp + "|" + readerCSVExp[i - 1];
}
try
{
readerCSVExp.CopyCurrentRecordTo(rowExp);
}
catch (Exception exExpCSVOutOfMemory)
{
MessageBox.Show(exExpCSVOutOfMemory.Message);
Environment.Exit(1);
}
try
{
rowExp[fieldCount] = rowNumExp.ToString();
}
catch (Exception exExpRowNumOutOfMemory)
{
MessageBox.Show(exExpRowNumOutOfMemory.Message);
Environment.Exit(1);
}
// Dedup Expected
if (!(dictExp.ContainsKey(keyExp)))
{
dictExp.Add(keyExp, rowExp);
}
else
{
listDupExp.Add(rowExp);
}
}
logFile.WriteLine("Done Reading Expected File at " + DateTime.Now);
Console.WriteLine("Done Reading Expected File at " + DateTime.Now + "\r\n");
logFile.WriteLine("Done Creating Expected Dictionary at " + DateTime.Now);
logFile.WriteLine("Done Identifying Expected Duplicates at " + DateTime.Now + "\r\n");
}
Is there anything, I could do to make it more memory efficient? Anything I could do differently above, to consume less mermory?
Any ideas are welcome.
Thanks guys for all the feedback.
I have incorporated the changes as suggested to store the index of the row instead of the row itself in the dictionaries.
Here is the same code fragment with the new implementation.
// Read Expected
long rowNumExp;
SortedDictionary<string, long> dictExp = new SortedDictionary<string, long>();
System.Text.StringBuilder keyExp = new System.Text.StringBuilder();
while (readerCSVExp.ReadNextRecord())
{
if (hasHeaders == true)
{
rowNumExp = readerCSVExp.CurrentRecordIndex + 2;
}
else
{
rowNumExp = readerCSVExp.CurrentRecordIndex + 1;
}
for (int i = 0; i < keyColumns.Length - 1; i++)
{
keyExp.Append(readerCSVExp[keyColumns[i] - 1]);
keyExp.Append("|");
}
keyExp.Append(readerCSVExp[keyColumns[keyColumns.Length - 1] - 1]);
// Dedup Expected
if (!(dictExp.ContainsKey(keyExp.ToString())))
{
dictExp.Add(keyExp.ToString(), rowNumExp);
}
else
{
// Process Expected Duplicates
string dupExp;
for (int i = 0; i < fieldCount; i++)
{
if (i >= fieldCountExp)
{
dupExp = null;
}
else
{
dupExp = readerCSVExp[i];
}
foreach (int keyColumn in keyColumns)
{
if (i == keyColumn - 1)
{
resultCell = "duplicateEXP: '" + dupExp + "'";
resultCell = CreateCSVField(resultCell);
resultsFile.Write(resultCell);
comSumCol = comSumCol + 1;
countDuplicateExp = countDuplicateExp + 1;
}
else
{
if (checkPTColumns(i + 1, passthroughColumns) == false)
{
resultCell = "'" + dupExp + "'";
resultCell = CreateCSVField(resultCell);
resultsFile.Write(resultCell);
countDuplicateExp = countDuplicateExp + 1;
}
else
{
resultCell = "PASSTHROUGH duplicateEXP: '" + dupExp + "'";
resultCell = CreateCSVField(resultCell);
resultsFile.Write(resultCell);
}
comSumCol = comSumCol + 1;
}
}
if (comSumCol <= fieldCount)
{
resultsFile.Write(csComma);
}
}
if (comSumCol == fieldCount + 1)
{
resultsFile.Write(csComma + rowNumExp);
comSumCol = comSumCol + 1;
}
if (comSumCol == fieldCount + 2)
{
resultsFile.Write(csComma);
comSumCol = comSumCol + 1;
}
if (comSumCol > fieldCount + 2)
{
comSumRow = comSumRow + 1;
resultsFile.Write(csCrLf);
comSumCol = 1;
}
}
keyExp.Clear();
}
logFile.WriteLine("Done Reading Expected File at " + DateTime.Now + "\r\n");
Console.WriteLine("Done Reading Expected File at " + DateTime.Now + "\r\n");
logFile.WriteLine("Done Analyzing Expected Duplicates at " + DateTime.Now + "\r\n");
Console.WriteLine("Done Analyzing Expected Duplicates at " + DateTime.Now + "\r\n");
logFile.Flush();
However, the problem is I need both the data sets in memory. I actually iterate through both the dictionaries looking for matches, mismatches, duplicates and dropouts based on the key.
Using the this approach of storing the row index, I am still using a lot of memory because for dynamic access I have to now use cached version of the csv reader. So although the dictionary is much smaller now, the caching of data makes up for the savings and I still ended up with about similar memory usage.
Hope, I am making sense...:)
One option is to get rid of the dictionary entirely and just loop through the 2 files, but not sure if the performance would be as fast as comparing 2 dictionaries.
Any inputs are much appreciated.