views:

556

answers:

2
Dim re As New Regex("((?<field>[^"",\r\n]+)|""(?<field>([^""]|"""")+)"")((?<rowbreak>,\r\n|\r\n|\n|$)|,)")


3700,Collin,Franc,,[email protected],Collins,Francine,,[email protected],"Enel North America, Inc.",One T Drive,Suite 220,MyCity,MA,77774,1,[email protected],,,,,3700,
3701,Steur,Larry,,[email protected],Steur,Larry,,[email protected],"St. Petersburg Corp, Inc.",10490 Gady Blvd,,MyCity,FL,33772,1,[email protected],,,,,3701
3705,Giger,Tina,CFO,[email protected],Giger,Tina,CFO,[email protected],Net Technologies,23 Calab Rd,Suite 202,Calabas,CA,77777,1,[email protected],,,,,3705,

The RegEx I'm using doesn't work in that if there are empty columns it "slides" the next column over into it's place. What can I do to allow so that the following makes five columns for each row?

1,2,3,4,5
Test,,,Test,
Test,,Test,,

Here's my full code:

<%@ Page Language="VB" MasterPageFile="~/_Common/MasterPage.master" %>

<%@ Import Namespace="System.Data" %>
<%@ Import Namespace="System.IO" %>
<%@ Import Namespace="System.Text.RegularExpressions" %>
<%@ Import Namespace="System.Xml" %>

<script runat="server">

    Private Sub Page_Load(ByVal sender As Object, ByVal e As System.EventArgs)
    
        ' call the parser
        Dim dt As DataTable = ParseCSVFile(Server.MapPath("Documents/Test.csv"))
    
        ' bind the resulting DataTable to a DataGrid Web Control
        GridView1.DataSource = dt
        GridView1.DataBind()
    End Sub

    Public Function ParseCSV(ByVal inputString As String) As DataTable
    
        Dim dt As New DataTable()
    
        ' declare the Regular Expression that will match versus the input string
        Dim re As New Regex("((?<field>[^"",\r\n]+)|""(?<field>([^""]|"""")+)"")((?<rowbreak>,\r\n|\r\n|\n|$)|,)")
    
        Dim colArray As New ArrayList()
        Dim rowArray As New ArrayList()
    
        Dim colCount As Integer = 0
        Dim maxColCount As Integer = 0
        Dim rowbreak As String = ""
        Dim field As String = ""
    
        Dim mc As MatchCollection = re.Matches(inputString)
    
        For Each m As Match In mc
        
            ' retrieve the field and replace two double-quotes with a single double-quote
            field = m.Result("${field}").Replace("""""", """")
        
            rowbreak = m.Result("${rowbreak}")
        
            If field.Length > 0 Then
                colArray.Add(field)
                colCount += 1
            End If
        
            If rowbreak.Length > 0 Then
            
                ' add the column array to the row Array List
                rowArray.Add(colArray.ToArray())
            
                ' create a new Array List to hold the field values
                colArray = New ArrayList()
            
                If colCount > maxColCount Then
                    maxColCount = colCount
                End If
            
                colCount = 0
            End If
        Next
    
        If rowbreak.Length = 0 Then
            ' this is executed when the last line doesn't
            ' end with a line break
            rowArray.Add(colArray.ToArray())
            If colCount > maxColCount Then
                maxColCount = colCount
            End If
        End If
    
        ' create the columns for the table
        For i As Integer = 0 To maxColCount - 1
            dt.Columns.Add([String].Format("col{0:000}", i))
        Next
    
        ' convert the row Array List into an Array object for easier access
        Dim ra As Array = rowArray.ToArray()
        For i As Integer = 0 To ra.Length - 1
        
            ' create a new DataRow
            Dim dr As DataRow = dt.NewRow()
        
            ' convert the column Array List into an Array object for easier access
            Dim ca As Array = DirectCast((ra.GetValue(i)), Array)
        
            ' add each field into the new DataRow
            For j As Integer = 0 To ca.Length - 1
                dr(j) = ca.GetValue(j)
            Next
        
            ' add the new DataRow to the DataTable
            dt.Rows.Add(dr)
        Next
    
        ' in case no data was parsed, create a single column
        If dt.Columns.Count = 0 Then
            dt.Columns.Add("NoData")
        End If
    
        Return dt
    End Function

    Public Function ParseCSVFile(ByVal path As String) As DataTable
    
        Dim inputString As String = ""
    
        ' check that the file exists before opening it
        If File.Exists(path) Then
        
            Dim sr As New StreamReader(path)
            inputString = sr.ReadToEnd()
            sr.Close()
        
        End If
    
        Return ParseCSV(inputString)
    End Function

</script>


<asp:Content ID="Content1" ContentPlaceHolderID="head" Runat="Server">

</asp:Content>

<asp:Content ID="Content2" ContentPlaceHolderID="ContentPlaceHolder1" Runat="Server">
    <asp:FileUpload ID="FileUpload1" runat="server" /> 
    <asp:Button ID="btnUpload" runat="server" Text="Upload" />
    <asp:GridView ID="GridView1" runat="server">
    </asp:GridView>
</asp:Content>

I actually would prefer doing this with Linq but I couldn't find any examples that used CSV files that had quoted values (for values that contained commas in the value itself).

** UPDATE **

I changed the RegEx to this and it's getting me closer:

Dim re As New Regex("(?<field>,)|((?<field>[^"",\r\n]+)|""(?<field>([^""]|"""")+)"")(,|(?<rowbreak>\r\n|\n|$))")

now it puts all the data in the correct columns but there's just a comma in the empty columns

+1  A: 

Why use regular expressions? You have a pretty simple parsing job - use commas as separators, unless they are inside quotes. So you can iterate through each line character by character, setting flags when you hit an oppening quote, and unsetting it when you hit an ending quote. While "inside quote" you ignore the commas. (I gave up on trying to do this using regular expressions)... Try this:

private DataTable GetDataTableFromCsv(string csvContent)
    {
        DataTable dt = new DataTable();
        string all = csvContent;
        string[] lines = all.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);
        foreach (String s in lines)
        {
            bool isInsideQuote = false;
            List<string> values = new List<string>();
            string accumulator = "";
            for (int j = 0; j < s.Length; j++)// (char c in s)
            {
                char c = s[j];
                if (c == '"')
                {
                    if (!isInsideQuote)
                    {
                        isInsideQuote = true;
                    }
                    else
                    {
                        isInsideQuote = false;
                    }
                    accumulator += "\"";
                }
                else if (c == ',' && !isInsideQuote)
                {
                    values.Add(accumulator);
                    accumulator = ""; //reset
                }
                else if (j == s.Length - 1)
                {
                    accumulator += c;
                    values.Add(accumulator);
                    accumulator = "";
                }
                else
                {
                    accumulator += c;
                }
            }
            //first time, create relevant columns
            if (dt.Columns.Count == 0)
            {
                foreach (string colExample in values)
                {
                    dt.Columns.Add();
                }
                //list of strings now contains an example.
            }
            DataRow dr = dt.NewRow();
            for (int i = 0; i < values.Count; i++)
            {
                dr[i] = values[i];
            }
            dt.Rows.Add(dr);
        }
        return dt;
    }
Ilya Tchivilev
A: 

This doesn't work for values that have an issue with a comma that is encapsulated.