tags:

views:

319

answers:

3

for my application i have to parse CSV file using Erlang.following is the code which will parse CSV using Erlang:-

parse_file(Fn) ->
{ok, Data} = file:read_file(Fn),
parse(binary_to_list(Data)).

parse(Data) -> lists:reverse(parse(Data, [])).

parse([], Acc) -> Acc;
parse(Data, Acc) ->
{Line, Tail} = parse_line(Data),
parse(Tail, [Line|Acc]).

parse_line(Data) ->
{Line, Tail} = parse_line(Data, []),
{lists:reverse(Line), Tail}.

parse_line([13,10|Data], Acc) -> {Acc, Data};
parse_line([10|Data], Acc) -> {Acc, Data};
parse_line([13|Data], Acc) -> {Acc, Data};
parse_line([], Acc) -> {Acc, []};
parse_line([$,,$,|Data], Acc) -> parse_line(Data, [""|Acc]);
parse_line([$,|Data], Acc) -> parse_line(Data, Acc);
parse_line(Data, Acc) ->
{Fld, Tail} = parse_field(Data),
parse_line(Tail, [Fld|Acc]).

parse_field([34|Data]) ->
{Fld, Tail} = parse_fieldq(Data, ""),
{lists:reverse(Fld), Tail};
parse_field(Data) ->
{Fld, Tail} = parse_field(Data, ""),
{lists:reverse(Fld), Tail}.

parse_field([$,|Tail], Acc) -> {Acc, [$,|Tail]};
parse_field([13|Tail], Acc) -> {Acc, [13|Tail]};
parse_field([10|Tail], Acc) -> {Acc, [10|Tail]};
parse_field([], Acc) -> {Acc, []};
parse_field([Ch|Tail], Acc) -> parse_field(Tail, [Ch|Acc]).

parse_fieldq([34,34|Tail], Acc) -> parse_fieldq(Tail, [34|Acc]);
parse_fieldq([34|Tail], Acc) -> {Acc, Tail};
parse_fieldq([Ch|Tail], Acc) -> parse_fieldq(Tail, [Ch|Acc]).

this code works fine but having two issues:- 1-since the code parse using double quote ("") and comma(,) and separate each value..but in following example if First name consist of double quote sting within it then the parser will create one more field.

"Type","First Name","Last Name","Email"
"Contact","Ashwani  Garg ------"All Pain Will End."","","[email protected]"

result:-
[["contact"],["Ashwani  Garg ------"],["All Pain Will End."],[],["[email protected]"]]

expected result:-
[["contact"],["Ashwani  Garg ------All Pain Will End."],[],["[email protected]"]]

2-for the following kind of csv its for value,its truncate some value:- First Name,Last Name,Middle Name,Name,Nickname,E-mail Address,Home Street,Home City,Home Postal Code,Home State,Home Country/Region,Home Phone,Home Fax,Mobile Phone,Personal Web Page,Business Street,Business City,Business Postal Code,Business State,Business Country/Region,Business Web Page,Business Phone,Business Fax,Pager,Company,Job Title,Department,Office Location,Notes

    Affection,,,Affection,,,,,,,,+919845141544,,+919845141544,,,,,,,,,,,,,,,
    result:-
    [["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[]]
    expected result:-
   [["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]

Please help me ...for refernce please use the following link:- http://ppolv.wordpress.com/2008/02/25/parsing-csv-in-erlang/

+3  A: 
parse(File) ->
  {ok, F} = file:open(File, [read, raw]),
  parse(F, file:read_line(F), []).

parse(F, eof, Done) ->
  file:close(F),
  lists:reverse(Done);    

parse(F, Line, Done) ->
  parse(F, file:read_line(F), [parse_line(Line)|Done]).



parse_line(Line) -> parse_line(Line, []).

parse_line([], Fields) -> lists:reverse(Fields);
parse_line("," ++ Line, Fields) -> parse_field(Line, Fields);
parse_line(Line, Fields) -> parse_field(Line, Fields).

parse_field("\"" ++ Line, Fields) -> parse_field_q(Line, [], Fields);
parse_field(Line, Fields) -> parse_field(Line, [], Fields).

parse_field("," ++ _ = Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field([C|Line], Buf, Fields) -> parse_field(Line, [C|Buf], Fields);
parse_field([], Buf, Fields) -> parse_line([], [lists:reverse(Buf)|Fields]).

parse_field_q(Line, Fields) -> parse_field_q(Line, [], Fields).
parse_field_q("\"\"" ++ Line, Buf, Fields) -> parse_field_q(Line, [$"|Buf], Fields);
parse_field_q("\"" ++ Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field_q([C|Line], Buf, Fields) -> parse_field_q(Line, [C|Buf], Fields).

without file:read_line :

parse_file(File) ->
  {ok, Data} = file:read_file(File),
  parse(binary_to_list(Data), []).

parse([], Done) ->
  lists:reverse(Done);

parse(Data, Done) ->
  {Line, Rest} = case re:split(Data, "\r|\n|\r\n", [{return, list}, {parts, 2}]) of
                   [L,R] -> {L,R};
                   [L]   -> {L,[]}
                 end,
  parse(Rest, [parse_line(Line)|Done]).
Zed
thanx for your reply..i have run your script..it works well and sort also but check the ouput..[["Type","First Name","Last Name","Email"], [[]], ["Contact","null",[],"[email protected]"], [[]]but you can see there is one [[]] box in every records that is not of any use can we avoid that...
Abhimanyu
OK, I changed the code to put Strings into an extra list.
Zed
** exception error: undefined function file:read_line/1 in function csv_erl_parser:parse/1
Abhimanyu
Upgrade your Erlang install to R13B02, or tell me if that's not possible...
Zed
its not possbile because it depend upon ubuntu version...
Abhimanyu
OK, I added code to read whole file to memory and work from there. parse_line is the same as before.
Zed
but still the problem is same as stated first..
Abhimanyu
Honestly, I don't understand what you mean... do you have double newlines in your file, or what?
Zed
Filtered = [E || E <- P, P =/= [[]]]. You can filter the [[]]'s from the result like this.
Zed
hi Zed, if a field name consits od value with comma saparater like "abhimanyu,singh" then it will parse this values as two list..but it is one value only...
Abhimanyu
Try this one :)
Zed
thanx a lot for your help..
Abhimanyu
If it's working fine, accept the answer, so people will know it's been correctly answered. This stands for all your outstanding questions. Your acceptance rate is quite low.
Zed
+1  A: 

Reading lines from a file was also discussed in Trapexit. It should be trivial to adapt that to your needs:

http://www.trapexit.org/Reading%5FLines%5Ffrom%5Fa%5FFile

Roberto Aloi
The first URL seems to be the same as the one at the end of the question ;)
Zed
lol. Sorry about that. I didn't see it.
Roberto Aloi
+2  A: 

A side issue:

How are you creating the CSV input? It doesn't appear to be valid CSV (not that there is a particularly rigorous specification for CSV though).

Typically to use double quotes inside a CSV field they need to be escaped as a pair of double quotes, so your example would be:

"Type","First Name","Last Name","Email"
"Contact","Ashwani  Garg ------""All Pain Will End.""","","[email protected]"

This will import fine into open office spreadsheet, whereas your original example does not.

Rob Charlton