ansaurus

Question

Splitting a string only when the delimeter is not enclosed in quotation marks

Answer 1

+2 A:

What you are asking for is essentially a Javascript CSV parser. Do a Google search on "Javascript CSV Parser" and you'll get lots of hits, many with complete scripts. See also http://stackoverflow.com/questions/1293147/javascript-code-to-parse-csv-data

Jim Garrison 2009-12-30 19:22:14

Just to be pendatic, 'lexer' is a more appropriate term than 'parser' for this problem.

trinithis 2009-12-31 04:51:19

Answer 2

+1 A:

var str = 'text, foo, "haha, dude", bar';
var fragments = str.match(/[a-z]+|(['"]).*?\1/g);

Even better (supports escaped " or ' inside the strings):

var str = 'text_123 space, foo, "text, here\", dude", bar, \'one, two\', blob';
var fragments = str.match(/[^"', ][^"',]+[^"', ]|(["'])(?:[^\1\\\\]|\\\\.)*\1/g);

// Result:
0: text_123 space
1: foo
2: "text, here\", dude"
3: bar
4: 'one, two'
5: blob

watain 2009-12-30 19:23:22

Doesn't handle swapped `'` and `"` delimiters, non-alpha characters.

Jim Garrison 2009-12-30 19:26:16

Changed it, now it does :)

watain 2009-12-30 19:54:54

I ran it with a simple test, `"a,b"` and it returned `null`. Apparently it doesn't pick up words that consist of less than 3 characters

Andreas Grech 2009-12-30 21:21:31

Some other problems: **1)** `\1` doesn't act as a group reference inside a character class (it just matches `1`); **2)** you only need two backslashes in the regex to match a literal backslash, not four; **3)** to put a backslash-quote in your target string, you have to use `\\"` in the string literal, not `\"`; **4)** unless you're told otherwise by the originator, whitespace should be treated as significant in CSV data (meaning the first part of your regex should be simply `["',\s]+`).

Alan Moore 2009-12-31 14:49:31

Answer 3

A:

If you can control the input to enforce that the string will be enclosed in double-quotes " and that all elements withing the string will be enclosed in single-quotes ', and that no element can CONTAIN a single-quote, then you can split on , '. If you CAN'T control the input, then using a regular expression to sort/filter/split the input would be about as useful as using a regular expression to match against xhtml (see: http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454)

md5sum 2009-12-30 19:24:03

I don't see what the thread you linked to has to do with this one. Okay, I wouldn't use regex in this case, but the problem faced here has little to do with trying to parse (x)html using regex. *That* can't be done because of the recursive nature of (x)html, but this problem isn't about that at all. It seems that in every thread with the word "regex" in it, the thread (#1732348) is posted a linked to...

Bart Kiers 2009-12-30 19:57:16

The point is that if you aren't sure WHAT your input might contain, then there's no regular expression that's gonna parse it properly.

md5sum 2009-12-30 20:09:30

My point is that the post #1732348 has little to do with this one. And the fact you don't know exactly what your input is, is exactly what regex is about: you define a pattern of the possible variations.

Bart Kiers 2009-12-30 20:29:53

Answer 4

+1 A:

Well, I already have a jackhammer of a solution written (general code written for something else), so just for kicks . . .

function Lexer () {
  this.setIndex = false;
  this.useNew = false;
  for (var i = 0; i < arguments.length; ++i) {
    var arg = arguments [i];
    if (arg === Lexer.USE_NEW) {
      this.useNew = true;
    }
    else if (arg === Lexer.SET_INDEX) {
      this.setIndex = Lexer.DEFAULT_INDEX;
    }
    else if (arg instanceof Lexer.SET_INDEX) {
      this.setIndex = arg.indexProp;
    }
  }
  this.rules = [];
  this.errorLexeme = null;
}

Lexer.NULL_LEXEME = {};

Lexer.ERROR_LEXEME = { 
  toString: function () {
    return "[object Lexer.ERROR_LEXEME]";
  }
};

Lexer.DEFAULT_INDEX = "index";

Lexer.USE_NEW = {};

Lexer.SET_INDEX = function (indexProp) {
  if ( !(this instanceof arguments.callee)) {
    return new arguments.callee.apply (this, arguments);
  }
  if (indexProp === undefined) {
    indexProp = Lexer.DEFAULT_INDEX;
  }
  this.indexProp = indexProp;
};

(function () {
  var New = (function () {
    var fs = [];
    return function () {
      var f = fs [arguments.length];
      if (f) {
        return f.apply (this, arguments);
      }
      var argStrs = [];
      for (var i = 0; i < arguments.length; ++i) {
        argStrs.push ("a[" + i + "]");
      }
      f = new Function ("var a=arguments;return new this(" + argStrs.join () + ");");
      if (arguments.length < 100) {
        fs [arguments.length] = f;
      }
      return f.apply (this, arguments);
    };
  }) ();

  var flagMap = [
      ["global", "g"]
    , ["ignoreCase", "i"]
    , ["multiline", "m"]
    , ["sticky", "y"]
    ];

  function getFlags (regex) {
    var flags = "";
    for (var i = 0; i < flagMap.length; ++i) {
      if (regex [flagMap [i] [0]]) {
        flags += flagMap [i] [1];
      }
    }
    return flags;
  }

  function not (x) {
    return function (y) {
      return x !== y;
    };
  }

  function Rule (regex, lexeme) {
    if (!regex.global) {
      var flags = "g" + getFlags (regex);
      regex = new RegExp (regex.source, flags);
    }
    this.regex = regex;
    this.lexeme = lexeme;
  }

  Lexer.prototype = {
      constructor: Lexer

    , addRule: function (regex, lexeme) {
        var rule = new Rule (regex, lexeme);
        this.rules.push (rule);
      }

    , setErrorLexeme: function (lexeme) {
        this.errorLexeme = lexeme;
      }

    , runLexeme: function (lexeme, exec) {
        if (typeof lexeme !== "function") {
          return lexeme;
        }
        var args = exec.concat (exec.index, exec.input);
        if (this.useNew) {
          return New.apply (lexeme, args);
        }
        return lexeme.apply (null, args);
      }

    , lex: function (str) {
        var index = 0;
        var lexemes = [];
        if (this.setIndex) {
          lexemes.push = function () {
            for (var i = 0; i < arguments.length; ++i) {
              if (arguments [i]) {
                arguments [i] [this.setIndex] = index;
              }
            }
            return Array.prototype.push.apply (this, arguments);
          };
        }
        while (index < str.length) {
          var bestExec = null;
          var bestRule = null;
          for (var i = 0; i < this.rules.length; ++i) {
            var rule = this.rules [i];
            rule.regex.lastIndex = index;
            var exec = rule.regex.exec (str);
            if (exec) {
              var doUpdate = !bestExec 
                || (exec.index < bestExec.index)
                || (exec.index === bestExec.index && exec [0].length > bestExec [0].length)
                ;
              if (doUpdate) {
                bestExec = exec;
                bestRule = rule;
              }
            }
          }
          if (!bestExec) {
            if (this.errorLexeme) {
              lexemes.push (this.errorLexeme);
              return lexemes.filter (not (Lexer.NULL_LEXEME));
            }
            ++index;
          }
          else {
            if (this.errorLexeme && index !== bestExec.index) {
              lexemes.push (this.errorLexeme);
            }
            var lexeme = this.runLexeme (bestRule.lexeme, bestExec);
            lexemes.push (lexeme);
          }
          index = bestRule.regex.lastIndex;
        }
        return lexemes.filter (not (Lexer.NULL_LEXEME));
      }
  };
}) ();

if (!Array.prototype.filter) {
  Array.prototype.filter = function (fun) {
    var len = this.length >>> 0;
    var res = [];
    var thisp = arguments [1];
    for (var i = 0; i < len; ++i) {
      if (i in this) {
        var val = this [i];
        if (fun.call (thisp, val, i, this)) {
          res.push (val);
        }
      }
    }
    return res;
  };
}

Now to use the code for your problem:

function trim (str) {
  str = str.replace (/^\s+/, "");
  str = str.replace (/\s+$/, "");
  return str;
}

var splitter = new Lexer ();
splitter.setErrorLexeme (Lexer.ERROR_LEXEME);
splitter.addRule (/[^,"]*"[^"]*"[^,"]*/g, trim);
splitter.addRule (/[^,']*'[^']*'[^,']*/g, trim);
splitter.addRule (/[^,"']+/g, trim);
splitter.addRule (/,/g, Lexer.NULL_LEXEME);

var strs = [
    "peanut, butter, jelly"
  , "peanut, 'butter, bread', 'jelly'"
  , 'peanut, "butter, bread", "jelly"'
  ];

// NOTE: I'm lazy here, so I'm using Array.prototype.map, 
//       which isn't supported in all browsers.
var splitStrs = strs.map (function (str) {
  return splitter.lex (str);
});

trinithis 2009-12-30 20:18:38

ansaurus

tags:

views:

answers:

Splitting a string only when the delimeter is not enclosed in quotation marks

related questions