views:

112

answers:

2

I modified a regex I got here. I needed to change it because I needed it to match the following additional criteria:

  1. Dates with only Month and Year
  2. Full dates in the form mm dd, yyyy
  3. Dates with year only
  4. Input with extraneous info (like Vol. 51, No. 1, Mar 2008)

This is what I have so far. I did this with RegexBuddy to help me parse the logic, but it's so complex I'm not certain I have the most efficient solution.

\b(?:((Jan(uary)?|Feb(ruary)?|Ma(r(ch)?|y)|Apr(il)?|Ju((ly?)|(ne?))|Aug(ust)?|Oct(ober)?|(Sept|Nov|Dec)(ember)?)|((((Jan(uary)?|Ma(r(ch)?|y)|Jul(y)?|Aug(ust)?|Oct(ober)?|Dec(ember)?) 31)|((Jan(uary)?|Ma(r(ch)?|y)|Apr(il)?|Ju((ly?)|(ne?))|Aug(ust)?|Oct(ober)?|(Sept|Nov|Dec)(ember)?) (0?[1-9]|([12]\d)|30))|(Feb(ruary)? (0?[1-9]|1\d|2[0-8]|(29(?=, ((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))))),)) ((1[6-9]|[2-9]\d)\d{2}))|((1[6-9]|[2-9]\d)\d{2})

Is there anything that could be done to preserve the functionality of both the original regex and my additional criteria?

Here is the code where I implement this, if it helps you see what I'm trying to do. The output of the parseDate function is supposed to be a string date in the form "yyyy mm dd" (i.e., example 4 should output "2008 Mar"):

//generalized RegEx function
function returnRegExMatch(ex,haystack) {
  var needle = ex.exec(haystack);
  if (needle) { return needle[0]; }
}

// date extraction (uses returnRegExMatch)
function parseDate(date) {
  //strip anything other than a valid date
  var dateRe = /\b(?:((Jan(uary)?|Feb(ruary)?|Ma(r(ch)?|y)|Apr(il)?|Ju((ly?)|(ne?))|Aug(ust)?|Oct(ober)?|(Sept|Nov|Dec)(ember)?)|((((Jan(uary)?|Ma(r(ch)?|y)|Jul(y)?|Aug(ust)?|Oct(ober)?|Dec(ember)?) 31)|((Jan(uary)?|Ma(r(ch)?|y)|Apr(il)?|Ju((ly?)|(ne?))|Aug(ust)?|Oct(ober)?|(Sept|Nov|Dec)(ember)?) (0?[1-9]|([12]\d)|30))|(Feb(ruary)? (0?[1-9]|1\d|2[0-8]|(29(?=, ((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))))),)) ((1[6-9]|[2-9]\d)\d{2}))|((1[6-9]|[2-9]\d)\d{2})/;
  date = returnRegExMatch(dateRe,date);

  var yearRe = /[0-9][0-9][0-9][0-9]/;
  var monthRe = /Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/;
  var dayRe = /[0-9]?[0-9],/;

  var year = returnRegExMatch(yearRe,date);
  var month = returnRegExMatch(monthRe,date);
  var day = parseInt(returnRegExMatch(dayRe,date),10);

  var dateReturned = "";
  if (year) { dateReturned = year; }
  if (month) { dateReturned = dateReturned + " " + month; }
  if (month && day) { dateReturned = dateReturned + " " + day; }

  return dateReturned;
}

Thanks!

EDIT Thanks to all who took time to respond. You guys did what I was hoping for, pointing out the most ridiculous things in my implementation. I decided to simplify the main regex quite a bit. Here's the result:

\b(?:(?:Jan(?:uary)?|Feb(?:ruary)?|Ma(?:r(?:ch)?|y)|Apr(?:il)?|Ju(?:(?:ly?)|(?:ne?))|Aug(?:ust)?|Oct(?:ober)?|(?:Sept|Nov|Dec)(?:ember)?) (?:\d{1,2}, )?)?\d{4}

This doesn't worry about detect invalid dates based on leap years or whatever. @Bart convinced me that this is probably best done with native JS than regex. Thanks to @Tim too for pointing out the need for non-capturing parentheses.

If anyone has further suggestions for how I should refine this regex please fire away.

+4  A: 

I must say that I'm having trouble grokking this monster :)

Two things that are immediately apparent:

  1. It would be more efficient to use non-capturing parentheses (?:...) than regular parentheses if you're not planning on using their (sub-)matches later on.

  2. If you have your parentheses nested to ten levels, something is wrong. It might work, but it's a b*tch to maintain. Or understand.

I'll check with RegexMagic if there's maybe a better way to get what you need. But since nobody is forcing you to do all you want to do in a single regex, why not break up the problem into components, using a single, simpler regex for each?

Tim Pietzcker
+1 break 'em up
mobrule
+3  A: 

How about something like this:

#!/usr/bin/js

function getMonth(monthStr) {
    var monthMap = new Array();
    monthMap['jan'] = monthMap['january']   = 1;
    monthMap['feb'] = monthMap['february']  = 2;
    monthMap['mar'] = monthMap['march']     = 3;
    monthMap['apr'] = monthMap['april']     = 4;
    monthMap['may']                         = 5;
    monthMap['jun'] = monthMap['june']      = 6;
    monthMap['jul'] = monthMap['july']      = 7;
    monthMap['aug'] = monthMap['august']    = 8;
    monthMap['sep'] = monthMap['september'] = 9;
    monthMap['oct'] = monthMap['october']   = 10;
    monthMap['nov'] = monthMap['november']  = 11;
    monthMap['dec'] = monthMap['december']  = 12;
    return monthMap[monthStr.toLowerCase()];
}

function isLeapYear(year) {
    return year%400 == 0 || (year%100 != 0 && year%4 == 0);
}

function isPositiveNumber(str) {
    return str.match(/^\d+$/);
}

function parseDate(date) {
    var tokens = date.split(/,?\s+/);

    var m = getMonth(tokens[0]);
    var d = tokens[1];
    var y = tokens[2];

    if(!isPositiveNumber(d) || !m || !isPositiveNumber(y)) return false;

    if(
        ((m==4 || m==6 || m==9 || m==11) && d <= 30) ||
        (m==2 && ((isLeapYear(y) && d <= 29) || d <= 28)) ||
        ((m==1 || m==3 || m==5 || m==7 || m==8 || m==10 || m==12) && d <= 31)
    ) {
        var dateObj = new Date();
        dateObj.setFullYear(y, m-1, d);
        return dateObj;
    }

    return false;
}

var tests = new Array('January 31, 2009', 'Nov 31, 2009', 'Feb 29, 2001', 'Feb 29, 2000', 'Feb 29, 1900');

for(var i in tests) {
    var date = parseDate(tests[i]);
    print(date ? tests[i]+" is a valid date, parsed as: "+date : tests[i]+" invalid");
}

Output:

January 31, 2009 is a valid date, parsed as: Sat Jan 31 2009 20:31:33 GMT+0100 (CET)
Nov 31, 2009 invalid
Feb 29, 2001 invalid
Feb 29, 2000 is a valid date, parsed as: Tue Feb 29 2000 20:31:33 GMT+0100 (CET)
Feb 29, 1900 invalid
Bart Kiers
Bravo! A huge improvement in readability.If you count the time it takes to try to understand this when debugging, this is vastly more efficient too. (I think efficiency calculations should always measure run-time plus programmer-time.)
Jeremy Stein
Thanks! Just one clarification: I want the parseDate function to return a string not a Date object. The string format is a little weird, but it's what I need for the application where this code goes. I've edited my question to reflect that.
dtjohnso
Well, then you either need to adjust my example slightly, or perhaps better: use the SimpleDateFormat class to format the Date into a String. Let me know if you run into problems.
Bart Kiers