The original version I posted here as an answer had a problem in that it only worked while there was more than one "Regex" that matched the current expression. That is, as soon as only one Regex matched, it would return a token - whereas most people want the Regex to be "greedy". This was especially the case for things such as "quoted strings".
The only solution that sits on top of Regex is to read the input line-by-line (which means you cannot have tokens that span multiple lines). I can live with this - it is, after all, a poor man's lexer! Besides, it's usually useful to get line number information out of the Lexer in any case.
So, here's a new version that addresses these issues. Credit also goes to this
public interface IMatcher
{
/// <summary>
/// Return the number of characters that this "regex" or equivalent
/// matches.
/// </summary>
/// <param name="text">The text to be matched</param>
/// <returns>The number of characters that matched</returns>
int Match(string text);
}
class RegexMatcher : IMatcher
{
private readonly Regex regex;
public RegexMatcher(string regex)
{
this.regex = new Regex(string.Format("^{0}", regex));
}
public int Match(string text)
{
Match m = regex.Match(text);
if(m.Success)
return m.Length;
return 0;
}
public override string ToString()
{
return regex.ToString();
}
}
public class TokenDefinition
{
public readonly IMatcher Matcher;
public readonly object Token;
public TokenDefinition(string regex, object token)
{
this.Matcher = new RegexMatcher(regex);
this.Token = token;
}
}
public class Lexer : IDisposable
{
private readonly TextReader reader;
private readonly TokenDefinition[] tokenDefinitions;
private string lineRemaining;
private string tokenContents;
private object currentToken;
private int lineNumber = 0;
private int position = 0;
public Lexer(TextReader reader, TokenDefinition[] tokenDefinitions)
{
this.reader = reader;
this.tokenDefinitions = tokenDefinitions;
nextLine();
}
private void nextLine()
{
do
{
lineRemaining = reader.ReadLine();
++lineNumber;
position = 0;
} while(lineRemaining != null && lineRemaining.Length == 0);
}
public bool Next()
{
if(lineRemaining == null)
return false;
foreach(TokenDefinition def in tokenDefinitions)
{
int matched = def.Matcher.Match(lineRemaining);
if(matched > 0)
{
position += matched;
currentToken = def.Token;
tokenContents = lineRemaining.Substring(0,matched);
lineRemaining = lineRemaining.Substring(matched);
if(lineRemaining.Length == 0)
nextLine();
return true;
}
}
throw new Exception(string.Format("Unable to match against any tokens at line {0} position {1} \"{2}\"",
lineNumber, position, lineRemaining));
}
public string TokenContents
{
get { return tokenContents; }
}
public object Token
{
get { return currentToken; }
}
public int LineNumber
{
get { return lineNumber; }
}
public void Dispose()
{
reader.Dispose();
}
}
Example program:
string sample = @"( one (two 456 -43.2 "" \"" quoted"" ))";
var defs = new TokenDefinition[]
{
// Thanks to [steven levithan][2] for this great quoted string
// regex
new TokenDefinition(@"([""'])(?:\\\1|.)*?\1", "QUOTED-STRING"),
// Thanks to http://www.regular-expressions.info/floatingpoint.html
new TokenDefinition(@"[-+]?\d*\.\d+([eE][-+]?\d+)?", "FLOAT"),
new TokenDefinition(@"[-+]?\d+", "INT"),
new TokenDefinition(@"#t", "TRUE"),
new TokenDefinition(@"#f", "FALSE"),
new TokenDefinition(@"[*<>\?\-+/A-Za-z->!]+", "SYMBOL"),
new TokenDefinition(@"\.", "DOT"),
new TokenDefinition(@"\(", "LEFT"),
new TokenDefinition(@"\)", "RIGHT"),
new TokenDefinition(@"\s", "SPACE")
};
TextReader r = new StringReader(sample);
Lexer l = new Lexer(r, defs);
while (l.Next())
{
Console.WriteLine("Token: {0} Contents: {1}", l.Token, l.TokenContents);
}
Output:
Token: LEFT Contents: (
Token: SPACE Contents:
Token: SYMBOL Contents: one
Token: SPACE Contents:
Token: LEFT Contents: (
Token: SYMBOL Contents: two
Token: SPACE Contents:
Token: INT Contents: 456
Token: SPACE Contents:
Token: FLOAT Contents: -43.2
Token: SPACE Contents:
Token: QUOTED-STRING Contents: " \" quoted"
Token: SPACE Contents:
Token: RIGHT Contents: )
Token: RIGHT Contents: )