Here's a way to parse your meta-language with pyparsing:
import pyparsing as p
lbrace = p.Literal('{').suppress()
rbrace = p.Literal('}').suppress()
equals = p.Literal('=').suppress()
slash = p.Literal('/').suppress()
identifier = p.Word(p.alphas, p.alphanums)
qs = p.QuotedString('"', '\\')
tag = p.Group(
lbrace
+ identifier.setResultsName('tag_name')
+ p.Group(p.ZeroOrMore(
p.Group(
identifier.setResultsName('attr_name')
+ equals
+ (identifier | qs).setResultsName('value')
).setResultsName('attribute')
)).setResultsName('attributes')
+ rbrace
).setResultsName('tag')
close_tag = p.Group(
lbrace
+ slash
+ identifier.setResultsName('tag_name')
+ rbrace
).setResultsName('closetag')
any_tag = tag | close_tag
s = """
what
{foo} {/foo} {bar baz="bat" bat="baz"}{b}dongs{/b} moredongs{/bar}
"""
print ''.join([tok.asXML() for tok, st, en in any_tag.scanString(s)])
Output:
<tag>
<tag>
<tag_name>foo</tag_name>
<attributes>
</attributes>
</tag>
</tag>
<closetag>
<closetag>
<tag_name>foo</tag_name>
</closetag>
</closetag>
<tag>
<tag>
<tag_name>bar</tag_name>
<attributes>
<attribute>
<attr_name>baz</attr_name>
<value>bat</value>
</attribute>
<attribute>
<attr_name>bat</attr_name>
<value>baz</value>
</attribute>
</attributes>
</tag>
</tag>
<tag>
<tag>
<tag_name>b</tag_name>
<attributes>
</attributes>
</tag>
</tag>
<closetag>
<closetag>
<tag_name>b</tag_name>
</closetag>
</closetag>
<closetag>
<closetag>
<tag_name>bar</tag_name>
</closetag>
</closetag>
You should look at what scanString
does to see how to use this in your own code. Getting the text between tags is left as an exercise for the reader. Hint: use a list as a stack for tags.