I am currently the following code based on Chapter 12.5 of the Python Cookbook:
from xml.parsers import expat
class Element(object):
def __init__(self, name, attributes):
self.name = name
self.attributes = attributes
self.cdata = ''
self.children = []
def addChild(self, element):
self.children.append(element)
def getAttribute(self,key):
return self.attributes.get(key)
def getData(self):
return self.cdata
def getElements(self, name=''):
if name:
return [c for c in self.children if c.name == name]
else:
return list(self.children)
class Xml2Obj(object):
def __init__(self):
self.root = None
self.nodeStack = []
def StartElement(self, name, attributes):
element = Element(name.encode(), attributes)
if self.nodeStack:
parent = self.nodeStack[-1]
parent.addChild(element)
else:
self.root = element
self.nodeStack.append(element)
def EndElement(self, name):
self.nodeStack.pop()
def CharacterData(self,data):
if data.strip():
data = data.encode()
element = self.nodeStack[-1]
element.cdata += data
def Parse(self, filename):
Parser = expat.ParserCreate()
Parser.StartElementHandler = self.StartElement
Parser.EndElementHandler = self.EndElement
Parser.CharacterDataHandler = self.CharacterData
ParserStatus = Parser.Parse(open(filename).read(),1)
return self.root
I am working with XML docs about 1 GB in size. Does anyone know a faster way to parse these?