The following code contains a recursion function which is giving an error:
I think the problem is the len(childLinks)==0 which cant be true bcoz web
page will definately contain link so len of childLinks will never be zero
and this recursive function will go into infinite loop.What should i do in
order to come out of the infinite loop.
import sgmllib
import urllib, sgmllib
import sys
sys.setrecursionlimit(1000)
class Parser(sgmllib.SGMLParser):
def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()
def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self, verbose)
self.hyperlinks = []
self.inside_a_element = 0
def start_a(self, attributes):
"Process a hyperlink and its 'attributes'."
for name, value in attributes:
if name == "href":
self.hyperlinks.append(value)
self.inside_a_element = 1
def end_a(self):
"Record the end of a hyperlink."
self.inside_a_element = 0
def get_hyperlinks(self):
"Return the list of hyperlinks."
return self.hyperlinks
def process_page(self, parentNode):
print "parent :: ", parentNode.nodeName
file= urllib.urlopen(parentNode.nodeName)
data = file.read()
parser = Parser()
parser.parse(data)
childLinks = parser.get_hyperlinks()
if len(childLinks) == 0:
print "Leaf Node :: ", parentNode.nodeName
return
for childLink in childLinks:
childNode = Node(childLink)
parentNode.addChild(childNode)
print " Child :: ", childNode.nodeName
self.process_page(childNode)
class Node(object):
def __init__(self, nodeName=None, children=[]):
self.nodeName = nodeName
self.children = children
def print_info(self):
print "RootNode", "<" + self.rootNode + ">"
def getNodeName(self):
return nodeName
def setNodeName(self,value):
self.nodeName=value
def getParentNode(self):
return parentNode
def setChildren(children):
self.children=children
def addChild(self, child):
self.children.append(child)
def getChildren(self):
return children
def setRootNode(self,rootNode):
self.rootNode="d://PythonSample/Page.html"
def getRootNode(self):
return self.rootNode
def getFirstChild():
firstChild=Node("d://PythonSample/Page.html")
return firstChild
class TreeCreator:
def __init__(self, startURL):
self.startURL = startURL
def createTree(self):
parser=Parser()
node = Node(startURL)
parser.process_page(node)
return node
def printTree(self, node):
print node.nodeName
children = node.children
if len(children) == 0 :
return
for childNode in children:
print " ->" ,childNode.nodeName
#self.printTree(childNode)
if name == 'main':
startURL="http://ocado.com"
treeCreator =TreeCreator(startURL)
nodeTree = treeCreator.createTree()
treeCreator.printTree(nodeTree);
Kindly suggest something.