links_list = char.getLinks(words)
for source_url in links_list:
try:
print 'Downloading URL: ' + source_url
urldict = hash_url(source_url)
source_url_short = urldict['url_short']
source_url_hash = urldict['url_short_hash']
if Url.objects.filter(source_url_short = source_url_short).count() == 0:
try:
htmlSource = getSource(source_url)
except:
htmlSource = '-'
print '\thtmlSource got an error...'
new_u = Url(source_url = source_url, source_url_short = source_url_short, source_url_hash = source_url_hash, html = htmlSource)
new_u.save()
time.sleep(3)
else:
print '\tAlready in database'
except:
print '\tError with downloading URL..'
time.sleep(3)
pass
def getSource(theurl, unicode = 1, moved = 0):
if moved == 1:
theurl = urllib2.urlopen(theurl).geturl()
urlReq = urllib2.Request(theurl)
urlReq.add_header('User-Agent',random.choice(agents))
urlResponse = urllib2.urlopen(urlReq)
htmlSource = urlResponse.read()
htmlSource = htmlSource.decode('utf-8').encode('utf-8')
return htmlSource
basically what this code does is...it takes a list of URLs and downloads them, saves them to a DB. That's all.