Hi all,
As you surely know, I can do multithreading to download files from the Internet faster. But if I send lots of requests to the same website, I could be black listed.
So could you help me to implement something like "I've got a list of urls. I want you to download all of these files but if 10 downloads are already running, wait for a slot."
I'll appreciate any help. Tk.
binoua
This is the code I'm using (doesn't work).
class PDBDownloader(threading.Thread):
prefix = 'http://www.rcsb.org/pdb/files/'
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
self.pdbid = None
self.urlstr = ''
self.content = ''
def run(self):
while True:
self.pdbid = self.queue.get()
self.urlstr = self.prefix + pdbid + '.pdb'
print 'downloading', pdbid
self.download()
filename = '%s.pdb' %(pdbid)
f = open(filename, 'wt')
f.write(self.content)
f.close()
self.queue.task_done()
def download(self):
try:
f = urllib2.urlopen(self.urlstr)
except urllib2.HTTPError, e:
msg = 'HTTPError while downloading file %s at %s. '\
'Details: %s.' %(self.pdbid, self.urlstr, str(e))
raise OstDownloadException, msg
except urllib2.URLError, e:
msg = 'URLError while downloading file %s at %s. '\
'RCSB erveur unavailable.' %(self.pdbid, self.urlstr)
raise OstDownloadException, msg
except Exception, e:
raise OstDownloadException, str(e)
else:
self.content = f.read()
if __name__ == '__main__':
pdblist = ['1BTA', '3EAM', '1EGJ', '2BV9', '2X6A']
for i in xrange(len(pdblist)):
pdb = PDBDownloader(queue)
pdb.setDaemon(True)
pdb.start()
while pdblist:
pdbid = pdblist.pop()
queue.put(pdbid)
queue.join()