Using privoxy as http-proxy in front of tor works for me - here's a crawler-template:
import urllib2
import httplib
from BeautifulSoup import BeautifulSoup
from time import sleep
class Scraper(object):
def __init__(self, options, args):
if options.proxy is None:
options.proxy = "http://localhost:8118/"
self._open = self._get_opener(options.proxy)
def _get_opener(self, proxy):
proxy_handler = urllib2.ProxyHandler({'http': proxy})
opener = urllib2.build_opener(proxy_handler)
return opener.open
def get_soup(self, url):
soup = None
while soup is None:
try:
request = urllib2.Request(url)
request.add_header('User-Agent', 'foo bar useragent')
soup = BeautifulSoup(self._open(request))
except (httplib.IncompleteRead, httplib.BadStatusLine,
urllib2.HTTPError, ValueError, urllib2.URLError), err:
sleep(1)
return soup
class PageType(Scraper):
_URL_TEMPL = "http://foobar.com/baz/%s"
def items_from_page(self, url):
nextpage = None
soup = self.get_soup(url)
items = []
for item in soup.findAll("foo"):
items.append(item["bar"])
nexpage = item["href"]
return nextpage, items
def get_items(self):
nextpage, items = self._categories_from_page(self._START_URL % "start.html")
while nextpage is not None:
nextpage, newitems = self.items_from_page(self._URL_TEMPL % nextpage)
items.extend(newitems)
return items()
pt = PageType()
print pt.get_items()