i started this script in Calibre. when i found out that Calibre can not do what i want, i installed spyder and i am now in the process of making it real python.
i am trying (at this stage) to get a list of url from an index. in the new urls, i want to get other urls (sort of an index of indexs) in order to get to the 2ns level index i need to get a cookie from the site.
my mechanize does not seem to open the page, so soup can not parse my 2nd index. i am not sure what the problem is. but that is what i think. what did i do wrong?
i marked in the code the line where the problem is. if you try looking at the code, you can't miss it. i also added the error i am getting when i run the code.
my code so far is:
#import urllib2
from BeautifulSoup import BeautifulSoup # For processing HTML
#from BeautifulSoup import BeautifulStoneSoup # For processing XML
#import BeautifulSoup # To get everything
import re, mechanize
INDEX = 'http://maya.tase.co.il/'
def parse_index():
feeds = []
for title, url in [
(u"Feed", u"http://maya.tase.co.il/bursa/index.asp?view=search&company_group=3000&arg_comp=&srh_comp_lb=1007&srh_from=2010-01-01&srh_until=2010-09-28&srh_anaf=-1&srh_event=9999&is_urgent=0&srh_company_press="),
#(u"הודעות מאתמול", u"http://maya.tase.co.il/bursa/index.asp?view=yesterday"),
]:
articles = make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(url):
title = 'Temp'
current_articles = []
br = mechanize.Browser()
page = br.open(url)
# page= urllib2.urlopen(url)
soup = BeautifulSoup(page)
print 'url is', url
print 'The soup is: ', soup
for item in soup.findAll('a',attrs={'class':'A3'}):
print 'item is: ',item
#link = item.find('a')
#titlecheck = self.tag_to_string(link)
#url_test = re.search('javascript', item['href'])
if not re.search('javascript', item['href']):
br1 = mechanize.Browser()
temp2= INDEX + 'bursa/' + item['href']
# temp2=[temp3]
print 'url1 is', temp2
print br1.open(INDEX)
#page1 = urllib2.urlopen(temp2)
br1.open(INDEX)
page1 = br1.open(temp2) #the problem is here
print '1111'
print 'page1 is', page1
soup1 = BeautifulSoup(page1)
print '2222'
# print 'the new soup is', soup1
print '6714'
for item1 in soup1.findAll('iframe'):
print 'item1 is:' , item1
txt= item1['src']
print 'FOUND GOOD URL'
re1='.*?' # Non-greedy match on filler
re2='(mayafiles)' # Variable Name 1
re3='(.)' # Any Single Character 1
re4='.*?' # Non-greedy match on filler
re5='htm' # Uninteresting: word
re6='.*?' # Non-greedy match on filler
re7='(htm)' # Word 1
rg = re.compile(re1+re2+re3+re4+re5+re6+re7,re.IGNORECASE|re.DOTALL)
m = rg.search(txt)
if m:
var1=m.group(1)
c1=m.group(2)
word1=m.group(3)
print "("+var1+")"+"("+c1+")"+"("+word1+")"+"\n"
url = item1['src']
else:
url = 'http://www.pdfdownload.org/pdf2html/pdf2html.php?url=' + item1['src'] + '&images=yes'
print 'url is: ', url
title = self.tag_to_string(item)
print 'title is: ', title
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
return current_articles
parse_index()
make_links()
this is the error messege:
url1 is http://maya.tase.co.il/bursa/report.asp?report_cd=570152
<response_seek_wrapper at 0x4bd0a08 whose wrapped object = <closeable_response at 0x4bd0e68 whose fp = <socket._fileobject object at 0x04AD36B0>>>
1111
page1 is <response_seek_wrapper at 0x4bd0828 whose wrapped object = <closeable_response at 0x4bd3f80 whose fp = <socket._fileobject object at 0x04AD3870>>>
Traceback (most recent call last):
File "C:\Users\Berkowitz\.spyder\.temp.py", line 99, in <module>
parse_index()
File "C:\Users\Berkowitz\.spyder\.temp.py", line 31, in parse_index
articles = make_links(url)
File "C:\Users\Berkowitz\.spyder\.temp.py", line 64, in make_links
soup1 = BeautifulSoup(page1)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1499, in __init__
File "build\bdist.win32\egg\BeautifulSoup.py", line 1230, in __init__
File "build\bdist.win32\egg\BeautifulSoup.py", line 1263, in _feed
File "C:\Python26\lib\HTMLParser.py", line 108, in feed
self.goahead(0)
File "C:\Python26\lib\HTMLParser.py", line 148, in goahead
k = self.parse_starttag(i)
File "C:\Python26\lib\HTMLParser.py", line 226, in parse_starttag
endpos = self.check_for_whole_start_tag(i)
File "C:\Python26\lib\HTMLParser.py", line 301, in check_for_whole_start_tag
self.error("malformed start tag")
File "C:\Python26\lib\HTMLParser.py", line 115, in error
raise HTMLParseError(message, self.getpos())
HTMLParser.HTMLParseError: malformed start tag, at line 271, column 473