views:

69

answers:

0

i started this script in Calibre. when i found out that Calibre can not do what i want, i installed spyder and i am now in the process of making it real python.

i am trying (at this stage) to get a list of url from an index. in the new urls, i want to get other urls (sort of an index of indexs) in order to get to the 2ns level index i need to get a cookie from the site.

my mechanize does not seem to open the page, so soup can not parse my 2nd index. i am not sure what the problem is. but that is what i think. what did i do wrong?

i marked in the code the line where the problem is. if you try looking at the code, you can't miss it. i also added the error i am getting when i run the code.

my code so far is:

#import urllib2
from BeautifulSoup import BeautifulSoup          # For processing HTML
#from BeautifulSoup import BeautifulStoneSoup     # For processing XML
#import BeautifulSoup                             # To get everything
import re, mechanize



INDEX = 'http://maya.tase.co.il/'
def parse_index():
    feeds = []
    for title, url in [
                        (u"Feed", u"http://maya.tase.co.il/bursa/index.asp?view=search&company_group=3000&arg_comp=&srh_comp_lb=1007&srh_from=2010-01-01&srh_until=2010-09-28&srh_anaf=-1&srh_event=9999&is_urgent=0&srh_company_press="),
                        #(u"הודעות מאתמול", u"http://maya.tase.co.il/bursa/index.asp?view=yesterday"),                            
                        ]:
        articles = make_links(url)
        if articles:
            feeds.append((title, articles))
    return feeds

def make_links(url):
    title = 'Temp'
    current_articles = []
    br = mechanize.Browser()
    page = br.open(url)

  #  page= urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    print 'url is', url

    print 'The soup is: ', soup
    for item in soup.findAll('a',attrs={'class':'A3'}):
        print 'item is: ',item
        #link = item.find('a')
        #titlecheck = self.tag_to_string(link)
        #url_test = re.search('javascript', item['href'])

        if not re.search('javascript', item['href']):
          br1 = mechanize.Browser()
          temp2= INDEX + 'bursa/' + item['href']
    #      temp2=[temp3]
          print 'url1 is', temp2
          print br1.open(INDEX)
          #page1 = urllib2.urlopen(temp2)
          br1.open(INDEX)
          page1 = br1.open(temp2)                               #the problem is here
          print '1111'
          print 'page1 is', page1
          soup1 = BeautifulSoup(page1)
          print '2222'
  #            print 'the new soup is', soup1
          print '6714' 
          for item1 in soup1.findAll('iframe'):
             print 'item1 is:' , item1
             txt= item1['src']
             print 'FOUND GOOD URL'
             re1='.*?' # Non-greedy match on filler
             re2='(mayafiles)' # Variable Name 1
             re3='(.)' # Any Single Character 1
             re4='.*?' # Non-greedy match on filler
             re5='htm' # Uninteresting: word
             re6='.*?' # Non-greedy match on filler
             re7='(htm)' # Word 1

             rg = re.compile(re1+re2+re3+re4+re5+re6+re7,re.IGNORECASE|re.DOTALL)
             m = rg.search(txt)
             if m:
                 var1=m.group(1)
                 c1=m.group(2)
                 word1=m.group(3)
                 print "("+var1+")"+"("+c1+")"+"("+word1+")"+"\n"
                 url = item1['src']
             else:
                 url = 'http://www.pdfdownload.org/pdf2html/pdf2html.php?url=' + item1['src'] + '&images=yes'

             print 'url is: ', url
             title       = self.tag_to_string(item)
             print 'title is: ', title
             current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this


    return current_articles

parse_index()
make_links()

this is the error messege:

url1 is http://maya.tase.co.il/bursa/report.asp?report_cd=570152
<response_seek_wrapper at 0x4bd0a08 whose wrapped object = <closeable_response at 0x4bd0e68 whose fp = <socket._fileobject object at 0x04AD36B0>>>
1111
page1 is <response_seek_wrapper at 0x4bd0828 whose wrapped object = <closeable_response at 0x4bd3f80 whose fp = <socket._fileobject object at 0x04AD3870>>>
Traceback (most recent call last):
  File "C:\Users\Berkowitz\.spyder\.temp.py", line 99, in <module>
    parse_index()
  File "C:\Users\Berkowitz\.spyder\.temp.py", line 31, in parse_index
    articles = make_links(url)
  File "C:\Users\Berkowitz\.spyder\.temp.py", line 64, in make_links
    soup1 = BeautifulSoup(page1)
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1499, in __init__
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1230, in __init__
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1263, in _feed
  File "C:\Python26\lib\HTMLParser.py", line 108, in feed
    self.goahead(0)
  File "C:\Python26\lib\HTMLParser.py", line 148, in goahead
    k = self.parse_starttag(i)
  File "C:\Python26\lib\HTMLParser.py", line 226, in parse_starttag
    endpos = self.check_for_whole_start_tag(i)
  File "C:\Python26\lib\HTMLParser.py", line 301, in check_for_whole_start_tag
    self.error("malformed start tag")
  File "C:\Python26\lib\HTMLParser.py", line 115, in error
    raise HTMLParseError(message, self.getpos())
HTMLParser.HTMLParseError: malformed start tag, at line 271, column 473