i have the following script i am using to scrap data from my uni website and insert into a GAE Db
from mechanize import Browser
from BeautifulSoup import BeautifulSoup
import re
import datetime
__author__ = "Nash Rafeeq"
url = "http://webspace.apiit.edu.my/schedule/timetable.jsp"
viewurl = "http://localhost:8000/timekeeper/intake/checkintake/"
inserturl = "http://localhost:8000/timekeeper/intake/addintake/"
print url
mech = Browser()
try:
page = mech.open(url)
html = page.read()
except Exception, err:
print str(err)
#print html
soup = BeautifulSoup(html)
soup.prettify()
tables = soup.find('select')
for options in tables:
intake = options.string
#print intake
try:
#print viewurl+intake
page = mech.open(viewurl+intake)
html = page.read()
print html
if html=="Exist in database":
print intake, " Exist in the database skiping"
else:
page = mech.open(inserturl+intake)
html = page.read()
print html
if html=="Ok":
print intake, "added to the database"
else:
print "Error adding ", intake, " to database"
except Exception, err:
print str(err)
i am wondering what would be the best way to optimize this script so i can run it on the app engine servers. as it is, it is now scraping over 300 entries and take well over 10 mins to insert all the data on my local machine
the model that is being used to store the data is
class Intake(db.Model):
intake=db.StringProperty(multiline=False, required=True)
#@permerlink
def get_absolute_url(self):
return "/timekeeper/%s/" % self.intake
class Meta:
db_table = "Intake"
verbose_name_plural = "Intakes"
ordering = ['intake']