The following code screen-scrapes fishersci.com for 3 pieces of information: The product name, The product URL and the catalog number and saves the data into 3 table items rec_item, rec_url and rec_cat respectively.
# lib/tasks/inventory_courses_new_item.rake
task :fetch_new_courses => :environment do
require 'nokogiri'
require 'open-uri'
Inventory.find_all_by_rec_item(nil).each do |inventory|
url = "http://www.fishersci.com/wps/portal/SEARCHRESULTS?keyWord=#{CGI.escape(inventory.item)}+&restrictedCategoryId=&N=0&Ntk=all&rpp=15&suppCatNoOnOff=false&imagesOnOff=false&highlightOnOff=false&teaserOnOff=true&store=Scientific&type=preference&showAdvanceOptions="
doc = Nokogiri::HTML(open(url))
longcourses = doc.at_css(".ptitlelink").text unless doc.at_css(".ptitlelink").nil?
courses = longcourses.sub(/\s*/, '') unless longcourses.nil?
inventory.update_attribute(:rec_item, courses)
end
Inventory.find_all_by_rec_url(nil).each do |inventory|
url = "http://www.fishersci.com/wps/portal/SEARCHRESULTS?keyWord=#{CGI.escape(inventory.item)}+&restrictedCategoryId=&N=0&Ntk=all&rpp=15&suppCatNoOnOff=false&imagesOnOff=false&highlightOnOff=false&teaserOnOff=true&store=Scientific&type=preference&showAdvanceOptions="
doc = Nokogiri::HTML(open(url))
link = doc.at_css(".ptitlelink")[:href] unless doc.at_css(".ptitlelink").nil?
inventory.update_attribute(:rec_url, "http://www.fishersci.com#{link}")
end
Inventory.find_all_by_rec_cat(nil).each do |inventory|
url = "http://www.fishersci.com/wps/portal/SEARCHRESULTS?keyWord=#{CGI.escape(inventory.item)}+&restrictedCategoryId=&N=0&Ntk=all&rpp=15&suppCatNoOnOff=false&imagesOnOff=false&highlightOnOff=false&teaserOnOff=true&store=Scientific&type=preference&showAdvanceOptions="
doc = Nokogiri::HTML(open(url))
link = doc.at_css(".ptitlelink")[:href] unless doc.at_css(".ptitlelink").nil?
re1='.*?' # Non-greedy match on filler
re2='((?:[a-z][a-z]*[0-9]+[a-z0-9]*))' # Alphanum 1
re=(re1+re2)
m=Regexp.new(re,Regexp::IGNORECASE);
if m.match(link)
alphanum1=m.match(link)[1];
end
inventory.update_attribute(:rec_cat, alphanum1)
end
end
However this code collects a TON of data. (to see a proof run the following ruby script)
require 'rubygems'
require 'nokogiri'
require 'open-uri'
url = "http://www.fishersci.com/wps/portal/SEARCHRESULTS?keyWord=lead+nitrate&restrictedCategoryId=&N=0&Ntk=all&rpp=15&suppCatNoOnOff=true&imagesOnOff=false&highlightOnOff=false&teaserOnOff=true&store=Scientific&type=preference&showAdvanceOptions="
#url = "http://www.fishersci.com/wps/portal/SEARCHRESULTS?keyWord=sodium+chloride&restrictedCategoryId=&N=0&Ntk=all&rpp=&suppCatNoOnOff=&imagesOnOff=&highlightOnOff=&teaserOnOff=&store=Scientific&type=&showAdvanceOptions="
doc = Nokogiri::HTML(open(url))
puts doc.at_css("title").text
doc.css(".product").each do |product|
title = product.at_css(".ptitlelink").text
titleshort = title.sub(/\s*/, '')
#price = item.at_css(".PriceCompare .BodyS, .PriceXLBold").text[/\$[0-9\.]+/]
#puts "#{title} - #{price}"
puts "#{title}"
link = product.at_css(".ptitlelink")[:href]
#puts "#{titleshort}"
#puts "http://www.fishersci.com#{link}"
re1='.*?' # Non-greedy match on filler
re2='((?:[a-z][a-z]*[0-9]+[a-z0-9]*))' # Alphanum 1
re=(re1+re2)
m=Regexp.new(re,Regexp::IGNORECASE);
if m.match(link)
alphanum1=m.match(link)[1];
puts ""<<alphanum1<<""<< "\n"
end
end
I'm looking for coding ideas to loop through the entire collection and save all of the data, rather than just the first item as my first example above does. Thanks!