views:

220

answers:

0

The following code screen-scrapes fishersci.com for 3 pieces of information: The product name, The product URL and the catalog number and saves the data into 3 table items rec_item, rec_url and rec_cat respectively.

 # lib/tasks/inventory_courses_new_item.rake
task :fetch_new_courses => :environment do
  require 'nokogiri'
  require 'open-uri'

  Inventory.find_all_by_rec_item(nil).each do |inventory|
    url = "http://www.fishersci.com/wps/portal/SEARCHRESULTS?keyWord=#{CGI.escape(inventory.item)}+&restrictedCategoryId=&N=0&Ntk=all&rpp=15&suppCatNoOnOff=false&imagesOnOff=false&highlightOnOff=false&teaserOnOff=true&store=Scientific&type=preference&showAdvanceOptions="
    doc = Nokogiri::HTML(open(url))
    longcourses = doc.at_css(".ptitlelink").text unless doc.at_css(".ptitlelink").nil?
    courses = longcourses.sub(/\s*/, '') unless longcourses.nil?
    inventory.update_attribute(:rec_item, courses)

  end

   Inventory.find_all_by_rec_url(nil).each do |inventory|
    url = "http://www.fishersci.com/wps/portal/SEARCHRESULTS?keyWord=#{CGI.escape(inventory.item)}+&restrictedCategoryId=&N=0&Ntk=all&rpp=15&suppCatNoOnOff=false&imagesOnOff=false&highlightOnOff=false&teaserOnOff=true&store=Scientific&type=preference&showAdvanceOptions="
    doc = Nokogiri::HTML(open(url))
    link = doc.at_css(".ptitlelink")[:href] unless doc.at_css(".ptitlelink").nil?
    inventory.update_attribute(:rec_url, "http://www.fishersci.com#{link}")

  end


    Inventory.find_all_by_rec_cat(nil).each do |inventory|
        url = "http://www.fishersci.com/wps/portal/SEARCHRESULTS?keyWord=#{CGI.escape(inventory.item)}+&restrictedCategoryId=&N=0&Ntk=all&rpp=15&suppCatNoOnOff=false&imagesOnOff=false&highlightOnOff=false&teaserOnOff=true&store=Scientific&type=preference&showAdvanceOptions="
        doc = Nokogiri::HTML(open(url))
        link = doc.at_css(".ptitlelink")[:href] unless doc.at_css(".ptitlelink").nil?
        re1='.*?'   # Non-greedy match on filler
        re2='((?:[a-z][a-z]*[0-9]+[a-z0-9]*))'  # Alphanum 1
        re=(re1+re2)
        m=Regexp.new(re,Regexp::IGNORECASE);

        if m.match(link)
            alphanum1=m.match(link)[1];
        end

        inventory.update_attribute(:rec_cat, alphanum1)
    end

end

However this code collects a TON of data. (to see a proof run the following ruby script)

require 'rubygems'
require 'nokogiri'
require 'open-uri'

url = "http://www.fishersci.com/wps/portal/SEARCHRESULTS?keyWord=lead+nitrate&restrictedCategoryId=&N=0&Ntk=all&rpp=15&suppCatNoOnOff=true&imagesOnOff=false&highlightOnOff=false&teaserOnOff=true&store=Scientific&type=preference&showAdvanceOptions="
#url = "http://www.fishersci.com/wps/portal/SEARCHRESULTS?keyWord=sodium+chloride&restrictedCategoryId=&N=0&Ntk=all&rpp=&suppCatNoOnOff=&imagesOnOff=&highlightOnOff=&teaserOnOff=&store=Scientific&type=&showAdvanceOptions="
doc = Nokogiri::HTML(open(url))
puts doc.at_css("title").text
doc.css(".product").each do |product|
  title = product.at_css(".ptitlelink").text
  titleshort = title.sub(/\s*/, '')
  #price = item.at_css(".PriceCompare .BodyS, .PriceXLBold").text[/\$[0-9\.]+/]
  #puts "#{title} - #{price}"
  puts "#{title}"
  link = product.at_css(".ptitlelink")[:href]
  #puts "#{titleshort}"
  #puts "http://www.fishersci.com#{link}"

  re1='.*?' # Non-greedy match on filler
  re2='((?:[a-z][a-z]*[0-9]+[a-z0-9]*))'    # Alphanum 1
  re=(re1+re2)
  m=Regexp.new(re,Regexp::IGNORECASE);
  if m.match(link)
      alphanum1=m.match(link)[1];
      puts ""<<alphanum1<<""<< "\n"
  end

end

I'm looking for coding ideas to loop through the entire collection and save all of the data, rather than just the first item as my first example above does. Thanks!