tags:

views:

61

answers:

2

I'm making progress but I've run into a new problem.

This is the new code:

#!/usr/bin/perl -w
use strict;
use LWP::Simple;
use HTML::TreeBuilder;

my $url = 'http://oreilly.com/store/complete.html';
my $page = get( $url ) or die $!;
my $p = HTML::TreeBuilder->new_from_content( $page );
my($book);
my($edition);

my @links = $p->look_down(
    _tag => 'a',
    href => qr{^ /Qhttp://www.oreilly.com/catalog/\E \w+ $}x
);

my @rows = map { $_->parent->parent } @links;

my @books;
for my $row (@rows) {
    my %book;
    my @cells = $row->look_down( _tag => 'td' );
    $book{title}    =$cells[0]->as_trimmed-text;
    $book{price}    =$cells[2]->as_trimmed-text;
    $book{price} =~ s/^\$//;

    $book{url}      = get_url( $cells[0] );
    $book{ebook}    = get_url( $cells[3] );
    $book{safari}   = get_url( $cells[4] );
    $book{examples} = get_url( $cells[5] );
    push @books, \%book;
}

sub get_url {
    my $node = shift;
    my @hrefs = $node->look_down( _tag => 'a');
    return unless @hrefs;
    my $url = $hrefs[0]->atr('href');
    $url =~ s/\s+$//;
    return $url;
}

$p = $p->delete; #we don't need this anymore.

{
    my $count = 1;
    my @perlbooks = sort { $a->{price} <=> $b->{price} }
                    grep { $_->{title} =~/perl/i } @books;
    print $count++, "\t", $_->{price}, "\t", $_->{title} for @perlbooks;
}

{
    my @perlbooks = grep { $_->{title} =~ /perl/i } @books;
    my @javabooks = grep { $_->{title} =~ /java/i } @books;
    my $diff =  @javabooks - @perlbooks;
    print "There are ".@perlbooks." Perl books and ".@javabooks. " Java books. $diff more Java than Perl.";
}

for my $book ( $books[34] ) {
    my $url = $book->{url};
    my $page = get( $url );
    my $tree = HTML::TreeBuilder->new_from_content( $page );
    my ($pubinfo) = $tree->look_down(
                                    _tag => 'span',
                                    class => 'secondary2'
    );
    my $html = $pubinfo->as_HTML; print $html;
    my ($pages) = $html =~ /(\d+) pages/,
    my ($edition) = $html =~ /(\d)(?:st|nd|rd|th) Edition/;
    my ($date) = $html =~ /(\w+ (19|20)\d\d)/;

    print "\n$pages $edition $date\n";

    my ($img_node) = $tree->look_down(
                                    _tag => 'img',
                                    src  => qr{^/catalog/covers/},
    );
    my $img_url = 'http://www.oreilly.com'.$img_node-&gt;attr('src');
    my $cover = get( $img_url );
    # now save $cover to disk
}

Now I'm getting these errors,

Bareword "text" not allowed while "strict subs" in use at ./SpiderTutorial_19_06.pl line 23. Bareword "text" not allowed while "strict subs" in use at ./SpiderTutorial_19_06.pl line 24. Execution of ./SpiderTutorial_19_06.pl aborted due to compilation errors.

Any help would be greatly appreciated.

+4  A: 

I don't know the original program but most likely as_trimmed-text should be as_trimmed_text.

musiKk
or perhaps as_trimmed->text ?
ysth
@ysth: I thought so too but found `as_trimmed_text` more likely. The documentation for HTML::Element confirms it.
musiKk
+3  A: 

The problem is the method name as_trimmed-text. Hyphens aren't allowed in names in perl. You probably meant as_trimmed_text. Now it parsed as $cells[0]->as_trimmed() - text().

Leon Timmermans