I've been knocking up a little pet project the last two days which consists of making a crawler in Perl.
I have no real experience in Perl (only what I have learned in the past two days). My script is as follows:
ACTC.pm:
#!/usr/bin/perl
use strict;
use URI;
use URI::http;
use File::Basename;
use DBI;
use HTML::Parser;
use LWP::Simple;
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->max_redirect(0);
package Crawler;
sub new {
my $class = shift;
my $self = {
_url => shift,
_max_link => 0,
_local => 1,
};
bless $self, $class;
return $self;
}
sub trim{
my( $self, $string ) = @_;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
sub process_image {
my ($self, $process_image) = @_;
$self->{_process_image} = $process_image;
}
sub local {
my ($self, $local) = @_;
$self->{_local} = $local;
}
sub max_link {
my ($self, $max_link) = @_;
$self->{_max_link} = $max_link;
}
sub x_more {
my ($self, $x_more) = @_;
$self->{_x_more} = $x_more;
}
sub resolve_href {
my ( $self, $base, $href) = @_;
my $u = URI->new_abs($href, $base);
return $u->canonical;
}
sub write {
my ( $self, $ref, $data ) = @_;
open FILE, '>c:/perlscripts/' . $ref . '_' . $self->{_process_image} . '.txt';
foreach( $data ) {
print FILE $self->trim($_) . "\n";
}
close( FILE );
}
sub scrape {
my ( @m_error_array, @m_href_array, @href_array, $dbh, $query, $result, $array );
my ( $self, $DBhost, $DBuser, $DBpass, $DBname ) = @_;
if( defined( $self->{_process_image} ) && ( -e 'c:/perlscripts/href_w_' . $self->{_process_image} . ".txt" ) ) {
open ERROR_W, "<c:/perlscripts/error_w_" . $self->{_process_image} . ".txt";
open M_HREF_W, "<c:/perlscripts/m_href_w_" . $self->{_process_image} . ".txt";
open HREF_W, "<c:/perlscripts/href_w_" . $self->{_process_image} . ".txt";
@m_error_array = <ERROR_W>;
@m_href_array = <M_HREF_W>;
@href_array = <HREF_W>;
close ( ERROR_W );
close ( M_HREF_W );
close ( HREF_W );
}else{
@href_array = ( $self->{_url} );
}
my $z = 0;
while( @href_array ){
if( defined( $self->{_x_more} ) && $z == $self->{_x_more} ) {
print "died";
last;
}
my $href = shift( @href_array );
if( defined( $self->{_process_image} ) && scalar @href_array ne 0 ) {
$self->write( 'm_href_w', @m_href_array );
$self->write( 'href_w', @href_array );
$self->write( 'error_w', @m_error_array );
}
$self->{_link_count} = scalar @m_href_array;
my $info = URI::http->new($href);
if( ! defined( $info->host ) ) {
push( @m_error_array, $href );
}else{
my $host = $info->host;
$host =~ s/^www\.//;
$self->{_current_page} = $href;
my $redirect_limit = 10;
my $y = 0;
my( $response, $responseCode );
while( 1 && $y le $redirect_limit ) {
$response = $ua->get($href);
$responseCode = $response->code;
if( $responseCode == 200 || $responseCode == 301 || $responseCode == 302 ) {
if( $responseCode == 301 || $responseCode == 302 ) {
$href = $self->resolve_href( $href, $response->header('Location') );
}else{
last;
}
}else{
last;
}
$y++;
}
if( $y != $redirect_limit && $responseCode == 200 ) {
print $href . "\n";
if( ! defined( $self->{_url_list} ) ) {
my @url_list = ( $href );
}else{
my @url_list = $self->{_url_list};
push( @url_list, $href );
$self->{_url_list} = @url_list;
}
my $DNS = "dbi:mysql:$DBname:$DBhost:3306";
$dbh = DBI->connect($DNS, $DBuser, $DBpass ) or die $DBI::errstr;
$result = $dbh->prepare("INSERT INTO `". $host ."` (URL) VALUES ('$href')");
if( ! $result->execute() ){
$result = $dbh->prepare("CREATE TABLE `". $host ."` ( `ID` INT( 255 ) NOT NULL AUTO_INCREMENT , `URL` VARCHAR( 255 ) NOT NULL , PRIMARY KEY ( `ID` )) ENGINE = MYISAM ;");
$result->execute();
print "Host added: " . $host . "\n";
}
my $content = $response->content;
die "get failed: " . $href if (!defined $content);
my @pageLinksArray = ( $content =~ m/href=["']([^"']*)["']/g );
foreach( @pageLinksArray ) {
my $link = $self->trim($_);
if( $self->{_max_link} != 0 && scalar @m_href_array > $self->{_max_link} ) {
last;
}
my $new_href = $self->resolve_href( $href, $link );
if( $new_href =~ m/^http:\/\// ) {
if( substr( $new_href, -1 ) ne "#" ) {
my $base = $self->{_url};
my %values_index;
@values_index{@m_href_array} = ();
if( ! $new_href =~ m/$base/ ) {
if( $self->{_local} eq "true" && ! exists $values_index{$new_href} ) {
push( @m_href_array, $new_href );
push( @href_array, $new_href );
}
}elsif( $self->{_local} eq "true" && ! exists $values_index{$new_href} ) {
push( @m_href_array, $new_href );
push( @href_array, $new_href );
}
}
}
}
}else{
push( @m_error_array, $href );
}
}
}
}
1;
new_spider.pl:
#!/usr/bin/perl
use strict;
use warnings;
use ACTC;
my ($object, $url, $uri);
print "Starting Poing: (url): ";
chomp($url = <>);
$object = new Crawler( $url );
$object->process_image('process_image_name');
$object->local('true');
$object->max_link(0);
$object->x_more(9999999);
$object->scrape( 'localhost', 'root', '', 'crawl' );
#print $object->{_url} . "\n";
#print $object->{_process_image};
Now it's not complete some of the functions are not working correctly but after running the script I have indexed 1500 pages in about an hour which I think is quite slow.
The script started of whipping through the results but now is quite slugish spitting out one url every second.
Can any one give any tips on how to increase performance?