The Code
Save this code to a text file named adwords.pl:
#!/usr/bin/perl
# usage: perl adwords.pl results.html
#
use strict;
use HTML::TokeParser;
die "I need at least one file: $!\n"
unless @ARGV;
my @Ads;
for my $file (@ARGV){
# skip if the file doesn't exist
# you could add more file testing here.
# errors go to STDERR so they won't
# pollute our csv file
unless (-e $file) {
warn "What??: $file -- $! \n-- skipping --\n";
next;
}
# now parse the file
my $p = HTML::TokeParser->new($file);
while(my $token = $p->get_token) {
next unless $token->[0] eq 'S'
and $token->[1] eq 'a'
and $token->[2]{id} =~ /^aw\d$/;
my $link = $token->[2]{href};
my $ad;
if($link =~ /pagead/) {
my($url) = $link =~ /adurl=([^\&]+)/;
$ad->{href} = $url;
} elsif($link =~ m{^/url\?}) {
my($url) = $link =~ /\&q=([^&]+)/;
$url =~ s/%3F/\?/;
$url =~ s/%3D/=/g;
$url =~ s/%25/%/g;
$ad->{href} = $url;
}
$ad->{adwords} = $p->get_trimmed_text('/a');
$ad->{desc} = $p->get_trimmed_text('/font');
($ad->{url}) = $ad->{desc} =~ /([\S]+)$/;
push(@Ads,$ad);
}
}
print quoted( qw( AdWords HREF Description URL Interest ) );
for my $ad (@Ads) {
print quoted( @$ad{qw( adwords href desc url )} );
}
sub quoted {
return join( ",", map { "\"$_\"" } @_ )."\n";
}