#!/usr/local/bin/perl -w # XLinks.pl # Complete documentation at http://www.sfu.ca/~ajdelore/XLinks/ # Anthony DeLorenzo use strict; my $VERSION = "1.2"; #### USER CONFIGURATION OPTIONS #### # Don't forget to change the path to perl on the first line. # The base directory of the site, without trailing slash my $base = "/home/ajdelore/pub_html"; # The name of the file to start at, in the base directory. my $startfile = "index.html"; # The name of your default index file for directories my $indexfile = "index.html"; # Valid file extensions for XHTML documents -- others will not be # crawled or parsed. my @extensions = ('html'); # Should the program print output to screen(0) or file (1); # Include a name and path for a logfile, if set to 1. # Do not comment out if not using. my $output_to_file = 0; my $logfile = ""; # Should program output be: # 0: minimal (print failed links only) # 1: verbose (print all links checked) my $verbose = 1; #### END CONFIGURATION #### use URI::file; use XML::XPath; use XML::XPath::XMLParser; use LWP::UserAgent; if ($output_to_file) { open (LOGFILE,">$logfile") or die "Couldn't open logfile"; select LOGFILE; } chdir ($base) or die "Couldn't access directory $base"; $startfile = URI::file->new_abs($startfile,$base); my (@files, @files_checked, %uris_ok, %uris_failed); my $uris_checked = 0; push @files, ($startfile); push @files_checked, ($startfile); my $ua = LWP::UserAgent->new; PARSE: while ( scalar @files > 0 ) { my $file_uri = URI->new(pop @files); my @path_segments = $file_uri->path_segments; my $filename = pop @path_segments; my $ext = (split /\./,$filename)[1]; next PARSE unless ( grep { $_ eq $ext } @extensions ); chdir (join('/', @path_segments)); my $base_uri = URI::file->cwd; print "Trying to parse $file_uri\n"; my $parser = XML::XPath->new(filename => $filename); my $nofollow = 0; my $path = ("/html/head/meta[\@name='XLinks']"); foreach my $meta ( ($parser->find($path))->get_nodelist) { my $content = lc $meta->getAttribute('content'); if ($content eq 'nocheck') { print " Found meta 'nocheck' directive. Ignoring file.\n\n"; next PARSE; } elsif ($content eq 'nofollow') { print " Found meta 'nofollow' directive.\n"; $nofollow = 1; } } foreach my $path ('//a','//link','//img') { CHECK: foreach my $node ( ($parser->find($path))->get_nodelist) { next CHECK if $node->getAttribute('check') eq 'no'; unless ( $path eq '//img' ) { my $href = $node->getAttribute('href'); next if $href =~ /\#[\w\d]+$/; my $uri = URI->new_abs($href, $base_uri); next if $uri->scheme eq 'mailto'; if ( $uri->scheme eq 'file' ) { if ( $uri =~ /\/$/ ) { $uri .= $indexfile } if (check_uri($uri)) { $nofollow = 1 if $node->getAttribute('check') eq 'nofollow'; next CHECK if $nofollow; foreach (@files_checked) { next CHECK if URI::eq($_, $uri) } push @files, ($uri); push @files_checked, ($uri); } } else { check_uri($uri) } } else { my $src = $node->findvalue('@src'); my $uri = URI->new_abs($src, $base_uri); check_uri($uri); } } } print "\n"; } print "\nSUMMARY\n"; print "Pages Checked: ", scalar(@files_checked), "\n"; print "Links Checked: ", $uris_checked, "\n"; print "Unique Links Checked: ", scalar(keys(%uris_ok)) + scalar(keys(%uris_failed)), "\n"; print "Unique Links Pass/Fail: ", scalar(keys(%uris_ok)), " / ", scalar(keys(%uris_failed)), "\n\n"; sub check_uri { $uris_checked++; my $uri = shift; if ( defined $uris_ok{$uri} ) { print " Valid ($uris_ok{$uri}) $uri\n" if $verbose; return 1; } elsif ( defined $uris_failed{$uri} ) { print " Failed ($uris_failed{$uri}) $uri\n" if $verbose; return 0; } else { my $req = HTTP::Request-> new ('HEAD',$uri); my $res = $ua->request($req); if ( $res->is_success ) { $uris_ok{$uri} = $res->code; print " Valid ($uris_ok{$uri}) $uri\n" if $verbose; return 1; } else { $uris_failed{$uri} = $res->code; print " Failed ($uris_failed{$uri}) $uri\n" if $verbose; return 0; } } }