curl/perl/contrib/checklinks.pl.in
Daniel Stenberg 67a83c1b34 David Shaw finally removed all traces of Gopher and we are now officially
not supporting it. It hasn't been functioning for years anyway, so this is
just finally stating what already was true. And a cleanup at the same time.
2006-01-16 22:14:37 +00:00

337 lines
6.3 KiB
Perl

#!@PERL@
#
# checklinks.pl
#
# This script extracts all links from a HTML page and checks their validity.
# Written to use 'curl' for URL checking.
#
# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
# Version: 0.7 Sept 30, 1998
#
# HISTORY
#
# 0.5 - Cuts off the #-part from links before checking.
#
# 0.6 - Now deals with error codes 3XX better and follows the Location:
# properly.
# - Added the -x flag that only checks http:// -links
#
# 0.7 - Ok, http://www.viunga.se/main.html didn't realize this had no path
# but a document. Now it does.
#
#
$in="";
argv:
if($ARGV[0] eq "-v" ) {
$verbose = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-i" ) {
$usestdin = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-l" ) {
$linenumber = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-h" ) {
$help = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-x" ) {
$external = 1;
shift @ARGV;
goto argv;
}
$geturl = $ARGV[0];
if(($geturl eq "") || $help) {
print "Usage: $0 [-hilvx] <full URL>\n",
" Use a traling slash for directory URLs!\n",
" -h This help text\n",
" -i Read the initial page from stdin\n",
" -l Line number report for BAD links\n",
" -v Verbose mode\n",
" -x Check non-local (external?) links only\n";
exit;
}
if($ARGV[1] eq "-") {
print "We use stdin!\n";
$usestdin = 1;
}
# This is necessary from where I tried this:
#$proxy =" -x 194.237.142.41:80";
# linkchecker, URL will be appended to the right of this command line
# this is the one using HEAD:
$linkcheck = "curl -s -m 20 -I$proxy";
# as a second attempt, this will be used. This is not using HEAD but will
# get the whole frigging document!
$linkcheckfull = "curl -s -m 20 -i$proxy";
# htmlget, URL will be appended to the right of this command line
$htmlget = "curl -s$proxy";
# Parse the input URL and split it into the relevant parts:
sub SplitURL {
my $inurl = $_[0];
if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = $4;
}
elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = "";
if($getpath !~ /\//) {
$getpath ="";
$getdocument = $3;
}
}
elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = "";
$getdocument = "";
}
else {
print "Couldn't parse the specified URL, retry please!\n";
exit;
}
}
&SplitURL($geturl);
#print "protocol = $getprotocol\n";
#print "server = $getserver\n";
#print "path = $getpath\n";
#print "document = $getdocument\n";
#exit;
if(!$usestdin) {
open(HEADGET, "$linkcheck $geturl|") ||
die "Couldn't get web page for some reason";
headget:
while(<HEADGET>) {
# print $_;
if($_ =~ /HTTP\/.*3\d\d /) {
$pagemoved=1;
}
elsif($pagemoved &&
($_ =~ /^Location: (.*)/)) {
$geturl = $1;
&SplitURL($geturl);
$pagemoved++;
last headget;
}
}
close(HEADGET);
if($pagemoved == 1) {
print "Page is moved but we don't know where. Did you forget the ",
"traling slash?\n";
exit;
}
open(WEBGET, "$htmlget $geturl|") ||
die "Couldn't get web page for some reason";
while(<WEBGET>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
# print $line."\n";
$in=$in.$line;
}
close(WEBGET);
}
else {
while(<STDIN>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
$in=$in.$line;
}
}
#print length($in)."\n";
sub LinkWorks {
my $check = $_[0];
# URL encode:
# $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg;
@doc = `$linkcheck \"$check\"`;
$head = 1;
# print "COMMAND: $linkcheck \"$check\"\n";
# print $doc[0]."\n";
boo:
if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) {
$error = $1;
if($error < 400 ) {
return "GOOD";
}
else {
if($head && ($error >= 500)) {
# This server doesn't like HEAD!
@doc = `$linkcheckfull \"$check\"`;
$head = 0;
goto boo;
}
return "BAD";
}
}
return "BAD";
}
sub GetLinks {
my $in = $_[0];
my @result;
getlinkloop:
while($in =~ /[^<]*(<[^>]+>)/g ) {
# we have a tag in $1
$tag = $1;
if($tag =~ /^<!--/) {
# this is a comment tag, ignore it
}
else {
if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
$url=$2;
if($url =~ /^\"(.*)\"$/) {
# this was a "string" now $1 has removed the quotes:
$url=$1;
}
$url =~ s/([^\#]*)\#.*/$1/g;
if($url eq "") {
# if the link was nothing than a #-link it may now have
# been emptied completely so then we skip the rest
next getlinkloop;
}
if($done{$url}) {
# if this url already is done, do next
$done{$url}++;
next getlinkloop;
}
$done{$url} = 1; # this is "done"
push @result, $url;
if($tag =~ /< *([^ ]+)/) {
# print "TAG: $1\n";
$tagtype{$url}=$1;
}
}
}
}
return @result;
}
@links = &GetLinks($in);
linkloop:
for(@links) {
$url = $_;
if($url =~ /^([^:]+):/) {
$prot = $1;
# if($prot !~ /(http|ftp)/i) {
if($prot !~ /http/i) {
# this is an unsupported protocol, we ignore this
next linkloop;
}
$link = $url;
}
else {
if($external) {
next linkloop;
}
# this is a link on the save server:
if($url =~ /^\//) {
# from root
$link = "$getprotocol://$getserver$url";
}
else {
# from the scanned page's dir
$nyurl=$url;
if(length($getpath) &&
($getpath !~ /\/$/) &&
($nyurl !~ /^\//)) {
# lacks ending slash, add one to the document part:
$nyurl = "/".$nyurl;
}
$link = "$getprotocol://$getserver/$getpath$nyurl";
}
}
#print "test $link\n";
#$success = "GOOD";
$success = &LinkWorks($link);
$count = $done{$url};
$allcount += $count;
print "$success $count <".$tagtype{$url}."> $link $url\n";
# If bad and -l, present the line numbers of the usage
if("BAD" eq $success) {
$badlinks++;
if($linenumber) {
$line =1;
for(@indoc) {
if($_ =~ /$url/) {
print " line $line\n";
}
$line++;
}
}
}
}
if($verbose) {
print "$allcount links were checked";
if($badlinks > 0) {
print ", $badlinks were found bad";
}
print "\n";
}