# HG changeset patch # User haftmann # Date 1118038723 -7200 # Node ID fbe2fc30a177e447750658facec40fd1b6501c1d # Parent ea4e64b2f25a71b7be89d8467e4b4cf1458f095d added sunbroy2 diff -r ea4e64b2f25a -r fbe2fc30a177 Admin/mirror-main --- a/Admin/mirror-main Sun Jun 05 23:07:29 2005 +0200 +++ b/Admin/mirror-main Mon Jun 06 08:18:43 2005 +0200 @@ -12,6 +12,9 @@ HOST=$(hostname) case ${HOST} in + sunbroy2) + DEST=/home/html/isabelle/html-data + ;; atbroy1) DEST=/home/html/isabelle/html-data ;; diff -r ea4e64b2f25a -r fbe2fc30a177 Admin/website/build/cl-1.0.1.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Admin/website/build/cl-1.0.1.pl Mon Jun 06 08:18:43 2005 +0200 @@ -0,0 +1,2112 @@ +#!/usr/local/bin/perl +# +# Checklinks 1.0.1 +# +# Starting at one or more seed HTML files, recursively check the +# validity of all links on the site. Major features: +# +# * Local URLs are read from the filesystem when possible (much +# faster than going through HTTP server). +# * Basic server-side includes (aka SSI or SHTML) are checked. +# * Latest standards are supported-- HTML 4.0, HTTP 1.1, URIs +# according to RFC 2396. +# * Links are traversed breadth-first. +# +# To list command-line options, run "cl -?" or see &usage() below. +# +# TO CONFIGURE: +# +# 1) Set $LOCAL_HOST and $DOCUMENT_ROOT, just below. If you don't, the +# program will try to guess them in set_needed_globals(), but it's more +# reliable if you enter them here. +# +# 2) If needed, set any further server configuration below-- things like +# path aliases and so forth. If you have the srm.conf file, you can +# feed it into this script with "-c srm.conf"; otherwise, the default +# settings will probably work OK. +# +# You can set a few parameters with the undocumented "-D " +# command-line option, e.g. "-D LOCAL_HOST=www.myhost.com". +# +# Further comments, including an overview of script internals, are at +# the end of this file. +# +# Copyright (C) 1998, 2000 by James Marshall, james@jmarshall.com +# see http://www.jmarshall.com/tools/cl/ for more info +# +# +# CHANGES IN 1.0.1: +# +# This is just a bug fix release. Fixes include: +# . Aliases are handled correctly now. Sorry 'bout that. +# . A redirect + relative URL no longer results in infinitely +# recursing URLs. +# . More HTML tags are searched for links. +# . Non-HTML files are no longer searched for links. +# . There were other minor bug fixes. +# +#---------------------------------------------------------------------- + +#use strict ; + +my( $LOCAL_HOST, $DOCUMENT_ROOT, $USER_DIR, @DIRECTORY_INDEX, + %ALIAS, %ALIAS_MATCH, %SCRIPT_ALIAS, %SCRIPT_ALIAS_MATCH, %UN_ALIAS, + @SHTML_EXTENSIONS, @CGI_EXTENSIONS, @INCLUDE_PATTERNS, @EXCLUDE_PATTERNS, + @INCLUDE_STATUS, @EXCLUDE_STATUS, + $verbose_report, $max_depth, $file_check, $full_http_check, + $MAX_REDIRECTS, $MAX_ATTEMPTS, $HTML_BY_NAME, $SUPPORT_NCSA_BUG, + @NO_PROXY, $DOC_ROOT_DEV, $DOC_ROOT_INODE, $DOC_ROOT_EXISTS, $CWD, + %html_urls, %non_html_urls, %e_to_ch, + + %home_dir, %dir_to_user, %inode_to_user, + + %url, @urlstoget, + + $debug, $CL_VERSION, + ) ; + + +#----- User Configuration --------------------------------------------- + +# This should be 'localhost', or a hostname of the Web server. URLs at +# this host will be assumed to be local; URLs not at this host will not be +# traversed into. If this names a remote host, the program will not work. +# Note that 'localhost' doesn't necessarily point to your local Web server. + +# $LOCAL_HOST= 'localhost' ; +# $LOCAL_HOST= 'www.example.com' ; +$LOCAL_HOST='isabelle.in.tum.de'; + +# This is your root Web directory, i.e. the directory that the Web server +# sends the user if the URL "http://$LOCAL_HOST" is requested. It's in +# the configuration file srm.conf (and is read by -c option). +# If you don't know the document root of your server, but you don't need +# it because you're only checking URLs whose path starts with ~, put a +# non-existent path here rather than leave it blank (a hack). + +# $DOCUMENT_ROOT= '/home/www/htdocs' ; +$DOCUMENT_ROOT='/home/proj/isabelle'; + +#----- variables equivalent to srm.conf entries + +# These globals are from the equivalent entries in srm.conf, etc. +# See the command-line option -c , to read values directly +# from srm.conf instead. + +# $USER_DIR= 'public_html' ; +$USER_DIR='.html-data'; +@DIRECTORY_INDEX= qw( index.html index.cgi index.shtml ) ; + +# Used in &url_to_filename(), and possibly elsewhere +# Note that ALIAS_MATCH and SCRIPT_ALIAS_MATCH use Perl (not standard) regexps. +# If order of multiple e.g. "Alias" directives is important, this may not work. +%ALIAS= () ; +%ALIAS_MATCH= () ; +%SCRIPT_ALIAS= () ; +%SCRIPT_ALIAS_MATCH= () ; + +# The list of file extensions to interpret as CGI scripts or +# server-parsed HTML files. +# These are not specific settings in srm.conf, but are combinations of +# AddHandler directives and possibly AddType directives. +@CGI_EXTENSIONS= qw( .cgi ) ; +@SHTML_EXTENSIONS= qw( .shtml ) ; + +#----- end of variables equivalent to srm.conf entries + +# Specify patterns here to only include URLs that match at least one +# pattern. As a special case, an empty list includes all URLs, i.e. +# does not restrict URLs by name (except perhaps by @EXCLUDE_PATTERNS). +# This can be added to or cleared with the -I command-line option. +@INCLUDE_PATTERNS= () ; + +# Specify patterns here to cause matching URLs to be excluded, +# e.g. '\?' means ignore all URLs that query. +# This can be added to or cleared with the -X command-line option. +# @EXCLUDE_PATTERNS= qw( \? ) ; + +# Only report URLs whose status codes start with one of these patterns. +# As a special case, an empty list reports all URLs, i.e. does not +# restrict URLs by status code (except perhaps by @EXCLUDE_STATUS). +# This can be added to or cleared with the -i command-line option. +@INCLUDE_STATUS= () ; + +# Don't report URLs whose status codes start with these patterns. Default +# is qw( 200 ). +# This can be added to or cleared with the -x command-line option. +@EXCLUDE_STATUS= qw( 200 ) ; + +# For 302 or 303 HTTP redirection, redirect no more than this many times. +$MAX_REDIRECTS= 5 ; + +# If a connection times out, etc., attempt no more than this many times. +$MAX_ATTEMPTS= 5 ; + +# The old version determined whether a file was HTML by the -T test (text +# file), and so traversed all HTML-like links in any text file that wasn't +# a CGI script. It's probably more appropriate to check the file +# extension, to exclude source code, .txt files, etc. Leave $HTML_BY_NAME +# set to use the filename, or unset it to traverse all HTML-like links in +# any text files, as the old version did. +$HTML_BY_NAME= 1 ; + +# Some old NCSA servers, including 1.5.2, don't report the HTTP version +# correctly in the status line; they return e.g. "HTTP 200 OK". To allow +# this, leave the variable here set. +$SUPPORT_NCSA_BUG= 1 ; + + +#----- DO NOT CHANGE ANYTHING BELOW THIS LINE, unless you want to... --- + +#----- Further Global Variable Initialization -------------------------- + +$CL_VERSION= '1.0.1' ; + +$ENV{'http_proxy'}||= $ENV{'HTTP_PROXY'} ; +@NO_PROXY= split(/[\s,]+/, $ENV{'no_proxy'} || $ENV{'NO_PROXY'} ) ; + + +# If output's not going directly to terminal, this ensures autoflushing. +$|= 1 ; + +#----- End of Configuration -------------------------------------------- + +use strict 'vars' ; +use IO::Socket ; + +&usage unless @ARGV ; + +# Process command-line options +&getopts ; + +# Make any final needed adjustments to globals, after the hard-coded +# values above and any options have been processed. +&adjust_all_globals ; + +# Default to "." if no starting filenames given. +# 3-6-98: Anh, decided against it. +#@ARGV= ('.') unless @ARGV ; + +# &add_url() sets $url{$_} and pushes to @urlstoget, only if not already +# added, plus any other initialization. +# Only add a file if it can be accessed with a URL. +foreach my $arg (@ARGV) { + if ($arg=~ m#^http://#i) { + &add_url($arg, '-', 0) ; + } else { + my($URL)= &filename_to_url($arg, $CWD) ; + if (defined($URL)) { + &add_url($URL, '-', 0) ; + } else { + die "ERROR: $arg is not accessible through the Web server.\n" ; + } + } +} + + +# Check the URLs, in order. @urlstoget may grow and rearrange. +while (@urlstoget) { + my($url)= shift(@urlstoget) ; + if ( !$url->{'ishtml'} or !$url->{'islocal'} or $url->{'dontfollow'} + or (length($max_depth) and $url->{'depth'} > $max_depth ) ) { + &verify_url($url) ; # may set ishtml=true + } + if ( $url->{'ishtml'} and $url->{'islocal'} and !$url->{'dontfollow'} + and (!length($max_depth) or $url->{'depth'} <= $max_depth ) ) { + my($HTML)= &load_url($url) ; # may set ishtml=false + # 11-30-99 JSM: fixed to handle rel URLs in redirected pages correctly + my($base_url)= $url->{'location'} || $url->{'URL'} ; + &extract_urls($HTML, $base_url, $url->{'URL'}, $url->{'depth'}+1) + if $url->{'ishtml'} ; # big, calls &add_url() + } + + # If we get an error response that may be corrected with another + # attempt, put it back in the queue. Such errors include 408, + # 503, 504, and the homegrown codes 600, 601, 602, and 603. + if ($url->{'status'}=~ /^(408|503|504|600|601|602|603)\b/ ) { + push(@urlstoget, $url) if ( $url->{'numtries'} < $MAX_ATTEMPTS ) ; + } + +} + +&make_report() ; + +exit ; + + + +#----- Process command-line options ----------------------------------- + +# Process any command-line options. +sub getopts { + my($opt, $param) ; + while ($ARGV[0]=~ /^-/) { + $opt= shift(@ARGV) ; + ($opt, $param)= $opt=~ /^-(.)(.*)/ ; + + # Turn on verbose reporting + if ($opt eq 'v') { + $verbose_report= ($param ne '-') ; + + # User-specified patterns to exclude ('' to clear list) + } elsif ($opt eq 'I') { + $param= shift(@ARGV) unless length($param) ; + if (length($param)) { push(@INCLUDE_PATTERNS, $param) } + else { @INCLUDE_PATTERNS= () } + + # User-specified patterns to exclude ('' to clear list) + } elsif ($opt eq 'X') { + $param= shift(@ARGV) unless length($param) ; + if (length($param)) { push(@EXCLUDE_PATTERNS, $param) } + else { @EXCLUDE_PATTERNS= () } + + # User-specified response codes to ignore ('' to clear list) + } elsif ($opt eq 'i') { + $param= shift(@ARGV) unless length($param) ; + if (length($param)) { push(@INCLUDE_STATUS, $param) } + else { @INCLUDE_STATUS= () } + + # User-specified response codes to ignore ('' to clear list) + } elsif ($opt eq 'x') { + $param= shift(@ARGV) unless length($param) ; + if (length($param)) { push(@EXCLUDE_STATUS, $param) } + else { @EXCLUDE_STATUS= () } + + # Maximum traversal depth + } elsif ($opt eq 'd') { + $param= shift(@ARGV) unless length($param) ; + $max_depth= $param ; + + # Make it a "file check"-- only read local files, do not use HTTP + } elsif ($opt eq 'f') { + $file_check= ($param ne '-') ; + + # Use HTTP for all URL's, even local files + } elsif ($opt eq 'h') { + $full_http_check= ($param ne '-') ; + + # Read configuration parameters from srm.conf-like file + } elsif ($opt eq 'c') { + $param= shift(@ARGV) unless length($param) ; + &read_srm_conf($param) ; + + # Print current configuration parameters + } elsif ($opt eq 'q') { + &print_config ; + exit ; # jsm-- should we exit? + + # Allow certain parameters to be defined via the command line + } elsif ($opt eq 'D') { + $param= shift(@ARGV) unless length($param) ; + $debug=1, unshift(@ARGV,$param), next if $param=~ /^-/ ; + my($name,$value)= split(/=/, $param, 2) ; + $value= 1 unless length($value) ; + if ($name=~ /^(LOCAL_HOST|DOCUMENT_ROOT|USER_DIR|DEBUG|debug)$/) { + eval "\$$name= \$value" ; + #$$name= $value ; # this doesn't work, because of initial my() + } + + } elsif ($opt eq '?') { + &usage ; + + # End command-line option processing on "--" + } elsif ($opt eq '-') { + return ; + + } else { + print STDERR + "Illegal option-- '$opt'. Enter \"$0 -?\" for help.\n" ; + exit ; + } + + } + + if ($file_check and $full_http_check) { + print STDERR "You cannot use both the -f and the -h options.\n" ; + exit ; + } + +} + + +# Read appropriate values from the given file, typically srm.conf. If a +# directory is named, default to filename "srm.conf". +# Note that opening "-" will open STDIN. +sub read_srm_conf { + my($fname)= @_ ; + local(*SRM) ; + + # default to srm.conf if only a directory is named + if (-d $fname) { + $fname=~ s#/$## ; + $fname.= "/srm.conf" ; + } + + # Clear old values + $DOCUMENT_ROOT= $USER_DIR= '' ; + @DIRECTORY_INDEX= @CGI_EXTENSIONS= @SHTML_EXTENSIONS= () ; + %ALIAS= %ALIAS_MATCH= %SCRIPT_ALIAS= %SCRIPT_ALIAS_MATCH= () ; + + open(SRM, "<$fname") || die "Can't open $fname: $!" ; + while () { + s/#.*// ; + next unless /\S/ ; + my($name, @param)= /(\S+)/g ; + + if ($name eq 'DocumentRoot') { + $DOCUMENT_ROOT= $param[0] ; + + } elsif ($name eq 'UserDir') { + $USER_DIR= $param[0] ; + + } elsif ($name eq 'DirectoryIndex') { + @DIRECTORY_INDEX= @param ; + + } elsif ($name eq 'Alias') { + $ALIAS{$param[0]}= $param[1] ; + + } elsif ($name eq 'AliasMatch') { + $ALIAS_MATCH{$param[0]}= $param[1] ; + + } elsif ($name eq 'ScriptAlias') { + $SCRIPT_ALIAS{$param[0]}= $param[1] ; + + } elsif ($name eq 'ScriptAliasMatch') { + $SCRIPT_ALIAS_MATCH{$param[0]}= $param[1] ; + + } elsif ($name eq 'AddHandler') { + if ($param[0] eq 'cgi-script') { + push(@CGI_EXTENSIONS, $param[1]) ; + } elsif ($param[0] eq 'server-parsed') { + push(@SHTML_EXTENSIONS, $param[1]) ; + } + } + + } + close(SRM) ; +} + + +# Make any final settings to global variables, after the hard-coded values +# and command-line options have been processed. +# Most non-user-configurable globals are also set here. +sub adjust_all_globals { + + # Standardize $USER_DIR to never have trailing slash + $USER_DIR=~ s#/$## ; + + # If no $LOCAL_HOST set, try to read it from first URL in list, or + # use the string 'localhost' if that URL contains no hostname. + unless (length($LOCAL_HOST)) { + $LOCAL_HOST= (&parse_url($ARGV[0]))[1] || 'localhost' ; + print STDERR "LOCAL_HOST set to \"\L$LOCAL_HOST\E\"\n" ; + } + $LOCAL_HOST= lc($LOCAL_HOST) ; + + # If no $DOCUMENT_ROOT, try to guess it from $HOME, username, $USER_DIR. + unless (length($DOCUMENT_ROOT)) { + my($home) ; + unless ($home= $ENV{'HOME'}) { + my($uname)= getpwuid($<) || $ENV{'USER'} || `whoami` || `id -un` ; + chomp($uname) ; + &read_home_dirs unless %home_dir ; # only read when needed + $home= $home_dir{$uname} ; + } + $DOCUMENT_ROOT= "$home/$USER_DIR" ; + + die "Could not determine DOCUMENT_ROOT; edit the $0 script to set it.\n" + unless (-d $DOCUMENT_ROOT) ; + + print STDERR "DOCUMENT_ROOT set to \"$DOCUMENT_ROOT\"\n" ; + } + $DOCUMENT_ROOT=~ s#/$## ; + + # Allows &filename_to_url() to unalias as best as possible. Note that + # use of &filename_to_url() can be avoided by the user; see note in + # that routine. + %UN_ALIAS= (reverse (%ALIAS, %SCRIPT_ALIAS) ) ; + + # These are to compare equivalency to later, in &filename_to_url(). + ($DOC_ROOT_DEV, $DOC_ROOT_INODE)= stat("$DOCUMENT_ROOT/.") ; + + $DOC_ROOT_EXISTS= -e _ ; + + # Set CWD from shell variable, else from `pwd`. + $CWD= $ENV{'PWD'} || `pwd` || die "couldn't run pwd: $!" ; + chomp($CWD) ; + + + # These are used by &extract_urls(). + # This is a complete list of URL-type attributes defined in HTML 4.0, + # plus any others I found, like nonstandard ones or from an earlier HTML. + # Only a few of these are commonly used, as of early 1998. + # The set in %html_urls could possibly link to HTML resources, while the + # set in %non_html_urls could not. The %special(.*) sets, here for + # reference only, include URL attributes that require special handling. + + %html_urls= ( 'a' => [ 'href' ], + 'area' => [ 'href' ], + 'frame' => [ 'src', 'longdesc' ], + 'link' => [ 'href', 'urn' ], + 'img' => [ 'longdesc', 'usemap' ], + 'q' => [ 'cite' ], + 'blockquote' => [ 'cite' ], + 'ins' => [ 'cite' ], + 'del' => [ 'cite' ], + 'object' => [ 'usemap' ], + 'input' => [ 'usemap' ], + 'iframe' => [ 'src', 'longdesc' ], + 'ilayer' => [ 'src' ], + 'layer' => [ 'src' ], + 'fig' => [ 'imagemap' ], + 'overlay' => [ 'imagemap' ], + 'meta' => [ 'url' ], + 'note' => [ 'src' ], + ) ; + + %non_html_urls= ( 'body' => [ 'background' ], + 'img' => [ 'src', 'lowsrc', 'dynsrc' ], + 'input' => [ 'src' ], + 'script' => [ 'src', 'for' ], + + 'fig' => [ 'src' ], + 'overlay' => [ 'src' ], + 'select' => [ 'src' ], + 'ul' => [ 'src' ], + 'h1' => [ 'src' ], + 'h2' => [ 'src' ], + 'h3' => [ 'src' ], + 'h4' => [ 'src' ], + 'h5' => [ 'src' ], + 'h6' => [ 'src' ], + 'hr' => [ 'src' ], + 'table' => [ 'src' ], + 'td' => [ 'src' ], + 'th' => [ 'src' ], + 'tr' => [ 'src' ], + + 'bgsound' => [ 'src' ], + 'embed' => [ 'src' ], + ) ; + + # %special_urls= ( 'base' => [ 'href' ] ) ; + # + # %special_html_urls= ( 'object' => [ 'codebase', 'data' ] ) ; + # + # %special_non_html_urls= + # ( 'head' => [ 'profile' ], + # 'object' => [ 'codebase', 'archive', 'classid' ], + # 'applet' => [ 'codebase', 'code', 'object', 'archive' ], + # 'form' => [ 'action', 'script' ] + # ) ; + + + # This is a translation from entity character references to characters, + # used in &HTMLunescape(). + # This simplified version only supports " < > &, but that + # should be enough for URL-type attributes. + # See http://www.w3.org/TR/REC-html40/sgml/entities.html for full entity + # list. + + %e_to_ch= (quot => '"', + 'lt' => '<', + 'gt' => '>', + amp => '&') ; + +} + + +#---------------------------------------------------------------------- + +# Add the URL to our data structures; specifically, to %url and @urlstoget. +# Returns a pointer to the structure in %url, or undef if already defined +# or on error. +# Currently, this always receives the URL with the host name lowercase, +# either from &absolute_url() or from using $LOCAL_HOST. +sub add_url { + my($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow)= @_ ; + + # Allow the user to restrict URL patterns: URLs must be in + # @INCLUDE_PATTERNS but not in @EXCLUDE_PATTERNS (but only restrict + # by @INCLUDE_PATTERNS if it's not empty). + return undef if @INCLUDE_PATTERNS && + !grep( $URL=~ /$_/, @INCLUDE_PATTERNS ) ; + return undef if grep( $URL=~ /$_/, @EXCLUDE_PATTERNS ) ; + + # Canonicalize URL, so we don't get a page multiple times + $URL= &canonicalize($URL) ; + + # for obscure case involving a
-extracted URL being + # overwritten by extraction (don't fret over this) + $url{$URL}{'dontfollow'}&&= $dontfollow if $url{$URL} ; + + # Don't add the record a second time! Or will infinitely traverse. + return undef if $url{$URL} ; # or add to @referers, for 301 correction...? + + # Only HTTP URLs are currently supported + return undef unless $URL=~ /^http:/i ; + + # Any self-referral here indicates a bug in the program. It's happened. + die "PROGRAM ERROR: $URL shows its first referer as itself.\n" + if $referer eq $URL ; + + my(%u) ; + @u{qw(URL referer depth ishtml iscgi dontfollow)}= + ($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow) ; + $u{'islocal'}= ($URL=~ m#^http://\Q$LOCAL_HOST\E/#io) + 0 ; # make length>0 + if ($u{'islocal'}) { +# $u{'filename'}= &url_to_filename($URL) ; + @u{'filename', 'location'}= &url_to_filename($URL) ; + $u{'iscgi'}= &is_cgi($u{'filename'}, $URL) if $u{'iscgi'} eq '' ; + + # 2-27-00 JSM: Detect ishtml by filename, not -T test. + if ( $u{'ishtml'} eq '' ) { + $u{'ishtml'}= $HTML_BY_NAME + ? ( !$u{'iscgi'} && -e $u{'filename'} && + $u{'filename'}=~ /\.html?$/i ) + 0 + : (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0 ; + } +# $u{'ishtml'}= (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0 +# unless length($u{'ishtml'}) ; + + } + + # If we're only doing a file check, don't add URLs that require HTTP + return undef if ($file_check and (!$u{'islocal'} or $u{'iscgi'}) ) ; + + push(@urlstoget, \%u) ; + $url{$URL}= \%u ; + + # return \%u ; # unneeded because of previous statement +} + + +# Guess if a file is a CGI script or not. Returns true if the (regular) file +# is executable, has one of @CGI_EXTENSIONS, or if the URL is in a +# ScriptAlias'ed directory. +# $fname must be absolute path, but $URL is optional (saves time if available). +# Note that URLs like "/path/script.cgi?a=b" are handled correctly-- the +# previously extracted filename is tested for CGI-ness, while the URL is +# checked for ScriptAlias matching (which is unaffected by final query +# strings or PATH_INFO). +sub is_cgi { + my($fname, $URL)= @_ ; + return 1 if (-x $fname && ! -d _ ) ; # should we really do this? + foreach (@CGI_EXTENSIONS) { return 1 if $fname=~ /\Q$_\E$/i } + + $URL= &filename_to_url($fname) unless length($URL) ; # currently unused + my($URLpath)= $URL=~ m#^http://[^/]*(.*)#i ; + foreach (keys %SCRIPT_ALIAS) { return 1 if $URLpath=~ /^\Q$_\E/ } + foreach (keys %SCRIPT_ALIAS_MATCH) { return 1 if $URLpath=~ /^$_/ } + + return 0 ; +} + + +# Put the URL in such a form that two URLs that point to the same resource +# have the same URL, to avoid superfluous retrievals. +# Host name is lowercased elsewhere-- this routine is only called from +# &add_url; see note there. To lowercase the host name here would be +# inefficient. +sub canonicalize { + my($URL)= @_ ; + + $URL=~ s/#.*// ; # remove any "#" fragment from end of URL + + return $URL ; +} + + +#----- File reading/downloading routines (includes networking) -------- + +# Verify that a URL exists, and set $url->{'status'} accordingly. Do +# this either by checking the local filesystem or by using the HTTP HEAD +# method for remote sites or CGI scripts. +# Set $url->{'ishtml'} accordingly if discovered from Content-Type:. +# This does not support various Redirect directives in srm.conf. +sub verify_url { + my($url)= @_ ; + + print STDERR "verifying $url->{'URL'}\n" if $debug ; + + + # Depending on the state of $url->{islocal, iscgi, dontfollow} and + # $full_http_check, take appropriate actions to check/set the + # status code for this URL. + + # NOTE: In some situations, specifically when checking a CGI script + # named in a (thus implying that dontfollow is set), + # and using HTTP to check the URL (because the script is remote or + # $full_http_check is set), the HTTP response code may not be + # accurate. This is because there is no form data sent with the + # request, as there normally would be. In these cases, a cautionary + # note is appended to $url->{'status'}. Additionally, an empty + # $url->{'status'} is changed to an explanatory note (maybe we should + # do that in load_url() too?). + + # Use HEAD if file is remote, or if $full_http_check is set. + if (!$url->{'islocal'} or $full_http_check) { + &load_url_using_HTTP($url, 'HEAD') ; + $url->{'status'}= '[no status returned]' + unless length($url->{'status'}) ; + $url->{'status'}.= ' (NOTE: Form was not submitted normally)' + if $url->{'dontfollow'} ; + + # URL is local: If it's not CGI, do a normal local file check + } elsif (!$url->{'iscgi'}) { + $url->{'status'}= (-e $url->{'filename'}) + ? "200 Local File Exists" : "404 File Not Found" ; + + # URL is local CGI: Use HEAD unless dontfollow is set + } elsif (!$url->{'dontfollow'}) { + &load_url_using_HTTP($url, 'HEAD') ; + + # Else it's a local CGI with dontfollow set: Check for executable file + } else { + $url->{'status'}= + (! -e $url->{'filename'}) ? "404 File Not Found" + : (! -x $url->{'filename'}) ? "403 Local File Is Not Executable" + : "200 Local Executable File Exists" + + } + + +# Old verify routine below: +# +# # If is a local non-CGI file, check it directly from the filesystem +# if ($url->{'islocal'} and !$url->{'iscgi'} and !$full_http_check) { +# $url->{'status'}= (-e $url->{'filename'}) +# ? "200 Local File Exists" : "404 File Not Found" ; +# +# # Otherwise, download its HEAD from its HTTP server +# } else { +# &load_url_using_HTTP($url, 'HEAD') ; +# } + + +} + + + +# Load entire file/resource and return its contents, setting $url->{'status'} +# accordingly. Do this either by checking the local filesystem or by +# using the HTTP GET method for remote sites or CGI scripts. +# Set $url->{'ishtml'} accordingly if discovered from Content-Type:. +# This does not support various Redirect directives in srm.conf. +sub load_url { + my($url)= @_ ; + my($HTML) ; + + print STDERR "loading $url->{'URL'}\n" if $debug ; + + # If is a local non-CGI file, read it directly from the filesystem + if ($url->{'islocal'} and !$url->{'iscgi'} and !$full_http_check) { + my($iscgi) ; + ($HTML, $url->{'ssierrs'}, $iscgi)= + &read_expanded_file($url->{'filename'}, $url->{'URL'}) ; + $url->{'status'}= + !defined($HTML) + ? sprintf("450 Can't read file: %s (%s)", $!, $!+0) + : @{$url->{'ssierrs'}} + ? sprintf("451 SSI Error(s) (%s total)", + scalar @{$url->{'ssierrs'}}) + + : "200 Local File Read OK" ; + + # $url->{'iscgi'} may be set if an SHTML file included CGI calls. + # Don't set it if we're doing a file check, in which case we'll + # keep whatever $HTML we could get. + $url->{'iscgi'}= $iscgi unless $file_check ; + } + + # Otherwise (or if rereckoned), download the resource from its HTTP server + if (!$url->{'islocal'} or $url->{'iscgi'} or $full_http_check) { + (undef, undef, $HTML)= &load_url_using_HTTP($url, 'GET') ; + } + + # Note that this will be set even when URL is to be reloaded, like + # for a 601 (timeout) response. + $url->{'hasbeenloaded'}= 1 ; + return $HTML ; +} + + +# Read a local file and return its contents. If a file is SSI (aka SHTML), +# expand any SSI directives as needed, recursively +# including nested files. +# This is used for all local reads, SHTML or not, but the vast bulk of this +# routine is for SHTML files. +# +# If file is SHTML, this routine also returns a structure of error data, +# and a boolean saying if this file needs to be downloaded via HTTP +# for a complete check (e.g. includes CGI calls). +# +# $fname must be canonicalized absolute path, but $URL parameter is optional. +# %$parents contains all "include"-ancestors of the file, to prevent loops. +# If omitted, assumes no ancestors (and a fresh hash is started). +# +# This routine seems much bigger and more complex than it needs to be. +# It could be one third the size and much simpler if we didn't have to +# worry about full error reporting on nested includes. +# +# Note: This routine was made to mimic what Apache would return to a client. +# However, the result differs from Apache's in two slight ways, both +# involving nested SSI within , and both +# apparent bugs in Apache 1.1 (may be fixed in later versions): +# +# 1) If a value contains no "/" (i.e. in current directory), +# then Apache always parses the included file as SHTML, regardless of +# extension. This routine checks @SHTML_EXTENSIONS for all included +# files. +# 2) If a value containing a "/" loads an SHTML file +# containing a tag with a relative path, the directive +# fails in Apache. This routine tries to guess the correct path/URL. +# +# +# Notes on this routine, and SHTML files in general: +# +# At first thought, it seems like we could load each included file +# only once, instead of once for every file that includes it. +# However, because of the fact that relative URLs are resolved +# relative to the top-level including file, the top-level file will +# need to be expanded every time. (It's legal (though of questionable +# wisdom) to include a file from e.g. both /a/index.shtml and +# /b/index.shtml, so links from the included file point to different +# URLs.) +# +# Note that while URLs in included files (e.g. ) are +# resolved relative to the top-level including file, nested include tags +# are resolved relative to the direct includer. +# +# We could possibly be more efficient in time (but costly in memory) +# by storing the expanded contents and $errlist of each included file, +# since those will be constant (except $errlist's include-loop +# reporting might vary somewhat). There are probably other ways to +# eek out savings of time and memory, at the cost of complexity. +# +# The main loop here is inside of an s/// statement. Unusual, but it's an +# appropriate way to handle the recursion. Recursion is needed, since each +# included file may or may not be SHTML. +# +# $iscgi is set if a file includes " tag that points to a CGI file, or if +# any of its include-children sets $iscgi. +# +# +# Notes to help clarify data structures, if (God forbid) you have to modify +# this routine: +# +# Each error is a list of files in an "include chain", and $errlist is a +# list of errors. $errlist is associated with the current $HTML. Each +# error in $errlist is associated with some tag in $HTML, as iterated in +# the s/// loop. When this routine returns ($HTML, $errlist), the +# errors in $errlist should all have as their first element tags that +# were found in $HTML. +# +# Each error's final element (the "leaf") is associated with a tag that +# tried to load an invalid file. Each leaf will always be the complete +# set of errors for that tag (i.e. it has no children, since it couldn't +# load the file). +# +# If the file can be validly loaded, then we may have 0 or multiple errors +# associated with this file/tag (and returned from this routine). Each +# file's $errlist is an accumulation of all of its children's $errlist's. +# +# Errors that come from loading a child are associated with the child's +# $HTML and tags. Before adding them to the parent's (current) $errlist, +# they must have the CHILD's path/tag unshifted onto the front of the +# include chain; all errors are then added to the current $errlist. +# This ensures that: +# a) The errors are now all associated with the tag that loaded the child. +# b) The errors in the current $errlist are always associated with tags +# from the current $HTML. +# + +sub read_expanded_file { + my($fname, $URL, $parents)= @_ ; + my($HTML, $errlist, $iscgi) ; + my($isshtml) ; + + $parents->{$fname}= 1 ; + + $HTML= &read_file($fname) ; + return(undef) unless defined($HTML) ; + + foreach (@SHTML_EXTENSIONS) { + $isshtml= 1, last if ($fname=~ /\Q$_\E$/i) ; + } + + if ($isshtml) { + $errlist= [] ; + $iscgi= ($HTML=~ /.'}]); + last GET_CHILD ; + } + if ($childfname=~ m#\.\.(/|$)#) { + push(@$errlist, [ {'path' => $childfname, + 'tag' => $tagst, 'errmsg' => + 'Paths can not contain "../" in ' + . '.'}]); + last GET_CHILD ; + } + $childfname= ($fname=~ m#(.*/)#)[0] . $childfname ; + + } else { + push(@$errlist, [ {'path' => '', + 'tag' => $tagst, 'errmsg' => + 'Tag must contain either the "file" or ' + . '"virtual" attribute.'}]); + last GET_CHILD ; + + } + + # canonicalize filename for %$parents + 1 while $childfname=~ s#/\.(/|$)#/# ; + 1 while $childfname=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ; + + # Guarantee that file exists, is regular, and is readable + unless (-e $childfname) { + push(@$errlist, [{'path' => $childfname, 'tag' => $tagst, + 'errmsg' => 'File not found'} ] ) ; + last GET_CHILD ; + } + unless (-f $childfname) { + push(@$errlist, [{'path' => $childfname, 'tag' => $tagst, + 'errmsg' => 'File is not a regular' + . ' file.' } ] ) ; + last GET_CHILD ; + } + unless (-r $childfname) { + push(@$errlist, [{'path' => $childfname, 'tag' => $tagst, + 'errmsg' => 'File is not readable by' + . ' current user.' } ] ) ; + last GET_CHILD ; + } + + # Guard against include loops + if ($parents->{$childfname}) { + push(@$errlist, [{'path' => $childfname, 'tag' => $tagst, + 'errmsg' => 'An "include" loop exists' + . ' involving this file.' } ] ) ; + last GET_CHILD ; + } + + + # Get the included file, with any error data + ($childHTML, $childerrlist, $childiscgi)= + &read_expanded_file($childfname, $childURL, $parents) ; + + # Log if there was any error reading the file + push(@$errlist, [{'path' => $childfname, 'tag' => $tagst, + 'errmsg' => "Can't read file: $!." } ] ) + unless defined($childHTML) ; + + # Add any errors to the current (parent) error list + foreach my $error (@$childerrlist) { + unshift(@$error, + { 'path' => $childfname, 'tag' => $tagst } ) ; + } + push(@$errlist, @$childerrlist) ; + + # Parent is a CGI if any of its children is a CGI + $iscgi||= $childiscgi ; + + } # GET_CHILD + + + $childHTML ; # final value to replace in main s/// construct + + } # do {} + + }gie ; # $HTML=~ s{} {} + + } # if ($isshtml) + + delete $parents->{$fname} ; + + return($HTML, $errlist, $iscgi) ; +} + + + +# Returns the contents of the named file, or undef on error. +sub read_file { + my($fname)= @_ ; + local(*F, $/) ; + + undef $/ ; + open(F, "<$fname") || return undef ; + my($ret)= ; + close(F) ; + + return $ret ; +} + + +# Try to get the given URL with the given HTTP method, and return the +# status line, headers, and body. +# Set $url->{'status'} accordingly, and set $url->{'ishtml'} accordingly +# if Content-Type: header is returned. +# This is specific to this program, and calls the more general &get_url(). +# This could be slightly more efficient if 302 or 303 was handled in the +# calling routine, where it could take advantage of a new URL being local. +sub load_url_using_HTTP { + my($url, $method)= @_ ; + my($status_line, $headers, $body) ; + + # We should not get here if $file_check is set + die "mistakenly called load_url_using_HTTP($url->{'URL'})" if $file_check ; + + GETFILE: { + ($status_line, $headers, $body)= + &get_url( ($url->{'location'} || $url->{'URL'}), $method) ; + + # If HEAD failed (as on some servers), sigh and use GET + ($status_line, $headers, $body)= + &get_url( ($url->{'location'} || $url->{'URL'}), 'GET') + unless length($status_line) ; + + ($url->{'status'})= $status_line=~ m#^HTTP/[\d.]+\s+(.*)# ; + + # 2-27-00 JSM: Allow old NCSA servers to not include the HTTP version. + if ($SUPPORT_NCSA_BUG and $url->{'status'} eq '') { + ($url->{'status'})= $status_line=~ m#^HTTP(?:/[\d.]+)?\s+(.*)# ; + } + + # Redirect to new location if status is 302 or 303 + if ($url->{'status'}=~ /^(301|302|303)\b/) { + ($url->{'location'})= $headers=~ m#^Location:[ \t]+(\S+)#im ; + last GETFILE unless length($url->{'location'}) ; + $url->{'location'}= + &absolute_url($url->{'location'}, $url->{'URL'}) ; + redo GETFILE + if ($url->{'status'}=~ /^(302|303)\b/) + && (++$url->{'numredirects'} <= $MAX_REDIRECTS) ; + } + } + + $url->{'numtries'}++ ; + $url->{'lasttried'}= time ; + + # If successful response included Content-Type:, set ishtml accordingly + $url->{'ishtml'}= (lc($1) eq 'text/html') + 0 + if $url->{'status'}=~ /^2/ + and $headers=~ m#^content-type:[ \t]*(\S+)#im ; + + print STDERR "status: $status_line\n" if $debug ; + + return($status_line, $headers, $body) ; +} + + +# Request the HTTP resource at the given absolute URL using the given method, +# and return the response status line, headers, and body. +# jsm-- in the future, this should support downloading to a file, in case +# the download is too large to fit in memory. +sub get_url { + my($URL, $method)= @_ ; + my($host, $uri, $endhost) ; + my($S, $rin) ; + my($response, $status_line, $headers, $body, $status_code) ; + my($content_length) ; + $method= uc($method) ; + $method= 'GET' unless length($method) ; + + ($host, $uri)= $URL=~ m#^http://([^/]*)(.*)$#i ; + $uri= '/' unless length($uri) ; + $endhost= $host ; + + # use an HTTP proxy if $ENV{'http_proxy'} is set + USEPROXY: { + last USEPROXY unless $host=~ /\./ ; + if (length($ENV{'http_proxy'})) { + foreach (@NO_PROXY) { + last USEPROXY if $host=~ /$_$/i ; + } + ($host)= $ENV{'http_proxy'}=~ m#^(?:http://)?([^/]*)#i ; + $uri= $URL ; + } + } + + # Open socket + $S= IO::Socket::INET->new(PeerAddr => $host, # may contain :port + PeerPort => 80, # default if none in PeerAddr + Proto => 'tcp') ; + return("HTTP/1.1 600 Can't create socket: $@") unless defined($S) ; + $S->autoflush() ; # very important!! + + # Send HTTP 1.1 request + print $S "$method $uri HTTP/1.1\015\012", + "Host: $endhost\015\012", + "Connection: close\015\012", + "User-agent: CheckLinks/$CL_VERSION\015\012", + "\015\012" ; + + # Wait for socket response with select() + vec($rin= '', fileno($S), 1)= 1 ; + select($rin, undef, undef, 60) + || return("HTTP/1.1 601 Connection timed out") ; + + local($/)= "\012" ; + + # Handle "100 Continue" responses for HTTP 1.1: loop until non-1xx. + do { + $status_line= <$S> ; + $status_line=~ s/\015?\012$// ; + ($status_code)= $status_line=~ m#^HTTP/\d+\.\d+\s+(\d+)# ; + + $headers= '' ; + while (<$S>) { + last if /^\015?\012/ ; + $headers.= $_ ; + } + $headers=~ s/\015?\012[ \t]+/ /g ; + } until $status_code!~ /^1/ ; + + # Body length is determined by HTTP 1.1 spec, section 4.4: these + # certain conditions implying no body, then chunked encoding, + # then Content-length: header, then server closing connection. + if ($method eq 'HEAD' or $status_code=~ /^(1|204\b|304\b)/) { + $body= undef ; + + # else chunked encoding + } elsif ($headers=~ /^transfer-encoding:[ \t]*chunked\b/im) { + # 7-16-99: Old code was only saving last chunk. Fix using + # $this_chunk contributed by Mark Trotter. + my($this_chunk, $chunk_size, $readsofar, $thisread) ; + while ($chunk_size= hex(<$S>)) { + $readsofar= 0 ; + while ($readsofar!=$chunk_size) { + last unless $thisread= + read($S, $this_chunk, $chunk_size-$readsofar, $readsofar) ; + $readsofar+= $thisread ; + } + return("HTTP/1.1 603 Incomplete chunked response", $headers, $body) + if $readsofar!=$chunk_size ; + $_= <$S> ; # clear CRLF after chunk + $body.= $this_chunk ; + } + + # Read footers if they exist + while (<$S>) { + last if /^\015?\012/ ; + $headers.= $_ ; + } + $headers=~ s/\015?\012[ \t]+/ /g ; + + + # else body length given in Content-length: + } elsif (($content_length)= $headers=~ /^content-length:[ \t]*(\d+)/im) { + my($readsofar, $thisread) ; + while ($readsofar!=$content_length) { + last unless $thisread= + read($S, $body, $content_length-$readsofar, $readsofar) ; + $readsofar+= $thisread ; + } + return(sprintf("HTTP/1.1 602 Incomplete response (%s of %s bytes)", + $readsofar+0, $content_length), + $headers, $body) + if $readsofar!=$content_length ; + + + # else body is entire socket output + } else { + local($/)= undef ; + $body= <$S> ; + } + + close($S) ; + + return($status_line, $headers, $body) ; +} + + +#----- URL-parsing routines ------------------------------------------- + +# The routines parse_url(), unparse_url(), and absolute_url() are based on +# different sections in the Internet Draft "Uniform Resource Identifiers +# (URI): Generic Syntax and Semantics", 11-18-97, by Berners-Lee, +# Fielding, and Masinter, filename draft-fielding-uri-syntax-01. + +# Parse a URL into its components, according to URI draft, sections 4.3, 4.4. +# This regular expression is straight from Appendix B, modified to use Perl 5. +# Returns scheme, site, path, query, and fragment. All but path may have +# the undefined value. +sub parse_url { + my($URL)= @_ ; + my($scheme, $site, $path, $query, $fragment)= + ($URL=~ m{^(?: ([^:/?\#]+):)? + (?: // ([^/?\#]*))? + ([^?\#]*) + (?: \? ([^\#]*))? + (?: \# (.*))? + }x + ) ; + + + # Un-URL-encode the path, to equivalate things like %7E --> ~ + # Note that in some situations, this may cause problems with URLs that + # contain the % character: if the unescaped URL is then used in + # relative URL calculation, it may be unescaped again (rare). + $path=~ s/\+/ /g ; + $path=~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/ge ; + + # Note that in HTTP, the presence of a host implies a path beginning with + # '/', so $path should be '/' for URLs like "http://www.somehost.com" + $path= '/' if !length($path) && length($site) && lc($scheme) eq 'http' ; + + return($scheme, $site, $path, $query, $fragment) ; + +} + + +# Returns a full URL string, given its components +# The full procedure is described in the URI draft, section 5.2, step 7. +sub unparse_url { + my($scheme, $site, $path, $query, $fragment)= @_ ; + my($URL) ; + + $URL= "$scheme:" if defined($scheme) ; + $URL.= "//$site" if defined($site) ; + $URL.= $path ; + $URL.= "?$query" if defined($query) ; + $URL.= "#$fragment" if defined($fragment) ; + + return $URL ; +} + + +# Returns a canonicalized absolute URL, given a relative URL and a base URL. +# The full procedure is described in the URI draft, section 5.2. +# Note that a relative URI of "#fragment" should be resolved to "the current +# document", not to an absolute URL. This presents a quandary for this +# routine: should it always return an absolute URL, thus violating the +# spec, or should it not always return an absolute URL, thus requiring any +# caller to check for this special case? This routine leaves that up to +# the caller, with $return_rel_fragment-- if set, stick to the spec; +# otherwise, always return an absolute URL. See section G.4 of the draft. +# Note that the pathname reduction in steps 6.c-f messes up any PATH_INFO +# that has ./ or ../ in it, which may be a bug in the spec. +sub absolute_url { + my($relurl, $baseurl, $return_rel_fragment)= @_ ; + my(@relurl, @baseurl) ; + + # parse_url() returns scheme, site, path, query, fragment + @relurl= &parse_url($relurl) ; # Step 1 + @baseurl= &parse_url($baseurl) ; + + COMBINE: { + + # Step 2 + # See note above about $return_rel_fragment + if ( $relurl[2] eq '' && + !defined($relurl[0]) && + !defined($relurl[1]) && + !defined($relurl[3]) ) { + @relurl[0..3]= @baseurl[0..3] ; + return $relurl if $return_rel_fragment ; # see note above + last COMBINE ; + } + + last COMBINE if defined($relurl[0]) ; # Step 3 + $relurl[0]= $baseurl[0] ; + + last COMBINE if defined($relurl[1]) ; # Step 4 + $relurl[1]= $baseurl[1] ; + + last COMBINE if $relurl[2]=~ m#^/# ; # Step 5 + + # Step 6-- resolve relative path + my($path)= $baseurl[2]=~ m#^(.*/)# ; # Step 6.a + $relurl[2]= $path . $relurl[2] ; # Step 6.b + + } # COMBINE + + # Put the remaining steps outside of the block to canonicalize the path. + # Arguably, this is not allowed. To avoid such arguments at the expense of + # path canonicalization, put steps 6.c-f back in the COMBINE block. + + 1 while $relurl[2]=~ s#(^|/)\./#$1# ; # Step 6.c + $relurl[2]=~ s#(^|/)\.$#$1# ; # Step 6.d + + # Step 6.e + my($oldpath) ; + while ($relurl[2]=~ s#(([^/]+)/\.\./)# ($2 eq '..') ? $1 : '' #ge) { + last if ($relurl[2] eq $oldpath) ; + $oldpath= $relurl[2] ; + } + + # Step 6.f + $relurl[2]=~ s#(([^/]+)/\.\.$)# ($2 eq '..') ? $1 : '' #ge ; + + # Step 6.g: allow leading ".." segments to remain in path + # Step 6.h: relurl[2] is already the buffer string + + # To canonicalize further, lowercase the hostname (is this valid for all + # schemes?) + $relurl[1]= lc($relurl[1]) if defined($relurl[1]) ; + + return &unparse_url(@relurl) ; # Step 7 +} + + + +# Convert a local URL into a canonicalized absolute path, or undef if +# not on this host or other error. +# Result should only be used as filename. +# Supports UserDir (e.g. public_html) for "/~username/path/file" URLs. +# Supports Alias, AliasMatch, ScriptAlias, and ScriptAliasMatch from srm.conf +# (but note use of Perl regex's instead of standard regex's). +# Inserts index.html, etc. (from @DIRECTORY_INDEX) if result is a directory, +# but just return directory name (ending in '/') if none of those exists. +# Removes PATH_INFO, if any, from filename. +# Directory names are always returned with trailing slash (which would not +# be appropriate if PATH_INFO was to be retained). +# While this routines makes some tests (e.g. if the file is a directory), +# it does not verify that file at the resulting $filename exists. +# Note that not all URLs point to files, so this routine is not always +# appropriate. In this program, the result from this routine is only +# used when we know the URL is not a CGI script (and is therefore a file), +# except in &is_cgi() itself, which tests if a file is a CGI script. +# If it weren't for &is_cgi(), we could ignore cases when the URL isn't +# a file. +# 12-1-99 JSM: Changed to also return "redirected" location, in case URL +# is a directory but not ending in a slash, so relative URLs will resolve +# correctly against the redirected URL. +sub url_to_filename { + my($URL)= @_ ; + my($URLpath, $path, $location, $docroot, $user) ; + return undef unless $URL=~ m#^http://\Q$LOCAL_HOST\E/#io ; + $URLpath= (&parse_url($URL))[2] ; + die "couldn't get path from [$URL]" unless length($URLpath) ; + + # Note minor security hole: if this script is run setuid, then any + # file on the system could be read by using an ALIAS to point to the + # file. Note also that if a $URLpath such as "/alias/dir/../.." is + # passed to this routine, the alias will be substituted BEFORE the + # ".." path segments are traversed. A case like this probably a + # mistake in the URL anyway. + + # Make no more than one alias substitution-- is there a precedence order? + # Note that %(.*)_MATCH use Perl regex's, not standard regex's. + # 3-29-99 JSM: These all alias to actual directory, not to a resulting + # URL, so no further conversion should be done if one of these matches. + # 3-29-99 JSM: Changed ALIAS_MATCH and SCRIPT_ALIAS_MATCH blocks to + # allow $1-type substitution in targets; MUST TEST! + ALIAS: { + foreach (keys %ALIAS) + { $path= $URLpath, last ALIAS + if $URLpath=~ s/^\Q$_\E/$ALIAS{$_}/ } + foreach (keys %ALIAS_MATCH) + { $path= $URLpath, last ALIAS + if eval "\$URLpath=~ s/^\$_/$ALIAS_MATCH{$_}/" } + foreach (keys %SCRIPT_ALIAS) + { $path= $URLpath, last ALIAS + if $URLpath=~ s/^\Q$_\E/$SCRIPT_ALIAS{$_}/ } + foreach (keys %SCRIPT_ALIAS_MATCH) + { $path= $URLpath, last ALIAS + if eval "\$URLpath=~ s/^\$_/$SCRIPT_ALIAS_MATCH{$_}/" } + } + + + # If $path has been set in above ALIAS block, no further conversion is + # needed. + if ($path eq '') { + + # Must check for ^/.. before PATH_INFO check, in case $URL's path + # is e.g. '/../conf/access.conf' + return undef if $URLpath=~ m#^/\.\.(/|$)# ; # ^/.. is not allowed + + # Set $docroot and $path for this file, based on the URL (contains '~' ?) + if (length($USER_DIR) and ($user,$path)= $URLpath=~ m#^/~([^/]+)(.*)# ) { + &read_home_dirs unless %home_dir ; # only read when needed + return undef unless length($home_dir{$user}) ; + $docroot= "$home_dir{$user}/$USER_DIR" ; + $path= '/' unless length($path) ; + } else { + # If we have no $DOCUMENT_ROOT, we can't handle URLs without ~. + return undef unless $DOC_ROOT_EXISTS ; + $docroot= $DOCUMENT_ROOT ; + $path= $URLpath ; + } + + # Handle PATH_INFO: remove path segments until an existing file is named. + # Note that directories cannot have PATH_INFO after them. + unless (-e "$docroot$path") { + for (my($path2)= $path ; $path2=~ m#/# ; $path2=~ s#/[^/]*$##) { + last if -d "$docroot$path2" ; + $path= $path2, last if -e "$docroot$path2" ; + } + } + + # canonicalize path, and recheck for ^/.. (handles an obscure error, + # when $URL's path is e.g. '/a/b/../../..'; but must canonicalize + # after PATH_INFO check, in case path is e.g. '/a/b.cgi/../../..'). + 1 while $path=~ s#/\.(/|$)#/# ; + 1 while $path=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ; + return undef if $path=~ m#^/\.\.(/|$)# ; # ^/.. is not allowed + + $path= "$docroot$path" ; + + } + + + # Add "index.html", etc. if appropriate + if (-d $path) { + $path.= '/' unless $path=~ m#/$# ; + # 12-1-99 JSM: set "redirected" location also + $location= "$URL/" unless $URL=~ m#/$# ; + foreach (@DIRECTORY_INDEX) { + $path.= $_, last if -f "$path$_" ; + } + } + return ($path, $location) ; +} + + +# Convert a local (possibly relative) pathname into a canonicalized URL. +# If filename is relative and no $basepath is given, assume it's in the +# current directory. +# Supports UserDir (e.g. public_html) for "/~username/path/file" URLs. +# Each path segment is checked to see if it's the same as $DOCUMENT_ROOT, +# by comparing inodes. When a match is found, it's cut off the front, +# and an absolute URL is constructed. If $DOCUMENT_ROOT is never matched, +# then $USER_DIR is scanned for. If that doesn't match (i.e. the file +# is not served to the Web), undef is returned. +# Note that $DOC_ROOT_DEV and $DOC_ROOT_INODE are set at the start of the +# program for efficiency, but are an integral part of this routine. +# %ALIAS is supported using %UN_ALIAS, as best as possible. See next note +# on avoiding use of this routine. +# This is currently only used when parsing command-line filenames, and when +# an includes an +# (which may be an error anyway). Thus, it can be avoided if needed, such +# as when complex aliasing makes results ambiguous. +# jsm-- should this add/remove @DIRECTORY_INDEX, to avoid some duplication? +# 3-29-99 JSM: Changed UNALIAS handling-- if it's unaliased, then no other +# conversion is necessary. +sub filename_to_url { + my($path, $basepath)= @_ ; + my($URLpath) ; + unless ($path=~ m#^/#) { + $basepath= $CWD unless length($basepath) ; + $basepath.= '/' if -d $basepath && $basepath!~ m#/$# ; + $basepath=~ s#/[^/]*$#/# ; + $path= "$basepath$path" ; + } + + # canonicalize filename by removing ./ and ../ where appropriate + 1 while $path=~ s#/\.(/|$)#/# ; + 1 while $path=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ; + + # canonicalize directory to include final / + $path.= '/' if -d $path && $path!~ m#/$# ; + + # First, if path can be unaliased, return that. + # (Added 3-29-99 by JSM.) + foreach (keys %UN_ALIAS) + { $URLpath= $path, last if $path=~ s/^\Q$_\E/$UN_ALIAS{$_}/ } + return "http://\L$LOCAL_HOST\E$URLpath" if $URLpath ne '' ; + + # Then, check if file is under $DOCUMENT_ROOT tree, and convert if so. + if ($DOC_ROOT_EXISTS) { + my($doc_root)= $path ; + while ($doc_root=~ s#/[^/]*$##) { + my($dev,$inode)= stat("$doc_root/.") ; + if ( ($dev==$DOC_ROOT_DEV) && ($inode==$DOC_ROOT_INODE) ) { + $path=~ s/^$doc_root// ; +# foreach (keys %UN_ALIAS) +# { last if $path=~ s/^\Q$_\E/$UN_ALIAS{$_}/ } + return "http://\L$LOCAL_HOST\E$path" ; + } + } + } + + # Handle possible case of "~username/$USER_DIR/$path" + # I don't think %ALIAS applies here, does it? + # This misses some when $HOME/$USER_DIR points through a symbolic link, + # and $CWD isn't set to match %dir_to_user. Work around by avoiding + # this routine, e.g. using only URLs on command line. + if (length($USER_DIR)) { + if ($path=~ m#^(.*?)/$USER_DIR(/.*)# ) { + # First, see if path is in %dir_to_user + &read_home_dirs unless %dir_to_user ; # only read when needed + return "http://\L$LOCAL_HOST\E/~$dir_to_user{$1}$2" + if length($dir_to_user{$1}) ; + + # If not, then we must check inodes to equivalate directories + &read_inode_to_user unless %inode_to_user ; # only read when needed + my($dev,$inode)= stat("$1/.") ; + return "http://\L$LOCAL_HOST\E/~$inode_to_user{$dev}{$inode}$2" + if length($inode_to_user{$dev}{$inode}) ; + } + } + + return undef ; +} + + + +# Reads all users' home directory into %home_dir, from /etc/passwd. +# Also creates %dir_to_user, which is faster than %inode_to_user (below). +# Only used when $USER_DIR is used, for "/~username/path/file" URLs. +# 2-27-00 JSM: Changed to use getpwent, instead of reading /etc/passwd. +sub read_home_dirs { + my($user, $homedir) ; + + setpwent ; # to rewind, in case getpwent has already been used + while ( ($user, $homedir)= (getpwent)[0,7] ) { + $home_dir{$user}= $homedir ; + $dir_to_user{$homedir}= $user + unless $dir_to_user{$homedir} ne '' ; + } + endpwent ; # clean way to end getpwent processing +} + + +# Reads home directory inode information into %inode_to_user, from /etc/passwd. +# Because this is time-consuming, it is only called if needed, and only once. +# Only used when $USER_DIR is used, for "/~username/path/file" URLs. +# On SPARCstation-10 with 3000 /etc/passwd records, this takes ~2 seconds. +# 2-27-00 JSM: Changed to use already-existing %home_dir, instead of reading +# /etc/passwd again. +sub read_inode_to_user { + my($user, $homedir) ; + my($dev, $inode) ; + &read_home_dirs unless %home_dir ; # only read when needed + + while ( ($user, $homedir)= each %home_dir ) { + ($dev,$inode)= stat("$homedir/.") ; + $inode_to_user{$dev}{$inode}= $user + unless $inode_to_user{$dev}{$inode} ne '' ; + } +} + + + +#----- Extracting URLs from HTML ------------------------------------ + + +# Parse an SGML tag, and return a hash structure with a "name" scalar and +# an "attrib" hash. +# Parses first tag in string, ignoring all surrounding text. +# Results are canonicalized to lower case wherever case-insensitive. +sub parse_tag { + my($tag)= @_ ; # will be mangled + my($tagname,%attrib) ; + + # only parse first tag in string + ($tag)= split(/>/, $tag) ; # remove all after > + $tag=~ s/^([^<]*<)?\s*// ; # remove pre-<, <, and leading blanks + ($tagname,$tag)= split(/\s+/, $tag, 2) ; # split out tag name + + # Extract name/value (possibly quoted), lowercase name, set $attrib{}. + # If quoted, is delimited by quotes; if not, delimited by whitespace. + $attrib{lc($1)}= &HTMLunescape($+) + while ($tag=~ s/\s*(\w+)\s*=\s*(([^"']\S*)|"([^"]*)"?|'([^']*)'?)//) ; + + # now, get remaining non-valued (boolean) attributes + $tag=~ s/^\s*|\s*$//g ; # skip leading/trailing blanks + foreach (split(/\s+/, $tag)) { + $_= lc($_) ; + $attrib{$_}= $_ ; # booleans have values equal to their name + } + + return { 'name' => lc($tagname), + 'attrib' => \%attrib } ; +} + + +# Unescape any HTML character references and return resulting string. +# Support entity character references in %e_to_ch (which is incomplete), +# plus "$#ddd;" and "Ý" forms for values<256. +# Note that not decoding a valid character is erroneous, in that a +# subsequent re-escaping will not return the original string, because +# of the ampersand. Nonetheless, that's preferable to losing the data. +# Q: Is there an appropriate general way to represent an unescaped string? +sub HTMLunescape { + my($s)= @_ ; + + # Try alpha, decimal, and hex representations, only substituting if valid + $s=~ s/&(([a-zA-Z][a-zA-Z0-9.-]*);?|#([0-9]+);?|#[Xx]([0-9a-fA-F]+);?)/ + length($2) ? ( defined($e_to_ch{$2}) ? $e_to_ch{$2} : "&$1" ) + : length($3) ? ( $3 < 256 ? chr($3) : "&$1" ) + : length($4) ? ( hex($4) < 256 ? chr(hex($4)) : "&$1" ) + : "&$1" + /ge ; + + return $s ; +} + + +# Given a block of HTML, extracts all URLs referenced in it, and adds them +# to our data structures to be downloaded or checked (i.e. calls +# &add_url()). +# Note that %html_urls and %non_html_urls are set at the start of the +# program for efficiency, but are an integral part of this routine. +# Currently, this extracts all <.*?> patterns, which may not be valid if +# "<" or ">" characters are e.g. inside a