Admin/website/build/cl-1.0.1.pl
changeset 16292 fbe2fc30a177
equal deleted inserted replaced
16291:ea4e64b2f25a 16292:fbe2fc30a177
       
     1 #!/usr/local/bin/perl
       
     2 #
       
     3 #   Checklinks 1.0.1
       
     4 #
       
     5 #     Starting at one or more seed HTML files, recursively check the
       
     6 #     validity of all links on the site.  Major features:
       
     7 #
       
     8 #       * Local URLs are read from the filesystem when possible (much
       
     9 #           faster than going through HTTP server).
       
    10 #       * Basic server-side includes (aka SSI or SHTML) are checked.
       
    11 #       * Latest standards are supported-- HTML 4.0, HTTP 1.1, URIs
       
    12 #           according to RFC 2396.
       
    13 #       * Links are traversed breadth-first.
       
    14 #
       
    15 #   To list command-line options, run "cl -?" or see &usage() below.
       
    16 #
       
    17 #   TO CONFIGURE: 
       
    18 #
       
    19 #   1) Set $LOCAL_HOST and $DOCUMENT_ROOT, just below.  If you don't, the
       
    20 #      program will try to guess them in set_needed_globals(), but it's more
       
    21 #      reliable if you enter them here.
       
    22 #
       
    23 #   2) If needed, set any further server configuration below-- things like
       
    24 #      path aliases and so forth.  If you have the srm.conf file, you can 
       
    25 #      feed it into this script with "-c srm.conf"; otherwise, the default 
       
    26 #      settings will probably work OK.
       
    27 #
       
    28 #   You can set a few parameters with the undocumented "-D <name=value>"
       
    29 #   command-line option, e.g. "-D LOCAL_HOST=www.myhost.com".
       
    30 #
       
    31 #   Further comments, including an overview of script internals, are at
       
    32 #   the end of this file.
       
    33 #
       
    34 #   Copyright (C) 1998, 2000 by James Marshall, james@jmarshall.com
       
    35 #   see http://www.jmarshall.com/tools/cl/ for more info
       
    36 #
       
    37 #
       
    38 #   CHANGES IN 1.0.1:
       
    39 #
       
    40 #     This is just a bug fix release.  Fixes include:
       
    41 #       . Aliases are handled correctly now.  Sorry 'bout that.
       
    42 #       . A redirect + relative URL no longer results in infinitely
       
    43 #           recursing URLs.
       
    44 #       . More HTML tags are searched for links.
       
    45 #       . Non-HTML files are no longer searched for links.
       
    46 #       . There were other minor bug fixes.
       
    47 #
       
    48 #----------------------------------------------------------------------
       
    49 
       
    50 #use strict ;
       
    51 
       
    52 my( $LOCAL_HOST, $DOCUMENT_ROOT, $USER_DIR, @DIRECTORY_INDEX,
       
    53     %ALIAS, %ALIAS_MATCH, %SCRIPT_ALIAS, %SCRIPT_ALIAS_MATCH, %UN_ALIAS,
       
    54     @SHTML_EXTENSIONS, @CGI_EXTENSIONS, @INCLUDE_PATTERNS, @EXCLUDE_PATTERNS,
       
    55           @INCLUDE_STATUS, @EXCLUDE_STATUS,
       
    56     $verbose_report, $max_depth, $file_check, $full_http_check,
       
    57     $MAX_REDIRECTS, $MAX_ATTEMPTS, $HTML_BY_NAME, $SUPPORT_NCSA_BUG,
       
    58     @NO_PROXY, $DOC_ROOT_DEV, $DOC_ROOT_INODE, $DOC_ROOT_EXISTS, $CWD,
       
    59     %html_urls, %non_html_urls, %e_to_ch,
       
    60 
       
    61     %home_dir, %dir_to_user, %inode_to_user,
       
    62 
       
    63     %url, @urlstoget, 
       
    64 
       
    65     $debug, $CL_VERSION,
       
    66   ) ;
       
    67 
       
    68 
       
    69 #----- User Configuration ---------------------------------------------
       
    70 
       
    71 # This should be 'localhost', or a hostname of the Web server.  URLs at
       
    72 #   this host will be assumed to be local; URLs not at this host will not be
       
    73 #   traversed into. If this names a remote host, the program will not work.
       
    74 # Note that 'localhost' doesn't necessarily point to your local Web server.
       
    75 
       
    76 # $LOCAL_HOST= 'localhost' ;
       
    77 # $LOCAL_HOST= 'www.example.com' ;
       
    78 $LOCAL_HOST='isabelle.in.tum.de';
       
    79 
       
    80 # This is your root Web directory, i.e. the directory that the Web server
       
    81 #   sends the user if the URL "http://$LOCAL_HOST" is requested.  It's in
       
    82 #   the configuration file srm.conf (and is read by -c option).
       
    83 # If you don't know the document root of your server, but you don't need
       
    84 #   it because you're only checking URLs whose path starts with ~, put a
       
    85 #   non-existent path here rather than leave it blank (a hack).
       
    86 
       
    87 # $DOCUMENT_ROOT= '/home/www/htdocs' ;
       
    88 $DOCUMENT_ROOT='/home/proj/isabelle';
       
    89 
       
    90 #----- variables equivalent to srm.conf entries 
       
    91 
       
    92 # These globals are from the equivalent entries in srm.conf, etc.
       
    93 # See the command-line option -c <config-file>, to read values directly 
       
    94 #   from srm.conf instead.
       
    95 
       
    96 # $USER_DIR= 'public_html' ;
       
    97 $USER_DIR='.html-data';
       
    98 @DIRECTORY_INDEX= qw( index.html index.cgi index.shtml ) ;
       
    99 
       
   100 # Used in &url_to_filename(), and possibly elsewhere
       
   101 # Note that ALIAS_MATCH and SCRIPT_ALIAS_MATCH use Perl (not standard) regexps.
       
   102 # If order of multiple e.g. "Alias" directives is important, this may not work.
       
   103 %ALIAS= () ;
       
   104 %ALIAS_MATCH= () ;
       
   105 %SCRIPT_ALIAS= () ;
       
   106 %SCRIPT_ALIAS_MATCH= () ;
       
   107 
       
   108 # The list of file extensions to interpret as CGI scripts or
       
   109 #   server-parsed HTML files.
       
   110 # These are not specific settings in srm.conf, but are combinations of
       
   111 #   AddHandler directives and possibly AddType directives.
       
   112 @CGI_EXTENSIONS=   qw( .cgi ) ;
       
   113 @SHTML_EXTENSIONS= qw( .shtml ) ;
       
   114 
       
   115 #----- end of variables equivalent to srm.conf entries 
       
   116 
       
   117 # Specify patterns here to only include URLs that match at least one
       
   118 #   pattern.  As a special case, an empty list includes all URLs, i.e.
       
   119 #   does not restrict URLs by name (except perhaps by @EXCLUDE_PATTERNS).
       
   120 # This can be added to or cleared with the -I command-line option.
       
   121 @INCLUDE_PATTERNS= () ;
       
   122 
       
   123 # Specify patterns here to cause matching URLs to be excluded,
       
   124 #   e.g. '\?' means ignore all URLs that query.
       
   125 # This can be added to or cleared with the -X command-line option.
       
   126 # @EXCLUDE_PATTERNS=  qw( \? ) ;
       
   127 
       
   128 # Only report URLs whose status codes start with one of these patterns.
       
   129 #   As a special case, an empty list reports all URLs, i.e. does not 
       
   130 #   restrict URLs by status code (except perhaps by @EXCLUDE_STATUS).
       
   131 # This can be added to or cleared with the -i command-line option.
       
   132 @INCLUDE_STATUS= () ;
       
   133 
       
   134 # Don't report URLs whose status codes start with these patterns.  Default
       
   135 #   is qw( 200 ).
       
   136 # This can be added to or cleared with the -x command-line option.
       
   137 @EXCLUDE_STATUS= qw( 200 ) ;
       
   138 
       
   139 # For 302 or 303 HTTP redirection, redirect no more than this many times.
       
   140 $MAX_REDIRECTS= 5 ;
       
   141 
       
   142 # If a connection times out, etc., attempt no more than this many times.
       
   143 $MAX_ATTEMPTS= 5 ;
       
   144 
       
   145 # The old version determined whether a file was HTML by the -T test (text
       
   146 #   file), and so traversed all HTML-like links in any text file that wasn't
       
   147 #   a CGI script.  It's probably more appropriate to check the file
       
   148 #   extension, to exclude source code, .txt files, etc.  Leave $HTML_BY_NAME
       
   149 #   set to use the filename, or unset it to traverse all HTML-like links in
       
   150 #   any text files, as the old version did.
       
   151 $HTML_BY_NAME= 1 ;
       
   152 
       
   153 # Some old NCSA servers, including 1.5.2, don't report the HTTP version
       
   154 #   correctly in the status line; they return e.g. "HTTP 200 OK".  To allow
       
   155 #   this, leave the variable here set.
       
   156 $SUPPORT_NCSA_BUG= 1 ;
       
   157 
       
   158 
       
   159 #----- DO NOT CHANGE ANYTHING BELOW THIS LINE, unless you want to... ---
       
   160 
       
   161 #----- Further Global Variable Initialization --------------------------
       
   162 
       
   163 $CL_VERSION= '1.0.1' ;
       
   164 
       
   165 $ENV{'http_proxy'}||= $ENV{'HTTP_PROXY'} ;
       
   166 @NO_PROXY= split(/[\s,]+/, $ENV{'no_proxy'} || $ENV{'NO_PROXY'} ) ;
       
   167 
       
   168 
       
   169 # If output's not going directly to terminal, this ensures autoflushing.
       
   170 $|= 1 ;
       
   171 
       
   172 #----- End of Configuration --------------------------------------------
       
   173 
       
   174 use strict 'vars' ;
       
   175 use IO::Socket ;
       
   176 
       
   177 &usage unless @ARGV ;
       
   178 
       
   179 # Process command-line options
       
   180 &getopts ;
       
   181 
       
   182 # Make any final needed adjustments to globals, after the hard-coded
       
   183 #   values above and any options have been processed.
       
   184 &adjust_all_globals ;
       
   185 
       
   186 # Default to "." if no starting filenames given.
       
   187 # 3-6-98: Anh, decided against it.
       
   188 #@ARGV= ('.') unless @ARGV ;
       
   189 
       
   190 # &add_url() sets $url{$_} and pushes to @urlstoget, only if not already
       
   191 #   added, plus any other initialization.
       
   192 # Only add a file if it can be accessed with a URL.
       
   193 foreach my $arg (@ARGV) {
       
   194     if ($arg=~ m#^http://#i) {
       
   195         &add_url($arg, '-', 0) ;
       
   196     } else {
       
   197         my($URL)= &filename_to_url($arg, $CWD) ;
       
   198         if (defined($URL)) {
       
   199             &add_url($URL, '-', 0) ;
       
   200         } else {
       
   201 	    die "ERROR: $arg is not accessible through the Web server.\n" ;
       
   202         }
       
   203     }
       
   204 }
       
   205 
       
   206 
       
   207 # Check the URLs, in order.  @urlstoget may grow and rearrange.
       
   208 while (@urlstoget) {
       
   209     my($url)= shift(@urlstoget) ;
       
   210     if ( !$url->{'ishtml'} or !$url->{'islocal'} or $url->{'dontfollow'}
       
   211          or (length($max_depth) and $url->{'depth'} > $max_depth ) ) {
       
   212         &verify_url($url) ;    # may set ishtml=true
       
   213     }
       
   214     if ( $url->{'ishtml'} and $url->{'islocal'} and !$url->{'dontfollow'}
       
   215          and (!length($max_depth) or $url->{'depth'} <= $max_depth ) ) {
       
   216         my($HTML)= &load_url($url) ;  # may set ishtml=false
       
   217         # 11-30-99 JSM: fixed to handle rel URLs in redirected pages correctly
       
   218         my($base_url)= $url->{'location'} || $url->{'URL'} ;
       
   219         &extract_urls($HTML, $base_url, $url->{'URL'}, $url->{'depth'}+1) 
       
   220             if $url->{'ishtml'} ;      # big, calls &add_url()
       
   221     }
       
   222 
       
   223     # If we get an error response that may be corrected with another
       
   224     #   attempt, put it back in the queue.  Such errors include 408,
       
   225     #   503, 504, and the homegrown codes 600, 601, 602, and 603.
       
   226     if ($url->{'status'}=~ /^(408|503|504|600|601|602|603)\b/ ) {
       
   227         push(@urlstoget, $url) if ( $url->{'numtries'} < $MAX_ATTEMPTS ) ;
       
   228     }
       
   229 
       
   230 }
       
   231 
       
   232 &make_report() ;
       
   233 
       
   234 exit ;
       
   235 
       
   236 
       
   237 
       
   238 #----- Process command-line options -----------------------------------
       
   239 
       
   240 # Process any command-line options.
       
   241 sub getopts {
       
   242     my($opt, $param) ;
       
   243     while ($ARGV[0]=~ /^-/) {
       
   244         $opt= shift(@ARGV) ;
       
   245         ($opt, $param)= $opt=~ /^-(.)(.*)/ ;
       
   246 
       
   247         # Turn on verbose reporting
       
   248         if ($opt eq 'v') {
       
   249             $verbose_report= ($param ne '-') ;
       
   250 
       
   251         # User-specified patterns to exclude ('' to clear list)
       
   252         } elsif ($opt eq 'I') {
       
   253             $param= shift(@ARGV) unless length($param) ;
       
   254             if (length($param)) { push(@INCLUDE_PATTERNS, $param) }
       
   255             else { @INCLUDE_PATTERNS= () }
       
   256 
       
   257         # User-specified patterns to exclude ('' to clear list)
       
   258         } elsif ($opt eq 'X') {
       
   259             $param= shift(@ARGV) unless length($param) ;
       
   260             if (length($param)) { push(@EXCLUDE_PATTERNS, $param) }
       
   261             else { @EXCLUDE_PATTERNS= () }
       
   262 
       
   263         # User-specified response codes to ignore ('' to clear list)
       
   264         } elsif ($opt eq 'i') {
       
   265             $param= shift(@ARGV) unless length($param) ;
       
   266             if (length($param)) { push(@INCLUDE_STATUS, $param) }
       
   267             else { @INCLUDE_STATUS= () }
       
   268 
       
   269         # User-specified response codes to ignore ('' to clear list)
       
   270         } elsif ($opt eq 'x') {
       
   271             $param= shift(@ARGV) unless length($param) ;
       
   272             if (length($param)) { push(@EXCLUDE_STATUS, $param) }
       
   273             else { @EXCLUDE_STATUS= () }
       
   274 
       
   275         # Maximum traversal depth
       
   276         } elsif ($opt eq 'd') {
       
   277             $param= shift(@ARGV) unless length($param) ;
       
   278             $max_depth= $param ;
       
   279 
       
   280         # Make it a "file check"-- only read local files, do not use HTTP
       
   281         } elsif ($opt eq 'f') {
       
   282             $file_check= ($param ne '-') ;
       
   283 
       
   284         # Use HTTP for all URL's, even local files
       
   285         } elsif ($opt eq 'h') {
       
   286             $full_http_check= ($param ne '-') ;
       
   287 
       
   288         # Read configuration parameters from srm.conf-like file
       
   289         } elsif ($opt eq 'c') {
       
   290             $param= shift(@ARGV) unless length($param) ;
       
   291             &read_srm_conf($param) ;
       
   292             
       
   293         # Print current configuration parameters
       
   294         } elsif ($opt eq 'q') {
       
   295             &print_config ;
       
   296             exit ;   # jsm-- should we exit?
       
   297 
       
   298         # Allow certain parameters to be defined via the command line
       
   299         } elsif ($opt eq 'D') {
       
   300             $param= shift(@ARGV) unless length($param) ;
       
   301             $debug=1, unshift(@ARGV,$param), next if $param=~ /^-/ ;
       
   302             my($name,$value)= split(/=/, $param, 2) ;
       
   303             $value= 1 unless length($value) ;
       
   304             if ($name=~ /^(LOCAL_HOST|DOCUMENT_ROOT|USER_DIR|DEBUG|debug)$/) {
       
   305                 eval "\$$name= \$value" ;
       
   306                 #$$name= $value ;  # this doesn't work, because of initial my()
       
   307             }
       
   308 
       
   309         } elsif ($opt eq '?') {
       
   310             &usage ;
       
   311 
       
   312         # End command-line option processing on "--"
       
   313         } elsif ($opt eq '-') {
       
   314             return ;
       
   315 
       
   316         } else {
       
   317             print STDERR 
       
   318                 "Illegal option-- '$opt'.  Enter \"$0 -?\" for help.\n" ;
       
   319             exit ;
       
   320         }
       
   321 
       
   322     }
       
   323 
       
   324     if ($file_check and $full_http_check) {
       
   325         print STDERR "You cannot use both the -f and the -h options.\n" ;
       
   326         exit ;
       
   327     }
       
   328 
       
   329 }
       
   330 
       
   331 
       
   332 # Read appropriate values from the given file, typically srm.conf.  If a
       
   333 #   directory is named, default to filename "srm.conf".
       
   334 # Note that opening "-" will open STDIN.
       
   335 sub read_srm_conf {
       
   336     my($fname)= @_ ;
       
   337     local(*SRM) ;
       
   338 
       
   339     # default to srm.conf if only a directory is named
       
   340     if (-d $fname) {
       
   341         $fname=~ s#/$## ;
       
   342         $fname.= "/srm.conf" ;
       
   343     }
       
   344 
       
   345     # Clear old values
       
   346     $DOCUMENT_ROOT= $USER_DIR= '' ;
       
   347     @DIRECTORY_INDEX= @CGI_EXTENSIONS= @SHTML_EXTENSIONS= () ;
       
   348     %ALIAS= %ALIAS_MATCH= %SCRIPT_ALIAS= %SCRIPT_ALIAS_MATCH= () ;
       
   349 
       
   350     open(SRM, "<$fname") || die "Can't open $fname: $!" ;
       
   351     while (<SRM>) {
       
   352         s/#.*// ;
       
   353         next unless /\S/ ;
       
   354         my($name, @param)= /(\S+)/g ;
       
   355 
       
   356         if ($name eq 'DocumentRoot') {
       
   357             $DOCUMENT_ROOT= $param[0] ;
       
   358 
       
   359         } elsif ($name eq 'UserDir') {
       
   360             $USER_DIR= $param[0] ;
       
   361 
       
   362         } elsif ($name eq 'DirectoryIndex') {
       
   363             @DIRECTORY_INDEX= @param ;
       
   364 
       
   365         } elsif ($name eq 'Alias') {
       
   366             $ALIAS{$param[0]}= $param[1] ;
       
   367 
       
   368         } elsif ($name eq 'AliasMatch') {
       
   369             $ALIAS_MATCH{$param[0]}= $param[1] ;
       
   370 
       
   371         } elsif ($name eq 'ScriptAlias') {
       
   372             $SCRIPT_ALIAS{$param[0]}= $param[1] ;
       
   373 
       
   374         } elsif ($name eq 'ScriptAliasMatch') {
       
   375             $SCRIPT_ALIAS_MATCH{$param[0]}= $param[1] ;
       
   376 
       
   377         } elsif ($name eq 'AddHandler') {
       
   378             if ($param[0] eq 'cgi-script') {
       
   379                 push(@CGI_EXTENSIONS, $param[1]) ;
       
   380             } elsif ($param[0] eq 'server-parsed') {
       
   381                 push(@SHTML_EXTENSIONS, $param[1]) ;
       
   382             }
       
   383         }
       
   384 
       
   385     }
       
   386     close(SRM) ;
       
   387 }
       
   388 
       
   389 
       
   390 # Make any final settings to global variables, after the hard-coded values
       
   391 #   and command-line options have been processed.
       
   392 # Most non-user-configurable globals are also set here.
       
   393 sub adjust_all_globals {
       
   394 
       
   395     # Standardize $USER_DIR to never have trailing slash
       
   396     $USER_DIR=~ s#/$## ;
       
   397 
       
   398     # If no $LOCAL_HOST set, try to read it from first URL in list, or
       
   399     #   use the string 'localhost' if that URL contains no hostname.
       
   400     unless (length($LOCAL_HOST)) {
       
   401         $LOCAL_HOST= (&parse_url($ARGV[0]))[1] || 'localhost' ;
       
   402         print STDERR "LOCAL_HOST set to \"\L$LOCAL_HOST\E\"\n" ;
       
   403     }
       
   404     $LOCAL_HOST= lc($LOCAL_HOST) ;
       
   405 
       
   406     # If no $DOCUMENT_ROOT, try to guess it from $HOME, username, $USER_DIR.
       
   407     unless (length($DOCUMENT_ROOT)) {
       
   408         my($home) ;
       
   409         unless ($home= $ENV{'HOME'}) {
       
   410             my($uname)= getpwuid($<) || $ENV{'USER'} || `whoami` || `id -un` ;
       
   411             chomp($uname) ;
       
   412             &read_home_dirs unless %home_dir ;   # only read when needed
       
   413             $home= $home_dir{$uname} ;
       
   414         }
       
   415         $DOCUMENT_ROOT= "$home/$USER_DIR" ;
       
   416 
       
   417         die "Could not determine DOCUMENT_ROOT; edit the $0 script to set it.\n"
       
   418             unless (-d $DOCUMENT_ROOT) ;
       
   419 
       
   420         print STDERR "DOCUMENT_ROOT set to \"$DOCUMENT_ROOT\"\n" ;
       
   421     }
       
   422     $DOCUMENT_ROOT=~ s#/$## ;
       
   423 
       
   424     # Allows &filename_to_url() to unalias as best as possible.  Note that 
       
   425     #   use of &filename_to_url() can be avoided by the user; see note in 
       
   426     #   that routine.
       
   427     %UN_ALIAS= (reverse (%ALIAS, %SCRIPT_ALIAS) ) ;
       
   428 
       
   429     # These are to compare equivalency to later, in &filename_to_url().
       
   430     ($DOC_ROOT_DEV, $DOC_ROOT_INODE)= stat("$DOCUMENT_ROOT/.") ;
       
   431     
       
   432     $DOC_ROOT_EXISTS= -e _ ;
       
   433     
       
   434     # Set CWD from shell variable, else from `pwd`.
       
   435     $CWD= $ENV{'PWD'} || `pwd` || die "couldn't run pwd: $!" ;
       
   436     chomp($CWD) ;
       
   437 
       
   438 
       
   439     # These are used by &extract_urls().
       
   440     # This is a complete list of URL-type attributes defined in HTML 4.0,
       
   441     #   plus any others I found, like nonstandard ones or from an earlier HTML.
       
   442     # Only a few of these are commonly used, as of early 1998.
       
   443     # The set in %html_urls could possibly link to HTML resources, while the
       
   444     #   set in %non_html_urls could not.  The %special(.*) sets, here for 
       
   445     #   reference only, include URL attributes that require special handling.
       
   446 
       
   447     %html_urls= ( 'a'          => [ 'href' ],
       
   448                   'area'       => [ 'href' ],
       
   449                   'frame'      => [ 'src', 'longdesc' ],
       
   450                   'link'       => [ 'href', 'urn' ],
       
   451                   'img'        => [ 'longdesc', 'usemap' ],
       
   452                   'q'          => [ 'cite' ],
       
   453                   'blockquote' => [ 'cite' ],
       
   454                   'ins'        => [ 'cite' ],
       
   455                   'del'        => [ 'cite' ],
       
   456                   'object'     => [ 'usemap' ],
       
   457                   'input'      => [ 'usemap' ],
       
   458                   'iframe'     => [ 'src', 'longdesc' ],
       
   459 		  'ilayer'     => [ 'src' ],
       
   460 		  'layer'      => [ 'src' ],
       
   461 		  'fig'        => [ 'imagemap' ],
       
   462 		  'overlay'    => [ 'imagemap' ],
       
   463 		  'meta'       => [ 'url' ],
       
   464 		  'note'       => [ 'src' ],
       
   465                 ) ;
       
   466 
       
   467     %non_html_urls= ( 'body'    => [ 'background' ], 
       
   468                       'img'     => [ 'src', 'lowsrc', 'dynsrc' ],
       
   469                       'input'   => [ 'src' ],
       
   470                       'script'  => [ 'src', 'for' ],
       
   471 
       
   472 		      'fig'     => [ 'src' ],
       
   473 		      'overlay' => [ 'src' ],
       
   474 		      'select'  => [ 'src' ],
       
   475 		      'ul'      => [ 'src' ],
       
   476 		      'h1'      => [ 'src' ],
       
   477 		      'h2'      => [ 'src' ],
       
   478 		      'h3'      => [ 'src' ],
       
   479 		      'h4'      => [ 'src' ],
       
   480 		      'h5'      => [ 'src' ],
       
   481 		      'h6'      => [ 'src' ],
       
   482 		      'hr'      => [ 'src' ],
       
   483 		      'table'   => [ 'src' ],
       
   484 		      'td'      => [ 'src' ],
       
   485 		      'th'      => [ 'src' ],
       
   486 		      'tr'      => [ 'src' ],
       
   487 
       
   488 		      'bgsound' => [ 'src' ],
       
   489 		      'embed'   => [ 'src' ],
       
   490                   ) ;
       
   491 
       
   492     # %special_urls= ( 'base' => [ 'href' ] ) ;
       
   493     #
       
   494     # %special_html_urls= ( 'object' => [ 'codebase', 'data' ] ) ;
       
   495     #
       
   496     # %special_non_html_urls=
       
   497     #      ( 'head'   => [ 'profile' ],
       
   498     #        'object' => [ 'codebase', 'archive', 'classid' ],
       
   499     #        'applet' => [ 'codebase', 'code', 'object', 'archive' ],
       
   500     #        'form'   => [ 'action', 'script' ]
       
   501     #      ) ;
       
   502 
       
   503 
       
   504     # This is a translation from entity character references to characters, 
       
   505     #   used in &HTMLunescape().
       
   506     # This simplified version only supports &quot; &lt; &gt; &amp;, but that
       
   507     #   should be enough for URL-type attributes.
       
   508     # See http://www.w3.org/TR/REC-html40/sgml/entities.html for full entity 
       
   509     #   list.
       
   510 
       
   511     %e_to_ch= (quot => '"', 
       
   512                'lt' => '<', 
       
   513                'gt' => '>', 
       
   514                amp  => '&') ;
       
   515 
       
   516 }
       
   517 
       
   518 
       
   519 #----------------------------------------------------------------------
       
   520 
       
   521 # Add the URL to our data structures; specifically, to %url and @urlstoget.
       
   522 # Returns a pointer to the structure in %url, or undef if already defined
       
   523 #   or on error.
       
   524 # Currently, this always receives the URL with the host name lowercase,
       
   525 #   either from &absolute_url() or from using $LOCAL_HOST.
       
   526 sub add_url {
       
   527     my($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow)= @_ ;
       
   528 
       
   529     # Allow the user to restrict URL patterns:  URLs must be in 
       
   530     #   @INCLUDE_PATTERNS but not in @EXCLUDE_PATTERNS (but only restrict 
       
   531     #   by @INCLUDE_PATTERNS if it's not empty).
       
   532     return undef if @INCLUDE_PATTERNS &&
       
   533                     !grep( $URL=~ /$_/, @INCLUDE_PATTERNS ) ;
       
   534     return undef if grep( $URL=~ /$_/, @EXCLUDE_PATTERNS ) ;
       
   535 
       
   536     # Canonicalize URL, so we don't get a page multiple times
       
   537     $URL= &canonicalize($URL) ;
       
   538     
       
   539     # for obscure case involving a <form action=___.cgi>-extracted URL being
       
   540     #   overwritten by <a href=___.cgi> extraction (don't fret over this)
       
   541     $url{$URL}{'dontfollow'}&&= $dontfollow if $url{$URL} ;
       
   542 
       
   543     # Don't add the record a second time!  Or will infinitely traverse.
       
   544     return undef if $url{$URL} ;  # or add to @referers, for 301 correction...?
       
   545 
       
   546     # Only HTTP URLs are currently supported
       
   547     return undef unless $URL=~ /^http:/i ;
       
   548 
       
   549     # Any self-referral here indicates a bug in the program.  It's happened.
       
   550     die "PROGRAM ERROR: $URL shows its first referer as itself.\n"
       
   551         if $referer eq $URL ;
       
   552 
       
   553     my(%u) ;
       
   554     @u{qw(URL referer depth ishtml iscgi dontfollow)}= 
       
   555         ($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow) ;
       
   556     $u{'islocal'}= ($URL=~ m#^http://\Q$LOCAL_HOST\E/#io) + 0 ; # make length>0
       
   557     if ($u{'islocal'}) {
       
   558 #        $u{'filename'}= &url_to_filename($URL) ;
       
   559         @u{'filename', 'location'}= &url_to_filename($URL) ;
       
   560         $u{'iscgi'}= &is_cgi($u{'filename'}, $URL)  if $u{'iscgi'} eq '' ;
       
   561 
       
   562 	# 2-27-00 JSM:  Detect ishtml by filename, not -T test.
       
   563 	if ( $u{'ishtml'} eq '' ) {
       
   564 	    $u{'ishtml'}=  $HTML_BY_NAME
       
   565 		?  ( !$u{'iscgi'} && -e $u{'filename'} && 
       
   566 		     $u{'filename'}=~ /\.html?$/i ) + 0
       
   567 		:  (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0 ;
       
   568 	}
       
   569 #        $u{'ishtml'}= (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0
       
   570 #            unless length($u{'ishtml'}) ;
       
   571 
       
   572     }
       
   573 
       
   574     #  If we're only doing a file check, don't add URLs that require HTTP
       
   575     return undef if ($file_check and (!$u{'islocal'} or $u{'iscgi'}) ) ;
       
   576 
       
   577     push(@urlstoget, \%u) ;
       
   578     $url{$URL}= \%u ;
       
   579 
       
   580     # return \%u ;   # unneeded because of previous statement
       
   581 }
       
   582 
       
   583 
       
   584 # Guess if a file is a CGI script or not.  Returns true if the (regular) file
       
   585 #   is executable, has one of @CGI_EXTENSIONS, or if the URL is in a 
       
   586 #   ScriptAlias'ed directory.
       
   587 # $fname must be absolute path, but $URL is optional (saves time if available).
       
   588 # Note that URLs like "/path/script.cgi?a=b" are handled correctly-- the
       
   589 #   previously extracted filename is tested for CGI-ness, while the URL is
       
   590 #   checked for ScriptAlias matching (which is unaffected by final query
       
   591 #   strings or PATH_INFO).
       
   592 sub is_cgi {
       
   593     my($fname, $URL)= @_ ;
       
   594     return 1 if (-x $fname && ! -d _ ) ;   # should we really do this?
       
   595     foreach (@CGI_EXTENSIONS) { return 1 if $fname=~ /\Q$_\E$/i }
       
   596 
       
   597     $URL= &filename_to_url($fname) unless length($URL) ;  # currently unused
       
   598     my($URLpath)= $URL=~ m#^http://[^/]*(.*)#i ;
       
   599     foreach (keys %SCRIPT_ALIAS)        { return 1 if $URLpath=~ /^\Q$_\E/ }
       
   600     foreach (keys %SCRIPT_ALIAS_MATCH)  { return 1 if $URLpath=~ /^$_/ }
       
   601 
       
   602     return 0 ;
       
   603 }
       
   604    
       
   605 
       
   606 # Put the URL in such a form that two URLs that point to the same resource
       
   607 #   have the same URL, to avoid superfluous retrievals.
       
   608 # Host name is lowercased elsewhere-- this routine is only called from
       
   609 #   &add_url; see note there.  To lowercase the host name here would be
       
   610 #   inefficient.
       
   611 sub canonicalize {
       
   612     my($URL)= @_ ;
       
   613 
       
   614     $URL=~ s/#.*// ;    # remove any "#" fragment from end of URL
       
   615 
       
   616     return $URL ;
       
   617 }
       
   618 
       
   619 
       
   620 #----- File reading/downloading routines (includes networking) --------
       
   621 
       
   622 # Verify that a URL exists, and set $url->{'status'} accordingly.  Do
       
   623 #   this either by checking the local filesystem or by using the HTTP HEAD 
       
   624 #   method for remote sites or CGI scripts.
       
   625 # Set $url->{'ishtml'} accordingly if discovered from Content-Type:.
       
   626 # This does not support various Redirect directives in srm.conf.
       
   627 sub verify_url {
       
   628     my($url)= @_ ;
       
   629 
       
   630     print STDERR "verifying $url->{'URL'}\n" if $debug ;
       
   631 
       
   632 
       
   633     # Depending on the state of $url->{islocal, iscgi, dontfollow} and
       
   634     #   $full_http_check, take appropriate actions to check/set the
       
   635     #   status code for this URL.
       
   636     
       
   637     # NOTE: In some situations, specifically when checking a CGI script
       
   638     #   named in a <form action> (thus implying that dontfollow is set),
       
   639     #   and using HTTP to check the URL (because the script is remote or
       
   640     #   $full_http_check is set), the HTTP response code may not be
       
   641     #   accurate.  This is because there is no form data sent with the
       
   642     #   request, as there normally would be.  In these cases, a cautionary
       
   643     #   note is appended to $url->{'status'}.  Additionally, an empty 
       
   644     #   $url->{'status'} is changed to an explanatory note (maybe we should
       
   645     #   do that in load_url() too?).
       
   646 
       
   647     # Use HEAD if file is remote, or if $full_http_check is set.
       
   648     if (!$url->{'islocal'} or $full_http_check) {
       
   649         &load_url_using_HTTP($url, 'HEAD') ;
       
   650         $url->{'status'}= '[no status returned]'
       
   651             unless length($url->{'status'}) ;
       
   652         $url->{'status'}.= ' (NOTE: Form was not submitted normally)'
       
   653             if $url->{'dontfollow'} ;
       
   654 
       
   655     # URL is local:  If it's not CGI, do a normal local file check
       
   656     } elsif (!$url->{'iscgi'}) {
       
   657         $url->{'status'}= (-e $url->{'filename'})  
       
   658             ? "200 Local File Exists"  : "404 File Not Found" ;
       
   659 
       
   660     # URL is local CGI:  Use HEAD unless dontfollow is set
       
   661     } elsif (!$url->{'dontfollow'}) {
       
   662         &load_url_using_HTTP($url, 'HEAD') ;
       
   663 
       
   664     # Else it's a local CGI with dontfollow set:  Check for executable file
       
   665     } else {
       
   666         $url->{'status'}= 
       
   667              (! -e $url->{'filename'})  ? "404 File Not Found"
       
   668            : (! -x $url->{'filename'})  ? "403 Local File Is Not Executable"
       
   669            :                              "200 Local Executable File Exists"
       
   670 
       
   671     }
       
   672         
       
   673 
       
   674 # Old verify routine below:
       
   675 #
       
   676 #    # If is a local non-CGI file, check it directly from the filesystem
       
   677 #    if ($url->{'islocal'} and !$url->{'iscgi'} and !$full_http_check) {
       
   678 #        $url->{'status'}= (-e $url->{'filename'})  
       
   679 #            ? "200 Local File Exists"  : "404 File Not Found" ;
       
   680 #
       
   681 #    # Otherwise, download its HEAD from its HTTP server
       
   682 #    } else {
       
   683 #        &load_url_using_HTTP($url, 'HEAD') ;
       
   684 #    }
       
   685 
       
   686 
       
   687 }
       
   688 
       
   689 
       
   690 
       
   691 # Load entire file/resource and return its contents, setting $url->{'status'}
       
   692 #    accordingly.  Do this either by checking the local filesystem or by 
       
   693 #    using the HTTP GET method for remote sites or CGI scripts.
       
   694 # Set $url->{'ishtml'} accordingly if discovered from Content-Type:.
       
   695 # This does not support various Redirect directives in srm.conf.
       
   696 sub load_url {
       
   697     my($url)= @_ ;
       
   698     my($HTML) ;
       
   699 
       
   700     print STDERR "loading $url->{'URL'}\n" if $debug ;
       
   701 
       
   702     # If is a local non-CGI file, read it directly from the filesystem
       
   703     if ($url->{'islocal'} and !$url->{'iscgi'} and !$full_http_check) {
       
   704         my($iscgi) ;
       
   705         ($HTML, $url->{'ssierrs'}, $iscgi)= 
       
   706             &read_expanded_file($url->{'filename'}, $url->{'URL'}) ;
       
   707         $url->{'status'}= 
       
   708             !defined($HTML)
       
   709                   ? sprintf("450 Can't read file: %s (%s)", $!, $!+0)
       
   710             : @{$url->{'ssierrs'}}
       
   711                   ? sprintf("451 SSI Error(s) (%s total)",
       
   712                             scalar @{$url->{'ssierrs'}})
       
   713 
       
   714             :       "200 Local File Read OK" ;
       
   715 
       
   716         # $url->{'iscgi'} may be set if an SHTML file included CGI calls.
       
   717         # Don't set it if we're doing a file check, in which case we'll
       
   718         #   keep whatever $HTML we could get.
       
   719         $url->{'iscgi'}= $iscgi unless $file_check ;
       
   720     }
       
   721 
       
   722     # Otherwise (or if rereckoned), download the resource from its HTTP server
       
   723     if (!$url->{'islocal'} or $url->{'iscgi'} or $full_http_check) {
       
   724         (undef, undef, $HTML)= &load_url_using_HTTP($url, 'GET') ;
       
   725     }
       
   726 
       
   727     # Note that this will be set even when URL is to be reloaded, like
       
   728     #   for a 601 (timeout) response.
       
   729     $url->{'hasbeenloaded'}= 1 ;
       
   730     return $HTML ;
       
   731 }
       
   732 
       
   733 
       
   734 # Read a local file and return its contents.  If a file is SSI (aka SHTML),
       
   735 #   expand any SSI <!--#include--> directives as needed, recursively 
       
   736 #   including nested files.
       
   737 # This is used for all local reads, SHTML or not, but the vast bulk of this
       
   738 #   routine is for SHTML files.
       
   739 #
       
   740 # If file is SHTML, this routine also returns a structure of error data,
       
   741 #   and a boolean saying if this file needs to be downloaded via HTTP
       
   742 #   for a complete check (e.g. includes CGI calls).
       
   743 #
       
   744 # $fname must be canonicalized absolute path, but $URL parameter is optional.
       
   745 # %$parents contains all "include"-ancestors of the file, to prevent loops.
       
   746 #   If omitted, assumes no ancestors (and a fresh hash is started).
       
   747 #
       
   748 # This routine seems much bigger and more complex than it needs to be.
       
   749 #   It could be one third the size and much simpler if we didn't have to
       
   750 #   worry about full error reporting on nested includes.
       
   751 #
       
   752 # Note: This routine was made to mimic what Apache would return to a client.
       
   753 #   However, the result differs from Apache's in two slight ways, both
       
   754 #   involving nested SSI within <!--#include file="..." -->, and both
       
   755 #   apparent bugs in Apache 1.1 (may be fixed in later versions):
       
   756 #
       
   757 #   1) If a <file="..."> value contains no "/" (i.e. in current directory),
       
   758 #      then Apache always parses the included file as SHTML, regardless of
       
   759 #      extension.  This routine checks @SHTML_EXTENSIONS for all included
       
   760 #      files.
       
   761 #   2) If a <file="..."> value containing a "/" loads an SHTML file  
       
   762 #      containing a <virtual="..."> tag with a relative path, the directive
       
   763 #      fails in Apache.  This routine tries to guess the correct path/URL.
       
   764 #
       
   765 #
       
   766 # Notes on this routine, and SHTML files in general:
       
   767 #
       
   768 # At first thought, it seems like we could load each included file
       
   769 # only once, instead of once for every file that includes it.
       
   770 # However, because of the fact that relative URLs are resolved
       
   771 # relative to the top-level including file, the top-level file will
       
   772 # need to be expanded every time.  (It's legal (though of questionable
       
   773 # wisdom) to include a file from e.g. both /a/index.shtml and
       
   774 # /b/index.shtml, so links from the included file point to different
       
   775 # URLs.)
       
   776 #
       
   777 # Note that while URLs in included files (e.g. <a href="...">) are
       
   778 # resolved relative to the top-level including file, nested include tags
       
   779 # are resolved relative to the direct includer.
       
   780 #
       
   781 # We could possibly be more efficient in time (but costly in memory)
       
   782 # by storing the expanded contents and $errlist of each included file,
       
   783 # since those will be constant (except $errlist's include-loop
       
   784 # reporting might vary somewhat).  There are probably other ways to
       
   785 # eek out savings of time and memory, at the cost of complexity.
       
   786 #
       
   787 # The main loop here is inside of an s/// statement.  Unusual, but it's an
       
   788 # appropriate way to handle the recursion. Recursion is needed, since each
       
   789 # included file may or may not be SHTML.
       
   790 #
       
   791 # $iscgi is set if a file includes "<!--#exec", or if it contains an
       
   792 #   <!--#include virtual="..." --> tag that points to a CGI file, or if 
       
   793 #   any of its include-children sets $iscgi.
       
   794 #
       
   795 #
       
   796 # Notes to help clarify data structures, if (God forbid) you have to modify 
       
   797 # this routine:
       
   798 #
       
   799 # Each error is a list of files in an "include chain", and $errlist is a
       
   800 # list of errors.  $errlist is associated with the current $HTML.  Each
       
   801 # error in $errlist is associated with some tag in $HTML, as iterated in
       
   802 # the s/// loop.  When this routine returns ($HTML, $errlist), the
       
   803 # errors in $errlist should all have as their first element tags that
       
   804 # were found in $HTML.
       
   805 #
       
   806 # Each error's final element (the "leaf") is associated with a tag that
       
   807 # tried to load an invalid file.  Each leaf will always be the complete 
       
   808 # set of errors for that tag (i.e. it has no children, since it couldn't
       
   809 # load the file).
       
   810 #
       
   811 # If the file can be validly loaded, then we may have 0 or multiple errors
       
   812 # associated with this file/tag (and returned from this routine).  Each
       
   813 # file's $errlist is an accumulation of all of its children's $errlist's.
       
   814 #
       
   815 # Errors that come from loading a child are associated with the child's
       
   816 # $HTML and tags.  Before adding them to the parent's (current) $errlist,
       
   817 # they must have the CHILD's path/tag unshifted onto the front of the 
       
   818 # include chain; all errors are then added to the current $errlist.
       
   819 # This ensures that:
       
   820 #   a) The errors are now all associated with the tag that loaded the child.
       
   821 #   b) The errors in the current $errlist are always associated with tags
       
   822 #      from the current $HTML.
       
   823 #
       
   824 
       
   825 sub read_expanded_file {
       
   826     my($fname, $URL, $parents)= @_ ;
       
   827     my($HTML, $errlist, $iscgi) ;
       
   828     my($isshtml) ;
       
   829 
       
   830     $parents->{$fname}= 1 ;
       
   831 
       
   832     $HTML= &read_file($fname) ;
       
   833     return(undef) unless defined($HTML) ;
       
   834 
       
   835     foreach (@SHTML_EXTENSIONS) {
       
   836         $isshtml= 1, last if ($fname=~ /\Q$_\E$/i) ;
       
   837     }
       
   838 
       
   839     if ($isshtml) {
       
   840         $errlist= [] ;
       
   841         $iscgi= ($HTML=~ /<!--#exec\b/i) ;
       
   842 
       
   843         $HTML=~ s{(<!--#include\b.*?>)} {
       
   844             do {
       
   845                 my($childfname, $childURL) ;
       
   846                 my($childHTML, $childerrlist, $childiscgi) ;
       
   847                 my($tagst) = $1 ;
       
   848                 my($tag)= &parse_tag($tagst) ;
       
   849               GET_CHILD: {
       
   850                   if (length($tag->{'attrib'}{'virtual'})) {
       
   851                       $URL= &filename_to_url($fname) unless length($URL) ;
       
   852                       $childURL= 
       
   853                           &absolute_url($tag->{'attrib'}{'virtual'}, $URL) ;
       
   854                       ($childfname)= &url_to_filename($childURL) ;
       
   855 
       
   856                       # If it's a CGI, don't follow it, but no error either
       
   857                       if (&is_cgi($childfname, $childURL)) {
       
   858                           $iscgi= 1 ;
       
   859                           last GET_CHILD ;
       
   860                       }
       
   861 
       
   862                   } elsif (length($tag->{'attrib'}{'file'})) {
       
   863                       $childfname= $tag->{'attrib'}{'file'} ;
       
   864                       if ($childfname=~ m#^(/|~)#) {
       
   865                           push(@$errlist, [ {'path' => $childfname, 
       
   866                                              'tag' => $tagst, 'errmsg' =>
       
   867                                   'Absolute paths are not allowed in '
       
   868                                 . '<!--#include file="..." -->.'}]);
       
   869                           last GET_CHILD ;
       
   870                       }
       
   871                       if ($childfname=~ m#\.\.(/|$)#) {
       
   872                           push(@$errlist, [ {'path' => $childfname, 
       
   873                                              'tag' => $tagst, 'errmsg' =>
       
   874                                   'Paths can not contain "../" in '
       
   875                                 . '<!--#include file="..." -->.'}]);
       
   876                           last GET_CHILD ;
       
   877                       }
       
   878                       $childfname= ($fname=~ m#(.*/)#)[0] . $childfname ;
       
   879 
       
   880                   } else {
       
   881                       push(@$errlist, [ {'path' => '',
       
   882                                          'tag' => $tagst, 'errmsg' =>
       
   883                               'Tag must contain either the "file" or '
       
   884                             . '"virtual" attribute.'}]);
       
   885                       last GET_CHILD ;
       
   886 
       
   887                   }
       
   888 
       
   889                   # canonicalize filename for %$parents
       
   890                   1 while $childfname=~ s#/\.(/|$)#/# ;
       
   891                   1 while $childfname=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ;
       
   892 
       
   893                   # Guarantee that file exists, is regular, and is readable
       
   894                   unless (-e $childfname) {
       
   895                       push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
       
   896                                         'errmsg' => 'File not found'} ] ) ;
       
   897                       last GET_CHILD ;
       
   898                   }
       
   899                   unless (-f $childfname) {
       
   900                       push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
       
   901                                         'errmsg' => 'File is not a regular'
       
   902                                             . ' file.' } ] ) ;
       
   903                       last GET_CHILD ;
       
   904                   }
       
   905                   unless (-r $childfname) {
       
   906                       push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
       
   907                                         'errmsg' => 'File is not readable by'
       
   908                                             . ' current user.' } ] ) ;
       
   909                       last GET_CHILD ;
       
   910                   }
       
   911 
       
   912                   # Guard against include loops
       
   913                   if ($parents->{$childfname}) {
       
   914                       push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
       
   915                                         'errmsg' => 'An "include" loop exists'
       
   916                                             . ' involving this file.' } ] ) ;
       
   917                       last GET_CHILD ;
       
   918                   }
       
   919 
       
   920 
       
   921                   # Get the included file, with any error data
       
   922                   ($childHTML, $childerrlist, $childiscgi)= 
       
   923                       &read_expanded_file($childfname, $childURL, $parents) ;
       
   924 
       
   925                   # Log if there was any error reading the file
       
   926                   push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
       
   927                                     'errmsg' => "Can't read file: $!." } ] )
       
   928                       unless defined($childHTML) ;
       
   929 
       
   930                   # Add any errors to the current (parent) error list
       
   931                   foreach my $error (@$childerrlist) {
       
   932                       unshift(@$error, 
       
   933                               { 'path' => $childfname, 'tag' => $tagst } ) ;
       
   934                   }
       
   935                   push(@$errlist, @$childerrlist) ;
       
   936 
       
   937                   # Parent is a CGI if any of its children is a CGI
       
   938                   $iscgi||= $childiscgi ;
       
   939             
       
   940               }   # GET_CHILD
       
   941 
       
   942 
       
   943               $childHTML ;   # final value to replace in main s/// construct
       
   944 
       
   945           }   # do {}
       
   946 
       
   947         }gie ;   # $HTML=~ s{} {}
       
   948 
       
   949     } # if ($isshtml)
       
   950 
       
   951     delete $parents->{$fname} ;
       
   952 
       
   953     return($HTML, $errlist, $iscgi) ;
       
   954 }
       
   955 
       
   956 
       
   957 
       
   958 # Returns the contents of the named file, or undef on error.
       
   959 sub read_file {
       
   960     my($fname)= @_ ;
       
   961     local(*F, $/) ;
       
   962  
       
   963     undef $/ ;
       
   964     open(F, "<$fname") || return undef ;
       
   965     my($ret)= <F> ;
       
   966     close(F) ;
       
   967     
       
   968     return $ret ;
       
   969 }
       
   970 
       
   971 
       
   972 # Try to get the given URL with the given HTTP method, and return the
       
   973 #   status line, headers, and body.
       
   974 # Set $url->{'status'} accordingly, and set $url->{'ishtml'} accordingly
       
   975 #   if Content-Type: header is returned.
       
   976 # This is specific to this program, and calls the more general &get_url().
       
   977 # This could be slightly more efficient if 302 or 303 was handled in the
       
   978 #   calling routine, where it could take advantage of a new URL being local.
       
   979 sub load_url_using_HTTP {
       
   980     my($url, $method)= @_ ;
       
   981     my($status_line, $headers, $body) ;
       
   982 
       
   983     # We should not get here if $file_check is set
       
   984     die "mistakenly called load_url_using_HTTP($url->{'URL'})" if $file_check ;
       
   985 
       
   986     GETFILE: {
       
   987         ($status_line, $headers, $body)=
       
   988             &get_url( ($url->{'location'} || $url->{'URL'}), $method) ;
       
   989 
       
   990         # If HEAD failed (as on some servers), sigh and use GET
       
   991         ($status_line, $headers, $body)=
       
   992             &get_url( ($url->{'location'} || $url->{'URL'}), 'GET')
       
   993                 unless length($status_line) ;
       
   994 
       
   995         ($url->{'status'})=  $status_line=~ m#^HTTP/[\d.]+\s+(.*)# ;
       
   996 
       
   997 	# 2-27-00 JSM:  Allow old NCSA servers to not include the HTTP version.
       
   998 	if ($SUPPORT_NCSA_BUG and $url->{'status'} eq '') {
       
   999 	    ($url->{'status'})=  $status_line=~ m#^HTTP(?:/[\d.]+)?\s+(.*)# ;
       
  1000 	}
       
  1001 
       
  1002         # Redirect to new location if status is 302 or 303
       
  1003         if ($url->{'status'}=~ /^(301|302|303)\b/) {
       
  1004             ($url->{'location'})= $headers=~ m#^Location:[ \t]+(\S+)#im ;
       
  1005             last GETFILE unless length($url->{'location'}) ;
       
  1006             $url->{'location'}= 
       
  1007                 &absolute_url($url->{'location'}, $url->{'URL'}) ;
       
  1008             redo GETFILE
       
  1009                 if    ($url->{'status'}=~ /^(302|303)\b/)
       
  1010                    && (++$url->{'numredirects'} <= $MAX_REDIRECTS) ;
       
  1011         }
       
  1012     }
       
  1013 
       
  1014     $url->{'numtries'}++ ;
       
  1015     $url->{'lasttried'}= time ;
       
  1016 
       
  1017     # If successful response included Content-Type:, set ishtml accordingly
       
  1018     $url->{'ishtml'}= (lc($1) eq 'text/html') + 0
       
  1019         if $url->{'status'}=~ /^2/
       
  1020            and $headers=~ m#^content-type:[ \t]*(\S+)#im ;
       
  1021 
       
  1022     print STDERR "status: $status_line\n" if $debug ;
       
  1023 
       
  1024     return($status_line, $headers, $body) ;
       
  1025 }
       
  1026 
       
  1027 
       
  1028 # Request the HTTP resource at the given absolute URL using the given method,
       
  1029 #   and return the response status line, headers, and body.
       
  1030 # jsm-- in the future, this should support downloading to a file, in case
       
  1031 #   the download is too large to fit in memory.
       
  1032 sub get_url {
       
  1033     my($URL, $method)= @_ ;
       
  1034     my($host, $uri, $endhost) ;
       
  1035     my($S, $rin) ;
       
  1036     my($response, $status_line, $headers, $body, $status_code) ;
       
  1037     my($content_length) ;
       
  1038     $method= uc($method) ;
       
  1039     $method= 'GET' unless length($method) ;
       
  1040 
       
  1041     ($host, $uri)= $URL=~ m#^http://([^/]*)(.*)$#i ;
       
  1042     $uri= '/' unless length($uri) ;
       
  1043     $endhost= $host ;
       
  1044 
       
  1045     # use an HTTP proxy if $ENV{'http_proxy'} is set
       
  1046     USEPROXY: {
       
  1047         last USEPROXY unless $host=~ /\./ ;
       
  1048         if (length($ENV{'http_proxy'})) {
       
  1049             foreach (@NO_PROXY) {
       
  1050                 last USEPROXY if $host=~ /$_$/i ;
       
  1051             }
       
  1052             ($host)= $ENV{'http_proxy'}=~ m#^(?:http://)?([^/]*)#i ;
       
  1053             $uri= $URL ;
       
  1054         }
       
  1055     }
       
  1056 
       
  1057     # Open socket
       
  1058     $S= IO::Socket::INET->new(PeerAddr => $host,  # may contain :port
       
  1059                               PeerPort => 80,     # default if none in PeerAddr
       
  1060                               Proto => 'tcp') ;
       
  1061     return("HTTP/1.1 600 Can't create socket: $@") unless defined($S) ;
       
  1062     $S->autoflush() ;   # very important!!
       
  1063 
       
  1064     # Send HTTP 1.1 request
       
  1065     print $S "$method $uri HTTP/1.1\015\012",
       
  1066              "Host: $endhost\015\012",
       
  1067              "Connection: close\015\012",
       
  1068              "User-agent: CheckLinks/$CL_VERSION\015\012",
       
  1069              "\015\012" ;
       
  1070 
       
  1071     # Wait for socket response with select()
       
  1072     vec($rin= '', fileno($S), 1)= 1 ;
       
  1073     select($rin, undef, undef, 60) 
       
  1074         || return("HTTP/1.1 601 Connection timed out") ;
       
  1075 
       
  1076     local($/)= "\012" ;
       
  1077 
       
  1078     # Handle "100 Continue" responses for HTTP 1.1: loop until non-1xx.
       
  1079     do {
       
  1080         $status_line= <$S> ;
       
  1081         $status_line=~ s/\015?\012$// ;
       
  1082         ($status_code)= $status_line=~ m#^HTTP/\d+\.\d+\s+(\d+)# ;
       
  1083 
       
  1084         $headers= '' ;
       
  1085         while (<$S>) {
       
  1086             last if /^\015?\012/ ;
       
  1087             $headers.= $_ ;
       
  1088         }
       
  1089         $headers=~ s/\015?\012[ \t]+/ /g ;
       
  1090     } until $status_code!~ /^1/ ;
       
  1091 
       
  1092     # Body length is determined by HTTP 1.1 spec, section 4.4:  these
       
  1093     #   certain conditions implying no body, then chunked encoding,
       
  1094     #   then Content-length: header, then server closing connection.
       
  1095     if ($method eq 'HEAD' or $status_code=~ /^(1|204\b|304\b)/) {
       
  1096         $body= undef ;
       
  1097 
       
  1098     # else chunked encoding
       
  1099     } elsif ($headers=~ /^transfer-encoding:[ \t]*chunked\b/im) {
       
  1100         # 7-16-99:  Old code was only saving last chunk.  Fix using
       
  1101         #   $this_chunk contributed by Mark Trotter.
       
  1102         my($this_chunk, $chunk_size, $readsofar, $thisread) ;
       
  1103         while ($chunk_size= hex(<$S>)) {
       
  1104             $readsofar= 0 ;
       
  1105             while ($readsofar!=$chunk_size) {
       
  1106                 last unless $thisread=
       
  1107                     read($S, $this_chunk, $chunk_size-$readsofar, $readsofar) ;
       
  1108                 $readsofar+= $thisread ;
       
  1109             }
       
  1110             return("HTTP/1.1 603 Incomplete chunked response", $headers, $body)
       
  1111                 if $readsofar!=$chunk_size ;
       
  1112             $_= <$S> ;    # clear CRLF after chunk
       
  1113             $body.= $this_chunk ;
       
  1114         }
       
  1115 
       
  1116         # Read footers if they exist
       
  1117         while (<$S>) {
       
  1118             last if /^\015?\012/ ;
       
  1119             $headers.= $_ ;
       
  1120         }
       
  1121         $headers=~ s/\015?\012[ \t]+/ /g ;
       
  1122 
       
  1123 
       
  1124     # else body length given in Content-length:
       
  1125     } elsif (($content_length)= $headers=~ /^content-length:[ \t]*(\d+)/im) {
       
  1126         my($readsofar, $thisread) ;
       
  1127         while ($readsofar!=$content_length) {
       
  1128             last unless $thisread=
       
  1129                 read($S, $body, $content_length-$readsofar, $readsofar) ;
       
  1130             $readsofar+= $thisread ;
       
  1131         }
       
  1132         return(sprintf("HTTP/1.1 602 Incomplete response (%s of %s bytes)",
       
  1133                        $readsofar+0, $content_length),
       
  1134                $headers, $body)
       
  1135             if $readsofar!=$content_length ;
       
  1136 
       
  1137 
       
  1138     # else body is entire socket output
       
  1139     } else {
       
  1140         local($/)= undef ;
       
  1141         $body= <$S> ;
       
  1142     }
       
  1143 
       
  1144     close($S) ;
       
  1145 
       
  1146     return($status_line, $headers, $body) ;
       
  1147 }
       
  1148 
       
  1149 
       
  1150 #----- URL-parsing routines -------------------------------------------
       
  1151 
       
  1152 # The routines parse_url(), unparse_url(), and absolute_url() are based on
       
  1153 #   different sections in the Internet Draft "Uniform Resource Identifiers
       
  1154 #   (URI): Generic Syntax and Semantics", 11-18-97, by Berners-Lee,
       
  1155 #   Fielding, and Masinter, filename draft-fielding-uri-syntax-01.
       
  1156 
       
  1157 # Parse a URL into its components, according to URI draft, sections 4.3, 4.4.
       
  1158 # This regular expression is straight from Appendix B, modified to use Perl 5.
       
  1159 # Returns scheme, site, path, query, and fragment.  All but path may have
       
  1160 #   the undefined value.
       
  1161 sub parse_url {
       
  1162     my($URL)= @_ ;
       
  1163     my($scheme, $site, $path, $query, $fragment)=
       
  1164         ($URL=~ m{^(?:    ([^:/?\#]+):)?
       
  1165                    (?: // ([^/?\#]*))?
       
  1166                           ([^?\#]*)
       
  1167                    (?: \? ([^\#]*))?
       
  1168                    (?: \# (.*))?
       
  1169                  }x
       
  1170         ) ;
       
  1171         
       
  1172         
       
  1173     # Un-URL-encode the path, to equivalate things like %7E --> ~
       
  1174     # Note that in some situations, this may cause problems with URLs that
       
  1175     #   contain the % character: if the unescaped URL is then used in 
       
  1176     #   relative URL calculation, it may be unescaped again (rare).
       
  1177     $path=~ s/\+/ /g ;
       
  1178     $path=~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/ge ;
       
  1179 
       
  1180     # Note that in HTTP, the presence of a host implies a path beginning with
       
  1181     #   '/', so $path should be '/' for URLs like "http://www.somehost.com"
       
  1182     $path= '/' if !length($path) && length($site) && lc($scheme) eq 'http' ;
       
  1183 
       
  1184     return($scheme, $site, $path, $query, $fragment) ;
       
  1185 
       
  1186 }
       
  1187 
       
  1188 
       
  1189 # Returns a full URL string, given its components
       
  1190 # The full procedure is described in the URI draft, section 5.2, step 7.
       
  1191 sub unparse_url {
       
  1192     my($scheme, $site, $path, $query, $fragment)= @_ ;
       
  1193     my($URL) ;
       
  1194 
       
  1195     $URL= "$scheme:"    if defined($scheme) ;
       
  1196     $URL.= "//$site"    if defined($site) ;
       
  1197     $URL.= $path ;
       
  1198     $URL.= "?$query"    if defined($query) ;
       
  1199     $URL.= "#$fragment" if defined($fragment) ;
       
  1200 
       
  1201     return $URL ;
       
  1202 }
       
  1203 
       
  1204 
       
  1205 # Returns a canonicalized absolute URL, given a relative URL and a base URL.
       
  1206 # The full procedure is described in the URI draft, section 5.2.
       
  1207 # Note that a relative URI of "#fragment" should be resolved to "the current
       
  1208 #   document", not to an absolute URL.  This presents a quandary for this
       
  1209 #   routine:  should it always return an absolute URL, thus violating the
       
  1210 #   spec, or should it not always return an absolute URL, thus requiring any
       
  1211 #   caller to check for this special case?  This routine leaves that up to
       
  1212 #   the caller, with $return_rel_fragment-- if set, stick to the spec;
       
  1213 #   otherwise, always return an absolute URL.  See section G.4 of the draft.
       
  1214 # Note that the pathname reduction in steps 6.c-f messes up any PATH_INFO
       
  1215 #   that has ./ or ../ in it, which may be a bug in the spec.
       
  1216 sub absolute_url {
       
  1217     my($relurl, $baseurl, $return_rel_fragment)= @_ ;
       
  1218     my(@relurl, @baseurl) ;
       
  1219 
       
  1220     # parse_url() returns scheme, site, path, query, fragment
       
  1221     @relurl= &parse_url($relurl) ;      # Step 1
       
  1222     @baseurl= &parse_url($baseurl) ;
       
  1223 
       
  1224     COMBINE: {
       
  1225 
       
  1226         # Step 2
       
  1227         # See note above about $return_rel_fragment
       
  1228         if (  $relurl[2] eq '' && 
       
  1229               !defined($relurl[0]) &&
       
  1230               !defined($relurl[1]) &&
       
  1231               !defined($relurl[3]) ) {
       
  1232             @relurl[0..3]= @baseurl[0..3] ;
       
  1233             return $relurl if $return_rel_fragment ;   # see note above
       
  1234             last COMBINE ;
       
  1235         }
       
  1236 
       
  1237         last COMBINE if defined($relurl[0]) ;    # Step 3
       
  1238         $relurl[0]= $baseurl[0] ;
       
  1239 
       
  1240         last COMBINE if defined($relurl[1]) ;    # Step 4
       
  1241         $relurl[1]= $baseurl[1] ;
       
  1242 
       
  1243         last COMBINE if $relurl[2]=~ m#^/# ;     # Step 5
       
  1244 
       
  1245         # Step 6-- resolve relative path
       
  1246         my($path)= $baseurl[2]=~ m#^(.*/)# ;     # Step 6.a
       
  1247         $relurl[2]= $path . $relurl[2] ;         # Step 6.b
       
  1248         
       
  1249     } # COMBINE
       
  1250 
       
  1251     # Put the remaining steps outside of the block to canonicalize the path.
       
  1252     # Arguably, this is not allowed.  To avoid such arguments at the expense of
       
  1253     #   path canonicalization, put steps 6.c-f back in the COMBINE block.
       
  1254 
       
  1255     1 while $relurl[2]=~ s#(^|/)\./#$1# ;    # Step 6.c
       
  1256     $relurl[2]=~ s#(^|/)\.$#$1# ;            # Step 6.d
       
  1257 
       
  1258     # Step 6.e
       
  1259     my($oldpath) ;
       
  1260     while ($relurl[2]=~ s#(([^/]+)/\.\./)# ($2 eq '..')  ? $1  : '' #ge) {
       
  1261         last if ($relurl[2] eq $oldpath) ;
       
  1262         $oldpath= $relurl[2] ;
       
  1263     }
       
  1264 
       
  1265     # Step 6.f
       
  1266     $relurl[2]=~ s#(([^/]+)/\.\.$)# ($2 eq '..')  ? $1  : '' #ge ;
       
  1267 
       
  1268     # Step 6.g: allow leading ".." segments to remain in path
       
  1269     # Step 6.h: relurl[2] is already the buffer string
       
  1270 
       
  1271     # To canonicalize further, lowercase the hostname (is this valid for all
       
  1272     #   schemes?)
       
  1273     $relurl[1]= lc($relurl[1]) if defined($relurl[1]) ;
       
  1274 
       
  1275     return &unparse_url(@relurl) ;                  # Step 7
       
  1276 }
       
  1277 
       
  1278 
       
  1279 
       
  1280 # Convert a local URL into a canonicalized absolute path, or undef if
       
  1281 #   not on this host or other error.
       
  1282 # Result should only be used as filename.
       
  1283 # Supports UserDir (e.g. public_html) for "/~username/path/file" URLs.
       
  1284 # Supports Alias, AliasMatch, ScriptAlias, and ScriptAliasMatch from srm.conf
       
  1285 #   (but note use of Perl regex's instead of standard regex's).
       
  1286 # Inserts index.html, etc. (from @DIRECTORY_INDEX) if result is a directory,
       
  1287 #   but just return directory name (ending in '/') if none of those exists.
       
  1288 # Removes PATH_INFO, if any, from filename.
       
  1289 # Directory names are always returned with trailing slash (which would not
       
  1290 #   be appropriate if PATH_INFO was to be retained).
       
  1291 # While this routines makes some tests (e.g. if the file is a directory),
       
  1292 #   it does not verify that file at the resulting $filename exists.
       
  1293 # Note that not all URLs point to files, so this routine is not always
       
  1294 #   appropriate.  In this program, the result from this routine is only
       
  1295 #   used when we know the URL is not a CGI script (and is therefore a file),
       
  1296 #   except in &is_cgi() itself, which tests if a file is a CGI script.
       
  1297 #   If it weren't for &is_cgi(), we could ignore cases when the URL isn't
       
  1298 #   a file.
       
  1299 # 12-1-99 JSM:  Changed to also return "redirected" location, in case URL
       
  1300 #   is a directory but not ending in a slash, so relative URLs will resolve
       
  1301 #   correctly against the redirected URL.
       
  1302 sub url_to_filename {
       
  1303     my($URL)= @_ ;
       
  1304     my($URLpath, $path, $location, $docroot, $user) ;
       
  1305     return undef unless $URL=~ m#^http://\Q$LOCAL_HOST\E/#io ;
       
  1306     $URLpath= (&parse_url($URL))[2] ;
       
  1307     die "couldn't get path from [$URL]" unless length($URLpath) ;
       
  1308 
       
  1309     # Note minor security hole:  if this script is run setuid, then any 
       
  1310     #   file on the system could be read by using an ALIAS to point to the 
       
  1311     #   file.  Note also that if a $URLpath such as "/alias/dir/../.." is
       
  1312     #   passed to this routine, the alias will be substituted BEFORE the
       
  1313     #   ".." path segments are traversed.  A case like this probably a
       
  1314     #   mistake in the URL anyway.
       
  1315 
       
  1316     # Make no more than one alias substitution-- is there a precedence order?
       
  1317     # Note that %(.*)_MATCH use Perl regex's, not standard regex's.
       
  1318     # 3-29-99 JSM:  These all alias to actual directory, not to a resulting
       
  1319     #   URL, so no further conversion should be done if one of these matches.
       
  1320     # 3-29-99 JSM:  Changed ALIAS_MATCH and SCRIPT_ALIAS_MATCH blocks to
       
  1321     #   allow $1-type substitution in targets; MUST TEST!
       
  1322     ALIAS: {
       
  1323         foreach (keys %ALIAS) 
       
  1324             { $path= $URLpath, last ALIAS 
       
  1325                   if $URLpath=~ s/^\Q$_\E/$ALIAS{$_}/ }
       
  1326         foreach (keys %ALIAS_MATCH) 
       
  1327             { $path= $URLpath, last ALIAS 
       
  1328                   if eval "\$URLpath=~ s/^\$_/$ALIAS_MATCH{$_}/" }
       
  1329         foreach (keys %SCRIPT_ALIAS) 
       
  1330             { $path= $URLpath, last ALIAS 
       
  1331                   if $URLpath=~ s/^\Q$_\E/$SCRIPT_ALIAS{$_}/ }
       
  1332         foreach (keys %SCRIPT_ALIAS_MATCH) 
       
  1333             { $path= $URLpath, last ALIAS 
       
  1334                   if eval "\$URLpath=~ s/^\$_/$SCRIPT_ALIAS_MATCH{$_}/" }
       
  1335     }
       
  1336 
       
  1337 
       
  1338     # If $path has been set in above ALIAS block, no further conversion is
       
  1339     #   needed.  
       
  1340     if ($path eq '') {
       
  1341 
       
  1342         # Must check for ^/.. before PATH_INFO check, in case $URL's path
       
  1343         #   is e.g. '/../conf/access.conf'
       
  1344         return undef if $URLpath=~ m#^/\.\.(/|$)# ;   # ^/.. is not allowed
       
  1345 
       
  1346         # Set $docroot and $path for this file, based on the URL (contains '~' ?)
       
  1347         if (length($USER_DIR) and ($user,$path)= $URLpath=~ m#^/~([^/]+)(.*)# ) {
       
  1348             &read_home_dirs unless %home_dir ;   # only read when needed
       
  1349             return undef unless length($home_dir{$user}) ;
       
  1350             $docroot= "$home_dir{$user}/$USER_DIR" ;
       
  1351             $path= '/' unless length($path) ;
       
  1352         } else {
       
  1353             # If we have no $DOCUMENT_ROOT, we can't handle URLs without ~.
       
  1354             return undef unless $DOC_ROOT_EXISTS ;
       
  1355             $docroot= $DOCUMENT_ROOT ;
       
  1356             $path= $URLpath ;
       
  1357         }
       
  1358 
       
  1359         # Handle PATH_INFO: remove path segments until an existing file is named.
       
  1360         # Note that directories cannot have PATH_INFO after them.
       
  1361         unless (-e "$docroot$path") {
       
  1362             for (my($path2)= $path ; $path2=~ m#/# ; $path2=~ s#/[^/]*$##) {
       
  1363                  last if -d "$docroot$path2" ;
       
  1364                  $path= $path2, last if -e "$docroot$path2" ;
       
  1365              }
       
  1366         }
       
  1367 
       
  1368         # canonicalize path, and recheck for ^/.. (handles an obscure error,
       
  1369         #   when $URL's path is e.g. '/a/b/../../..'; but must canonicalize
       
  1370         #   after PATH_INFO check, in case path is e.g. '/a/b.cgi/../../..').
       
  1371         1 while $path=~ s#/\.(/|$)#/# ;
       
  1372         1 while $path=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ;
       
  1373         return undef if $path=~ m#^/\.\.(/|$)# ;   # ^/.. is not allowed
       
  1374 
       
  1375         $path= "$docroot$path" ;
       
  1376 
       
  1377     }
       
  1378 
       
  1379 
       
  1380     # Add "index.html", etc. if appropriate
       
  1381     if (-d $path) {
       
  1382         $path.= '/' unless $path=~ m#/$# ;
       
  1383         # 12-1-99 JSM: set "redirected" location also
       
  1384         $location= "$URL/" unless $URL=~ m#/$# ;
       
  1385         foreach (@DIRECTORY_INDEX) {
       
  1386             $path.= $_, last if -f "$path$_" ;
       
  1387         }
       
  1388     }
       
  1389     return ($path, $location) ;
       
  1390 }
       
  1391 
       
  1392 
       
  1393 # Convert a local (possibly relative) pathname into a canonicalized URL.
       
  1394 # If filename is relative and no $basepath is given, assume it's in the 
       
  1395 #   current directory.
       
  1396 # Supports UserDir (e.g. public_html) for "/~username/path/file" URLs.
       
  1397 # Each path segment is checked to see if it's the same as $DOCUMENT_ROOT,
       
  1398 #   by comparing inodes.  When a match is found, it's cut off the front,
       
  1399 #   and an absolute URL is constructed.  If $DOCUMENT_ROOT is never matched,
       
  1400 #   then $USER_DIR is scanned for.  If that doesn't match (i.e. the file 
       
  1401 #   is not served to the Web), undef is returned.
       
  1402 # Note that $DOC_ROOT_DEV and $DOC_ROOT_INODE are set at the start of the
       
  1403 #   program for efficiency, but are an integral part of this routine.
       
  1404 # %ALIAS is supported using %UN_ALIAS, as best as possible.  See next note
       
  1405 #   on avoiding use of this routine.
       
  1406 # This is currently only used when parsing command-line filenames, and when
       
  1407 #   an <!--#include file="..." --> includes an <!--#include virtual="..." -->
       
  1408 #   (which may be an error anyway).  Thus, it can be avoided if needed, such
       
  1409 #   as when complex aliasing makes results ambiguous.
       
  1410 # jsm-- should this add/remove @DIRECTORY_INDEX, to avoid some duplication?
       
  1411 # 3-29-99 JSM:  Changed UNALIAS handling-- if it's unaliased, then no other
       
  1412 #   conversion is necessary.
       
  1413 sub filename_to_url {
       
  1414     my($path, $basepath)= @_ ;
       
  1415     my($URLpath) ;
       
  1416     unless ($path=~ m#^/#) {
       
  1417         $basepath= $CWD unless length($basepath) ;
       
  1418         $basepath.= '/' if -d $basepath && $basepath!~ m#/$# ;
       
  1419         $basepath=~ s#/[^/]*$#/# ;
       
  1420         $path= "$basepath$path"  ;
       
  1421     }
       
  1422 
       
  1423     # canonicalize filename by removing ./ and ../ where appropriate
       
  1424     1 while $path=~ s#/\.(/|$)#/# ;
       
  1425     1 while $path=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ;
       
  1426 
       
  1427     # canonicalize directory to include final /
       
  1428     $path.= '/' if -d $path && $path!~ m#/$# ;
       
  1429 
       
  1430     # First, if path can be unaliased, return that.
       
  1431     # (Added 3-29-99 by JSM.)
       
  1432     foreach (keys %UN_ALIAS) 
       
  1433         { $URLpath= $path, last if $path=~ s/^\Q$_\E/$UN_ALIAS{$_}/ }
       
  1434     return "http://\L$LOCAL_HOST\E$URLpath" if $URLpath ne '' ;
       
  1435 
       
  1436     # Then, check if file is under $DOCUMENT_ROOT tree, and convert if so.
       
  1437     if ($DOC_ROOT_EXISTS) {
       
  1438         my($doc_root)= $path ;
       
  1439         while ($doc_root=~ s#/[^/]*$##) {
       
  1440             my($dev,$inode)= stat("$doc_root/.") ;
       
  1441             if ( ($dev==$DOC_ROOT_DEV) && ($inode==$DOC_ROOT_INODE) ) {
       
  1442                 $path=~ s/^$doc_root// ;
       
  1443 #                foreach (keys %UN_ALIAS) 
       
  1444 #                    { last if $path=~ s/^\Q$_\E/$UN_ALIAS{$_}/ }
       
  1445                 return "http://\L$LOCAL_HOST\E$path" ;
       
  1446             }
       
  1447         }
       
  1448     }
       
  1449 
       
  1450     # Handle possible case of "~username/$USER_DIR/$path"
       
  1451     # I don't think %ALIAS applies here, does it?
       
  1452     # This misses some when $HOME/$USER_DIR points through a symbolic link,
       
  1453     #   and $CWD isn't set to match %dir_to_user.  Work around by avoiding
       
  1454     #   this routine, e.g. using only URLs on command line.
       
  1455     if (length($USER_DIR)) {
       
  1456         if ($path=~ m#^(.*?)/$USER_DIR(/.*)# ) {
       
  1457             # First, see if path is in %dir_to_user
       
  1458             &read_home_dirs unless %dir_to_user ;   # only read when needed
       
  1459             return "http://\L$LOCAL_HOST\E/~$dir_to_user{$1}$2"
       
  1460                 if length($dir_to_user{$1}) ;
       
  1461 
       
  1462             # If not, then we must check inodes to equivalate directories
       
  1463             &read_inode_to_user unless %inode_to_user ; # only read when needed
       
  1464             my($dev,$inode)= stat("$1/.") ;
       
  1465             return "http://\L$LOCAL_HOST\E/~$inode_to_user{$dev}{$inode}$2"
       
  1466                 if length($inode_to_user{$dev}{$inode}) ;
       
  1467         }
       
  1468     }
       
  1469 
       
  1470     return undef ;
       
  1471 }
       
  1472 
       
  1473 
       
  1474 
       
  1475 # Reads all users' home directory into %home_dir, from /etc/passwd.
       
  1476 # Also creates %dir_to_user, which is faster than %inode_to_user (below).
       
  1477 # Only used when $USER_DIR is used, for "/~username/path/file" URLs.
       
  1478 # 2-27-00 JSM:  Changed to use getpwent, instead of reading /etc/passwd.
       
  1479 sub read_home_dirs {
       
  1480     my($user, $homedir) ;
       
  1481 
       
  1482     setpwent ;   # to rewind, in case getpwent has already been used
       
  1483     while ( ($user, $homedir)= (getpwent)[0,7] ) {
       
  1484         $home_dir{$user}= $homedir ;
       
  1485         $dir_to_user{$homedir}= $user
       
  1486             unless $dir_to_user{$homedir} ne '' ;
       
  1487     }
       
  1488     endpwent ;   # clean way to end getpwent processing
       
  1489 }
       
  1490 
       
  1491 
       
  1492 # Reads home directory inode information into %inode_to_user, from /etc/passwd.
       
  1493 # Because this is time-consuming, it is only called if needed, and only once.
       
  1494 # Only used when $USER_DIR is used, for "/~username/path/file" URLs.
       
  1495 # On SPARCstation-10 with 3000 /etc/passwd records, this takes ~2 seconds.
       
  1496 # 2-27-00 JSM:  Changed to use already-existing %home_dir, instead of reading
       
  1497 #   /etc/passwd again.
       
  1498 sub read_inode_to_user {
       
  1499     my($user, $homedir) ;
       
  1500     my($dev, $inode) ;
       
  1501     &read_home_dirs unless %home_dir ;   # only read when needed
       
  1502 
       
  1503     while ( ($user, $homedir)= each %home_dir ) {
       
  1504         ($dev,$inode)= stat("$homedir/.") ;
       
  1505         $inode_to_user{$dev}{$inode}= $user 
       
  1506             unless $inode_to_user{$dev}{$inode} ne '' ;
       
  1507     }
       
  1508 }
       
  1509 
       
  1510 
       
  1511 
       
  1512 #----- Extracting URLs from HTML ------------------------------------
       
  1513 
       
  1514 
       
  1515 # Parse an SGML tag, and return a hash structure with a "name" scalar and
       
  1516 #   an "attrib" hash.
       
  1517 # Parses first tag in string, ignoring all surrounding text.
       
  1518 # Results are canonicalized to lower case wherever case-insensitive.
       
  1519 sub parse_tag {
       
  1520     my($tag)= @_ ;       # will be mangled
       
  1521     my($tagname,%attrib) ;
       
  1522 
       
  1523     # only parse first tag in string
       
  1524     ($tag)= split(/>/, $tag) ;  # remove all after >
       
  1525     $tag=~ s/^([^<]*<)?\s*// ;  # remove pre-<, <, and leading blanks
       
  1526     ($tagname,$tag)= split(/\s+/, $tag, 2) ;        # split out tag name
       
  1527 
       
  1528     # Extract name/value (possibly quoted), lowercase name, set $attrib{}.
       
  1529     # If quoted, is delimited by quotes; if not, delimited by whitespace.
       
  1530     $attrib{lc($1)}= &HTMLunescape($+)
       
  1531         while ($tag=~ s/\s*(\w+)\s*=\s*(([^"']\S*)|"([^"]*)"?|'([^']*)'?)//) ;
       
  1532 
       
  1533     # now, get remaining non-valued (boolean) attributes
       
  1534     $tag=~ s/^\s*|\s*$//g ;             # skip leading/trailing blanks
       
  1535     foreach (split(/\s+/, $tag)) {
       
  1536         $_= lc($_) ;
       
  1537         $attrib{$_}= $_ ;   # booleans have values equal to their name
       
  1538     }
       
  1539 
       
  1540     return { 'name'   => lc($tagname), 
       
  1541              'attrib' => \%attrib       } ;
       
  1542 }
       
  1543 
       
  1544 
       
  1545 # Unescape any HTML character references and return resulting string.
       
  1546 # Support entity character references in %e_to_ch (which is incomplete),
       
  1547 #   plus "$#ddd;" and "&#xdd;" forms for values<256.
       
  1548 # Note that not decoding a valid character is erroneous, in that a
       
  1549 #   subsequent re-escaping will not return the original string, because
       
  1550 #   of the ampersand.  Nonetheless, that's preferable to losing the data.
       
  1551 #   Q:  Is there an appropriate general way to represent an unescaped string?
       
  1552 sub HTMLunescape {
       
  1553     my($s)= @_ ;
       
  1554 
       
  1555     # Try alpha, decimal, and hex representations, only substituting if valid
       
  1556     $s=~ s/&(([a-zA-Z][a-zA-Z0-9.-]*);?|#([0-9]+);?|#[Xx]([0-9a-fA-F]+);?)/
       
  1557               length($2)   ? ( defined($e_to_ch{$2}) ? $e_to_ch{$2}  : "&$1" )
       
  1558             : length($3)   ? ( $3 < 256              ? chr($3)       : "&$1" )
       
  1559             : length($4)   ? ( hex($4) < 256         ? chr(hex($4))  : "&$1" )
       
  1560                            :   "&$1"
       
  1561           /ge ;
       
  1562 
       
  1563     return $s ;
       
  1564 }
       
  1565 
       
  1566 
       
  1567 # Given a block of HTML, extracts all URLs referenced in it, and adds them
       
  1568 #   to our data structures to be downloaded or checked (i.e. calls 
       
  1569 #   &add_url()).
       
  1570 # Note that %html_urls and %non_html_urls are set at the start of the
       
  1571 #   program for efficiency, but are an integral part of this routine.
       
  1572 # Currently, this extracts all <.*?> patterns, which may not be valid if
       
  1573 #   "<" or ">" characters are e.g. inside a <script> element.
       
  1574 sub extract_urls {
       
  1575     my($HTML, $baseurl, $referer, $depth)= @_ ;
       
  1576     my(@tags) ;
       
  1577 
       
  1578     # Remove comments before extracting links, as pointed out by Tim Hunter.
       
  1579     $HTML=~ s/<!--.*?--.*?>//gs ;
       
  1580 
       
  1581     # We must look for <base> tag before all the work, so we must parse
       
  1582     #   all tags first.  :(  Therefore, we save this large array of
       
  1583     #   structures for efficiency, hoping we don't run out of memory.
       
  1584     my($i)= -1 ;  # to start at array element 0
       
  1585 
       
  1586     foreach ($HTML=~ /(<.*?>)/gs) {
       
  1587         $tags[++$i]= &parse_tag($_) ;
       
  1588         $baseurl= $tags[$i]{'attrib'}{'href'}
       
  1589             if ($tags[$i]{'name'} eq 'base') 
       
  1590                 and (length($tags[$i]{'attrib'}{'href'})) ;
       
  1591     }
       
  1592 
       
  1593     # For each tag, call &add_url() for each URL in the tag
       
  1594     foreach my $tag (@tags) {
       
  1595         next if $tag->{'name'}=~ m#^/# ;
       
  1596 
       
  1597         # Handle the "regular" tag-attributes, in %html_urls and %non_html_urls
       
  1598 
       
  1599         foreach (@{$html_urls{$tag->{'name'}}}) {
       
  1600             &add_url(&absolute_url($tag->{'attrib'}{$_}, $baseurl), 
       
  1601                      $referer, $depth) 
       
  1602                 if length($tag->{'attrib'}{$_}) ;
       
  1603         }
       
  1604 
       
  1605         foreach (@{$non_html_urls{$tag->{'name'}}}) {
       
  1606             &add_url(&absolute_url($tag->{'attrib'}{$_}, $baseurl), 
       
  1607                      $referer, $depth, 0) 
       
  1608                 if length($tag->{'attrib'}{$_}) ;
       
  1609         }
       
  1610 
       
  1611         # Now handle each tag-attribute that needs special attention
       
  1612 
       
  1613         if ($tag->{'name'} eq 'object') {
       
  1614             my($codebase)=
       
  1615                 &absolute_url($tag->{'attrib'}{'codebase'}, $baseurl) ;
       
  1616 
       
  1617             &add_url(&absolute_url($tag->{'attrib'}{'data'}, $codebase),
       
  1618                      $referer, $depth)
       
  1619                 if length($tag->{'attrib'}{'data'}) ;
       
  1620 
       
  1621             # There seems to be a contradiction between the HTML 4.0 spec
       
  1622             #   section 13.3.3 and RFC 1808 section 4 step 2b regarding
       
  1623             #   the URL resolution of e.g. classid="java:program.start".
       
  1624             #   For now, we'll stick with the RFC 1808 method.
       
  1625             &add_url(&absolute_url($tag->{'attrib'}{'classid'}, $codebase),
       
  1626                      $referer, $depth, 0)
       
  1627                 if length($tag->{'attrib'}{'classid'}) ;
       
  1628 
       
  1629             # <object> tag's "archive" attribute is a space-separated list
       
  1630             foreach (split(/\s+/, $tag->{'attrib'}{'archive'})) {
       
  1631                 &add_url(&absolute_url($_, $codebase), $referer, $depth, 0) ;
       
  1632             }
       
  1633 
       
  1634         } elsif ($tag->{'name'} eq 'head') {
       
  1635             # "profile" attribute is a space-separated list
       
  1636             foreach (split(/\s+/, $tag->{'attrib'}{'profile'})) {
       
  1637                 &add_url(&absolute_url($_, $baseurl), $referer, $depth, 0) ;
       
  1638             }
       
  1639 
       
  1640         } elsif ($tag->{'name'} eq 'applet') {
       
  1641             my($codebase)=
       
  1642                 &absolute_url($tag->{'attrib'}{'codebase'}, $baseurl) ;
       
  1643 
       
  1644             &add_url(&absolute_url($tag->{'attrib'}{'code'}, $codebase),
       
  1645                      $referer, $depth, 0)
       
  1646                 if length($tag->{'attrib'}{'code'}) ;
       
  1647 
       
  1648             &add_url(&absolute_url($tag->{'attrib'}{'object'}, $codebase),
       
  1649                      $referer, $depth, 0)
       
  1650                 if length($tag->{'attrib'}{'object'}) ;
       
  1651 
       
  1652             # <applet> tag's "archive" attribute is a comma-separated list
       
  1653             foreach (split(/\s*,\s*/, $tag->{'attrib'}{'archive'})) {
       
  1654                 &add_url(&absolute_url($_, $codebase), $referer, $depth, 0) ;
       
  1655             }
       
  1656 
       
  1657         # Check form script for existence only, but don't follow hyperlinks.
       
  1658         # Handles the unlikely case when a CGI is referred to by both a
       
  1659         #   <form action=___.cgi> tag and an <a href=___.cgi> tag: If a CGI
       
  1660         #   is only pointed to by <form action>, then don't follow the link
       
  1661         #   (i.e. set $url->{'dontfollow'} to verify, not load).  If a CGI
       
  1662         #   is called by at least one <a href> tag, then do follow the link.
       
  1663         } elsif ($tag->{'name'} eq 'form') {
       
  1664             &add_url(&absolute_url($tag->{'attrib'}{'action'}, $baseurl),
       
  1665                      $referer, $depth, undef, 1, 1)
       
  1666                 if length($tag->{'attrib'}{'action'}) ;
       
  1667 
       
  1668         }   # if ($tag->{'name'} eq '...')
       
  1669 
       
  1670 
       
  1671     }   # foreach $tag (@tags)
       
  1672 
       
  1673 }   # &extract_urls()
       
  1674 
       
  1675 
       
  1676 
       
  1677 #----- output routines ------------------------------------------------
       
  1678 
       
  1679 # Generate final report
       
  1680 sub make_report {
       
  1681     if ($verbose_report) { &make_verbose_report }
       
  1682     else                 { &make_short_report }
       
  1683 }
       
  1684 
       
  1685 
       
  1686 
       
  1687 # Generate a verbose report of all URLs that have been checked
       
  1688 sub make_verbose_report {
       
  1689     my($rootlist)= join("\n    ", @ARGV) ;
       
  1690     my($numurls)= scalar keys %url ;
       
  1691 
       
  1692     print <<EOH ;
       
  1693 ======================================================================
       
  1694 Report of invalid links on $LOCAL_HOST, 
       
  1695 recursively followed starting with the files/URLs:
       
  1696 
       
  1697     $rootlist
       
  1698 
       
  1699 EOH
       
  1700 
       
  1701     if (@INCLUDE_PATTERNS) {
       
  1702         my($includelist)= join("\n    ", @INCLUDE_PATTERNS) ;
       
  1703         print <<EOH ;
       
  1704 Only including URLs matching at least one of the following patterns:
       
  1705     $includelist
       
  1706 
       
  1707 EOH
       
  1708     }
       
  1709 
       
  1710     if (@EXCLUDE_PATTERNS) {
       
  1711         my($excludelist)= join("\n    ", @EXCLUDE_PATTERNS) ;
       
  1712         print <<EOH ;
       
  1713 Excluding URLs matching any of the following patterns:
       
  1714     $excludelist
       
  1715 
       
  1716 EOH
       
  1717     }
       
  1718 
       
  1719     print "Total URLs checked: $numurls\n\n" ;
       
  1720 
       
  1721     print( "Only reporting response codes beginning with: ", 
       
  1722            join(", ", @INCLUDE_STATUS), "\n" )
       
  1723         if @INCLUDE_STATUS ;
       
  1724 
       
  1725     print( "Excluding response codes beginning with: ",
       
  1726            join(", ", @EXCLUDE_STATUS), "\n" )
       
  1727         if @EXCLUDE_STATUS ;
       
  1728 
       
  1729     print "Maximum traversal depth: $max_depth\n" if length($max_depth) ;
       
  1730     print "Only local files were checked; no CGI scripts were invoked.\n"
       
  1731         if $file_check ;
       
  1732     print "Local URLs were read from the filesystem where possible.\n"
       
  1733         unless $full_http_check ;
       
  1734 
       
  1735     print "\n" ;
       
  1736 
       
  1737     # Only report status errors if there are any.
       
  1738     # Using grep()s is slightly inefficient, but a *lot* cleaner.
       
  1739     my($has_statuserrs) ;
       
  1740     foreach my $URL (sort keys %url) {
       
  1741         next if @INCLUDE_STATUS &&
       
  1742                 !grep($url{$URL}{'status'}=~ /^$_/, @INCLUDE_STATUS) ;
       
  1743         next if grep($url{$URL}{'status'}=~ /^$_/, @EXCLUDE_STATUS) ;
       
  1744 
       
  1745         $has_statuserrs=1, last ;
       
  1746     }
       
  1747 
       
  1748 
       
  1749     if ($has_statuserrs) {
       
  1750         print <<EOH ;
       
  1751 ======================================================================
       
  1752 
       
  1753 RESULTS FROM ALL URLS WITH SELECTED RESPONSE STATUS CODES
       
  1754 ------------------------------------------------------------
       
  1755 
       
  1756 EOH
       
  1757 
       
  1758         foreach (sort keys %url) {
       
  1759             my($u)= $url{$_} ;
       
  1760 
       
  1761             next if @INCLUDE_STATUS &&
       
  1762                     !grep($u->{'status'}=~ /^$_/, @INCLUDE_STATUS) ;
       
  1763             next if grep($u->{'status'}=~ /^$_/, @EXCLUDE_STATUS) ;
       
  1764 
       
  1765             print "$u->{'URL'}\n", '-' x 50, "\n" ;
       
  1766             print "Status:   $u->{'status'}\n" ;
       
  1767             print "Moved to: $u->{'location'}\n" if $u->{'location'} ;
       
  1768             print "Depth:    $u->{'depth'}\n" ;
       
  1769 
       
  1770             for (my($URL)= $u->{'referer'}, my($tab)= '  ' ;
       
  1771                  length($url{$URL}) ;
       
  1772                  $URL= $url{$URL}{'referer'}, $tab.= '  ' )
       
  1773             {
       
  1774                 print "${tab}referred by $url{$URL}{'URL'}\n" ;
       
  1775                 die "PROGRAM ERROR: apparent infinite referer loop.\n"
       
  1776                     if length($tab)>200 ;      # simple-minded sanity check
       
  1777             }
       
  1778 
       
  1779             print "\n\n" ;
       
  1780         }  # URL
       
  1781 
       
  1782     }
       
  1783 
       
  1784 
       
  1785 
       
  1786     # Only report SSI errors if there are any
       
  1787     my($has_ssierrs) ;
       
  1788     foreach (sort keys %url) { 
       
  1789         $has_ssierrs=1, last if @{$url{$_}{'ssierrs'}} ;
       
  1790     }
       
  1791 
       
  1792 
       
  1793     if ($has_ssierrs) {
       
  1794         print <<EOH ;
       
  1795 ======================================================================
       
  1796 
       
  1797 PROBLEMS WITH SERVER-SIDE INCLUDE (AKA SHTML) DIRECTIVES
       
  1798 ------------------------------------------------------------
       
  1799 
       
  1800 EOH
       
  1801         foreach (sort keys %url) {
       
  1802             my($u)= $url{$_} ;
       
  1803             if (@{$u->{'ssierrs'}}) {
       
  1804                 print "$u->{'URL'}\n", '-' x 50, "\n" ;
       
  1805                 printf "Total %s SSI Errors:\n",  $#{$u->{'ssierrs'}}+1 ;
       
  1806                 foreach my $i (0..$#{$u->{'ssierrs'}}) {
       
  1807                     printf "  %s) ", $i+1 ;
       
  1808                     my($tab)= ' ' x (4+length($i+1)) ;
       
  1809                     my($tab2) ;
       
  1810                     foreach my $level (@{$u->{'ssierrs'}[$i]}) {
       
  1811                         print "${tab2}file:   $level->{'path'}\n" ;
       
  1812                         print "${tab}in tag: $level->{'tag'}\n" ;
       
  1813                         print "${tab}error:  $level->{'errmsg'}\n" 
       
  1814                             if $level->{'errmsg'} ;
       
  1815                         $tab.= '  ' ;
       
  1816                         $tab2= $tab ;
       
  1817                     }
       
  1818                 }
       
  1819                 print "\n\n" ;
       
  1820             }
       
  1821         }
       
  1822     }
       
  1823 
       
  1824 
       
  1825     unless ($has_statuserrs or $has_ssierrs) {
       
  1826         print <<EOH ;
       
  1827 ======================================================================
       
  1828 
       
  1829                     <<     NO ERRORS FOUND     >>
       
  1830 
       
  1831 EOH
       
  1832     }
       
  1833 
       
  1834     print '=' x 70, "\n" ;
       
  1835 
       
  1836 } # &make_verbose_report()
       
  1837 
       
  1838 
       
  1839 # Generate a one-line-per-URL report
       
  1840 sub make_short_report {
       
  1841     my($numurls)= scalar keys %url ;
       
  1842 
       
  1843     print "Total $numurls URLs checked\n" ;
       
  1844     URL: foreach my $URL (sort keys %url) {
       
  1845         next if @INCLUDE_STATUS &&
       
  1846                 !grep($url{$URL}{'status'}=~ /^$_/, @INCLUDE_STATUS) ;
       
  1847         next if grep($url{$URL}{'status'}=~ /^$_/, @EXCLUDE_STATUS) ;
       
  1848 
       
  1849         print "$url{$URL}{'URL'}\t$url{$URL}{'status'}\n" ;
       
  1850     }  # URL
       
  1851 }
       
  1852 
       
  1853 
       
  1854 
       
  1855 # Print a summary of usage and exit
       
  1856 sub usage {
       
  1857     print <<EOF ;
       
  1858 
       
  1859 Checklinks $CL_VERSION
       
  1860 
       
  1861 To recursively check all HTML links on the local site, enter:
       
  1862     $0  <options>  [ start-file | start-URL ] ...
       
  1863 
       
  1864 Options include:
       
  1865   -v                    Generate full (verbose) report, including full
       
  1866                           referral information and detailed SSI error
       
  1867                           reporting.
       
  1868   -I <include-pattern>  Only check URLs matching <include-pattern>.
       
  1869   -X <exclude-pattern>  Don't check URLs matching <exclude-pattern>.
       
  1870   -i <include-status>   Only report URLs whose response code starts with
       
  1871                           the pattern <include-status>.
       
  1872   -x <exclude-status>   Don't report URLs whose response code starts with
       
  1873                           the pattern <exclude-status>.  Default is to
       
  1874                           exclude "200" responses only (i.e. "-x 200").
       
  1875   -d <max-depth>        Traverse links no deeper than <max-depth>.
       
  1876   -f                    "File mode"-- only read files from the filesystem;
       
  1877                           do not go through the HTTP server at all.  This
       
  1878                           will skip all URLs that point to CGI scripts.
       
  1879   -h                    "HTTP mode"-- use HTTP to check ALL URLs, even
       
  1880                           if they could be read from the filesystem.
       
  1881                           Incompatible with "-f" option.
       
  1882   -c <config-file>      Read appropriate configuration parameters from
       
  1883                           <config-file>, typically srm.conf.  Use '-' to
       
  1884                           read from STDIN.  If a directory is named, use
       
  1885                           "srm.conf" in that directory.
       
  1886   -q                    Print current configuration parameters.
       
  1887   -?                    Print this help message.
       
  1888   --                    End command-line option processing.
       
  1889 
       
  1890 Don't stack options like "-vf"; use "-v -f" instead.
       
  1891 
       
  1892 For -I, -X, -i, and -x:
       
  1893   Values are interpreted as Perl 5 regular expressions.
       
  1894   Use multiple options to build a list (e.g. "-I include1 -I include2").
       
  1895   Use a value of '' to clear a list (e.g. -x '' means "report all responses",
       
  1896     "-x '' -x 401" means "report all but 401 responses").
       
  1897   As a special case, an empty -I or -i list implies no include-restrictions.
       
  1898   If an item is in both the include and exclude list, it is excluded.
       
  1899   Note that -I and -X restrict which URLs are traversed into, so they may
       
  1900     prune large areas of your Web structure.
       
  1901 
       
  1902 EOF
       
  1903     exit ;
       
  1904 }
       
  1905 
       
  1906 
       
  1907 #----- debugging routines below ---------------------------------------
       
  1908 
       
  1909 # Print current configuration settings
       
  1910 sub print_config {
       
  1911     print "\n----- OPTIONS SPECIFIC TO THIS EXECUTION -----------------------------\n\n" ;
       
  1912     
       
  1913     print "Include only URLs containing one of the following patterns:\n",
       
  1914           ( map { "    $_\n" } @INCLUDE_PATTERNS ),
       
  1915           "\n"
       
  1916         if @INCLUDE_PATTERNS ;
       
  1917 
       
  1918     print "Exclude URLs containing one of the following patterns:\n",
       
  1919           (map { "    $_\n" } @EXCLUDE_PATTERNS ),
       
  1920           "\n"
       
  1921         if @EXCLUDE_PATTERNS ;
       
  1922 
       
  1923     print "Only report response codes beginning with:  ",
       
  1924           join(", ", @INCLUDE_STATUS), "\n"
       
  1925         if @INCLUDE_STATUS ;
       
  1926 
       
  1927     print "Don't report response codes beginning with: ",
       
  1928           join(", ", @EXCLUDE_STATUS), "\n"
       
  1929         if @EXCLUDE_STATUS ;
       
  1930 
       
  1931     print "\nMaximum search depth:  $max_depth\n" if length($max_depth) ;
       
  1932 
       
  1933     print <<EOS ;
       
  1934 
       
  1935 ----- INSTALLATION PARAMETERS ----------------------------------------
       
  1936     
       
  1937 Local Host:           $LOCAL_HOST
       
  1938 Document Root:        $DOCUMENT_ROOT
       
  1939 User Web Directory:   $USER_DIR
       
  1940 Default Filename(s):  @DIRECTORY_INDEX
       
  1941 
       
  1942 File extensions indicating a CGI program:       @CGI_EXTENSIONS
       
  1943 File extensions indicating server-parsed HTML:  @SHTML_EXTENSIONS
       
  1944 
       
  1945 EOS
       
  1946 
       
  1947     print "Directory Aliases:\n",
       
  1948           map { sprintf("    %15s ==> %s\n", $_, $ALIAS{$_}) }
       
  1949               sort keys %ALIAS, 
       
  1950           "\n"
       
  1951         if keys %ALIAS ;
       
  1952 
       
  1953     print "Directory Regular Expression Aliases:\n", 
       
  1954           map { sprintf("    %15s ==> %s\n", $_, $ALIAS_MATCH{$_}) }
       
  1955               sort keys %ALIAS_MATCH,
       
  1956           "\n"
       
  1957         if keys %ALIAS_MATCH ;
       
  1958 
       
  1959     print "Script Directory Aliases:\n", 
       
  1960           map { sprintf("    %15s ==> %s\n", $_, $SCRIPT_ALIAS{$_}) }
       
  1961               sort keys %SCRIPT_ALIAS,
       
  1962           "\n"
       
  1963         if keys %SCRIPT_ALIAS ;
       
  1964 
       
  1965     print "Script Directory Regular Expression Aliases:\n", 
       
  1966           map { sprintf("    %15s ==> %s\n", $_, $SCRIPT_ALIAS_MATCH{$_}) }
       
  1967               sort keys %SCRIPT_ALIAS_MATCH,
       
  1968           "\n"
       
  1969         if keys %SCRIPT_ALIAS_MATCH ;
       
  1970 
       
  1971     if ($ENV{'http_proxy'}) {
       
  1972         print "HTTP Proxy:  $ENV{'http_proxy'}\n" ;
       
  1973         print "Except to hosts ending in:  $ENV{'no_proxy'}\n" 
       
  1974             if $ENV{'no_proxy'} ;
       
  1975         print "\n" ;
       
  1976     } else {
       
  1977         print "Not using any HTTP Proxy.\n" ;
       
  1978     }
       
  1979 
       
  1980     print "Maximum HTTP redirections allowed for a URL:  $MAX_REDIRECTS\n",
       
  1981           "Maximum number of tries to get a single URL:  $MAX_ATTEMPTS\n\n";
       
  1982           
       
  1983     print '-' x 70, "\n\n" ;
       
  1984 
       
  1985 }
       
  1986 
       
  1987 
       
  1988 
       
  1989 # print configuration, in a style more for debugging (mostly from srm.conf)
       
  1990 sub print_debug_config {
       
  1991     print <<EOF ;
       
  1992 DOCUMENT_ROOT=    [$DOCUMENT_ROOT]
       
  1993 USER_DIR=         [$USER_DIR]
       
  1994 DIRECTORY_INDEX=  [@DIRECTORY_INDEX]
       
  1995 
       
  1996 CGI_EXTENSIONS=   [@CGI_EXTENSIONS]
       
  1997 SHTML_EXTENSIONS= [@SHTML_EXTENSIONS]
       
  1998 
       
  1999 EOF
       
  2000 
       
  2001     foreach (sort keys %ALIAS) 
       
  2002         { print "ALIAS{$_}= [$ALIAS{$_}]\n" }
       
  2003     foreach (sort keys %ALIAS_MATCH) 
       
  2004         { print "ALIAS_MATCH{$_}= [$ALIAS_MATCH{$_}]\n" }
       
  2005     foreach (sort keys %SCRIPT_ALIAS) 
       
  2006         { print "SCRIPT_ALIAS{$_}= [$SCRIPT_ALIAS{$_}]\n" }
       
  2007     foreach (sort keys %SCRIPT_ALIAS_MATCH) 
       
  2008         { print "SCRIPT_ALIAS_MATCH{$_}= [$SCRIPT_ALIAS_MATCH{$_}]\n" }
       
  2009 
       
  2010 }
       
  2011 
       
  2012 
       
  2013 #----------------------------------------------------------------------
       
  2014 #
       
  2015 #   SPARSE DOCUMENTATION ON PROGRAM INTERNALS:
       
  2016 #
       
  2017 #----------------------------------------------------------------------
       
  2018 #
       
  2019 #   URLs are read directly from the local filesystem if possible, or
       
  2020 #   downloaded or checked with HTTP GET and HEAD methods if not.  CGI
       
  2021 #   scripts are called through the HTTP server, even if local (since the
       
  2022 #   resource their URLs refer to are not files).
       
  2023 #
       
  2024 #
       
  2025 #   Global variables:
       
  2026 #
       
  2027 #     %url holds all data regarding all URLs, using the text URL as the key:
       
  2028 #
       
  2029 #     %url{$URL}{qw( URL filename referer depth
       
  2030 #                    ishtml iscgi islocal dontfollow
       
  2031 #                    status location numredirects numtries lasttried
       
  2032 #                    ssierrs
       
  2033 #                    hasbeenloaded)}  <== only for testing
       
  2034 #         URL is same as hash key
       
  2035 #         ishtml means is type text/html; links MAY be extracted.  (Used in
       
  2036 #           main loop.)
       
  2037 #         iscgi means is a CGI script; MUST NOT be read directly from 
       
  2038 #           filesystem. (Used in verify_url() and load_url().)
       
  2039 #         islocal means served by local server:
       
  2040 #             a) MAY be read directly from filesystem
       
  2041 #             b) links MAY be extracted
       
  2042 #         dontfollow means links MUST NOT be extracted.
       
  2043 #         status begins with a number, but may have text afterward.
       
  2044 #
       
  2045 #         islocal= (URL=~ m#^http://$LOCAL_HOST/#i) , i.e. redundant.
       
  2046 #         ishtml and iscgi are guesses, may be 0, 1, or '' for unset.
       
  2047 #         iscgi only matters for local URLs, but may safely be set for 
       
  2048 #           remote URLs.
       
  2049 #         dontfollow is a hack to handle an obscure CGI-related situation.
       
  2050 #
       
  2051 #     In addition to the standard HTTP status codes, 'status' may take one
       
  2052 #     of the following values:
       
  2053 #         450 Can't read file: $!
       
  2054 #         451 SSI Error(s) (__ total)
       
  2055 #         600 Can't create socket: $@
       
  2056 #         601 Connection timed out
       
  2057 #         602 Incomplete response (%s of %s bytes)
       
  2058 #         603 Incomplete chunked response
       
  2059 #
       
  2060 #     Handling of 602 and 603 is somewhat hackish-- overwrites real status
       
  2061 #       line with artificial one.
       
  2062 #
       
  2063 #     $url->{'ssierrs'} is a list of errors relating to SSI includes.
       
  2064 #       Each error has a list of filenames representing the include
       
  2065 #       chain.  Each element in the chain stores the absolute
       
  2066 #       pathname, the <!--#include--> tag which called it, and
       
  2067 #       possibly an error message (usually only on the last file in
       
  2068 #       the include-chain).  Thus, the entire structure is
       
  2069 #
       
  2070 #         $url->{'ssierrs'}[$errid][$levelsdeep]{qw( path tag errmsg )}
       
  2071 #
       
  2072 #       This is only a memory hog with many errors of deeply nested files.
       
  2073 #
       
  2074 #
       
  2075 #     @urlstoget: canonicalized list of URLs left to get, in order
       
  2076 #         of appearance.  List elements are pointers to %url elements.
       
  2077 #         This array may grow from &extract_urls(), or be rearranged when
       
  2078 #         a URL needs to be retried.
       
  2079 #
       
  2080 #   Note that $url is normally a pointer, and $URL is normally a string.
       
  2081 #
       
  2082 #   Files/paths are canonicalized by removing unneeded "./" and "../"
       
  2083 #     segments.
       
  2084 #   Directories are canonicalized to always end in "/", including in URLs;
       
  2085 #     note that some system variables and local directory variables aren't so.
       
  2086 #   URLs that point to "/" end in "/", e.g. "http://www.foo.com/".
       
  2087 #
       
  2088 #
       
  2089 #   This program has a lot of unused functionality for possible extension,
       
  2090 #   so don't get confused by routines that do more than they are used for.
       
  2091 #
       
  2092 #----------------------------------------------------------------------
       
  2093 #
       
  2094 # To do:
       
  2095 #
       
  2096 #   * If you want a different output format (like in HTML), it should 
       
  2097 #       be easy enough to add.
       
  2098 #   * This could be extended to check remote sites without much trouble.
       
  2099 #   * Should probably support directories that don't have index.html.
       
  2100 #   * Check that pointed-to fragments exist in target files
       
  2101 #   * keep track of servers and don't hit them too often
       
  2102 #   * support robots.txt
       
  2103 #   * add option to read list of URLs from file
       
  2104 #   * support HTTP Basic Authentication somehow.  Note that currently, this
       
  2105 #       reads local files even when they should be protected by .htaccess
       
  2106 #       (unless using -h option).
       
  2107 #   * Read list of local host names from gethostbyname() ?
       
  2108 #   * check read permissions on files?
       
  2109 #   * In the event multiple URLs redirect to the same URL, this could be a
       
  2110 #       little more efficient if it saved the redirected loads/downloads.
       
  2111 #
       
  2112 #----------------------------------------------------------------------