Admin/website/build/cl-1.0.1.pl
author wenzelm
Wed, 15 Feb 2006 21:34:55 +0100
changeset 19046 bc5c6c9b114e
parent 16292 fbe2fc30a177
permissions -rw-r--r--
removed distinct, renamed gen_distinct to distinct;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
16292
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
     1
#!/usr/local/bin/perl
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
     2
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
     3
#   Checklinks 1.0.1
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
     4
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
     5
#     Starting at one or more seed HTML files, recursively check the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
     6
#     validity of all links on the site.  Major features:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
     7
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
     8
#       * Local URLs are read from the filesystem when possible (much
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
     9
#           faster than going through HTTP server).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    10
#       * Basic server-side includes (aka SSI or SHTML) are checked.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    11
#       * Latest standards are supported-- HTML 4.0, HTTP 1.1, URIs
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    12
#           according to RFC 2396.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    13
#       * Links are traversed breadth-first.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    14
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    15
#   To list command-line options, run "cl -?" or see &usage() below.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    16
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    17
#   TO CONFIGURE: 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    18
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    19
#   1) Set $LOCAL_HOST and $DOCUMENT_ROOT, just below.  If you don't, the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    20
#      program will try to guess them in set_needed_globals(), but it's more
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    21
#      reliable if you enter them here.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    22
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    23
#   2) If needed, set any further server configuration below-- things like
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    24
#      path aliases and so forth.  If you have the srm.conf file, you can 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    25
#      feed it into this script with "-c srm.conf"; otherwise, the default 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    26
#      settings will probably work OK.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    27
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    28
#   You can set a few parameters with the undocumented "-D <name=value>"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    29
#   command-line option, e.g. "-D LOCAL_HOST=www.myhost.com".
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    30
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    31
#   Further comments, including an overview of script internals, are at
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    32
#   the end of this file.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    33
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    34
#   Copyright (C) 1998, 2000 by James Marshall, james@jmarshall.com
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    35
#   see http://www.jmarshall.com/tools/cl/ for more info
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    36
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    37
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    38
#   CHANGES IN 1.0.1:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    39
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    40
#     This is just a bug fix release.  Fixes include:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    41
#       . Aliases are handled correctly now.  Sorry 'bout that.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    42
#       . A redirect + relative URL no longer results in infinitely
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    43
#           recursing URLs.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    44
#       . More HTML tags are searched for links.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    45
#       . Non-HTML files are no longer searched for links.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    46
#       . There were other minor bug fixes.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    47
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    48
#----------------------------------------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    49
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    50
#use strict ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    51
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    52
my( $LOCAL_HOST, $DOCUMENT_ROOT, $USER_DIR, @DIRECTORY_INDEX,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    53
    %ALIAS, %ALIAS_MATCH, %SCRIPT_ALIAS, %SCRIPT_ALIAS_MATCH, %UN_ALIAS,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    54
    @SHTML_EXTENSIONS, @CGI_EXTENSIONS, @INCLUDE_PATTERNS, @EXCLUDE_PATTERNS,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    55
          @INCLUDE_STATUS, @EXCLUDE_STATUS,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    56
    $verbose_report, $max_depth, $file_check, $full_http_check,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    57
    $MAX_REDIRECTS, $MAX_ATTEMPTS, $HTML_BY_NAME, $SUPPORT_NCSA_BUG,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    58
    @NO_PROXY, $DOC_ROOT_DEV, $DOC_ROOT_INODE, $DOC_ROOT_EXISTS, $CWD,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    59
    %html_urls, %non_html_urls, %e_to_ch,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    60
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    61
    %home_dir, %dir_to_user, %inode_to_user,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    62
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    63
    %url, @urlstoget, 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    64
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    65
    $debug, $CL_VERSION,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    66
  ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    67
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    68
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    69
#----- User Configuration ---------------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    70
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    71
# This should be 'localhost', or a hostname of the Web server.  URLs at
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    72
#   this host will be assumed to be local; URLs not at this host will not be
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    73
#   traversed into. If this names a remote host, the program will not work.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    74
# Note that 'localhost' doesn't necessarily point to your local Web server.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    75
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    76
# $LOCAL_HOST= 'localhost' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    77
# $LOCAL_HOST= 'www.example.com' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    78
$LOCAL_HOST='isabelle.in.tum.de';
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    79
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    80
# This is your root Web directory, i.e. the directory that the Web server
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    81
#   sends the user if the URL "http://$LOCAL_HOST" is requested.  It's in
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    82
#   the configuration file srm.conf (and is read by -c option).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    83
# If you don't know the document root of your server, but you don't need
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    84
#   it because you're only checking URLs whose path starts with ~, put a
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    85
#   non-existent path here rather than leave it blank (a hack).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    86
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    87
# $DOCUMENT_ROOT= '/home/www/htdocs' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    88
$DOCUMENT_ROOT='/home/proj/isabelle';
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    89
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    90
#----- variables equivalent to srm.conf entries 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    91
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    92
# These globals are from the equivalent entries in srm.conf, etc.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    93
# See the command-line option -c <config-file>, to read values directly 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    94
#   from srm.conf instead.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    95
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    96
# $USER_DIR= 'public_html' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    97
$USER_DIR='.html-data';
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    98
@DIRECTORY_INDEX= qw( index.html index.cgi index.shtml ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
    99
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   100
# Used in &url_to_filename(), and possibly elsewhere
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   101
# Note that ALIAS_MATCH and SCRIPT_ALIAS_MATCH use Perl (not standard) regexps.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   102
# If order of multiple e.g. "Alias" directives is important, this may not work.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   103
%ALIAS= () ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   104
%ALIAS_MATCH= () ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   105
%SCRIPT_ALIAS= () ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   106
%SCRIPT_ALIAS_MATCH= () ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   107
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   108
# The list of file extensions to interpret as CGI scripts or
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   109
#   server-parsed HTML files.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   110
# These are not specific settings in srm.conf, but are combinations of
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   111
#   AddHandler directives and possibly AddType directives.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   112
@CGI_EXTENSIONS=   qw( .cgi ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   113
@SHTML_EXTENSIONS= qw( .shtml ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   114
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   115
#----- end of variables equivalent to srm.conf entries 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   116
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   117
# Specify patterns here to only include URLs that match at least one
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   118
#   pattern.  As a special case, an empty list includes all URLs, i.e.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   119
#   does not restrict URLs by name (except perhaps by @EXCLUDE_PATTERNS).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   120
# This can be added to or cleared with the -I command-line option.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   121
@INCLUDE_PATTERNS= () ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   122
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   123
# Specify patterns here to cause matching URLs to be excluded,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   124
#   e.g. '\?' means ignore all URLs that query.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   125
# This can be added to or cleared with the -X command-line option.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   126
# @EXCLUDE_PATTERNS=  qw( \? ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   127
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   128
# Only report URLs whose status codes start with one of these patterns.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   129
#   As a special case, an empty list reports all URLs, i.e. does not 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   130
#   restrict URLs by status code (except perhaps by @EXCLUDE_STATUS).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   131
# This can be added to or cleared with the -i command-line option.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   132
@INCLUDE_STATUS= () ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   133
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   134
# Don't report URLs whose status codes start with these patterns.  Default
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   135
#   is qw( 200 ).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   136
# This can be added to or cleared with the -x command-line option.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   137
@EXCLUDE_STATUS= qw( 200 ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   138
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   139
# For 302 or 303 HTTP redirection, redirect no more than this many times.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   140
$MAX_REDIRECTS= 5 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   141
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   142
# If a connection times out, etc., attempt no more than this many times.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   143
$MAX_ATTEMPTS= 5 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   144
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   145
# The old version determined whether a file was HTML by the -T test (text
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   146
#   file), and so traversed all HTML-like links in any text file that wasn't
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   147
#   a CGI script.  It's probably more appropriate to check the file
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   148
#   extension, to exclude source code, .txt files, etc.  Leave $HTML_BY_NAME
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   149
#   set to use the filename, or unset it to traverse all HTML-like links in
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   150
#   any text files, as the old version did.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   151
$HTML_BY_NAME= 1 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   152
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   153
# Some old NCSA servers, including 1.5.2, don't report the HTTP version
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   154
#   correctly in the status line; they return e.g. "HTTP 200 OK".  To allow
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   155
#   this, leave the variable here set.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   156
$SUPPORT_NCSA_BUG= 1 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   157
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   158
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   159
#----- DO NOT CHANGE ANYTHING BELOW THIS LINE, unless you want to... ---
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   160
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   161
#----- Further Global Variable Initialization --------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   162
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   163
$CL_VERSION= '1.0.1' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   164
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   165
$ENV{'http_proxy'}||= $ENV{'HTTP_PROXY'} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   166
@NO_PROXY= split(/[\s,]+/, $ENV{'no_proxy'} || $ENV{'NO_PROXY'} ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   167
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   168
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   169
# If output's not going directly to terminal, this ensures autoflushing.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   170
$|= 1 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   171
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   172
#----- End of Configuration --------------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   173
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   174
use strict 'vars' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   175
use IO::Socket ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   176
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   177
&usage unless @ARGV ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   178
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   179
# Process command-line options
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   180
&getopts ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   181
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   182
# Make any final needed adjustments to globals, after the hard-coded
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   183
#   values above and any options have been processed.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   184
&adjust_all_globals ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   185
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   186
# Default to "." if no starting filenames given.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   187
# 3-6-98: Anh, decided against it.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   188
#@ARGV= ('.') unless @ARGV ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   189
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   190
# &add_url() sets $url{$_} and pushes to @urlstoget, only if not already
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   191
#   added, plus any other initialization.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   192
# Only add a file if it can be accessed with a URL.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   193
foreach my $arg (@ARGV) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   194
    if ($arg=~ m#^http://#i) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   195
        &add_url($arg, '-', 0) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   196
    } else {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   197
        my($URL)= &filename_to_url($arg, $CWD) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   198
        if (defined($URL)) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   199
            &add_url($URL, '-', 0) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   200
        } else {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   201
	    die "ERROR: $arg is not accessible through the Web server.\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   202
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   203
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   204
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   205
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   206
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   207
# Check the URLs, in order.  @urlstoget may grow and rearrange.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   208
while (@urlstoget) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   209
    my($url)= shift(@urlstoget) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   210
    if ( !$url->{'ishtml'} or !$url->{'islocal'} or $url->{'dontfollow'}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   211
         or (length($max_depth) and $url->{'depth'} > $max_depth ) ) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   212
        &verify_url($url) ;    # may set ishtml=true
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   213
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   214
    if ( $url->{'ishtml'} and $url->{'islocal'} and !$url->{'dontfollow'}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   215
         and (!length($max_depth) or $url->{'depth'} <= $max_depth ) ) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   216
        my($HTML)= &load_url($url) ;  # may set ishtml=false
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   217
        # 11-30-99 JSM: fixed to handle rel URLs in redirected pages correctly
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   218
        my($base_url)= $url->{'location'} || $url->{'URL'} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   219
        &extract_urls($HTML, $base_url, $url->{'URL'}, $url->{'depth'}+1) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   220
            if $url->{'ishtml'} ;      # big, calls &add_url()
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   221
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   222
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   223
    # If we get an error response that may be corrected with another
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   224
    #   attempt, put it back in the queue.  Such errors include 408,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   225
    #   503, 504, and the homegrown codes 600, 601, 602, and 603.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   226
    if ($url->{'status'}=~ /^(408|503|504|600|601|602|603)\b/ ) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   227
        push(@urlstoget, $url) if ( $url->{'numtries'} < $MAX_ATTEMPTS ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   228
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   229
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   230
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   231
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   232
&make_report() ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   233
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   234
exit ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   235
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   236
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   237
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   238
#----- Process command-line options -----------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   239
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   240
# Process any command-line options.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   241
sub getopts {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   242
    my($opt, $param) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   243
    while ($ARGV[0]=~ /^-/) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   244
        $opt= shift(@ARGV) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   245
        ($opt, $param)= $opt=~ /^-(.)(.*)/ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   246
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   247
        # Turn on verbose reporting
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   248
        if ($opt eq 'v') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   249
            $verbose_report= ($param ne '-') ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   250
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   251
        # User-specified patterns to exclude ('' to clear list)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   252
        } elsif ($opt eq 'I') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   253
            $param= shift(@ARGV) unless length($param) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   254
            if (length($param)) { push(@INCLUDE_PATTERNS, $param) }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   255
            else { @INCLUDE_PATTERNS= () }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   256
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   257
        # User-specified patterns to exclude ('' to clear list)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   258
        } elsif ($opt eq 'X') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   259
            $param= shift(@ARGV) unless length($param) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   260
            if (length($param)) { push(@EXCLUDE_PATTERNS, $param) }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   261
            else { @EXCLUDE_PATTERNS= () }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   262
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   263
        # User-specified response codes to ignore ('' to clear list)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   264
        } elsif ($opt eq 'i') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   265
            $param= shift(@ARGV) unless length($param) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   266
            if (length($param)) { push(@INCLUDE_STATUS, $param) }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   267
            else { @INCLUDE_STATUS= () }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   268
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   269
        # User-specified response codes to ignore ('' to clear list)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   270
        } elsif ($opt eq 'x') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   271
            $param= shift(@ARGV) unless length($param) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   272
            if (length($param)) { push(@EXCLUDE_STATUS, $param) }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   273
            else { @EXCLUDE_STATUS= () }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   274
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   275
        # Maximum traversal depth
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   276
        } elsif ($opt eq 'd') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   277
            $param= shift(@ARGV) unless length($param) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   278
            $max_depth= $param ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   279
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   280
        # Make it a "file check"-- only read local files, do not use HTTP
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   281
        } elsif ($opt eq 'f') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   282
            $file_check= ($param ne '-') ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   283
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   284
        # Use HTTP for all URL's, even local files
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   285
        } elsif ($opt eq 'h') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   286
            $full_http_check= ($param ne '-') ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   287
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   288
        # Read configuration parameters from srm.conf-like file
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   289
        } elsif ($opt eq 'c') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   290
            $param= shift(@ARGV) unless length($param) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   291
            &read_srm_conf($param) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   292
            
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   293
        # Print current configuration parameters
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   294
        } elsif ($opt eq 'q') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   295
            &print_config ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   296
            exit ;   # jsm-- should we exit?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   297
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   298
        # Allow certain parameters to be defined via the command line
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   299
        } elsif ($opt eq 'D') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   300
            $param= shift(@ARGV) unless length($param) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   301
            $debug=1, unshift(@ARGV,$param), next if $param=~ /^-/ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   302
            my($name,$value)= split(/=/, $param, 2) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   303
            $value= 1 unless length($value) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   304
            if ($name=~ /^(LOCAL_HOST|DOCUMENT_ROOT|USER_DIR|DEBUG|debug)$/) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   305
                eval "\$$name= \$value" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   306
                #$$name= $value ;  # this doesn't work, because of initial my()
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   307
            }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   308
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   309
        } elsif ($opt eq '?') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   310
            &usage ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   311
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   312
        # End command-line option processing on "--"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   313
        } elsif ($opt eq '-') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   314
            return ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   315
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   316
        } else {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   317
            print STDERR 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   318
                "Illegal option-- '$opt'.  Enter \"$0 -?\" for help.\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   319
            exit ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   320
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   321
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   322
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   323
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   324
    if ($file_check and $full_http_check) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   325
        print STDERR "You cannot use both the -f and the -h options.\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   326
        exit ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   327
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   328
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   329
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   330
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   331
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   332
# Read appropriate values from the given file, typically srm.conf.  If a
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   333
#   directory is named, default to filename "srm.conf".
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   334
# Note that opening "-" will open STDIN.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   335
sub read_srm_conf {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   336
    my($fname)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   337
    local(*SRM) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   338
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   339
    # default to srm.conf if only a directory is named
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   340
    if (-d $fname) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   341
        $fname=~ s#/$## ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   342
        $fname.= "/srm.conf" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   343
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   344
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   345
    # Clear old values
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   346
    $DOCUMENT_ROOT= $USER_DIR= '' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   347
    @DIRECTORY_INDEX= @CGI_EXTENSIONS= @SHTML_EXTENSIONS= () ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   348
    %ALIAS= %ALIAS_MATCH= %SCRIPT_ALIAS= %SCRIPT_ALIAS_MATCH= () ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   349
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   350
    open(SRM, "<$fname") || die "Can't open $fname: $!" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   351
    while (<SRM>) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   352
        s/#.*// ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   353
        next unless /\S/ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   354
        my($name, @param)= /(\S+)/g ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   355
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   356
        if ($name eq 'DocumentRoot') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   357
            $DOCUMENT_ROOT= $param[0] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   358
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   359
        } elsif ($name eq 'UserDir') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   360
            $USER_DIR= $param[0] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   361
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   362
        } elsif ($name eq 'DirectoryIndex') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   363
            @DIRECTORY_INDEX= @param ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   364
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   365
        } elsif ($name eq 'Alias') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   366
            $ALIAS{$param[0]}= $param[1] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   367
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   368
        } elsif ($name eq 'AliasMatch') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   369
            $ALIAS_MATCH{$param[0]}= $param[1] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   370
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   371
        } elsif ($name eq 'ScriptAlias') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   372
            $SCRIPT_ALIAS{$param[0]}= $param[1] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   373
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   374
        } elsif ($name eq 'ScriptAliasMatch') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   375
            $SCRIPT_ALIAS_MATCH{$param[0]}= $param[1] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   376
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   377
        } elsif ($name eq 'AddHandler') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   378
            if ($param[0] eq 'cgi-script') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   379
                push(@CGI_EXTENSIONS, $param[1]) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   380
            } elsif ($param[0] eq 'server-parsed') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   381
                push(@SHTML_EXTENSIONS, $param[1]) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   382
            }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   383
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   384
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   385
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   386
    close(SRM) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   387
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   388
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   389
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   390
# Make any final settings to global variables, after the hard-coded values
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   391
#   and command-line options have been processed.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   392
# Most non-user-configurable globals are also set here.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   393
sub adjust_all_globals {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   394
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   395
    # Standardize $USER_DIR to never have trailing slash
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   396
    $USER_DIR=~ s#/$## ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   397
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   398
    # If no $LOCAL_HOST set, try to read it from first URL in list, or
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   399
    #   use the string 'localhost' if that URL contains no hostname.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   400
    unless (length($LOCAL_HOST)) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   401
        $LOCAL_HOST= (&parse_url($ARGV[0]))[1] || 'localhost' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   402
        print STDERR "LOCAL_HOST set to \"\L$LOCAL_HOST\E\"\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   403
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   404
    $LOCAL_HOST= lc($LOCAL_HOST) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   405
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   406
    # If no $DOCUMENT_ROOT, try to guess it from $HOME, username, $USER_DIR.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   407
    unless (length($DOCUMENT_ROOT)) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   408
        my($home) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   409
        unless ($home= $ENV{'HOME'}) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   410
            my($uname)= getpwuid($<) || $ENV{'USER'} || `whoami` || `id -un` ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   411
            chomp($uname) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   412
            &read_home_dirs unless %home_dir ;   # only read when needed
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   413
            $home= $home_dir{$uname} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   414
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   415
        $DOCUMENT_ROOT= "$home/$USER_DIR" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   416
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   417
        die "Could not determine DOCUMENT_ROOT; edit the $0 script to set it.\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   418
            unless (-d $DOCUMENT_ROOT) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   419
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   420
        print STDERR "DOCUMENT_ROOT set to \"$DOCUMENT_ROOT\"\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   421
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   422
    $DOCUMENT_ROOT=~ s#/$## ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   423
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   424
    # Allows &filename_to_url() to unalias as best as possible.  Note that 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   425
    #   use of &filename_to_url() can be avoided by the user; see note in 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   426
    #   that routine.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   427
    %UN_ALIAS= (reverse (%ALIAS, %SCRIPT_ALIAS) ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   428
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   429
    # These are to compare equivalency to later, in &filename_to_url().
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   430
    ($DOC_ROOT_DEV, $DOC_ROOT_INODE)= stat("$DOCUMENT_ROOT/.") ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   431
    
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   432
    $DOC_ROOT_EXISTS= -e _ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   433
    
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   434
    # Set CWD from shell variable, else from `pwd`.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   435
    $CWD= $ENV{'PWD'} || `pwd` || die "couldn't run pwd: $!" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   436
    chomp($CWD) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   437
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   438
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   439
    # These are used by &extract_urls().
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   440
    # This is a complete list of URL-type attributes defined in HTML 4.0,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   441
    #   plus any others I found, like nonstandard ones or from an earlier HTML.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   442
    # Only a few of these are commonly used, as of early 1998.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   443
    # The set in %html_urls could possibly link to HTML resources, while the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   444
    #   set in %non_html_urls could not.  The %special(.*) sets, here for 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   445
    #   reference only, include URL attributes that require special handling.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   446
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   447
    %html_urls= ( 'a'          => [ 'href' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   448
                  'area'       => [ 'href' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   449
                  'frame'      => [ 'src', 'longdesc' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   450
                  'link'       => [ 'href', 'urn' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   451
                  'img'        => [ 'longdesc', 'usemap' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   452
                  'q'          => [ 'cite' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   453
                  'blockquote' => [ 'cite' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   454
                  'ins'        => [ 'cite' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   455
                  'del'        => [ 'cite' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   456
                  'object'     => [ 'usemap' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   457
                  'input'      => [ 'usemap' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   458
                  'iframe'     => [ 'src', 'longdesc' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   459
		  'ilayer'     => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   460
		  'layer'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   461
		  'fig'        => [ 'imagemap' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   462
		  'overlay'    => [ 'imagemap' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   463
		  'meta'       => [ 'url' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   464
		  'note'       => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   465
                ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   466
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   467
    %non_html_urls= ( 'body'    => [ 'background' ], 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   468
                      'img'     => [ 'src', 'lowsrc', 'dynsrc' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   469
                      'input'   => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   470
                      'script'  => [ 'src', 'for' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   471
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   472
		      'fig'     => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   473
		      'overlay' => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   474
		      'select'  => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   475
		      'ul'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   476
		      'h1'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   477
		      'h2'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   478
		      'h3'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   479
		      'h4'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   480
		      'h5'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   481
		      'h6'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   482
		      'hr'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   483
		      'table'   => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   484
		      'td'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   485
		      'th'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   486
		      'tr'      => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   487
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   488
		      'bgsound' => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   489
		      'embed'   => [ 'src' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   490
                  ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   491
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   492
    # %special_urls= ( 'base' => [ 'href' ] ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   493
    #
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   494
    # %special_html_urls= ( 'object' => [ 'codebase', 'data' ] ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   495
    #
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   496
    # %special_non_html_urls=
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   497
    #      ( 'head'   => [ 'profile' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   498
    #        'object' => [ 'codebase', 'archive', 'classid' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   499
    #        'applet' => [ 'codebase', 'code', 'object', 'archive' ],
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   500
    #        'form'   => [ 'action', 'script' ]
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   501
    #      ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   502
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   503
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   504
    # This is a translation from entity character references to characters, 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   505
    #   used in &HTMLunescape().
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   506
    # This simplified version only supports &quot; &lt; &gt; &amp;, but that
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   507
    #   should be enough for URL-type attributes.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   508
    # See http://www.w3.org/TR/REC-html40/sgml/entities.html for full entity 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   509
    #   list.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   510
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   511
    %e_to_ch= (quot => '"', 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   512
               'lt' => '<', 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   513
               'gt' => '>', 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   514
               amp  => '&') ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   515
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   516
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   517
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   518
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   519
#----------------------------------------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   520
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   521
# Add the URL to our data structures; specifically, to %url and @urlstoget.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   522
# Returns a pointer to the structure in %url, or undef if already defined
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   523
#   or on error.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   524
# Currently, this always receives the URL with the host name lowercase,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   525
#   either from &absolute_url() or from using $LOCAL_HOST.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   526
sub add_url {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   527
    my($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   528
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   529
    # Allow the user to restrict URL patterns:  URLs must be in 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   530
    #   @INCLUDE_PATTERNS but not in @EXCLUDE_PATTERNS (but only restrict 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   531
    #   by @INCLUDE_PATTERNS if it's not empty).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   532
    return undef if @INCLUDE_PATTERNS &&
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   533
                    !grep( $URL=~ /$_/, @INCLUDE_PATTERNS ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   534
    return undef if grep( $URL=~ /$_/, @EXCLUDE_PATTERNS ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   535
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   536
    # Canonicalize URL, so we don't get a page multiple times
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   537
    $URL= &canonicalize($URL) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   538
    
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   539
    # for obscure case involving a <form action=___.cgi>-extracted URL being
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   540
    #   overwritten by <a href=___.cgi> extraction (don't fret over this)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   541
    $url{$URL}{'dontfollow'}&&= $dontfollow if $url{$URL} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   542
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   543
    # Don't add the record a second time!  Or will infinitely traverse.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   544
    return undef if $url{$URL} ;  # or add to @referers, for 301 correction...?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   545
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   546
    # Only HTTP URLs are currently supported
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   547
    return undef unless $URL=~ /^http:/i ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   548
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   549
    # Any self-referral here indicates a bug in the program.  It's happened.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   550
    die "PROGRAM ERROR: $URL shows its first referer as itself.\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   551
        if $referer eq $URL ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   552
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   553
    my(%u) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   554
    @u{qw(URL referer depth ishtml iscgi dontfollow)}= 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   555
        ($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   556
    $u{'islocal'}= ($URL=~ m#^http://\Q$LOCAL_HOST\E/#io) + 0 ; # make length>0
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   557
    if ($u{'islocal'}) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   558
#        $u{'filename'}= &url_to_filename($URL) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   559
        @u{'filename', 'location'}= &url_to_filename($URL) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   560
        $u{'iscgi'}= &is_cgi($u{'filename'}, $URL)  if $u{'iscgi'} eq '' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   561
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   562
	# 2-27-00 JSM:  Detect ishtml by filename, not -T test.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   563
	if ( $u{'ishtml'} eq '' ) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   564
	    $u{'ishtml'}=  $HTML_BY_NAME
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   565
		?  ( !$u{'iscgi'} && -e $u{'filename'} && 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   566
		     $u{'filename'}=~ /\.html?$/i ) + 0
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   567
		:  (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   568
	}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   569
#        $u{'ishtml'}= (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   570
#            unless length($u{'ishtml'}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   571
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   572
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   573
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   574
    #  If we're only doing a file check, don't add URLs that require HTTP
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   575
    return undef if ($file_check and (!$u{'islocal'} or $u{'iscgi'}) ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   576
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   577
    push(@urlstoget, \%u) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   578
    $url{$URL}= \%u ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   579
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   580
    # return \%u ;   # unneeded because of previous statement
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   581
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   582
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   583
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   584
# Guess if a file is a CGI script or not.  Returns true if the (regular) file
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   585
#   is executable, has one of @CGI_EXTENSIONS, or if the URL is in a 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   586
#   ScriptAlias'ed directory.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   587
# $fname must be absolute path, but $URL is optional (saves time if available).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   588
# Note that URLs like "/path/script.cgi?a=b" are handled correctly-- the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   589
#   previously extracted filename is tested for CGI-ness, while the URL is
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   590
#   checked for ScriptAlias matching (which is unaffected by final query
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   591
#   strings or PATH_INFO).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   592
sub is_cgi {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   593
    my($fname, $URL)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   594
    return 1 if (-x $fname && ! -d _ ) ;   # should we really do this?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   595
    foreach (@CGI_EXTENSIONS) { return 1 if $fname=~ /\Q$_\E$/i }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   596
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   597
    $URL= &filename_to_url($fname) unless length($URL) ;  # currently unused
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   598
    my($URLpath)= $URL=~ m#^http://[^/]*(.*)#i ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   599
    foreach (keys %SCRIPT_ALIAS)        { return 1 if $URLpath=~ /^\Q$_\E/ }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   600
    foreach (keys %SCRIPT_ALIAS_MATCH)  { return 1 if $URLpath=~ /^$_/ }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   601
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   602
    return 0 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   603
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   604
   
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   605
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   606
# Put the URL in such a form that two URLs that point to the same resource
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   607
#   have the same URL, to avoid superfluous retrievals.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   608
# Host name is lowercased elsewhere-- this routine is only called from
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   609
#   &add_url; see note there.  To lowercase the host name here would be
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   610
#   inefficient.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   611
sub canonicalize {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   612
    my($URL)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   613
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   614
    $URL=~ s/#.*// ;    # remove any "#" fragment from end of URL
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   615
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   616
    return $URL ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   617
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   618
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   619
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   620
#----- File reading/downloading routines (includes networking) --------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   621
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   622
# Verify that a URL exists, and set $url->{'status'} accordingly.  Do
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   623
#   this either by checking the local filesystem or by using the HTTP HEAD 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   624
#   method for remote sites or CGI scripts.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   625
# Set $url->{'ishtml'} accordingly if discovered from Content-Type:.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   626
# This does not support various Redirect directives in srm.conf.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   627
sub verify_url {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   628
    my($url)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   629
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   630
    print STDERR "verifying $url->{'URL'}\n" if $debug ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   631
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   632
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   633
    # Depending on the state of $url->{islocal, iscgi, dontfollow} and
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   634
    #   $full_http_check, take appropriate actions to check/set the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   635
    #   status code for this URL.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   636
    
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   637
    # NOTE: In some situations, specifically when checking a CGI script
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   638
    #   named in a <form action> (thus implying that dontfollow is set),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   639
    #   and using HTTP to check the URL (because the script is remote or
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   640
    #   $full_http_check is set), the HTTP response code may not be
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   641
    #   accurate.  This is because there is no form data sent with the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   642
    #   request, as there normally would be.  In these cases, a cautionary
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   643
    #   note is appended to $url->{'status'}.  Additionally, an empty 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   644
    #   $url->{'status'} is changed to an explanatory note (maybe we should
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   645
    #   do that in load_url() too?).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   646
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   647
    # Use HEAD if file is remote, or if $full_http_check is set.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   648
    if (!$url->{'islocal'} or $full_http_check) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   649
        &load_url_using_HTTP($url, 'HEAD') ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   650
        $url->{'status'}= '[no status returned]'
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   651
            unless length($url->{'status'}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   652
        $url->{'status'}.= ' (NOTE: Form was not submitted normally)'
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   653
            if $url->{'dontfollow'} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   654
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   655
    # URL is local:  If it's not CGI, do a normal local file check
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   656
    } elsif (!$url->{'iscgi'}) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   657
        $url->{'status'}= (-e $url->{'filename'})  
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   658
            ? "200 Local File Exists"  : "404 File Not Found" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   659
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   660
    # URL is local CGI:  Use HEAD unless dontfollow is set
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   661
    } elsif (!$url->{'dontfollow'}) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   662
        &load_url_using_HTTP($url, 'HEAD') ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   663
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   664
    # Else it's a local CGI with dontfollow set:  Check for executable file
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   665
    } else {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   666
        $url->{'status'}= 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   667
             (! -e $url->{'filename'})  ? "404 File Not Found"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   668
           : (! -x $url->{'filename'})  ? "403 Local File Is Not Executable"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   669
           :                              "200 Local Executable File Exists"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   670
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   671
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   672
        
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   673
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   674
# Old verify routine below:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   675
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   676
#    # If is a local non-CGI file, check it directly from the filesystem
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   677
#    if ($url->{'islocal'} and !$url->{'iscgi'} and !$full_http_check) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   678
#        $url->{'status'}= (-e $url->{'filename'})  
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   679
#            ? "200 Local File Exists"  : "404 File Not Found" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   680
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   681
#    # Otherwise, download its HEAD from its HTTP server
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   682
#    } else {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   683
#        &load_url_using_HTTP($url, 'HEAD') ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   684
#    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   685
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   686
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   687
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   688
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   689
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   690
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   691
# Load entire file/resource and return its contents, setting $url->{'status'}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   692
#    accordingly.  Do this either by checking the local filesystem or by 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   693
#    using the HTTP GET method for remote sites or CGI scripts.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   694
# Set $url->{'ishtml'} accordingly if discovered from Content-Type:.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   695
# This does not support various Redirect directives in srm.conf.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   696
sub load_url {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   697
    my($url)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   698
    my($HTML) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   699
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   700
    print STDERR "loading $url->{'URL'}\n" if $debug ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   701
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   702
    # If is a local non-CGI file, read it directly from the filesystem
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   703
    if ($url->{'islocal'} and !$url->{'iscgi'} and !$full_http_check) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   704
        my($iscgi) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   705
        ($HTML, $url->{'ssierrs'}, $iscgi)= 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   706
            &read_expanded_file($url->{'filename'}, $url->{'URL'}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   707
        $url->{'status'}= 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   708
            !defined($HTML)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   709
                  ? sprintf("450 Can't read file: %s (%s)", $!, $!+0)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   710
            : @{$url->{'ssierrs'}}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   711
                  ? sprintf("451 SSI Error(s) (%s total)",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   712
                            scalar @{$url->{'ssierrs'}})
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   713
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   714
            :       "200 Local File Read OK" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   715
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   716
        # $url->{'iscgi'} may be set if an SHTML file included CGI calls.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   717
        # Don't set it if we're doing a file check, in which case we'll
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   718
        #   keep whatever $HTML we could get.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   719
        $url->{'iscgi'}= $iscgi unless $file_check ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   720
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   721
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   722
    # Otherwise (or if rereckoned), download the resource from its HTTP server
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   723
    if (!$url->{'islocal'} or $url->{'iscgi'} or $full_http_check) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   724
        (undef, undef, $HTML)= &load_url_using_HTTP($url, 'GET') ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   725
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   726
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   727
    # Note that this will be set even when URL is to be reloaded, like
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   728
    #   for a 601 (timeout) response.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   729
    $url->{'hasbeenloaded'}= 1 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   730
    return $HTML ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   731
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   732
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   733
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   734
# Read a local file and return its contents.  If a file is SSI (aka SHTML),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   735
#   expand any SSI <!--#include--> directives as needed, recursively 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   736
#   including nested files.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   737
# This is used for all local reads, SHTML or not, but the vast bulk of this
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   738
#   routine is for SHTML files.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   739
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   740
# If file is SHTML, this routine also returns a structure of error data,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   741
#   and a boolean saying if this file needs to be downloaded via HTTP
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   742
#   for a complete check (e.g. includes CGI calls).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   743
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   744
# $fname must be canonicalized absolute path, but $URL parameter is optional.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   745
# %$parents contains all "include"-ancestors of the file, to prevent loops.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   746
#   If omitted, assumes no ancestors (and a fresh hash is started).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   747
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   748
# This routine seems much bigger and more complex than it needs to be.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   749
#   It could be one third the size and much simpler if we didn't have to
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   750
#   worry about full error reporting on nested includes.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   751
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   752
# Note: This routine was made to mimic what Apache would return to a client.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   753
#   However, the result differs from Apache's in two slight ways, both
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   754
#   involving nested SSI within <!--#include file="..." -->, and both
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   755
#   apparent bugs in Apache 1.1 (may be fixed in later versions):
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   756
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   757
#   1) If a <file="..."> value contains no "/" (i.e. in current directory),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   758
#      then Apache always parses the included file as SHTML, regardless of
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   759
#      extension.  This routine checks @SHTML_EXTENSIONS for all included
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   760
#      files.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   761
#   2) If a <file="..."> value containing a "/" loads an SHTML file  
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   762
#      containing a <virtual="..."> tag with a relative path, the directive
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   763
#      fails in Apache.  This routine tries to guess the correct path/URL.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   764
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   765
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   766
# Notes on this routine, and SHTML files in general:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   767
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   768
# At first thought, it seems like we could load each included file
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   769
# only once, instead of once for every file that includes it.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   770
# However, because of the fact that relative URLs are resolved
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   771
# relative to the top-level including file, the top-level file will
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   772
# need to be expanded every time.  (It's legal (though of questionable
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   773
# wisdom) to include a file from e.g. both /a/index.shtml and
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   774
# /b/index.shtml, so links from the included file point to different
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   775
# URLs.)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   776
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   777
# Note that while URLs in included files (e.g. <a href="...">) are
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   778
# resolved relative to the top-level including file, nested include tags
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   779
# are resolved relative to the direct includer.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   780
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   781
# We could possibly be more efficient in time (but costly in memory)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   782
# by storing the expanded contents and $errlist of each included file,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   783
# since those will be constant (except $errlist's include-loop
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   784
# reporting might vary somewhat).  There are probably other ways to
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   785
# eek out savings of time and memory, at the cost of complexity.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   786
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   787
# The main loop here is inside of an s/// statement.  Unusual, but it's an
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   788
# appropriate way to handle the recursion. Recursion is needed, since each
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   789
# included file may or may not be SHTML.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   790
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   791
# $iscgi is set if a file includes "<!--#exec", or if it contains an
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   792
#   <!--#include virtual="..." --> tag that points to a CGI file, or if 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   793
#   any of its include-children sets $iscgi.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   794
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   795
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   796
# Notes to help clarify data structures, if (God forbid) you have to modify 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   797
# this routine:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   798
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   799
# Each error is a list of files in an "include chain", and $errlist is a
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   800
# list of errors.  $errlist is associated with the current $HTML.  Each
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   801
# error in $errlist is associated with some tag in $HTML, as iterated in
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   802
# the s/// loop.  When this routine returns ($HTML, $errlist), the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   803
# errors in $errlist should all have as their first element tags that
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   804
# were found in $HTML.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   805
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   806
# Each error's final element (the "leaf") is associated with a tag that
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   807
# tried to load an invalid file.  Each leaf will always be the complete 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   808
# set of errors for that tag (i.e. it has no children, since it couldn't
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   809
# load the file).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   810
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   811
# If the file can be validly loaded, then we may have 0 or multiple errors
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   812
# associated with this file/tag (and returned from this routine).  Each
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   813
# file's $errlist is an accumulation of all of its children's $errlist's.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   814
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   815
# Errors that come from loading a child are associated with the child's
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   816
# $HTML and tags.  Before adding them to the parent's (current) $errlist,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   817
# they must have the CHILD's path/tag unshifted onto the front of the 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   818
# include chain; all errors are then added to the current $errlist.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   819
# This ensures that:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   820
#   a) The errors are now all associated with the tag that loaded the child.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   821
#   b) The errors in the current $errlist are always associated with tags
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   822
#      from the current $HTML.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   823
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   824
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   825
sub read_expanded_file {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   826
    my($fname, $URL, $parents)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   827
    my($HTML, $errlist, $iscgi) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   828
    my($isshtml) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   829
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   830
    $parents->{$fname}= 1 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   831
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   832
    $HTML= &read_file($fname) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   833
    return(undef) unless defined($HTML) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   834
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   835
    foreach (@SHTML_EXTENSIONS) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   836
        $isshtml= 1, last if ($fname=~ /\Q$_\E$/i) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   837
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   838
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   839
    if ($isshtml) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   840
        $errlist= [] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   841
        $iscgi= ($HTML=~ /<!--#exec\b/i) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   842
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   843
        $HTML=~ s{(<!--#include\b.*?>)} {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   844
            do {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   845
                my($childfname, $childURL) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   846
                my($childHTML, $childerrlist, $childiscgi) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   847
                my($tagst) = $1 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   848
                my($tag)= &parse_tag($tagst) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   849
              GET_CHILD: {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   850
                  if (length($tag->{'attrib'}{'virtual'})) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   851
                      $URL= &filename_to_url($fname) unless length($URL) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   852
                      $childURL= 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   853
                          &absolute_url($tag->{'attrib'}{'virtual'}, $URL) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   854
                      ($childfname)= &url_to_filename($childURL) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   855
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   856
                      # If it's a CGI, don't follow it, but no error either
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   857
                      if (&is_cgi($childfname, $childURL)) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   858
                          $iscgi= 1 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   859
                          last GET_CHILD ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   860
                      }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   861
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   862
                  } elsif (length($tag->{'attrib'}{'file'})) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   863
                      $childfname= $tag->{'attrib'}{'file'} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   864
                      if ($childfname=~ m#^(/|~)#) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   865
                          push(@$errlist, [ {'path' => $childfname, 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   866
                                             'tag' => $tagst, 'errmsg' =>
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   867
                                  'Absolute paths are not allowed in '
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   868
                                . '<!--#include file="..." -->.'}]);
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   869
                          last GET_CHILD ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   870
                      }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   871
                      if ($childfname=~ m#\.\.(/|$)#) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   872
                          push(@$errlist, [ {'path' => $childfname, 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   873
                                             'tag' => $tagst, 'errmsg' =>
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   874
                                  'Paths can not contain "../" in '
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   875
                                . '<!--#include file="..." -->.'}]);
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   876
                          last GET_CHILD ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   877
                      }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   878
                      $childfname= ($fname=~ m#(.*/)#)[0] . $childfname ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   879
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   880
                  } else {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   881
                      push(@$errlist, [ {'path' => '',
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   882
                                         'tag' => $tagst, 'errmsg' =>
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   883
                              'Tag must contain either the "file" or '
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   884
                            . '"virtual" attribute.'}]);
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   885
                      last GET_CHILD ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   886
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   887
                  }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   888
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   889
                  # canonicalize filename for %$parents
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   890
                  1 while $childfname=~ s#/\.(/|$)#/# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   891
                  1 while $childfname=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   892
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   893
                  # Guarantee that file exists, is regular, and is readable
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   894
                  unless (-e $childfname) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   895
                      push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   896
                                        'errmsg' => 'File not found'} ] ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   897
                      last GET_CHILD ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   898
                  }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   899
                  unless (-f $childfname) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   900
                      push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   901
                                        'errmsg' => 'File is not a regular'
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   902
                                            . ' file.' } ] ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   903
                      last GET_CHILD ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   904
                  }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   905
                  unless (-r $childfname) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   906
                      push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   907
                                        'errmsg' => 'File is not readable by'
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   908
                                            . ' current user.' } ] ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   909
                      last GET_CHILD ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   910
                  }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   911
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   912
                  # Guard against include loops
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   913
                  if ($parents->{$childfname}) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   914
                      push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   915
                                        'errmsg' => 'An "include" loop exists'
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   916
                                            . ' involving this file.' } ] ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   917
                      last GET_CHILD ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   918
                  }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   919
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   920
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   921
                  # Get the included file, with any error data
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   922
                  ($childHTML, $childerrlist, $childiscgi)= 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   923
                      &read_expanded_file($childfname, $childURL, $parents) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   924
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   925
                  # Log if there was any error reading the file
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   926
                  push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   927
                                    'errmsg' => "Can't read file: $!." } ] )
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   928
                      unless defined($childHTML) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   929
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   930
                  # Add any errors to the current (parent) error list
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   931
                  foreach my $error (@$childerrlist) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   932
                      unshift(@$error, 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   933
                              { 'path' => $childfname, 'tag' => $tagst } ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   934
                  }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   935
                  push(@$errlist, @$childerrlist) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   936
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   937
                  # Parent is a CGI if any of its children is a CGI
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   938
                  $iscgi||= $childiscgi ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   939
            
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   940
              }   # GET_CHILD
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   941
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   942
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   943
              $childHTML ;   # final value to replace in main s/// construct
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   944
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   945
          }   # do {}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   946
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   947
        }gie ;   # $HTML=~ s{} {}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   948
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   949
    } # if ($isshtml)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   950
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   951
    delete $parents->{$fname} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   952
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   953
    return($HTML, $errlist, $iscgi) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   954
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   955
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   956
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   957
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   958
# Returns the contents of the named file, or undef on error.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   959
sub read_file {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   960
    my($fname)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   961
    local(*F, $/) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   962
 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   963
    undef $/ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   964
    open(F, "<$fname") || return undef ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   965
    my($ret)= <F> ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   966
    close(F) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   967
    
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   968
    return $ret ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   969
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   970
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   971
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   972
# Try to get the given URL with the given HTTP method, and return the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   973
#   status line, headers, and body.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   974
# Set $url->{'status'} accordingly, and set $url->{'ishtml'} accordingly
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   975
#   if Content-Type: header is returned.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   976
# This is specific to this program, and calls the more general &get_url().
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   977
# This could be slightly more efficient if 302 or 303 was handled in the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   978
#   calling routine, where it could take advantage of a new URL being local.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   979
sub load_url_using_HTTP {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   980
    my($url, $method)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   981
    my($status_line, $headers, $body) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   982
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   983
    # We should not get here if $file_check is set
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   984
    die "mistakenly called load_url_using_HTTP($url->{'URL'})" if $file_check ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   985
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   986
    GETFILE: {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   987
        ($status_line, $headers, $body)=
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   988
            &get_url( ($url->{'location'} || $url->{'URL'}), $method) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   989
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   990
        # If HEAD failed (as on some servers), sigh and use GET
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   991
        ($status_line, $headers, $body)=
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   992
            &get_url( ($url->{'location'} || $url->{'URL'}), 'GET')
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   993
                unless length($status_line) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   994
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   995
        ($url->{'status'})=  $status_line=~ m#^HTTP/[\d.]+\s+(.*)# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   996
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   997
	# 2-27-00 JSM:  Allow old NCSA servers to not include the HTTP version.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   998
	if ($SUPPORT_NCSA_BUG and $url->{'status'} eq '') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
   999
	    ($url->{'status'})=  $status_line=~ m#^HTTP(?:/[\d.]+)?\s+(.*)# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1000
	}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1001
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1002
        # Redirect to new location if status is 302 or 303
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1003
        if ($url->{'status'}=~ /^(301|302|303)\b/) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1004
            ($url->{'location'})= $headers=~ m#^Location:[ \t]+(\S+)#im ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1005
            last GETFILE unless length($url->{'location'}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1006
            $url->{'location'}= 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1007
                &absolute_url($url->{'location'}, $url->{'URL'}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1008
            redo GETFILE
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1009
                if    ($url->{'status'}=~ /^(302|303)\b/)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1010
                   && (++$url->{'numredirects'} <= $MAX_REDIRECTS) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1011
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1012
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1013
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1014
    $url->{'numtries'}++ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1015
    $url->{'lasttried'}= time ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1016
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1017
    # If successful response included Content-Type:, set ishtml accordingly
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1018
    $url->{'ishtml'}= (lc($1) eq 'text/html') + 0
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1019
        if $url->{'status'}=~ /^2/
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1020
           and $headers=~ m#^content-type:[ \t]*(\S+)#im ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1021
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1022
    print STDERR "status: $status_line\n" if $debug ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1023
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1024
    return($status_line, $headers, $body) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1025
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1026
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1027
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1028
# Request the HTTP resource at the given absolute URL using the given method,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1029
#   and return the response status line, headers, and body.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1030
# jsm-- in the future, this should support downloading to a file, in case
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1031
#   the download is too large to fit in memory.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1032
sub get_url {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1033
    my($URL, $method)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1034
    my($host, $uri, $endhost) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1035
    my($S, $rin) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1036
    my($response, $status_line, $headers, $body, $status_code) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1037
    my($content_length) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1038
    $method= uc($method) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1039
    $method= 'GET' unless length($method) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1040
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1041
    ($host, $uri)= $URL=~ m#^http://([^/]*)(.*)$#i ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1042
    $uri= '/' unless length($uri) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1043
    $endhost= $host ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1044
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1045
    # use an HTTP proxy if $ENV{'http_proxy'} is set
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1046
    USEPROXY: {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1047
        last USEPROXY unless $host=~ /\./ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1048
        if (length($ENV{'http_proxy'})) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1049
            foreach (@NO_PROXY) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1050
                last USEPROXY if $host=~ /$_$/i ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1051
            }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1052
            ($host)= $ENV{'http_proxy'}=~ m#^(?:http://)?([^/]*)#i ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1053
            $uri= $URL ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1054
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1055
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1056
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1057
    # Open socket
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1058
    $S= IO::Socket::INET->new(PeerAddr => $host,  # may contain :port
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1059
                              PeerPort => 80,     # default if none in PeerAddr
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1060
                              Proto => 'tcp') ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1061
    return("HTTP/1.1 600 Can't create socket: $@") unless defined($S) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1062
    $S->autoflush() ;   # very important!!
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1063
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1064
    # Send HTTP 1.1 request
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1065
    print $S "$method $uri HTTP/1.1\015\012",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1066
             "Host: $endhost\015\012",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1067
             "Connection: close\015\012",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1068
             "User-agent: CheckLinks/$CL_VERSION\015\012",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1069
             "\015\012" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1070
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1071
    # Wait for socket response with select()
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1072
    vec($rin= '', fileno($S), 1)= 1 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1073
    select($rin, undef, undef, 60) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1074
        || return("HTTP/1.1 601 Connection timed out") ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1075
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1076
    local($/)= "\012" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1077
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1078
    # Handle "100 Continue" responses for HTTP 1.1: loop until non-1xx.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1079
    do {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1080
        $status_line= <$S> ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1081
        $status_line=~ s/\015?\012$// ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1082
        ($status_code)= $status_line=~ m#^HTTP/\d+\.\d+\s+(\d+)# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1083
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1084
        $headers= '' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1085
        while (<$S>) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1086
            last if /^\015?\012/ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1087
            $headers.= $_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1088
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1089
        $headers=~ s/\015?\012[ \t]+/ /g ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1090
    } until $status_code!~ /^1/ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1091
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1092
    # Body length is determined by HTTP 1.1 spec, section 4.4:  these
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1093
    #   certain conditions implying no body, then chunked encoding,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1094
    #   then Content-length: header, then server closing connection.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1095
    if ($method eq 'HEAD' or $status_code=~ /^(1|204\b|304\b)/) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1096
        $body= undef ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1097
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1098
    # else chunked encoding
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1099
    } elsif ($headers=~ /^transfer-encoding:[ \t]*chunked\b/im) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1100
        # 7-16-99:  Old code was only saving last chunk.  Fix using
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1101
        #   $this_chunk contributed by Mark Trotter.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1102
        my($this_chunk, $chunk_size, $readsofar, $thisread) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1103
        while ($chunk_size= hex(<$S>)) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1104
            $readsofar= 0 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1105
            while ($readsofar!=$chunk_size) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1106
                last unless $thisread=
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1107
                    read($S, $this_chunk, $chunk_size-$readsofar, $readsofar) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1108
                $readsofar+= $thisread ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1109
            }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1110
            return("HTTP/1.1 603 Incomplete chunked response", $headers, $body)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1111
                if $readsofar!=$chunk_size ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1112
            $_= <$S> ;    # clear CRLF after chunk
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1113
            $body.= $this_chunk ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1114
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1115
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1116
        # Read footers if they exist
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1117
        while (<$S>) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1118
            last if /^\015?\012/ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1119
            $headers.= $_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1120
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1121
        $headers=~ s/\015?\012[ \t]+/ /g ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1122
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1123
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1124
    # else body length given in Content-length:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1125
    } elsif (($content_length)= $headers=~ /^content-length:[ \t]*(\d+)/im) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1126
        my($readsofar, $thisread) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1127
        while ($readsofar!=$content_length) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1128
            last unless $thisread=
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1129
                read($S, $body, $content_length-$readsofar, $readsofar) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1130
            $readsofar+= $thisread ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1131
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1132
        return(sprintf("HTTP/1.1 602 Incomplete response (%s of %s bytes)",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1133
                       $readsofar+0, $content_length),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1134
               $headers, $body)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1135
            if $readsofar!=$content_length ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1136
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1137
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1138
    # else body is entire socket output
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1139
    } else {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1140
        local($/)= undef ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1141
        $body= <$S> ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1142
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1143
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1144
    close($S) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1145
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1146
    return($status_line, $headers, $body) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1147
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1148
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1149
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1150
#----- URL-parsing routines -------------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1151
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1152
# The routines parse_url(), unparse_url(), and absolute_url() are based on
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1153
#   different sections in the Internet Draft "Uniform Resource Identifiers
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1154
#   (URI): Generic Syntax and Semantics", 11-18-97, by Berners-Lee,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1155
#   Fielding, and Masinter, filename draft-fielding-uri-syntax-01.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1156
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1157
# Parse a URL into its components, according to URI draft, sections 4.3, 4.4.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1158
# This regular expression is straight from Appendix B, modified to use Perl 5.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1159
# Returns scheme, site, path, query, and fragment.  All but path may have
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1160
#   the undefined value.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1161
sub parse_url {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1162
    my($URL)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1163
    my($scheme, $site, $path, $query, $fragment)=
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1164
        ($URL=~ m{^(?:    ([^:/?\#]+):)?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1165
                   (?: // ([^/?\#]*))?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1166
                          ([^?\#]*)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1167
                   (?: \? ([^\#]*))?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1168
                   (?: \# (.*))?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1169
                 }x
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1170
        ) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1171
        
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1172
        
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1173
    # Un-URL-encode the path, to equivalate things like %7E --> ~
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1174
    # Note that in some situations, this may cause problems with URLs that
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1175
    #   contain the % character: if the unescaped URL is then used in 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1176
    #   relative URL calculation, it may be unescaped again (rare).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1177
    $path=~ s/\+/ /g ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1178
    $path=~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/ge ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1179
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1180
    # Note that in HTTP, the presence of a host implies a path beginning with
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1181
    #   '/', so $path should be '/' for URLs like "http://www.somehost.com"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1182
    $path= '/' if !length($path) && length($site) && lc($scheme) eq 'http' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1183
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1184
    return($scheme, $site, $path, $query, $fragment) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1185
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1186
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1187
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1188
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1189
# Returns a full URL string, given its components
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1190
# The full procedure is described in the URI draft, section 5.2, step 7.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1191
sub unparse_url {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1192
    my($scheme, $site, $path, $query, $fragment)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1193
    my($URL) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1194
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1195
    $URL= "$scheme:"    if defined($scheme) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1196
    $URL.= "//$site"    if defined($site) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1197
    $URL.= $path ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1198
    $URL.= "?$query"    if defined($query) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1199
    $URL.= "#$fragment" if defined($fragment) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1200
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1201
    return $URL ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1202
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1203
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1204
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1205
# Returns a canonicalized absolute URL, given a relative URL and a base URL.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1206
# The full procedure is described in the URI draft, section 5.2.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1207
# Note that a relative URI of "#fragment" should be resolved to "the current
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1208
#   document", not to an absolute URL.  This presents a quandary for this
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1209
#   routine:  should it always return an absolute URL, thus violating the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1210
#   spec, or should it not always return an absolute URL, thus requiring any
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1211
#   caller to check for this special case?  This routine leaves that up to
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1212
#   the caller, with $return_rel_fragment-- if set, stick to the spec;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1213
#   otherwise, always return an absolute URL.  See section G.4 of the draft.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1214
# Note that the pathname reduction in steps 6.c-f messes up any PATH_INFO
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1215
#   that has ./ or ../ in it, which may be a bug in the spec.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1216
sub absolute_url {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1217
    my($relurl, $baseurl, $return_rel_fragment)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1218
    my(@relurl, @baseurl) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1219
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1220
    # parse_url() returns scheme, site, path, query, fragment
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1221
    @relurl= &parse_url($relurl) ;      # Step 1
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1222
    @baseurl= &parse_url($baseurl) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1223
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1224
    COMBINE: {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1225
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1226
        # Step 2
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1227
        # See note above about $return_rel_fragment
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1228
        if (  $relurl[2] eq '' && 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1229
              !defined($relurl[0]) &&
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1230
              !defined($relurl[1]) &&
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1231
              !defined($relurl[3]) ) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1232
            @relurl[0..3]= @baseurl[0..3] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1233
            return $relurl if $return_rel_fragment ;   # see note above
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1234
            last COMBINE ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1235
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1236
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1237
        last COMBINE if defined($relurl[0]) ;    # Step 3
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1238
        $relurl[0]= $baseurl[0] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1239
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1240
        last COMBINE if defined($relurl[1]) ;    # Step 4
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1241
        $relurl[1]= $baseurl[1] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1242
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1243
        last COMBINE if $relurl[2]=~ m#^/# ;     # Step 5
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1244
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1245
        # Step 6-- resolve relative path
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1246
        my($path)= $baseurl[2]=~ m#^(.*/)# ;     # Step 6.a
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1247
        $relurl[2]= $path . $relurl[2] ;         # Step 6.b
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1248
        
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1249
    } # COMBINE
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1250
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1251
    # Put the remaining steps outside of the block to canonicalize the path.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1252
    # Arguably, this is not allowed.  To avoid such arguments at the expense of
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1253
    #   path canonicalization, put steps 6.c-f back in the COMBINE block.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1254
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1255
    1 while $relurl[2]=~ s#(^|/)\./#$1# ;    # Step 6.c
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1256
    $relurl[2]=~ s#(^|/)\.$#$1# ;            # Step 6.d
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1257
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1258
    # Step 6.e
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1259
    my($oldpath) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1260
    while ($relurl[2]=~ s#(([^/]+)/\.\./)# ($2 eq '..')  ? $1  : '' #ge) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1261
        last if ($relurl[2] eq $oldpath) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1262
        $oldpath= $relurl[2] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1263
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1264
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1265
    # Step 6.f
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1266
    $relurl[2]=~ s#(([^/]+)/\.\.$)# ($2 eq '..')  ? $1  : '' #ge ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1267
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1268
    # Step 6.g: allow leading ".." segments to remain in path
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1269
    # Step 6.h: relurl[2] is already the buffer string
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1270
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1271
    # To canonicalize further, lowercase the hostname (is this valid for all
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1272
    #   schemes?)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1273
    $relurl[1]= lc($relurl[1]) if defined($relurl[1]) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1274
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1275
    return &unparse_url(@relurl) ;                  # Step 7
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1276
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1277
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1278
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1279
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1280
# Convert a local URL into a canonicalized absolute path, or undef if
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1281
#   not on this host or other error.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1282
# Result should only be used as filename.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1283
# Supports UserDir (e.g. public_html) for "/~username/path/file" URLs.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1284
# Supports Alias, AliasMatch, ScriptAlias, and ScriptAliasMatch from srm.conf
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1285
#   (but note use of Perl regex's instead of standard regex's).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1286
# Inserts index.html, etc. (from @DIRECTORY_INDEX) if result is a directory,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1287
#   but just return directory name (ending in '/') if none of those exists.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1288
# Removes PATH_INFO, if any, from filename.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1289
# Directory names are always returned with trailing slash (which would not
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1290
#   be appropriate if PATH_INFO was to be retained).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1291
# While this routines makes some tests (e.g. if the file is a directory),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1292
#   it does not verify that file at the resulting $filename exists.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1293
# Note that not all URLs point to files, so this routine is not always
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1294
#   appropriate.  In this program, the result from this routine is only
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1295
#   used when we know the URL is not a CGI script (and is therefore a file),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1296
#   except in &is_cgi() itself, which tests if a file is a CGI script.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1297
#   If it weren't for &is_cgi(), we could ignore cases when the URL isn't
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1298
#   a file.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1299
# 12-1-99 JSM:  Changed to also return "redirected" location, in case URL
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1300
#   is a directory but not ending in a slash, so relative URLs will resolve
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1301
#   correctly against the redirected URL.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1302
sub url_to_filename {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1303
    my($URL)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1304
    my($URLpath, $path, $location, $docroot, $user) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1305
    return undef unless $URL=~ m#^http://\Q$LOCAL_HOST\E/#io ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1306
    $URLpath= (&parse_url($URL))[2] ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1307
    die "couldn't get path from [$URL]" unless length($URLpath) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1308
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1309
    # Note minor security hole:  if this script is run setuid, then any 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1310
    #   file on the system could be read by using an ALIAS to point to the 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1311
    #   file.  Note also that if a $URLpath such as "/alias/dir/../.." is
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1312
    #   passed to this routine, the alias will be substituted BEFORE the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1313
    #   ".." path segments are traversed.  A case like this probably a
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1314
    #   mistake in the URL anyway.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1315
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1316
    # Make no more than one alias substitution-- is there a precedence order?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1317
    # Note that %(.*)_MATCH use Perl regex's, not standard regex's.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1318
    # 3-29-99 JSM:  These all alias to actual directory, not to a resulting
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1319
    #   URL, so no further conversion should be done if one of these matches.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1320
    # 3-29-99 JSM:  Changed ALIAS_MATCH and SCRIPT_ALIAS_MATCH blocks to
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1321
    #   allow $1-type substitution in targets; MUST TEST!
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1322
    ALIAS: {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1323
        foreach (keys %ALIAS) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1324
            { $path= $URLpath, last ALIAS 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1325
                  if $URLpath=~ s/^\Q$_\E/$ALIAS{$_}/ }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1326
        foreach (keys %ALIAS_MATCH) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1327
            { $path= $URLpath, last ALIAS 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1328
                  if eval "\$URLpath=~ s/^\$_/$ALIAS_MATCH{$_}/" }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1329
        foreach (keys %SCRIPT_ALIAS) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1330
            { $path= $URLpath, last ALIAS 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1331
                  if $URLpath=~ s/^\Q$_\E/$SCRIPT_ALIAS{$_}/ }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1332
        foreach (keys %SCRIPT_ALIAS_MATCH) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1333
            { $path= $URLpath, last ALIAS 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1334
                  if eval "\$URLpath=~ s/^\$_/$SCRIPT_ALIAS_MATCH{$_}/" }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1335
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1336
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1337
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1338
    # If $path has been set in above ALIAS block, no further conversion is
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1339
    #   needed.  
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1340
    if ($path eq '') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1341
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1342
        # Must check for ^/.. before PATH_INFO check, in case $URL's path
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1343
        #   is e.g. '/../conf/access.conf'
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1344
        return undef if $URLpath=~ m#^/\.\.(/|$)# ;   # ^/.. is not allowed
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1345
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1346
        # Set $docroot and $path for this file, based on the URL (contains '~' ?)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1347
        if (length($USER_DIR) and ($user,$path)= $URLpath=~ m#^/~([^/]+)(.*)# ) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1348
            &read_home_dirs unless %home_dir ;   # only read when needed
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1349
            return undef unless length($home_dir{$user}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1350
            $docroot= "$home_dir{$user}/$USER_DIR" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1351
            $path= '/' unless length($path) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1352
        } else {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1353
            # If we have no $DOCUMENT_ROOT, we can't handle URLs without ~.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1354
            return undef unless $DOC_ROOT_EXISTS ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1355
            $docroot= $DOCUMENT_ROOT ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1356
            $path= $URLpath ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1357
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1358
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1359
        # Handle PATH_INFO: remove path segments until an existing file is named.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1360
        # Note that directories cannot have PATH_INFO after them.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1361
        unless (-e "$docroot$path") {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1362
            for (my($path2)= $path ; $path2=~ m#/# ; $path2=~ s#/[^/]*$##) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1363
                 last if -d "$docroot$path2" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1364
                 $path= $path2, last if -e "$docroot$path2" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1365
             }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1366
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1367
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1368
        # canonicalize path, and recheck for ^/.. (handles an obscure error,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1369
        #   when $URL's path is e.g. '/a/b/../../..'; but must canonicalize
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1370
        #   after PATH_INFO check, in case path is e.g. '/a/b.cgi/../../..').
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1371
        1 while $path=~ s#/\.(/|$)#/# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1372
        1 while $path=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1373
        return undef if $path=~ m#^/\.\.(/|$)# ;   # ^/.. is not allowed
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1374
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1375
        $path= "$docroot$path" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1376
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1377
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1378
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1379
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1380
    # Add "index.html", etc. if appropriate
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1381
    if (-d $path) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1382
        $path.= '/' unless $path=~ m#/$# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1383
        # 12-1-99 JSM: set "redirected" location also
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1384
        $location= "$URL/" unless $URL=~ m#/$# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1385
        foreach (@DIRECTORY_INDEX) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1386
            $path.= $_, last if -f "$path$_" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1387
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1388
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1389
    return ($path, $location) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1390
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1391
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1392
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1393
# Convert a local (possibly relative) pathname into a canonicalized URL.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1394
# If filename is relative and no $basepath is given, assume it's in the 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1395
#   current directory.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1396
# Supports UserDir (e.g. public_html) for "/~username/path/file" URLs.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1397
# Each path segment is checked to see if it's the same as $DOCUMENT_ROOT,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1398
#   by comparing inodes.  When a match is found, it's cut off the front,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1399
#   and an absolute URL is constructed.  If $DOCUMENT_ROOT is never matched,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1400
#   then $USER_DIR is scanned for.  If that doesn't match (i.e. the file 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1401
#   is not served to the Web), undef is returned.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1402
# Note that $DOC_ROOT_DEV and $DOC_ROOT_INODE are set at the start of the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1403
#   program for efficiency, but are an integral part of this routine.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1404
# %ALIAS is supported using %UN_ALIAS, as best as possible.  See next note
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1405
#   on avoiding use of this routine.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1406
# This is currently only used when parsing command-line filenames, and when
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1407
#   an <!--#include file="..." --> includes an <!--#include virtual="..." -->
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1408
#   (which may be an error anyway).  Thus, it can be avoided if needed, such
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1409
#   as when complex aliasing makes results ambiguous.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1410
# jsm-- should this add/remove @DIRECTORY_INDEX, to avoid some duplication?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1411
# 3-29-99 JSM:  Changed UNALIAS handling-- if it's unaliased, then no other
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1412
#   conversion is necessary.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1413
sub filename_to_url {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1414
    my($path, $basepath)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1415
    my($URLpath) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1416
    unless ($path=~ m#^/#) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1417
        $basepath= $CWD unless length($basepath) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1418
        $basepath.= '/' if -d $basepath && $basepath!~ m#/$# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1419
        $basepath=~ s#/[^/]*$#/# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1420
        $path= "$basepath$path"  ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1421
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1422
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1423
    # canonicalize filename by removing ./ and ../ where appropriate
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1424
    1 while $path=~ s#/\.(/|$)#/# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1425
    1 while $path=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1426
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1427
    # canonicalize directory to include final /
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1428
    $path.= '/' if -d $path && $path!~ m#/$# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1429
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1430
    # First, if path can be unaliased, return that.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1431
    # (Added 3-29-99 by JSM.)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1432
    foreach (keys %UN_ALIAS) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1433
        { $URLpath= $path, last if $path=~ s/^\Q$_\E/$UN_ALIAS{$_}/ }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1434
    return "http://\L$LOCAL_HOST\E$URLpath" if $URLpath ne '' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1435
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1436
    # Then, check if file is under $DOCUMENT_ROOT tree, and convert if so.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1437
    if ($DOC_ROOT_EXISTS) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1438
        my($doc_root)= $path ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1439
        while ($doc_root=~ s#/[^/]*$##) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1440
            my($dev,$inode)= stat("$doc_root/.") ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1441
            if ( ($dev==$DOC_ROOT_DEV) && ($inode==$DOC_ROOT_INODE) ) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1442
                $path=~ s/^$doc_root// ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1443
#                foreach (keys %UN_ALIAS) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1444
#                    { last if $path=~ s/^\Q$_\E/$UN_ALIAS{$_}/ }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1445
                return "http://\L$LOCAL_HOST\E$path" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1446
            }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1447
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1448
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1449
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1450
    # Handle possible case of "~username/$USER_DIR/$path"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1451
    # I don't think %ALIAS applies here, does it?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1452
    # This misses some when $HOME/$USER_DIR points through a symbolic link,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1453
    #   and $CWD isn't set to match %dir_to_user.  Work around by avoiding
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1454
    #   this routine, e.g. using only URLs on command line.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1455
    if (length($USER_DIR)) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1456
        if ($path=~ m#^(.*?)/$USER_DIR(/.*)# ) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1457
            # First, see if path is in %dir_to_user
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1458
            &read_home_dirs unless %dir_to_user ;   # only read when needed
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1459
            return "http://\L$LOCAL_HOST\E/~$dir_to_user{$1}$2"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1460
                if length($dir_to_user{$1}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1461
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1462
            # If not, then we must check inodes to equivalate directories
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1463
            &read_inode_to_user unless %inode_to_user ; # only read when needed
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1464
            my($dev,$inode)= stat("$1/.") ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1465
            return "http://\L$LOCAL_HOST\E/~$inode_to_user{$dev}{$inode}$2"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1466
                if length($inode_to_user{$dev}{$inode}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1467
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1468
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1469
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1470
    return undef ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1471
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1472
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1473
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1474
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1475
# Reads all users' home directory into %home_dir, from /etc/passwd.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1476
# Also creates %dir_to_user, which is faster than %inode_to_user (below).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1477
# Only used when $USER_DIR is used, for "/~username/path/file" URLs.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1478
# 2-27-00 JSM:  Changed to use getpwent, instead of reading /etc/passwd.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1479
sub read_home_dirs {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1480
    my($user, $homedir) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1481
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1482
    setpwent ;   # to rewind, in case getpwent has already been used
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1483
    while ( ($user, $homedir)= (getpwent)[0,7] ) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1484
        $home_dir{$user}= $homedir ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1485
        $dir_to_user{$homedir}= $user
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1486
            unless $dir_to_user{$homedir} ne '' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1487
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1488
    endpwent ;   # clean way to end getpwent processing
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1489
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1490
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1491
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1492
# Reads home directory inode information into %inode_to_user, from /etc/passwd.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1493
# Because this is time-consuming, it is only called if needed, and only once.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1494
# Only used when $USER_DIR is used, for "/~username/path/file" URLs.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1495
# On SPARCstation-10 with 3000 /etc/passwd records, this takes ~2 seconds.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1496
# 2-27-00 JSM:  Changed to use already-existing %home_dir, instead of reading
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1497
#   /etc/passwd again.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1498
sub read_inode_to_user {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1499
    my($user, $homedir) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1500
    my($dev, $inode) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1501
    &read_home_dirs unless %home_dir ;   # only read when needed
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1502
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1503
    while ( ($user, $homedir)= each %home_dir ) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1504
        ($dev,$inode)= stat("$homedir/.") ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1505
        $inode_to_user{$dev}{$inode}= $user 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1506
            unless $inode_to_user{$dev}{$inode} ne '' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1507
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1508
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1509
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1510
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1511
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1512
#----- Extracting URLs from HTML ------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1513
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1514
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1515
# Parse an SGML tag, and return a hash structure with a "name" scalar and
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1516
#   an "attrib" hash.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1517
# Parses first tag in string, ignoring all surrounding text.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1518
# Results are canonicalized to lower case wherever case-insensitive.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1519
sub parse_tag {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1520
    my($tag)= @_ ;       # will be mangled
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1521
    my($tagname,%attrib) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1522
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1523
    # only parse first tag in string
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1524
    ($tag)= split(/>/, $tag) ;  # remove all after >
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1525
    $tag=~ s/^([^<]*<)?\s*// ;  # remove pre-<, <, and leading blanks
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1526
    ($tagname,$tag)= split(/\s+/, $tag, 2) ;        # split out tag name
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1527
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1528
    # Extract name/value (possibly quoted), lowercase name, set $attrib{}.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1529
    # If quoted, is delimited by quotes; if not, delimited by whitespace.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1530
    $attrib{lc($1)}= &HTMLunescape($+)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1531
        while ($tag=~ s/\s*(\w+)\s*=\s*(([^"']\S*)|"([^"]*)"?|'([^']*)'?)//) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1532
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1533
    # now, get remaining non-valued (boolean) attributes
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1534
    $tag=~ s/^\s*|\s*$//g ;             # skip leading/trailing blanks
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1535
    foreach (split(/\s+/, $tag)) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1536
        $_= lc($_) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1537
        $attrib{$_}= $_ ;   # booleans have values equal to their name
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1538
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1539
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1540
    return { 'name'   => lc($tagname), 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1541
             'attrib' => \%attrib       } ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1542
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1543
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1544
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1545
# Unescape any HTML character references and return resulting string.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1546
# Support entity character references in %e_to_ch (which is incomplete),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1547
#   plus "$#ddd;" and "&#xdd;" forms for values<256.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1548
# Note that not decoding a valid character is erroneous, in that a
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1549
#   subsequent re-escaping will not return the original string, because
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1550
#   of the ampersand.  Nonetheless, that's preferable to losing the data.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1551
#   Q:  Is there an appropriate general way to represent an unescaped string?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1552
sub HTMLunescape {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1553
    my($s)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1554
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1555
    # Try alpha, decimal, and hex representations, only substituting if valid
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1556
    $s=~ s/&(([a-zA-Z][a-zA-Z0-9.-]*);?|#([0-9]+);?|#[Xx]([0-9a-fA-F]+);?)/
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1557
              length($2)   ? ( defined($e_to_ch{$2}) ? $e_to_ch{$2}  : "&$1" )
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1558
            : length($3)   ? ( $3 < 256              ? chr($3)       : "&$1" )
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1559
            : length($4)   ? ( hex($4) < 256         ? chr(hex($4))  : "&$1" )
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1560
                           :   "&$1"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1561
          /ge ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1562
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1563
    return $s ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1564
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1565
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1566
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1567
# Given a block of HTML, extracts all URLs referenced in it, and adds them
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1568
#   to our data structures to be downloaded or checked (i.e. calls 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1569
#   &add_url()).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1570
# Note that %html_urls and %non_html_urls are set at the start of the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1571
#   program for efficiency, but are an integral part of this routine.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1572
# Currently, this extracts all <.*?> patterns, which may not be valid if
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1573
#   "<" or ">" characters are e.g. inside a <script> element.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1574
sub extract_urls {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1575
    my($HTML, $baseurl, $referer, $depth)= @_ ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1576
    my(@tags) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1577
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1578
    # Remove comments before extracting links, as pointed out by Tim Hunter.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1579
    $HTML=~ s/<!--.*?--.*?>//gs ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1580
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1581
    # We must look for <base> tag before all the work, so we must parse
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1582
    #   all tags first.  :(  Therefore, we save this large array of
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1583
    #   structures for efficiency, hoping we don't run out of memory.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1584
    my($i)= -1 ;  # to start at array element 0
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1585
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1586
    foreach ($HTML=~ /(<.*?>)/gs) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1587
        $tags[++$i]= &parse_tag($_) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1588
        $baseurl= $tags[$i]{'attrib'}{'href'}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1589
            if ($tags[$i]{'name'} eq 'base') 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1590
                and (length($tags[$i]{'attrib'}{'href'})) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1591
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1592
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1593
    # For each tag, call &add_url() for each URL in the tag
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1594
    foreach my $tag (@tags) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1595
        next if $tag->{'name'}=~ m#^/# ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1596
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1597
        # Handle the "regular" tag-attributes, in %html_urls and %non_html_urls
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1598
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1599
        foreach (@{$html_urls{$tag->{'name'}}}) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1600
            &add_url(&absolute_url($tag->{'attrib'}{$_}, $baseurl), 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1601
                     $referer, $depth) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1602
                if length($tag->{'attrib'}{$_}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1603
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1604
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1605
        foreach (@{$non_html_urls{$tag->{'name'}}}) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1606
            &add_url(&absolute_url($tag->{'attrib'}{$_}, $baseurl), 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1607
                     $referer, $depth, 0) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1608
                if length($tag->{'attrib'}{$_}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1609
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1610
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1611
        # Now handle each tag-attribute that needs special attention
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1612
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1613
        if ($tag->{'name'} eq 'object') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1614
            my($codebase)=
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1615
                &absolute_url($tag->{'attrib'}{'codebase'}, $baseurl) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1616
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1617
            &add_url(&absolute_url($tag->{'attrib'}{'data'}, $codebase),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1618
                     $referer, $depth)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1619
                if length($tag->{'attrib'}{'data'}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1620
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1621
            # There seems to be a contradiction between the HTML 4.0 spec
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1622
            #   section 13.3.3 and RFC 1808 section 4 step 2b regarding
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1623
            #   the URL resolution of e.g. classid="java:program.start".
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1624
            #   For now, we'll stick with the RFC 1808 method.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1625
            &add_url(&absolute_url($tag->{'attrib'}{'classid'}, $codebase),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1626
                     $referer, $depth, 0)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1627
                if length($tag->{'attrib'}{'classid'}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1628
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1629
            # <object> tag's "archive" attribute is a space-separated list
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1630
            foreach (split(/\s+/, $tag->{'attrib'}{'archive'})) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1631
                &add_url(&absolute_url($_, $codebase), $referer, $depth, 0) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1632
            }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1633
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1634
        } elsif ($tag->{'name'} eq 'head') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1635
            # "profile" attribute is a space-separated list
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1636
            foreach (split(/\s+/, $tag->{'attrib'}{'profile'})) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1637
                &add_url(&absolute_url($_, $baseurl), $referer, $depth, 0) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1638
            }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1639
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1640
        } elsif ($tag->{'name'} eq 'applet') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1641
            my($codebase)=
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1642
                &absolute_url($tag->{'attrib'}{'codebase'}, $baseurl) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1643
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1644
            &add_url(&absolute_url($tag->{'attrib'}{'code'}, $codebase),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1645
                     $referer, $depth, 0)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1646
                if length($tag->{'attrib'}{'code'}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1647
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1648
            &add_url(&absolute_url($tag->{'attrib'}{'object'}, $codebase),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1649
                     $referer, $depth, 0)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1650
                if length($tag->{'attrib'}{'object'}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1651
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1652
            # <applet> tag's "archive" attribute is a comma-separated list
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1653
            foreach (split(/\s*,\s*/, $tag->{'attrib'}{'archive'})) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1654
                &add_url(&absolute_url($_, $codebase), $referer, $depth, 0) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1655
            }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1656
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1657
        # Check form script for existence only, but don't follow hyperlinks.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1658
        # Handles the unlikely case when a CGI is referred to by both a
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1659
        #   <form action=___.cgi> tag and an <a href=___.cgi> tag: If a CGI
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1660
        #   is only pointed to by <form action>, then don't follow the link
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1661
        #   (i.e. set $url->{'dontfollow'} to verify, not load).  If a CGI
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1662
        #   is called by at least one <a href> tag, then do follow the link.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1663
        } elsif ($tag->{'name'} eq 'form') {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1664
            &add_url(&absolute_url($tag->{'attrib'}{'action'}, $baseurl),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1665
                     $referer, $depth, undef, 1, 1)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1666
                if length($tag->{'attrib'}{'action'}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1667
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1668
        }   # if ($tag->{'name'} eq '...')
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1669
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1670
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1671
    }   # foreach $tag (@tags)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1672
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1673
}   # &extract_urls()
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1674
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1675
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1676
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1677
#----- output routines ------------------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1678
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1679
# Generate final report
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1680
sub make_report {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1681
    if ($verbose_report) { &make_verbose_report }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1682
    else                 { &make_short_report }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1683
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1684
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1685
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1686
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1687
# Generate a verbose report of all URLs that have been checked
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1688
sub make_verbose_report {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1689
    my($rootlist)= join("\n    ", @ARGV) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1690
    my($numurls)= scalar keys %url ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1691
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1692
    print <<EOH ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1693
======================================================================
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1694
Report of invalid links on $LOCAL_HOST, 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1695
recursively followed starting with the files/URLs:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1696
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1697
    $rootlist
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1698
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1699
EOH
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1700
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1701
    if (@INCLUDE_PATTERNS) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1702
        my($includelist)= join("\n    ", @INCLUDE_PATTERNS) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1703
        print <<EOH ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1704
Only including URLs matching at least one of the following patterns:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1705
    $includelist
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1706
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1707
EOH
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1708
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1709
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1710
    if (@EXCLUDE_PATTERNS) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1711
        my($excludelist)= join("\n    ", @EXCLUDE_PATTERNS) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1712
        print <<EOH ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1713
Excluding URLs matching any of the following patterns:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1714
    $excludelist
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1715
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1716
EOH
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1717
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1718
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1719
    print "Total URLs checked: $numurls\n\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1720
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1721
    print( "Only reporting response codes beginning with: ", 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1722
           join(", ", @INCLUDE_STATUS), "\n" )
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1723
        if @INCLUDE_STATUS ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1724
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1725
    print( "Excluding response codes beginning with: ",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1726
           join(", ", @EXCLUDE_STATUS), "\n" )
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1727
        if @EXCLUDE_STATUS ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1728
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1729
    print "Maximum traversal depth: $max_depth\n" if length($max_depth) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1730
    print "Only local files were checked; no CGI scripts were invoked.\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1731
        if $file_check ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1732
    print "Local URLs were read from the filesystem where possible.\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1733
        unless $full_http_check ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1734
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1735
    print "\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1736
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1737
    # Only report status errors if there are any.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1738
    # Using grep()s is slightly inefficient, but a *lot* cleaner.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1739
    my($has_statuserrs) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1740
    foreach my $URL (sort keys %url) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1741
        next if @INCLUDE_STATUS &&
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1742
                !grep($url{$URL}{'status'}=~ /^$_/, @INCLUDE_STATUS) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1743
        next if grep($url{$URL}{'status'}=~ /^$_/, @EXCLUDE_STATUS) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1744
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1745
        $has_statuserrs=1, last ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1746
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1747
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1748
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1749
    if ($has_statuserrs) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1750
        print <<EOH ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1751
======================================================================
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1752
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1753
RESULTS FROM ALL URLS WITH SELECTED RESPONSE STATUS CODES
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1754
------------------------------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1755
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1756
EOH
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1757
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1758
        foreach (sort keys %url) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1759
            my($u)= $url{$_} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1760
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1761
            next if @INCLUDE_STATUS &&
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1762
                    !grep($u->{'status'}=~ /^$_/, @INCLUDE_STATUS) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1763
            next if grep($u->{'status'}=~ /^$_/, @EXCLUDE_STATUS) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1764
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1765
            print "$u->{'URL'}\n", '-' x 50, "\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1766
            print "Status:   $u->{'status'}\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1767
            print "Moved to: $u->{'location'}\n" if $u->{'location'} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1768
            print "Depth:    $u->{'depth'}\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1769
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1770
            for (my($URL)= $u->{'referer'}, my($tab)= '  ' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1771
                 length($url{$URL}) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1772
                 $URL= $url{$URL}{'referer'}, $tab.= '  ' )
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1773
            {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1774
                print "${tab}referred by $url{$URL}{'URL'}\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1775
                die "PROGRAM ERROR: apparent infinite referer loop.\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1776
                    if length($tab)>200 ;      # simple-minded sanity check
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1777
            }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1778
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1779
            print "\n\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1780
        }  # URL
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1781
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1782
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1783
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1784
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1785
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1786
    # Only report SSI errors if there are any
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1787
    my($has_ssierrs) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1788
    foreach (sort keys %url) { 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1789
        $has_ssierrs=1, last if @{$url{$_}{'ssierrs'}} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1790
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1791
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1792
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1793
    if ($has_ssierrs) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1794
        print <<EOH ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1795
======================================================================
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1796
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1797
PROBLEMS WITH SERVER-SIDE INCLUDE (AKA SHTML) DIRECTIVES
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1798
------------------------------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1799
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1800
EOH
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1801
        foreach (sort keys %url) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1802
            my($u)= $url{$_} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1803
            if (@{$u->{'ssierrs'}}) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1804
                print "$u->{'URL'}\n", '-' x 50, "\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1805
                printf "Total %s SSI Errors:\n",  $#{$u->{'ssierrs'}}+1 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1806
                foreach my $i (0..$#{$u->{'ssierrs'}}) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1807
                    printf "  %s) ", $i+1 ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1808
                    my($tab)= ' ' x (4+length($i+1)) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1809
                    my($tab2) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1810
                    foreach my $level (@{$u->{'ssierrs'}[$i]}) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1811
                        print "${tab2}file:   $level->{'path'}\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1812
                        print "${tab}in tag: $level->{'tag'}\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1813
                        print "${tab}error:  $level->{'errmsg'}\n" 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1814
                            if $level->{'errmsg'} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1815
                        $tab.= '  ' ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1816
                        $tab2= $tab ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1817
                    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1818
                }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1819
                print "\n\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1820
            }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1821
        }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1822
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1823
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1824
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1825
    unless ($has_statuserrs or $has_ssierrs) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1826
        print <<EOH ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1827
======================================================================
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1828
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1829
                    <<     NO ERRORS FOUND     >>
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1830
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1831
EOH
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1832
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1833
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1834
    print '=' x 70, "\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1835
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1836
} # &make_verbose_report()
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1837
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1838
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1839
# Generate a one-line-per-URL report
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1840
sub make_short_report {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1841
    my($numurls)= scalar keys %url ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1842
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1843
    print "Total $numurls URLs checked\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1844
    URL: foreach my $URL (sort keys %url) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1845
        next if @INCLUDE_STATUS &&
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1846
                !grep($url{$URL}{'status'}=~ /^$_/, @INCLUDE_STATUS) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1847
        next if grep($url{$URL}{'status'}=~ /^$_/, @EXCLUDE_STATUS) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1848
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1849
        print "$url{$URL}{'URL'}\t$url{$URL}{'status'}\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1850
    }  # URL
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1851
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1852
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1853
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1854
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1855
# Print a summary of usage and exit
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1856
sub usage {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1857
    print <<EOF ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1858
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1859
Checklinks $CL_VERSION
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1860
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1861
To recursively check all HTML links on the local site, enter:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1862
    $0  <options>  [ start-file | start-URL ] ...
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1863
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1864
Options include:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1865
  -v                    Generate full (verbose) report, including full
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1866
                          referral information and detailed SSI error
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1867
                          reporting.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1868
  -I <include-pattern>  Only check URLs matching <include-pattern>.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1869
  -X <exclude-pattern>  Don't check URLs matching <exclude-pattern>.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1870
  -i <include-status>   Only report URLs whose response code starts with
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1871
                          the pattern <include-status>.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1872
  -x <exclude-status>   Don't report URLs whose response code starts with
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1873
                          the pattern <exclude-status>.  Default is to
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1874
                          exclude "200" responses only (i.e. "-x 200").
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1875
  -d <max-depth>        Traverse links no deeper than <max-depth>.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1876
  -f                    "File mode"-- only read files from the filesystem;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1877
                          do not go through the HTTP server at all.  This
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1878
                          will skip all URLs that point to CGI scripts.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1879
  -h                    "HTTP mode"-- use HTTP to check ALL URLs, even
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1880
                          if they could be read from the filesystem.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1881
                          Incompatible with "-f" option.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1882
  -c <config-file>      Read appropriate configuration parameters from
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1883
                          <config-file>, typically srm.conf.  Use '-' to
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1884
                          read from STDIN.  If a directory is named, use
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1885
                          "srm.conf" in that directory.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1886
  -q                    Print current configuration parameters.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1887
  -?                    Print this help message.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1888
  --                    End command-line option processing.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1889
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1890
Don't stack options like "-vf"; use "-v -f" instead.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1891
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1892
For -I, -X, -i, and -x:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1893
  Values are interpreted as Perl 5 regular expressions.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1894
  Use multiple options to build a list (e.g. "-I include1 -I include2").
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1895
  Use a value of '' to clear a list (e.g. -x '' means "report all responses",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1896
    "-x '' -x 401" means "report all but 401 responses").
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1897
  As a special case, an empty -I or -i list implies no include-restrictions.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1898
  If an item is in both the include and exclude list, it is excluded.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1899
  Note that -I and -X restrict which URLs are traversed into, so they may
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1900
    prune large areas of your Web structure.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1901
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1902
EOF
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1903
    exit ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1904
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1905
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1906
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1907
#----- debugging routines below ---------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1908
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1909
# Print current configuration settings
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1910
sub print_config {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1911
    print "\n----- OPTIONS SPECIFIC TO THIS EXECUTION -----------------------------\n\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1912
    
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1913
    print "Include only URLs containing one of the following patterns:\n",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1914
          ( map { "    $_\n" } @INCLUDE_PATTERNS ),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1915
          "\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1916
        if @INCLUDE_PATTERNS ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1917
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1918
    print "Exclude URLs containing one of the following patterns:\n",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1919
          (map { "    $_\n" } @EXCLUDE_PATTERNS ),
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1920
          "\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1921
        if @EXCLUDE_PATTERNS ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1922
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1923
    print "Only report response codes beginning with:  ",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1924
          join(", ", @INCLUDE_STATUS), "\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1925
        if @INCLUDE_STATUS ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1926
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1927
    print "Don't report response codes beginning with: ",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1928
          join(", ", @EXCLUDE_STATUS), "\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1929
        if @EXCLUDE_STATUS ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1930
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1931
    print "\nMaximum search depth:  $max_depth\n" if length($max_depth) ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1932
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1933
    print <<EOS ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1934
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1935
----- INSTALLATION PARAMETERS ----------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1936
    
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1937
Local Host:           $LOCAL_HOST
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1938
Document Root:        $DOCUMENT_ROOT
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1939
User Web Directory:   $USER_DIR
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1940
Default Filename(s):  @DIRECTORY_INDEX
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1941
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1942
File extensions indicating a CGI program:       @CGI_EXTENSIONS
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1943
File extensions indicating server-parsed HTML:  @SHTML_EXTENSIONS
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1944
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1945
EOS
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1946
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1947
    print "Directory Aliases:\n",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1948
          map { sprintf("    %15s ==> %s\n", $_, $ALIAS{$_}) }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1949
              sort keys %ALIAS, 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1950
          "\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1951
        if keys %ALIAS ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1952
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1953
    print "Directory Regular Expression Aliases:\n", 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1954
          map { sprintf("    %15s ==> %s\n", $_, $ALIAS_MATCH{$_}) }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1955
              sort keys %ALIAS_MATCH,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1956
          "\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1957
        if keys %ALIAS_MATCH ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1958
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1959
    print "Script Directory Aliases:\n", 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1960
          map { sprintf("    %15s ==> %s\n", $_, $SCRIPT_ALIAS{$_}) }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1961
              sort keys %SCRIPT_ALIAS,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1962
          "\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1963
        if keys %SCRIPT_ALIAS ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1964
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1965
    print "Script Directory Regular Expression Aliases:\n", 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1966
          map { sprintf("    %15s ==> %s\n", $_, $SCRIPT_ALIAS_MATCH{$_}) }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1967
              sort keys %SCRIPT_ALIAS_MATCH,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1968
          "\n"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1969
        if keys %SCRIPT_ALIAS_MATCH ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1970
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1971
    if ($ENV{'http_proxy'}) {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1972
        print "HTTP Proxy:  $ENV{'http_proxy'}\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1973
        print "Except to hosts ending in:  $ENV{'no_proxy'}\n" 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1974
            if $ENV{'no_proxy'} ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1975
        print "\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1976
    } else {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1977
        print "Not using any HTTP Proxy.\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1978
    }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1979
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1980
    print "Maximum HTTP redirections allowed for a URL:  $MAX_REDIRECTS\n",
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1981
          "Maximum number of tries to get a single URL:  $MAX_ATTEMPTS\n\n";
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1982
          
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1983
    print '-' x 70, "\n\n" ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1984
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1985
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1986
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1987
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1988
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1989
# print configuration, in a style more for debugging (mostly from srm.conf)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1990
sub print_debug_config {
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1991
    print <<EOF ;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1992
DOCUMENT_ROOT=    [$DOCUMENT_ROOT]
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1993
USER_DIR=         [$USER_DIR]
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1994
DIRECTORY_INDEX=  [@DIRECTORY_INDEX]
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1995
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1996
CGI_EXTENSIONS=   [@CGI_EXTENSIONS]
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1997
SHTML_EXTENSIONS= [@SHTML_EXTENSIONS]
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1998
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  1999
EOF
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2000
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2001
    foreach (sort keys %ALIAS) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2002
        { print "ALIAS{$_}= [$ALIAS{$_}]\n" }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2003
    foreach (sort keys %ALIAS_MATCH) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2004
        { print "ALIAS_MATCH{$_}= [$ALIAS_MATCH{$_}]\n" }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2005
    foreach (sort keys %SCRIPT_ALIAS) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2006
        { print "SCRIPT_ALIAS{$_}= [$SCRIPT_ALIAS{$_}]\n" }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2007
    foreach (sort keys %SCRIPT_ALIAS_MATCH) 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2008
        { print "SCRIPT_ALIAS_MATCH{$_}= [$SCRIPT_ALIAS_MATCH{$_}]\n" }
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2009
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2010
}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2011
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2012
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2013
#----------------------------------------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2014
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2015
#   SPARSE DOCUMENTATION ON PROGRAM INTERNALS:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2016
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2017
#----------------------------------------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2018
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2019
#   URLs are read directly from the local filesystem if possible, or
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2020
#   downloaded or checked with HTTP GET and HEAD methods if not.  CGI
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2021
#   scripts are called through the HTTP server, even if local (since the
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2022
#   resource their URLs refer to are not files).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2023
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2024
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2025
#   Global variables:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2026
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2027
#     %url holds all data regarding all URLs, using the text URL as the key:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2028
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2029
#     %url{$URL}{qw( URL filename referer depth
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2030
#                    ishtml iscgi islocal dontfollow
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2031
#                    status location numredirects numtries lasttried
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2032
#                    ssierrs
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2033
#                    hasbeenloaded)}  <== only for testing
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2034
#         URL is same as hash key
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2035
#         ishtml means is type text/html; links MAY be extracted.  (Used in
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2036
#           main loop.)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2037
#         iscgi means is a CGI script; MUST NOT be read directly from 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2038
#           filesystem. (Used in verify_url() and load_url().)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2039
#         islocal means served by local server:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2040
#             a) MAY be read directly from filesystem
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2041
#             b) links MAY be extracted
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2042
#         dontfollow means links MUST NOT be extracted.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2043
#         status begins with a number, but may have text afterward.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2044
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2045
#         islocal= (URL=~ m#^http://$LOCAL_HOST/#i) , i.e. redundant.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2046
#         ishtml and iscgi are guesses, may be 0, 1, or '' for unset.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2047
#         iscgi only matters for local URLs, but may safely be set for 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2048
#           remote URLs.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2049
#         dontfollow is a hack to handle an obscure CGI-related situation.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2050
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2051
#     In addition to the standard HTTP status codes, 'status' may take one
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2052
#     of the following values:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2053
#         450 Can't read file: $!
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2054
#         451 SSI Error(s) (__ total)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2055
#         600 Can't create socket: $@
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2056
#         601 Connection timed out
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2057
#         602 Incomplete response (%s of %s bytes)
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2058
#         603 Incomplete chunked response
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2059
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2060
#     Handling of 602 and 603 is somewhat hackish-- overwrites real status
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2061
#       line with artificial one.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2062
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2063
#     $url->{'ssierrs'} is a list of errors relating to SSI includes.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2064
#       Each error has a list of filenames representing the include
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2065
#       chain.  Each element in the chain stores the absolute
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2066
#       pathname, the <!--#include--> tag which called it, and
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2067
#       possibly an error message (usually only on the last file in
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2068
#       the include-chain).  Thus, the entire structure is
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2069
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2070
#         $url->{'ssierrs'}[$errid][$levelsdeep]{qw( path tag errmsg )}
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2071
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2072
#       This is only a memory hog with many errors of deeply nested files.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2073
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2074
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2075
#     @urlstoget: canonicalized list of URLs left to get, in order
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2076
#         of appearance.  List elements are pointers to %url elements.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2077
#         This array may grow from &extract_urls(), or be rearranged when
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2078
#         a URL needs to be retried.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2079
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2080
#   Note that $url is normally a pointer, and $URL is normally a string.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2081
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2082
#   Files/paths are canonicalized by removing unneeded "./" and "../"
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2083
#     segments.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2084
#   Directories are canonicalized to always end in "/", including in URLs;
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2085
#     note that some system variables and local directory variables aren't so.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2086
#   URLs that point to "/" end in "/", e.g. "http://www.foo.com/".
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2087
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2088
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2089
#   This program has a lot of unused functionality for possible extension,
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2090
#   so don't get confused by routines that do more than they are used for.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2091
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2092
#----------------------------------------------------------------------
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2093
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2094
# To do:
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2095
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2096
#   * If you want a different output format (like in HTML), it should 
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2097
#       be easy enough to add.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2098
#   * This could be extended to check remote sites without much trouble.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2099
#   * Should probably support directories that don't have index.html.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2100
#   * Check that pointed-to fragments exist in target files
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2101
#   * keep track of servers and don't hit them too often
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2102
#   * support robots.txt
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2103
#   * add option to read list of URLs from file
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2104
#   * support HTTP Basic Authentication somehow.  Note that currently, this
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2105
#       reads local files even when they should be protected by .htaccess
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2106
#       (unless using -h option).
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2107
#   * Read list of local host names from gethostbyname() ?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2108
#   * check read permissions on files?
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2109
#   * In the event multiple URLs redirect to the same URL, this could be a
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2110
#       little more efficient if it saved the redirected loads/downloads.
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2111
#
fbe2fc30a177 added sunbroy2
haftmann
parents:
diff changeset
  2112
#----------------------------------------------------------------------