16292
|
1 |
#!/usr/local/bin/perl
|
|
2 |
#
|
|
3 |
# Checklinks 1.0.1
|
|
4 |
#
|
|
5 |
# Starting at one or more seed HTML files, recursively check the
|
|
6 |
# validity of all links on the site. Major features:
|
|
7 |
#
|
|
8 |
# * Local URLs are read from the filesystem when possible (much
|
|
9 |
# faster than going through HTTP server).
|
|
10 |
# * Basic server-side includes (aka SSI or SHTML) are checked.
|
|
11 |
# * Latest standards are supported-- HTML 4.0, HTTP 1.1, URIs
|
|
12 |
# according to RFC 2396.
|
|
13 |
# * Links are traversed breadth-first.
|
|
14 |
#
|
|
15 |
# To list command-line options, run "cl -?" or see &usage() below.
|
|
16 |
#
|
|
17 |
# TO CONFIGURE:
|
|
18 |
#
|
|
19 |
# 1) Set $LOCAL_HOST and $DOCUMENT_ROOT, just below. If you don't, the
|
|
20 |
# program will try to guess them in set_needed_globals(), but it's more
|
|
21 |
# reliable if you enter them here.
|
|
22 |
#
|
|
23 |
# 2) If needed, set any further server configuration below-- things like
|
|
24 |
# path aliases and so forth. If you have the srm.conf file, you can
|
|
25 |
# feed it into this script with "-c srm.conf"; otherwise, the default
|
|
26 |
# settings will probably work OK.
|
|
27 |
#
|
|
28 |
# You can set a few parameters with the undocumented "-D <name=value>"
|
|
29 |
# command-line option, e.g. "-D LOCAL_HOST=www.myhost.com".
|
|
30 |
#
|
|
31 |
# Further comments, including an overview of script internals, are at
|
|
32 |
# the end of this file.
|
|
33 |
#
|
|
34 |
# Copyright (C) 1998, 2000 by James Marshall, james@jmarshall.com
|
|
35 |
# see http://www.jmarshall.com/tools/cl/ for more info
|
|
36 |
#
|
|
37 |
#
|
|
38 |
# CHANGES IN 1.0.1:
|
|
39 |
#
|
|
40 |
# This is just a bug fix release. Fixes include:
|
|
41 |
# . Aliases are handled correctly now. Sorry 'bout that.
|
|
42 |
# . A redirect + relative URL no longer results in infinitely
|
|
43 |
# recursing URLs.
|
|
44 |
# . More HTML tags are searched for links.
|
|
45 |
# . Non-HTML files are no longer searched for links.
|
|
46 |
# . There were other minor bug fixes.
|
|
47 |
#
|
|
48 |
#----------------------------------------------------------------------
|
|
49 |
|
|
50 |
#use strict ;
|
|
51 |
|
|
52 |
my( $LOCAL_HOST, $DOCUMENT_ROOT, $USER_DIR, @DIRECTORY_INDEX,
|
|
53 |
%ALIAS, %ALIAS_MATCH, %SCRIPT_ALIAS, %SCRIPT_ALIAS_MATCH, %UN_ALIAS,
|
|
54 |
@SHTML_EXTENSIONS, @CGI_EXTENSIONS, @INCLUDE_PATTERNS, @EXCLUDE_PATTERNS,
|
|
55 |
@INCLUDE_STATUS, @EXCLUDE_STATUS,
|
|
56 |
$verbose_report, $max_depth, $file_check, $full_http_check,
|
|
57 |
$MAX_REDIRECTS, $MAX_ATTEMPTS, $HTML_BY_NAME, $SUPPORT_NCSA_BUG,
|
|
58 |
@NO_PROXY, $DOC_ROOT_DEV, $DOC_ROOT_INODE, $DOC_ROOT_EXISTS, $CWD,
|
|
59 |
%html_urls, %non_html_urls, %e_to_ch,
|
|
60 |
|
|
61 |
%home_dir, %dir_to_user, %inode_to_user,
|
|
62 |
|
|
63 |
%url, @urlstoget,
|
|
64 |
|
|
65 |
$debug, $CL_VERSION,
|
|
66 |
) ;
|
|
67 |
|
|
68 |
|
|
69 |
#----- User Configuration ---------------------------------------------
|
|
70 |
|
|
71 |
# This should be 'localhost', or a hostname of the Web server. URLs at
|
|
72 |
# this host will be assumed to be local; URLs not at this host will not be
|
|
73 |
# traversed into. If this names a remote host, the program will not work.
|
|
74 |
# Note that 'localhost' doesn't necessarily point to your local Web server.
|
|
75 |
|
|
76 |
# $LOCAL_HOST= 'localhost' ;
|
|
77 |
# $LOCAL_HOST= 'www.example.com' ;
|
|
78 |
$LOCAL_HOST='isabelle.in.tum.de';
|
|
79 |
|
|
80 |
# This is your root Web directory, i.e. the directory that the Web server
|
|
81 |
# sends the user if the URL "http://$LOCAL_HOST" is requested. It's in
|
|
82 |
# the configuration file srm.conf (and is read by -c option).
|
|
83 |
# If you don't know the document root of your server, but you don't need
|
|
84 |
# it because you're only checking URLs whose path starts with ~, put a
|
|
85 |
# non-existent path here rather than leave it blank (a hack).
|
|
86 |
|
|
87 |
# $DOCUMENT_ROOT= '/home/www/htdocs' ;
|
|
88 |
$DOCUMENT_ROOT='/home/proj/isabelle';
|
|
89 |
|
|
90 |
#----- variables equivalent to srm.conf entries
|
|
91 |
|
|
92 |
# These globals are from the equivalent entries in srm.conf, etc.
|
|
93 |
# See the command-line option -c <config-file>, to read values directly
|
|
94 |
# from srm.conf instead.
|
|
95 |
|
|
96 |
# $USER_DIR= 'public_html' ;
|
|
97 |
$USER_DIR='.html-data';
|
|
98 |
@DIRECTORY_INDEX= qw( index.html index.cgi index.shtml ) ;
|
|
99 |
|
|
100 |
# Used in &url_to_filename(), and possibly elsewhere
|
|
101 |
# Note that ALIAS_MATCH and SCRIPT_ALIAS_MATCH use Perl (not standard) regexps.
|
|
102 |
# If order of multiple e.g. "Alias" directives is important, this may not work.
|
|
103 |
%ALIAS= () ;
|
|
104 |
%ALIAS_MATCH= () ;
|
|
105 |
%SCRIPT_ALIAS= () ;
|
|
106 |
%SCRIPT_ALIAS_MATCH= () ;
|
|
107 |
|
|
108 |
# The list of file extensions to interpret as CGI scripts or
|
|
109 |
# server-parsed HTML files.
|
|
110 |
# These are not specific settings in srm.conf, but are combinations of
|
|
111 |
# AddHandler directives and possibly AddType directives.
|
|
112 |
@CGI_EXTENSIONS= qw( .cgi ) ;
|
|
113 |
@SHTML_EXTENSIONS= qw( .shtml ) ;
|
|
114 |
|
|
115 |
#----- end of variables equivalent to srm.conf entries
|
|
116 |
|
|
117 |
# Specify patterns here to only include URLs that match at least one
|
|
118 |
# pattern. As a special case, an empty list includes all URLs, i.e.
|
|
119 |
# does not restrict URLs by name (except perhaps by @EXCLUDE_PATTERNS).
|
|
120 |
# This can be added to or cleared with the -I command-line option.
|
|
121 |
@INCLUDE_PATTERNS= () ;
|
|
122 |
|
|
123 |
# Specify patterns here to cause matching URLs to be excluded,
|
|
124 |
# e.g. '\?' means ignore all URLs that query.
|
|
125 |
# This can be added to or cleared with the -X command-line option.
|
|
126 |
# @EXCLUDE_PATTERNS= qw( \? ) ;
|
|
127 |
|
|
128 |
# Only report URLs whose status codes start with one of these patterns.
|
|
129 |
# As a special case, an empty list reports all URLs, i.e. does not
|
|
130 |
# restrict URLs by status code (except perhaps by @EXCLUDE_STATUS).
|
|
131 |
# This can be added to or cleared with the -i command-line option.
|
|
132 |
@INCLUDE_STATUS= () ;
|
|
133 |
|
|
134 |
# Don't report URLs whose status codes start with these patterns. Default
|
|
135 |
# is qw( 200 ).
|
|
136 |
# This can be added to or cleared with the -x command-line option.
|
|
137 |
@EXCLUDE_STATUS= qw( 200 ) ;
|
|
138 |
|
|
139 |
# For 302 or 303 HTTP redirection, redirect no more than this many times.
|
|
140 |
$MAX_REDIRECTS= 5 ;
|
|
141 |
|
|
142 |
# If a connection times out, etc., attempt no more than this many times.
|
|
143 |
$MAX_ATTEMPTS= 5 ;
|
|
144 |
|
|
145 |
# The old version determined whether a file was HTML by the -T test (text
|
|
146 |
# file), and so traversed all HTML-like links in any text file that wasn't
|
|
147 |
# a CGI script. It's probably more appropriate to check the file
|
|
148 |
# extension, to exclude source code, .txt files, etc. Leave $HTML_BY_NAME
|
|
149 |
# set to use the filename, or unset it to traverse all HTML-like links in
|
|
150 |
# any text files, as the old version did.
|
|
151 |
$HTML_BY_NAME= 1 ;
|
|
152 |
|
|
153 |
# Some old NCSA servers, including 1.5.2, don't report the HTTP version
|
|
154 |
# correctly in the status line; they return e.g. "HTTP 200 OK". To allow
|
|
155 |
# this, leave the variable here set.
|
|
156 |
$SUPPORT_NCSA_BUG= 1 ;
|
|
157 |
|
|
158 |
|
|
159 |
#----- DO NOT CHANGE ANYTHING BELOW THIS LINE, unless you want to... ---
|
|
160 |
|
|
161 |
#----- Further Global Variable Initialization --------------------------
|
|
162 |
|
|
163 |
$CL_VERSION= '1.0.1' ;
|
|
164 |
|
|
165 |
$ENV{'http_proxy'}||= $ENV{'HTTP_PROXY'} ;
|
|
166 |
@NO_PROXY= split(/[\s,]+/, $ENV{'no_proxy'} || $ENV{'NO_PROXY'} ) ;
|
|
167 |
|
|
168 |
|
|
169 |
# If output's not going directly to terminal, this ensures autoflushing.
|
|
170 |
$|= 1 ;
|
|
171 |
|
|
172 |
#----- End of Configuration --------------------------------------------
|
|
173 |
|
|
174 |
use strict 'vars' ;
|
|
175 |
use IO::Socket ;
|
|
176 |
|
|
177 |
&usage unless @ARGV ;
|
|
178 |
|
|
179 |
# Process command-line options
|
|
180 |
&getopts ;
|
|
181 |
|
|
182 |
# Make any final needed adjustments to globals, after the hard-coded
|
|
183 |
# values above and any options have been processed.
|
|
184 |
&adjust_all_globals ;
|
|
185 |
|
|
186 |
# Default to "." if no starting filenames given.
|
|
187 |
# 3-6-98: Anh, decided against it.
|
|
188 |
#@ARGV= ('.') unless @ARGV ;
|
|
189 |
|
|
190 |
# &add_url() sets $url{$_} and pushes to @urlstoget, only if not already
|
|
191 |
# added, plus any other initialization.
|
|
192 |
# Only add a file if it can be accessed with a URL.
|
|
193 |
foreach my $arg (@ARGV) {
|
|
194 |
if ($arg=~ m#^http://#i) {
|
|
195 |
&add_url($arg, '-', 0) ;
|
|
196 |
} else {
|
|
197 |
my($URL)= &filename_to_url($arg, $CWD) ;
|
|
198 |
if (defined($URL)) {
|
|
199 |
&add_url($URL, '-', 0) ;
|
|
200 |
} else {
|
|
201 |
die "ERROR: $arg is not accessible through the Web server.\n" ;
|
|
202 |
}
|
|
203 |
}
|
|
204 |
}
|
|
205 |
|
|
206 |
|
|
207 |
# Check the URLs, in order. @urlstoget may grow and rearrange.
|
|
208 |
while (@urlstoget) {
|
|
209 |
my($url)= shift(@urlstoget) ;
|
|
210 |
if ( !$url->{'ishtml'} or !$url->{'islocal'} or $url->{'dontfollow'}
|
|
211 |
or (length($max_depth) and $url->{'depth'} > $max_depth ) ) {
|
|
212 |
&verify_url($url) ; # may set ishtml=true
|
|
213 |
}
|
|
214 |
if ( $url->{'ishtml'} and $url->{'islocal'} and !$url->{'dontfollow'}
|
|
215 |
and (!length($max_depth) or $url->{'depth'} <= $max_depth ) ) {
|
|
216 |
my($HTML)= &load_url($url) ; # may set ishtml=false
|
|
217 |
# 11-30-99 JSM: fixed to handle rel URLs in redirected pages correctly
|
|
218 |
my($base_url)= $url->{'location'} || $url->{'URL'} ;
|
|
219 |
&extract_urls($HTML, $base_url, $url->{'URL'}, $url->{'depth'}+1)
|
|
220 |
if $url->{'ishtml'} ; # big, calls &add_url()
|
|
221 |
}
|
|
222 |
|
|
223 |
# If we get an error response that may be corrected with another
|
|
224 |
# attempt, put it back in the queue. Such errors include 408,
|
|
225 |
# 503, 504, and the homegrown codes 600, 601, 602, and 603.
|
|
226 |
if ($url->{'status'}=~ /^(408|503|504|600|601|602|603)\b/ ) {
|
|
227 |
push(@urlstoget, $url) if ( $url->{'numtries'} < $MAX_ATTEMPTS ) ;
|
|
228 |
}
|
|
229 |
|
|
230 |
}
|
|
231 |
|
|
232 |
&make_report() ;
|
|
233 |
|
|
234 |
exit ;
|
|
235 |
|
|
236 |
|
|
237 |
|
|
238 |
#----- Process command-line options -----------------------------------
|
|
239 |
|
|
240 |
# Process any command-line options.
|
|
241 |
sub getopts {
|
|
242 |
my($opt, $param) ;
|
|
243 |
while ($ARGV[0]=~ /^-/) {
|
|
244 |
$opt= shift(@ARGV) ;
|
|
245 |
($opt, $param)= $opt=~ /^-(.)(.*)/ ;
|
|
246 |
|
|
247 |
# Turn on verbose reporting
|
|
248 |
if ($opt eq 'v') {
|
|
249 |
$verbose_report= ($param ne '-') ;
|
|
250 |
|
|
251 |
# User-specified patterns to exclude ('' to clear list)
|
|
252 |
} elsif ($opt eq 'I') {
|
|
253 |
$param= shift(@ARGV) unless length($param) ;
|
|
254 |
if (length($param)) { push(@INCLUDE_PATTERNS, $param) }
|
|
255 |
else { @INCLUDE_PATTERNS= () }
|
|
256 |
|
|
257 |
# User-specified patterns to exclude ('' to clear list)
|
|
258 |
} elsif ($opt eq 'X') {
|
|
259 |
$param= shift(@ARGV) unless length($param) ;
|
|
260 |
if (length($param)) { push(@EXCLUDE_PATTERNS, $param) }
|
|
261 |
else { @EXCLUDE_PATTERNS= () }
|
|
262 |
|
|
263 |
# User-specified response codes to ignore ('' to clear list)
|
|
264 |
} elsif ($opt eq 'i') {
|
|
265 |
$param= shift(@ARGV) unless length($param) ;
|
|
266 |
if (length($param)) { push(@INCLUDE_STATUS, $param) }
|
|
267 |
else { @INCLUDE_STATUS= () }
|
|
268 |
|
|
269 |
# User-specified response codes to ignore ('' to clear list)
|
|
270 |
} elsif ($opt eq 'x') {
|
|
271 |
$param= shift(@ARGV) unless length($param) ;
|
|
272 |
if (length($param)) { push(@EXCLUDE_STATUS, $param) }
|
|
273 |
else { @EXCLUDE_STATUS= () }
|
|
274 |
|
|
275 |
# Maximum traversal depth
|
|
276 |
} elsif ($opt eq 'd') {
|
|
277 |
$param= shift(@ARGV) unless length($param) ;
|
|
278 |
$max_depth= $param ;
|
|
279 |
|
|
280 |
# Make it a "file check"-- only read local files, do not use HTTP
|
|
281 |
} elsif ($opt eq 'f') {
|
|
282 |
$file_check= ($param ne '-') ;
|
|
283 |
|
|
284 |
# Use HTTP for all URL's, even local files
|
|
285 |
} elsif ($opt eq 'h') {
|
|
286 |
$full_http_check= ($param ne '-') ;
|
|
287 |
|
|
288 |
# Read configuration parameters from srm.conf-like file
|
|
289 |
} elsif ($opt eq 'c') {
|
|
290 |
$param= shift(@ARGV) unless length($param) ;
|
|
291 |
&read_srm_conf($param) ;
|
|
292 |
|
|
293 |
# Print current configuration parameters
|
|
294 |
} elsif ($opt eq 'q') {
|
|
295 |
&print_config ;
|
|
296 |
exit ; # jsm-- should we exit?
|
|
297 |
|
|
298 |
# Allow certain parameters to be defined via the command line
|
|
299 |
} elsif ($opt eq 'D') {
|
|
300 |
$param= shift(@ARGV) unless length($param) ;
|
|
301 |
$debug=1, unshift(@ARGV,$param), next if $param=~ /^-/ ;
|
|
302 |
my($name,$value)= split(/=/, $param, 2) ;
|
|
303 |
$value= 1 unless length($value) ;
|
|
304 |
if ($name=~ /^(LOCAL_HOST|DOCUMENT_ROOT|USER_DIR|DEBUG|debug)$/) {
|
|
305 |
eval "\$$name= \$value" ;
|
|
306 |
#$$name= $value ; # this doesn't work, because of initial my()
|
|
307 |
}
|
|
308 |
|
|
309 |
} elsif ($opt eq '?') {
|
|
310 |
&usage ;
|
|
311 |
|
|
312 |
# End command-line option processing on "--"
|
|
313 |
} elsif ($opt eq '-') {
|
|
314 |
return ;
|
|
315 |
|
|
316 |
} else {
|
|
317 |
print STDERR
|
|
318 |
"Illegal option-- '$opt'. Enter \"$0 -?\" for help.\n" ;
|
|
319 |
exit ;
|
|
320 |
}
|
|
321 |
|
|
322 |
}
|
|
323 |
|
|
324 |
if ($file_check and $full_http_check) {
|
|
325 |
print STDERR "You cannot use both the -f and the -h options.\n" ;
|
|
326 |
exit ;
|
|
327 |
}
|
|
328 |
|
|
329 |
}
|
|
330 |
|
|
331 |
|
|
332 |
# Read appropriate values from the given file, typically srm.conf. If a
|
|
333 |
# directory is named, default to filename "srm.conf".
|
|
334 |
# Note that opening "-" will open STDIN.
|
|
335 |
sub read_srm_conf {
|
|
336 |
my($fname)= @_ ;
|
|
337 |
local(*SRM) ;
|
|
338 |
|
|
339 |
# default to srm.conf if only a directory is named
|
|
340 |
if (-d $fname) {
|
|
341 |
$fname=~ s#/$## ;
|
|
342 |
$fname.= "/srm.conf" ;
|
|
343 |
}
|
|
344 |
|
|
345 |
# Clear old values
|
|
346 |
$DOCUMENT_ROOT= $USER_DIR= '' ;
|
|
347 |
@DIRECTORY_INDEX= @CGI_EXTENSIONS= @SHTML_EXTENSIONS= () ;
|
|
348 |
%ALIAS= %ALIAS_MATCH= %SCRIPT_ALIAS= %SCRIPT_ALIAS_MATCH= () ;
|
|
349 |
|
|
350 |
open(SRM, "<$fname") || die "Can't open $fname: $!" ;
|
|
351 |
while (<SRM>) {
|
|
352 |
s/#.*// ;
|
|
353 |
next unless /\S/ ;
|
|
354 |
my($name, @param)= /(\S+)/g ;
|
|
355 |
|
|
356 |
if ($name eq 'DocumentRoot') {
|
|
357 |
$DOCUMENT_ROOT= $param[0] ;
|
|
358 |
|
|
359 |
} elsif ($name eq 'UserDir') {
|
|
360 |
$USER_DIR= $param[0] ;
|
|
361 |
|
|
362 |
} elsif ($name eq 'DirectoryIndex') {
|
|
363 |
@DIRECTORY_INDEX= @param ;
|
|
364 |
|
|
365 |
} elsif ($name eq 'Alias') {
|
|
366 |
$ALIAS{$param[0]}= $param[1] ;
|
|
367 |
|
|
368 |
} elsif ($name eq 'AliasMatch') {
|
|
369 |
$ALIAS_MATCH{$param[0]}= $param[1] ;
|
|
370 |
|
|
371 |
} elsif ($name eq 'ScriptAlias') {
|
|
372 |
$SCRIPT_ALIAS{$param[0]}= $param[1] ;
|
|
373 |
|
|
374 |
} elsif ($name eq 'ScriptAliasMatch') {
|
|
375 |
$SCRIPT_ALIAS_MATCH{$param[0]}= $param[1] ;
|
|
376 |
|
|
377 |
} elsif ($name eq 'AddHandler') {
|
|
378 |
if ($param[0] eq 'cgi-script') {
|
|
379 |
push(@CGI_EXTENSIONS, $param[1]) ;
|
|
380 |
} elsif ($param[0] eq 'server-parsed') {
|
|
381 |
push(@SHTML_EXTENSIONS, $param[1]) ;
|
|
382 |
}
|
|
383 |
}
|
|
384 |
|
|
385 |
}
|
|
386 |
close(SRM) ;
|
|
387 |
}
|
|
388 |
|
|
389 |
|
|
390 |
# Make any final settings to global variables, after the hard-coded values
|
|
391 |
# and command-line options have been processed.
|
|
392 |
# Most non-user-configurable globals are also set here.
|
|
393 |
sub adjust_all_globals {
|
|
394 |
|
|
395 |
# Standardize $USER_DIR to never have trailing slash
|
|
396 |
$USER_DIR=~ s#/$## ;
|
|
397 |
|
|
398 |
# If no $LOCAL_HOST set, try to read it from first URL in list, or
|
|
399 |
# use the string 'localhost' if that URL contains no hostname.
|
|
400 |
unless (length($LOCAL_HOST)) {
|
|
401 |
$LOCAL_HOST= (&parse_url($ARGV[0]))[1] || 'localhost' ;
|
|
402 |
print STDERR "LOCAL_HOST set to \"\L$LOCAL_HOST\E\"\n" ;
|
|
403 |
}
|
|
404 |
$LOCAL_HOST= lc($LOCAL_HOST) ;
|
|
405 |
|
|
406 |
# If no $DOCUMENT_ROOT, try to guess it from $HOME, username, $USER_DIR.
|
|
407 |
unless (length($DOCUMENT_ROOT)) {
|
|
408 |
my($home) ;
|
|
409 |
unless ($home= $ENV{'HOME'}) {
|
|
410 |
my($uname)= getpwuid($<) || $ENV{'USER'} || `whoami` || `id -un` ;
|
|
411 |
chomp($uname) ;
|
|
412 |
&read_home_dirs unless %home_dir ; # only read when needed
|
|
413 |
$home= $home_dir{$uname} ;
|
|
414 |
}
|
|
415 |
$DOCUMENT_ROOT= "$home/$USER_DIR" ;
|
|
416 |
|
|
417 |
die "Could not determine DOCUMENT_ROOT; edit the $0 script to set it.\n"
|
|
418 |
unless (-d $DOCUMENT_ROOT) ;
|
|
419 |
|
|
420 |
print STDERR "DOCUMENT_ROOT set to \"$DOCUMENT_ROOT\"\n" ;
|
|
421 |
}
|
|
422 |
$DOCUMENT_ROOT=~ s#/$## ;
|
|
423 |
|
|
424 |
# Allows &filename_to_url() to unalias as best as possible. Note that
|
|
425 |
# use of &filename_to_url() can be avoided by the user; see note in
|
|
426 |
# that routine.
|
|
427 |
%UN_ALIAS= (reverse (%ALIAS, %SCRIPT_ALIAS) ) ;
|
|
428 |
|
|
429 |
# These are to compare equivalency to later, in &filename_to_url().
|
|
430 |
($DOC_ROOT_DEV, $DOC_ROOT_INODE)= stat("$DOCUMENT_ROOT/.") ;
|
|
431 |
|
|
432 |
$DOC_ROOT_EXISTS= -e _ ;
|
|
433 |
|
|
434 |
# Set CWD from shell variable, else from `pwd`.
|
|
435 |
$CWD= $ENV{'PWD'} || `pwd` || die "couldn't run pwd: $!" ;
|
|
436 |
chomp($CWD) ;
|
|
437 |
|
|
438 |
|
|
439 |
# These are used by &extract_urls().
|
|
440 |
# This is a complete list of URL-type attributes defined in HTML 4.0,
|
|
441 |
# plus any others I found, like nonstandard ones or from an earlier HTML.
|
|
442 |
# Only a few of these are commonly used, as of early 1998.
|
|
443 |
# The set in %html_urls could possibly link to HTML resources, while the
|
|
444 |
# set in %non_html_urls could not. The %special(.*) sets, here for
|
|
445 |
# reference only, include URL attributes that require special handling.
|
|
446 |
|
|
447 |
%html_urls= ( 'a' => [ 'href' ],
|
|
448 |
'area' => [ 'href' ],
|
|
449 |
'frame' => [ 'src', 'longdesc' ],
|
|
450 |
'link' => [ 'href', 'urn' ],
|
|
451 |
'img' => [ 'longdesc', 'usemap' ],
|
|
452 |
'q' => [ 'cite' ],
|
|
453 |
'blockquote' => [ 'cite' ],
|
|
454 |
'ins' => [ 'cite' ],
|
|
455 |
'del' => [ 'cite' ],
|
|
456 |
'object' => [ 'usemap' ],
|
|
457 |
'input' => [ 'usemap' ],
|
|
458 |
'iframe' => [ 'src', 'longdesc' ],
|
|
459 |
'ilayer' => [ 'src' ],
|
|
460 |
'layer' => [ 'src' ],
|
|
461 |
'fig' => [ 'imagemap' ],
|
|
462 |
'overlay' => [ 'imagemap' ],
|
|
463 |
'meta' => [ 'url' ],
|
|
464 |
'note' => [ 'src' ],
|
|
465 |
) ;
|
|
466 |
|
|
467 |
%non_html_urls= ( 'body' => [ 'background' ],
|
|
468 |
'img' => [ 'src', 'lowsrc', 'dynsrc' ],
|
|
469 |
'input' => [ 'src' ],
|
|
470 |
'script' => [ 'src', 'for' ],
|
|
471 |
|
|
472 |
'fig' => [ 'src' ],
|
|
473 |
'overlay' => [ 'src' ],
|
|
474 |
'select' => [ 'src' ],
|
|
475 |
'ul' => [ 'src' ],
|
|
476 |
'h1' => [ 'src' ],
|
|
477 |
'h2' => [ 'src' ],
|
|
478 |
'h3' => [ 'src' ],
|
|
479 |
'h4' => [ 'src' ],
|
|
480 |
'h5' => [ 'src' ],
|
|
481 |
'h6' => [ 'src' ],
|
|
482 |
'hr' => [ 'src' ],
|
|
483 |
'table' => [ 'src' ],
|
|
484 |
'td' => [ 'src' ],
|
|
485 |
'th' => [ 'src' ],
|
|
486 |
'tr' => [ 'src' ],
|
|
487 |
|
|
488 |
'bgsound' => [ 'src' ],
|
|
489 |
'embed' => [ 'src' ],
|
|
490 |
) ;
|
|
491 |
|
|
492 |
# %special_urls= ( 'base' => [ 'href' ] ) ;
|
|
493 |
#
|
|
494 |
# %special_html_urls= ( 'object' => [ 'codebase', 'data' ] ) ;
|
|
495 |
#
|
|
496 |
# %special_non_html_urls=
|
|
497 |
# ( 'head' => [ 'profile' ],
|
|
498 |
# 'object' => [ 'codebase', 'archive', 'classid' ],
|
|
499 |
# 'applet' => [ 'codebase', 'code', 'object', 'archive' ],
|
|
500 |
# 'form' => [ 'action', 'script' ]
|
|
501 |
# ) ;
|
|
502 |
|
|
503 |
|
|
504 |
# This is a translation from entity character references to characters,
|
|
505 |
# used in &HTMLunescape().
|
|
506 |
# This simplified version only supports " < > &, but that
|
|
507 |
# should be enough for URL-type attributes.
|
|
508 |
# See http://www.w3.org/TR/REC-html40/sgml/entities.html for full entity
|
|
509 |
# list.
|
|
510 |
|
|
511 |
%e_to_ch= (quot => '"',
|
|
512 |
'lt' => '<',
|
|
513 |
'gt' => '>',
|
|
514 |
amp => '&') ;
|
|
515 |
|
|
516 |
}
|
|
517 |
|
|
518 |
|
|
519 |
#----------------------------------------------------------------------
|
|
520 |
|
|
521 |
# Add the URL to our data structures; specifically, to %url and @urlstoget.
|
|
522 |
# Returns a pointer to the structure in %url, or undef if already defined
|
|
523 |
# or on error.
|
|
524 |
# Currently, this always receives the URL with the host name lowercase,
|
|
525 |
# either from &absolute_url() or from using $LOCAL_HOST.
|
|
526 |
sub add_url {
|
|
527 |
my($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow)= @_ ;
|
|
528 |
|
|
529 |
# Allow the user to restrict URL patterns: URLs must be in
|
|
530 |
# @INCLUDE_PATTERNS but not in @EXCLUDE_PATTERNS (but only restrict
|
|
531 |
# by @INCLUDE_PATTERNS if it's not empty).
|
|
532 |
return undef if @INCLUDE_PATTERNS &&
|
|
533 |
!grep( $URL=~ /$_/, @INCLUDE_PATTERNS ) ;
|
|
534 |
return undef if grep( $URL=~ /$_/, @EXCLUDE_PATTERNS ) ;
|
|
535 |
|
|
536 |
# Canonicalize URL, so we don't get a page multiple times
|
|
537 |
$URL= &canonicalize($URL) ;
|
|
538 |
|
|
539 |
# for obscure case involving a <form action=___.cgi>-extracted URL being
|
|
540 |
# overwritten by <a href=___.cgi> extraction (don't fret over this)
|
|
541 |
$url{$URL}{'dontfollow'}&&= $dontfollow if $url{$URL} ;
|
|
542 |
|
|
543 |
# Don't add the record a second time! Or will infinitely traverse.
|
|
544 |
return undef if $url{$URL} ; # or add to @referers, for 301 correction...?
|
|
545 |
|
|
546 |
# Only HTTP URLs are currently supported
|
|
547 |
return undef unless $URL=~ /^http:/i ;
|
|
548 |
|
|
549 |
# Any self-referral here indicates a bug in the program. It's happened.
|
|
550 |
die "PROGRAM ERROR: $URL shows its first referer as itself.\n"
|
|
551 |
if $referer eq $URL ;
|
|
552 |
|
|
553 |
my(%u) ;
|
|
554 |
@u{qw(URL referer depth ishtml iscgi dontfollow)}=
|
|
555 |
($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow) ;
|
|
556 |
$u{'islocal'}= ($URL=~ m#^http://\Q$LOCAL_HOST\E/#io) + 0 ; # make length>0
|
|
557 |
if ($u{'islocal'}) {
|
|
558 |
# $u{'filename'}= &url_to_filename($URL) ;
|
|
559 |
@u{'filename', 'location'}= &url_to_filename($URL) ;
|
|
560 |
$u{'iscgi'}= &is_cgi($u{'filename'}, $URL) if $u{'iscgi'} eq '' ;
|
|
561 |
|
|
562 |
# 2-27-00 JSM: Detect ishtml by filename, not -T test.
|
|
563 |
if ( $u{'ishtml'} eq '' ) {
|
|
564 |
$u{'ishtml'}= $HTML_BY_NAME
|
|
565 |
? ( !$u{'iscgi'} && -e $u{'filename'} &&
|
|
566 |
$u{'filename'}=~ /\.html?$/i ) + 0
|
|
567 |
: (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0 ;
|
|
568 |
}
|
|
569 |
# $u{'ishtml'}= (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0
|
|
570 |
# unless length($u{'ishtml'}) ;
|
|
571 |
|
|
572 |
}
|
|
573 |
|
|
574 |
# If we're only doing a file check, don't add URLs that require HTTP
|
|
575 |
return undef if ($file_check and (!$u{'islocal'} or $u{'iscgi'}) ) ;
|
|
576 |
|
|
577 |
push(@urlstoget, \%u) ;
|
|
578 |
$url{$URL}= \%u ;
|
|
579 |
|
|
580 |
# return \%u ; # unneeded because of previous statement
|
|
581 |
}
|
|
582 |
|
|
583 |
|
|
584 |
# Guess if a file is a CGI script or not. Returns true if the (regular) file
|
|
585 |
# is executable, has one of @CGI_EXTENSIONS, or if the URL is in a
|
|
586 |
# ScriptAlias'ed directory.
|
|
587 |
# $fname must be absolute path, but $URL is optional (saves time if available).
|
|
588 |
# Note that URLs like "/path/script.cgi?a=b" are handled correctly-- the
|
|
589 |
# previously extracted filename is tested for CGI-ness, while the URL is
|
|
590 |
# checked for ScriptAlias matching (which is unaffected by final query
|
|
591 |
# strings or PATH_INFO).
|
|
592 |
sub is_cgi {
|
|
593 |
my($fname, $URL)= @_ ;
|
|
594 |
return 1 if (-x $fname && ! -d _ ) ; # should we really do this?
|
|
595 |
foreach (@CGI_EXTENSIONS) { return 1 if $fname=~ /\Q$_\E$/i }
|
|
596 |
|
|
597 |
$URL= &filename_to_url($fname) unless length($URL) ; # currently unused
|
|
598 |
my($URLpath)= $URL=~ m#^http://[^/]*(.*)#i ;
|
|
599 |
foreach (keys %SCRIPT_ALIAS) { return 1 if $URLpath=~ /^\Q$_\E/ }
|
|
600 |
foreach (keys %SCRIPT_ALIAS_MATCH) { return 1 if $URLpath=~ /^$_/ }
|
|
601 |
|
|
602 |
return 0 ;
|
|
603 |
}
|
|
604 |
|
|
605 |
|
|
606 |
# Put the URL in such a form that two URLs that point to the same resource
|
|
607 |
# have the same URL, to avoid superfluous retrievals.
|
|
608 |
# Host name is lowercased elsewhere-- this routine is only called from
|
|
609 |
# &add_url; see note there. To lowercase the host name here would be
|
|
610 |
# inefficient.
|
|
611 |
sub canonicalize {
|
|
612 |
my($URL)= @_ ;
|
|
613 |
|
|
614 |
$URL=~ s/#.*// ; # remove any "#" fragment from end of URL
|
|
615 |
|
|
616 |
return $URL ;
|
|
617 |
}
|
|
618 |
|
|
619 |
|
|
620 |
#----- File reading/downloading routines (includes networking) --------
|
|
621 |
|
|
622 |
# Verify that a URL exists, and set $url->{'status'} accordingly. Do
|
|
623 |
# this either by checking the local filesystem or by using the HTTP HEAD
|
|
624 |
# method for remote sites or CGI scripts.
|
|
625 |
# Set $url->{'ishtml'} accordingly if discovered from Content-Type:.
|
|
626 |
# This does not support various Redirect directives in srm.conf.
|
|
627 |
sub verify_url {
|
|
628 |
my($url)= @_ ;
|
|
629 |
|
|
630 |
print STDERR "verifying $url->{'URL'}\n" if $debug ;
|
|
631 |
|
|
632 |
|
|
633 |
# Depending on the state of $url->{islocal, iscgi, dontfollow} and
|
|
634 |
# $full_http_check, take appropriate actions to check/set the
|
|
635 |
# status code for this URL.
|
|
636 |
|
|
637 |
# NOTE: In some situations, specifically when checking a CGI script
|
|
638 |
# named in a <form action> (thus implying that dontfollow is set),
|
|
639 |
# and using HTTP to check the URL (because the script is remote or
|
|
640 |
# $full_http_check is set), the HTTP response code may not be
|
|
641 |
# accurate. This is because there is no form data sent with the
|
|
642 |
# request, as there normally would be. In these cases, a cautionary
|
|
643 |
# note is appended to $url->{'status'}. Additionally, an empty
|
|
644 |
# $url->{'status'} is changed to an explanatory note (maybe we should
|
|
645 |
# do that in load_url() too?).
|
|
646 |
|
|
647 |
# Use HEAD if file is remote, or if $full_http_check is set.
|
|
648 |
if (!$url->{'islocal'} or $full_http_check) {
|
|
649 |
&load_url_using_HTTP($url, 'HEAD') ;
|
|
650 |
$url->{'status'}= '[no status returned]'
|
|
651 |
unless length($url->{'status'}) ;
|
|
652 |
$url->{'status'}.= ' (NOTE: Form was not submitted normally)'
|
|
653 |
if $url->{'dontfollow'} ;
|
|
654 |
|
|
655 |
# URL is local: If it's not CGI, do a normal local file check
|
|
656 |
} elsif (!$url->{'iscgi'}) {
|
|
657 |
$url->{'status'}= (-e $url->{'filename'})
|
|
658 |
? "200 Local File Exists" : "404 File Not Found" ;
|
|
659 |
|
|
660 |
# URL is local CGI: Use HEAD unless dontfollow is set
|
|
661 |
} elsif (!$url->{'dontfollow'}) {
|
|
662 |
&load_url_using_HTTP($url, 'HEAD') ;
|
|
663 |
|
|
664 |
# Else it's a local CGI with dontfollow set: Check for executable file
|
|
665 |
} else {
|
|
666 |
$url->{'status'}=
|
|
667 |
(! -e $url->{'filename'}) ? "404 File Not Found"
|
|
668 |
: (! -x $url->{'filename'}) ? "403 Local File Is Not Executable"
|
|
669 |
: "200 Local Executable File Exists"
|
|
670 |
|
|
671 |
}
|
|
672 |
|
|
673 |
|
|
674 |
# Old verify routine below:
|
|
675 |
#
|
|
676 |
# # If is a local non-CGI file, check it directly from the filesystem
|
|
677 |
# if ($url->{'islocal'} and !$url->{'iscgi'} and !$full_http_check) {
|
|
678 |
# $url->{'status'}= (-e $url->{'filename'})
|
|
679 |
# ? "200 Local File Exists" : "404 File Not Found" ;
|
|
680 |
#
|
|
681 |
# # Otherwise, download its HEAD from its HTTP server
|
|
682 |
# } else {
|
|
683 |
# &load_url_using_HTTP($url, 'HEAD') ;
|
|
684 |
# }
|
|
685 |
|
|
686 |
|
|
687 |
}
|
|
688 |
|
|
689 |
|
|
690 |
|
|
691 |
# Load entire file/resource and return its contents, setting $url->{'status'}
|
|
692 |
# accordingly. Do this either by checking the local filesystem or by
|
|
693 |
# using the HTTP GET method for remote sites or CGI scripts.
|
|
694 |
# Set $url->{'ishtml'} accordingly if discovered from Content-Type:.
|
|
695 |
# This does not support various Redirect directives in srm.conf.
|
|
696 |
sub load_url {
|
|
697 |
my($url)= @_ ;
|
|
698 |
my($HTML) ;
|
|
699 |
|
|
700 |
print STDERR "loading $url->{'URL'}\n" if $debug ;
|
|
701 |
|
|
702 |
# If is a local non-CGI file, read it directly from the filesystem
|
|
703 |
if ($url->{'islocal'} and !$url->{'iscgi'} and !$full_http_check) {
|
|
704 |
my($iscgi) ;
|
|
705 |
($HTML, $url->{'ssierrs'}, $iscgi)=
|
|
706 |
&read_expanded_file($url->{'filename'}, $url->{'URL'}) ;
|
|
707 |
$url->{'status'}=
|
|
708 |
!defined($HTML)
|
|
709 |
? sprintf("450 Can't read file: %s (%s)", $!, $!+0)
|
|
710 |
: @{$url->{'ssierrs'}}
|
|
711 |
? sprintf("451 SSI Error(s) (%s total)",
|
|
712 |
scalar @{$url->{'ssierrs'}})
|
|
713 |
|
|
714 |
: "200 Local File Read OK" ;
|
|
715 |
|
|
716 |
# $url->{'iscgi'} may be set if an SHTML file included CGI calls.
|
|
717 |
# Don't set it if we're doing a file check, in which case we'll
|
|
718 |
# keep whatever $HTML we could get.
|
|
719 |
$url->{'iscgi'}= $iscgi unless $file_check ;
|
|
720 |
}
|
|
721 |
|
|
722 |
# Otherwise (or if rereckoned), download the resource from its HTTP server
|
|
723 |
if (!$url->{'islocal'} or $url->{'iscgi'} or $full_http_check) {
|
|
724 |
(undef, undef, $HTML)= &load_url_using_HTTP($url, 'GET') ;
|
|
725 |
}
|
|
726 |
|
|
727 |
# Note that this will be set even when URL is to be reloaded, like
|
|
728 |
# for a 601 (timeout) response.
|
|
729 |
$url->{'hasbeenloaded'}= 1 ;
|
|
730 |
return $HTML ;
|
|
731 |
}
|
|
732 |
|
|
733 |
|
|
734 |
# Read a local file and return its contents. If a file is SSI (aka SHTML),
|
|
735 |
# expand any SSI <!--#include--> directives as needed, recursively
|
|
736 |
# including nested files.
|
|
737 |
# This is used for all local reads, SHTML or not, but the vast bulk of this
|
|
738 |
# routine is for SHTML files.
|
|
739 |
#
|
|
740 |
# If file is SHTML, this routine also returns a structure of error data,
|
|
741 |
# and a boolean saying if this file needs to be downloaded via HTTP
|
|
742 |
# for a complete check (e.g. includes CGI calls).
|
|
743 |
#
|
|
744 |
# $fname must be canonicalized absolute path, but $URL parameter is optional.
|
|
745 |
# %$parents contains all "include"-ancestors of the file, to prevent loops.
|
|
746 |
# If omitted, assumes no ancestors (and a fresh hash is started).
|
|
747 |
#
|
|
748 |
# This routine seems much bigger and more complex than it needs to be.
|
|
749 |
# It could be one third the size and much simpler if we didn't have to
|
|
750 |
# worry about full error reporting on nested includes.
|
|
751 |
#
|
|
752 |
# Note: This routine was made to mimic what Apache would return to a client.
|
|
753 |
# However, the result differs from Apache's in two slight ways, both
|
|
754 |
# involving nested SSI within <!--#include file="..." -->, and both
|
|
755 |
# apparent bugs in Apache 1.1 (may be fixed in later versions):
|
|
756 |
#
|
|
757 |
# 1) If a <file="..."> value contains no "/" (i.e. in current directory),
|
|
758 |
# then Apache always parses the included file as SHTML, regardless of
|
|
759 |
# extension. This routine checks @SHTML_EXTENSIONS for all included
|
|
760 |
# files.
|
|
761 |
# 2) If a <file="..."> value containing a "/" loads an SHTML file
|
|
762 |
# containing a <virtual="..."> tag with a relative path, the directive
|
|
763 |
# fails in Apache. This routine tries to guess the correct path/URL.
|
|
764 |
#
|
|
765 |
#
|
|
766 |
# Notes on this routine, and SHTML files in general:
|
|
767 |
#
|
|
768 |
# At first thought, it seems like we could load each included file
|
|
769 |
# only once, instead of once for every file that includes it.
|
|
770 |
# However, because of the fact that relative URLs are resolved
|
|
771 |
# relative to the top-level including file, the top-level file will
|
|
772 |
# need to be expanded every time. (It's legal (though of questionable
|
|
773 |
# wisdom) to include a file from e.g. both /a/index.shtml and
|
|
774 |
# /b/index.shtml, so links from the included file point to different
|
|
775 |
# URLs.)
|
|
776 |
#
|
|
777 |
# Note that while URLs in included files (e.g. <a href="...">) are
|
|
778 |
# resolved relative to the top-level including file, nested include tags
|
|
779 |
# are resolved relative to the direct includer.
|
|
780 |
#
|
|
781 |
# We could possibly be more efficient in time (but costly in memory)
|
|
782 |
# by storing the expanded contents and $errlist of each included file,
|
|
783 |
# since those will be constant (except $errlist's include-loop
|
|
784 |
# reporting might vary somewhat). There are probably other ways to
|
|
785 |
# eek out savings of time and memory, at the cost of complexity.
|
|
786 |
#
|
|
787 |
# The main loop here is inside of an s/// statement. Unusual, but it's an
|
|
788 |
# appropriate way to handle the recursion. Recursion is needed, since each
|
|
789 |
# included file may or may not be SHTML.
|
|
790 |
#
|
|
791 |
# $iscgi is set if a file includes "<!--#exec", or if it contains an
|
|
792 |
# <!--#include virtual="..." --> tag that points to a CGI file, or if
|
|
793 |
# any of its include-children sets $iscgi.
|
|
794 |
#
|
|
795 |
#
|
|
796 |
# Notes to help clarify data structures, if (God forbid) you have to modify
|
|
797 |
# this routine:
|
|
798 |
#
|
|
799 |
# Each error is a list of files in an "include chain", and $errlist is a
|
|
800 |
# list of errors. $errlist is associated with the current $HTML. Each
|
|
801 |
# error in $errlist is associated with some tag in $HTML, as iterated in
|
|
802 |
# the s/// loop. When this routine returns ($HTML, $errlist), the
|
|
803 |
# errors in $errlist should all have as their first element tags that
|
|
804 |
# were found in $HTML.
|
|
805 |
#
|
|
806 |
# Each error's final element (the "leaf") is associated with a tag that
|
|
807 |
# tried to load an invalid file. Each leaf will always be the complete
|
|
808 |
# set of errors for that tag (i.e. it has no children, since it couldn't
|
|
809 |
# load the file).
|
|
810 |
#
|
|
811 |
# If the file can be validly loaded, then we may have 0 or multiple errors
|
|
812 |
# associated with this file/tag (and returned from this routine). Each
|
|
813 |
# file's $errlist is an accumulation of all of its children's $errlist's.
|
|
814 |
#
|
|
815 |
# Errors that come from loading a child are associated with the child's
|
|
816 |
# $HTML and tags. Before adding them to the parent's (current) $errlist,
|
|
817 |
# they must have the CHILD's path/tag unshifted onto the front of the
|
|
818 |
# include chain; all errors are then added to the current $errlist.
|
|
819 |
# This ensures that:
|
|
820 |
# a) The errors are now all associated with the tag that loaded the child.
|
|
821 |
# b) The errors in the current $errlist are always associated with tags
|
|
822 |
# from the current $HTML.
|
|
823 |
#
|
|
824 |
|
|
825 |
sub read_expanded_file {
|
|
826 |
my($fname, $URL, $parents)= @_ ;
|
|
827 |
my($HTML, $errlist, $iscgi) ;
|
|
828 |
my($isshtml) ;
|
|
829 |
|
|
830 |
$parents->{$fname}= 1 ;
|
|
831 |
|
|
832 |
$HTML= &read_file($fname) ;
|
|
833 |
return(undef) unless defined($HTML) ;
|
|
834 |
|
|
835 |
foreach (@SHTML_EXTENSIONS) {
|
|
836 |
$isshtml= 1, last if ($fname=~ /\Q$_\E$/i) ;
|
|
837 |
}
|
|
838 |
|
|
839 |
if ($isshtml) {
|
|
840 |
$errlist= [] ;
|
|
841 |
$iscgi= ($HTML=~ /<!--#exec\b/i) ;
|
|
842 |
|
|
843 |
$HTML=~ s{(<!--#include\b.*?>)} {
|
|
844 |
do {
|
|
845 |
my($childfname, $childURL) ;
|
|
846 |
my($childHTML, $childerrlist, $childiscgi) ;
|
|
847 |
my($tagst) = $1 ;
|
|
848 |
my($tag)= &parse_tag($tagst) ;
|
|
849 |
GET_CHILD: {
|
|
850 |
if (length($tag->{'attrib'}{'virtual'})) {
|
|
851 |
$URL= &filename_to_url($fname) unless length($URL) ;
|
|
852 |
$childURL=
|
|
853 |
&absolute_url($tag->{'attrib'}{'virtual'}, $URL) ;
|
|
854 |
($childfname)= &url_to_filename($childURL) ;
|
|
855 |
|
|
856 |
# If it's a CGI, don't follow it, but no error either
|
|
857 |
if (&is_cgi($childfname, $childURL)) {
|
|
858 |
$iscgi= 1 ;
|
|
859 |
last GET_CHILD ;
|
|
860 |
}
|
|
861 |
|
|
862 |
} elsif (length($tag->{'attrib'}{'file'})) {
|
|
863 |
$childfname= $tag->{'attrib'}{'file'} ;
|
|
864 |
if ($childfname=~ m#^(/|~)#) {
|
|
865 |
push(@$errlist, [ {'path' => $childfname,
|
|
866 |
'tag' => $tagst, 'errmsg' =>
|
|
867 |
'Absolute paths are not allowed in '
|
|
868 |
. '<!--#include file="..." -->.'}]);
|
|
869 |
last GET_CHILD ;
|
|
870 |
}
|
|
871 |
if ($childfname=~ m#\.\.(/|$)#) {
|
|
872 |
push(@$errlist, [ {'path' => $childfname,
|
|
873 |
'tag' => $tagst, 'errmsg' =>
|
|
874 |
'Paths can not contain "../" in '
|
|
875 |
. '<!--#include file="..." -->.'}]);
|
|
876 |
last GET_CHILD ;
|
|
877 |
}
|
|
878 |
$childfname= ($fname=~ m#(.*/)#)[0] . $childfname ;
|
|
879 |
|
|
880 |
} else {
|
|
881 |
push(@$errlist, [ {'path' => '',
|
|
882 |
'tag' => $tagst, 'errmsg' =>
|
|
883 |
'Tag must contain either the "file" or '
|
|
884 |
. '"virtual" attribute.'}]);
|
|
885 |
last GET_CHILD ;
|
|
886 |
|
|
887 |
}
|
|
888 |
|
|
889 |
# canonicalize filename for %$parents
|
|
890 |
1 while $childfname=~ s#/\.(/|$)#/# ;
|
|
891 |
1 while $childfname=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ;
|
|
892 |
|
|
893 |
# Guarantee that file exists, is regular, and is readable
|
|
894 |
unless (-e $childfname) {
|
|
895 |
push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
|
|
896 |
'errmsg' => 'File not found'} ] ) ;
|
|
897 |
last GET_CHILD ;
|
|
898 |
}
|
|
899 |
unless (-f $childfname) {
|
|
900 |
push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
|
|
901 |
'errmsg' => 'File is not a regular'
|
|
902 |
. ' file.' } ] ) ;
|
|
903 |
last GET_CHILD ;
|
|
904 |
}
|
|
905 |
unless (-r $childfname) {
|
|
906 |
push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
|
|
907 |
'errmsg' => 'File is not readable by'
|
|
908 |
. ' current user.' } ] ) ;
|
|
909 |
last GET_CHILD ;
|
|
910 |
}
|
|
911 |
|
|
912 |
# Guard against include loops
|
|
913 |
if ($parents->{$childfname}) {
|
|
914 |
push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
|
|
915 |
'errmsg' => 'An "include" loop exists'
|
|
916 |
. ' involving this file.' } ] ) ;
|
|
917 |
last GET_CHILD ;
|
|
918 |
}
|
|
919 |
|
|
920 |
|
|
921 |
# Get the included file, with any error data
|
|
922 |
($childHTML, $childerrlist, $childiscgi)=
|
|
923 |
&read_expanded_file($childfname, $childURL, $parents) ;
|
|
924 |
|
|
925 |
# Log if there was any error reading the file
|
|
926 |
push(@$errlist, [{'path' => $childfname, 'tag' => $tagst,
|
|
927 |
'errmsg' => "Can't read file: $!." } ] )
|
|
928 |
unless defined($childHTML) ;
|
|
929 |
|
|
930 |
# Add any errors to the current (parent) error list
|
|
931 |
foreach my $error (@$childerrlist) {
|
|
932 |
unshift(@$error,
|
|
933 |
{ 'path' => $childfname, 'tag' => $tagst } ) ;
|
|
934 |
}
|
|
935 |
push(@$errlist, @$childerrlist) ;
|
|
936 |
|
|
937 |
# Parent is a CGI if any of its children is a CGI
|
|
938 |
$iscgi||= $childiscgi ;
|
|
939 |
|
|
940 |
} # GET_CHILD
|
|
941 |
|
|
942 |
|
|
943 |
$childHTML ; # final value to replace in main s/// construct
|
|
944 |
|
|
945 |
} # do {}
|
|
946 |
|
|
947 |
}gie ; # $HTML=~ s{} {}
|
|
948 |
|
|
949 |
} # if ($isshtml)
|
|
950 |
|
|
951 |
delete $parents->{$fname} ;
|
|
952 |
|
|
953 |
return($HTML, $errlist, $iscgi) ;
|
|
954 |
}
|
|
955 |
|
|
956 |
|
|
957 |
|
|
958 |
# Returns the contents of the named file, or undef on error.
|
|
959 |
sub read_file {
|
|
960 |
my($fname)= @_ ;
|
|
961 |
local(*F, $/) ;
|
|
962 |
|
|
963 |
undef $/ ;
|
|
964 |
open(F, "<$fname") || return undef ;
|
|
965 |
my($ret)= <F> ;
|
|
966 |
close(F) ;
|
|
967 |
|
|
968 |
return $ret ;
|
|
969 |
}
|
|
970 |
|
|
971 |
|
|
972 |
# Try to get the given URL with the given HTTP method, and return the
|
|
973 |
# status line, headers, and body.
|
|
974 |
# Set $url->{'status'} accordingly, and set $url->{'ishtml'} accordingly
|
|
975 |
# if Content-Type: header is returned.
|
|
976 |
# This is specific to this program, and calls the more general &get_url().
|
|
977 |
# This could be slightly more efficient if 302 or 303 was handled in the
|
|
978 |
# calling routine, where it could take advantage of a new URL being local.
|
|
979 |
sub load_url_using_HTTP {
|
|
980 |
my($url, $method)= @_ ;
|
|
981 |
my($status_line, $headers, $body) ;
|
|
982 |
|
|
983 |
# We should not get here if $file_check is set
|
|
984 |
die "mistakenly called load_url_using_HTTP($url->{'URL'})" if $file_check ;
|
|
985 |
|
|
986 |
GETFILE: {
|
|
987 |
($status_line, $headers, $body)=
|
|
988 |
&get_url( ($url->{'location'} || $url->{'URL'}), $method) ;
|
|
989 |
|
|
990 |
# If HEAD failed (as on some servers), sigh and use GET
|
|
991 |
($status_line, $headers, $body)=
|
|
992 |
&get_url( ($url->{'location'} || $url->{'URL'}), 'GET')
|
|
993 |
unless length($status_line) ;
|
|
994 |
|
|
995 |
($url->{'status'})= $status_line=~ m#^HTTP/[\d.]+\s+(.*)# ;
|
|
996 |
|
|
997 |
# 2-27-00 JSM: Allow old NCSA servers to not include the HTTP version.
|
|
998 |
if ($SUPPORT_NCSA_BUG and $url->{'status'} eq '') {
|
|
999 |
($url->{'status'})= $status_line=~ m#^HTTP(?:/[\d.]+)?\s+(.*)# ;
|
|
1000 |
}
|
|
1001 |
|
|
1002 |
# Redirect to new location if status is 302 or 303
|
|
1003 |
if ($url->{'status'}=~ /^(301|302|303)\b/) {
|
|
1004 |
($url->{'location'})= $headers=~ m#^Location:[ \t]+(\S+)#im ;
|
|
1005 |
last GETFILE unless length($url->{'location'}) ;
|
|
1006 |
$url->{'location'}=
|
|
1007 |
&absolute_url($url->{'location'}, $url->{'URL'}) ;
|
|
1008 |
redo GETFILE
|
|
1009 |
if ($url->{'status'}=~ /^(302|303)\b/)
|
|
1010 |
&& (++$url->{'numredirects'} <= $MAX_REDIRECTS) ;
|
|
1011 |
}
|
|
1012 |
}
|
|
1013 |
|
|
1014 |
$url->{'numtries'}++ ;
|
|
1015 |
$url->{'lasttried'}= time ;
|
|
1016 |
|
|
1017 |
# If successful response included Content-Type:, set ishtml accordingly
|
|
1018 |
$url->{'ishtml'}= (lc($1) eq 'text/html') + 0
|
|
1019 |
if $url->{'status'}=~ /^2/
|
|
1020 |
and $headers=~ m#^content-type:[ \t]*(\S+)#im ;
|
|
1021 |
|
|
1022 |
print STDERR "status: $status_line\n" if $debug ;
|
|
1023 |
|
|
1024 |
return($status_line, $headers, $body) ;
|
|
1025 |
}
|
|
1026 |
|
|
1027 |
|
|
1028 |
# Request the HTTP resource at the given absolute URL using the given method,
|
|
1029 |
# and return the response status line, headers, and body.
|
|
1030 |
# jsm-- in the future, this should support downloading to a file, in case
|
|
1031 |
# the download is too large to fit in memory.
|
|
1032 |
sub get_url {
|
|
1033 |
my($URL, $method)= @_ ;
|
|
1034 |
my($host, $uri, $endhost) ;
|
|
1035 |
my($S, $rin) ;
|
|
1036 |
my($response, $status_line, $headers, $body, $status_code) ;
|
|
1037 |
my($content_length) ;
|
|
1038 |
$method= uc($method) ;
|
|
1039 |
$method= 'GET' unless length($method) ;
|
|
1040 |
|
|
1041 |
($host, $uri)= $URL=~ m#^http://([^/]*)(.*)$#i ;
|
|
1042 |
$uri= '/' unless length($uri) ;
|
|
1043 |
$endhost= $host ;
|
|
1044 |
|
|
1045 |
# use an HTTP proxy if $ENV{'http_proxy'} is set
|
|
1046 |
USEPROXY: {
|
|
1047 |
last USEPROXY unless $host=~ /\./ ;
|
|
1048 |
if (length($ENV{'http_proxy'})) {
|
|
1049 |
foreach (@NO_PROXY) {
|
|
1050 |
last USEPROXY if $host=~ /$_$/i ;
|
|
1051 |
}
|
|
1052 |
($host)= $ENV{'http_proxy'}=~ m#^(?:http://)?([^/]*)#i ;
|
|
1053 |
$uri= $URL ;
|
|
1054 |
}
|
|
1055 |
}
|
|
1056 |
|
|
1057 |
# Open socket
|
|
1058 |
$S= IO::Socket::INET->new(PeerAddr => $host, # may contain :port
|
|
1059 |
PeerPort => 80, # default if none in PeerAddr
|
|
1060 |
Proto => 'tcp') ;
|
|
1061 |
return("HTTP/1.1 600 Can't create socket: $@") unless defined($S) ;
|
|
1062 |
$S->autoflush() ; # very important!!
|
|
1063 |
|
|
1064 |
# Send HTTP 1.1 request
|
|
1065 |
print $S "$method $uri HTTP/1.1\015\012",
|
|
1066 |
"Host: $endhost\015\012",
|
|
1067 |
"Connection: close\015\012",
|
|
1068 |
"User-agent: CheckLinks/$CL_VERSION\015\012",
|
|
1069 |
"\015\012" ;
|
|
1070 |
|
|
1071 |
# Wait for socket response with select()
|
|
1072 |
vec($rin= '', fileno($S), 1)= 1 ;
|
|
1073 |
select($rin, undef, undef, 60)
|
|
1074 |
|| return("HTTP/1.1 601 Connection timed out") ;
|
|
1075 |
|
|
1076 |
local($/)= "\012" ;
|
|
1077 |
|
|
1078 |
# Handle "100 Continue" responses for HTTP 1.1: loop until non-1xx.
|
|
1079 |
do {
|
|
1080 |
$status_line= <$S> ;
|
|
1081 |
$status_line=~ s/\015?\012$// ;
|
|
1082 |
($status_code)= $status_line=~ m#^HTTP/\d+\.\d+\s+(\d+)# ;
|
|
1083 |
|
|
1084 |
$headers= '' ;
|
|
1085 |
while (<$S>) {
|
|
1086 |
last if /^\015?\012/ ;
|
|
1087 |
$headers.= $_ ;
|
|
1088 |
}
|
|
1089 |
$headers=~ s/\015?\012[ \t]+/ /g ;
|
|
1090 |
} until $status_code!~ /^1/ ;
|
|
1091 |
|
|
1092 |
# Body length is determined by HTTP 1.1 spec, section 4.4: these
|
|
1093 |
# certain conditions implying no body, then chunked encoding,
|
|
1094 |
# then Content-length: header, then server closing connection.
|
|
1095 |
if ($method eq 'HEAD' or $status_code=~ /^(1|204\b|304\b)/) {
|
|
1096 |
$body= undef ;
|
|
1097 |
|
|
1098 |
# else chunked encoding
|
|
1099 |
} elsif ($headers=~ /^transfer-encoding:[ \t]*chunked\b/im) {
|
|
1100 |
# 7-16-99: Old code was only saving last chunk. Fix using
|
|
1101 |
# $this_chunk contributed by Mark Trotter.
|
|
1102 |
my($this_chunk, $chunk_size, $readsofar, $thisread) ;
|
|
1103 |
while ($chunk_size= hex(<$S>)) {
|
|
1104 |
$readsofar= 0 ;
|
|
1105 |
while ($readsofar!=$chunk_size) {
|
|
1106 |
last unless $thisread=
|
|
1107 |
read($S, $this_chunk, $chunk_size-$readsofar, $readsofar) ;
|
|
1108 |
$readsofar+= $thisread ;
|
|
1109 |
}
|
|
1110 |
return("HTTP/1.1 603 Incomplete chunked response", $headers, $body)
|
|
1111 |
if $readsofar!=$chunk_size ;
|
|
1112 |
$_= <$S> ; # clear CRLF after chunk
|
|
1113 |
$body.= $this_chunk ;
|
|
1114 |
}
|
|
1115 |
|
|
1116 |
# Read footers if they exist
|
|
1117 |
while (<$S>) {
|
|
1118 |
last if /^\015?\012/ ;
|
|
1119 |
$headers.= $_ ;
|
|
1120 |
}
|
|
1121 |
$headers=~ s/\015?\012[ \t]+/ /g ;
|
|
1122 |
|
|
1123 |
|
|
1124 |
# else body length given in Content-length:
|
|
1125 |
} elsif (($content_length)= $headers=~ /^content-length:[ \t]*(\d+)/im) {
|
|
1126 |
my($readsofar, $thisread) ;
|
|
1127 |
while ($readsofar!=$content_length) {
|
|
1128 |
last unless $thisread=
|
|
1129 |
read($S, $body, $content_length-$readsofar, $readsofar) ;
|
|
1130 |
$readsofar+= $thisread ;
|
|
1131 |
}
|
|
1132 |
return(sprintf("HTTP/1.1 602 Incomplete response (%s of %s bytes)",
|
|
1133 |
$readsofar+0, $content_length),
|
|
1134 |
$headers, $body)
|
|
1135 |
if $readsofar!=$content_length ;
|
|
1136 |
|
|
1137 |
|
|
1138 |
# else body is entire socket output
|
|
1139 |
} else {
|
|
1140 |
local($/)= undef ;
|
|
1141 |
$body= <$S> ;
|
|
1142 |
}
|
|
1143 |
|
|
1144 |
close($S) ;
|
|
1145 |
|
|
1146 |
return($status_line, $headers, $body) ;
|
|
1147 |
}
|
|
1148 |
|
|
1149 |
|
|
1150 |
#----- URL-parsing routines -------------------------------------------
|
|
1151 |
|
|
1152 |
# The routines parse_url(), unparse_url(), and absolute_url() are based on
|
|
1153 |
# different sections in the Internet Draft "Uniform Resource Identifiers
|
|
1154 |
# (URI): Generic Syntax and Semantics", 11-18-97, by Berners-Lee,
|
|
1155 |
# Fielding, and Masinter, filename draft-fielding-uri-syntax-01.
|
|
1156 |
|
|
1157 |
# Parse a URL into its components, according to URI draft, sections 4.3, 4.4.
|
|
1158 |
# This regular expression is straight from Appendix B, modified to use Perl 5.
|
|
1159 |
# Returns scheme, site, path, query, and fragment. All but path may have
|
|
1160 |
# the undefined value.
|
|
1161 |
sub parse_url {
|
|
1162 |
my($URL)= @_ ;
|
|
1163 |
my($scheme, $site, $path, $query, $fragment)=
|
|
1164 |
($URL=~ m{^(?: ([^:/?\#]+):)?
|
|
1165 |
(?: // ([^/?\#]*))?
|
|
1166 |
([^?\#]*)
|
|
1167 |
(?: \? ([^\#]*))?
|
|
1168 |
(?: \# (.*))?
|
|
1169 |
}x
|
|
1170 |
) ;
|
|
1171 |
|
|
1172 |
|
|
1173 |
# Un-URL-encode the path, to equivalate things like %7E --> ~
|
|
1174 |
# Note that in some situations, this may cause problems with URLs that
|
|
1175 |
# contain the % character: if the unescaped URL is then used in
|
|
1176 |
# relative URL calculation, it may be unescaped again (rare).
|
|
1177 |
$path=~ s/\+/ /g ;
|
|
1178 |
$path=~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/ge ;
|
|
1179 |
|
|
1180 |
# Note that in HTTP, the presence of a host implies a path beginning with
|
|
1181 |
# '/', so $path should be '/' for URLs like "http://www.somehost.com"
|
|
1182 |
$path= '/' if !length($path) && length($site) && lc($scheme) eq 'http' ;
|
|
1183 |
|
|
1184 |
return($scheme, $site, $path, $query, $fragment) ;
|
|
1185 |
|
|
1186 |
}
|
|
1187 |
|
|
1188 |
|
|
1189 |
# Returns a full URL string, given its components
|
|
1190 |
# The full procedure is described in the URI draft, section 5.2, step 7.
|
|
1191 |
sub unparse_url {
|
|
1192 |
my($scheme, $site, $path, $query, $fragment)= @_ ;
|
|
1193 |
my($URL) ;
|
|
1194 |
|
|
1195 |
$URL= "$scheme:" if defined($scheme) ;
|
|
1196 |
$URL.= "//$site" if defined($site) ;
|
|
1197 |
$URL.= $path ;
|
|
1198 |
$URL.= "?$query" if defined($query) ;
|
|
1199 |
$URL.= "#$fragment" if defined($fragment) ;
|
|
1200 |
|
|
1201 |
return $URL ;
|
|
1202 |
}
|
|
1203 |
|
|
1204 |
|
|
1205 |
# Returns a canonicalized absolute URL, given a relative URL and a base URL.
|
|
1206 |
# The full procedure is described in the URI draft, section 5.2.
|
|
1207 |
# Note that a relative URI of "#fragment" should be resolved to "the current
|
|
1208 |
# document", not to an absolute URL. This presents a quandary for this
|
|
1209 |
# routine: should it always return an absolute URL, thus violating the
|
|
1210 |
# spec, or should it not always return an absolute URL, thus requiring any
|
|
1211 |
# caller to check for this special case? This routine leaves that up to
|
|
1212 |
# the caller, with $return_rel_fragment-- if set, stick to the spec;
|
|
1213 |
# otherwise, always return an absolute URL. See section G.4 of the draft.
|
|
1214 |
# Note that the pathname reduction in steps 6.c-f messes up any PATH_INFO
|
|
1215 |
# that has ./ or ../ in it, which may be a bug in the spec.
|
|
1216 |
sub absolute_url {
|
|
1217 |
my($relurl, $baseurl, $return_rel_fragment)= @_ ;
|
|
1218 |
my(@relurl, @baseurl) ;
|
|
1219 |
|
|
1220 |
# parse_url() returns scheme, site, path, query, fragment
|
|
1221 |
@relurl= &parse_url($relurl) ; # Step 1
|
|
1222 |
@baseurl= &parse_url($baseurl) ;
|
|
1223 |
|
|
1224 |
COMBINE: {
|
|
1225 |
|
|
1226 |
# Step 2
|
|
1227 |
# See note above about $return_rel_fragment
|
|
1228 |
if ( $relurl[2] eq '' &&
|
|
1229 |
!defined($relurl[0]) &&
|
|
1230 |
!defined($relurl[1]) &&
|
|
1231 |
!defined($relurl[3]) ) {
|
|
1232 |
@relurl[0..3]= @baseurl[0..3] ;
|
|
1233 |
return $relurl if $return_rel_fragment ; # see note above
|
|
1234 |
last COMBINE ;
|
|
1235 |
}
|
|
1236 |
|
|
1237 |
last COMBINE if defined($relurl[0]) ; # Step 3
|
|
1238 |
$relurl[0]= $baseurl[0] ;
|
|
1239 |
|
|
1240 |
last COMBINE if defined($relurl[1]) ; # Step 4
|
|
1241 |
$relurl[1]= $baseurl[1] ;
|
|
1242 |
|
|
1243 |
last COMBINE if $relurl[2]=~ m#^/# ; # Step 5
|
|
1244 |
|
|
1245 |
# Step 6-- resolve relative path
|
|
1246 |
my($path)= $baseurl[2]=~ m#^(.*/)# ; # Step 6.a
|
|
1247 |
$relurl[2]= $path . $relurl[2] ; # Step 6.b
|
|
1248 |
|
|
1249 |
} # COMBINE
|
|
1250 |
|
|
1251 |
# Put the remaining steps outside of the block to canonicalize the path.
|
|
1252 |
# Arguably, this is not allowed. To avoid such arguments at the expense of
|
|
1253 |
# path canonicalization, put steps 6.c-f back in the COMBINE block.
|
|
1254 |
|
|
1255 |
1 while $relurl[2]=~ s#(^|/)\./#$1# ; # Step 6.c
|
|
1256 |
$relurl[2]=~ s#(^|/)\.$#$1# ; # Step 6.d
|
|
1257 |
|
|
1258 |
# Step 6.e
|
|
1259 |
my($oldpath) ;
|
|
1260 |
while ($relurl[2]=~ s#(([^/]+)/\.\./)# ($2 eq '..') ? $1 : '' #ge) {
|
|
1261 |
last if ($relurl[2] eq $oldpath) ;
|
|
1262 |
$oldpath= $relurl[2] ;
|
|
1263 |
}
|
|
1264 |
|
|
1265 |
# Step 6.f
|
|
1266 |
$relurl[2]=~ s#(([^/]+)/\.\.$)# ($2 eq '..') ? $1 : '' #ge ;
|
|
1267 |
|
|
1268 |
# Step 6.g: allow leading ".." segments to remain in path
|
|
1269 |
# Step 6.h: relurl[2] is already the buffer string
|
|
1270 |
|
|
1271 |
# To canonicalize further, lowercase the hostname (is this valid for all
|
|
1272 |
# schemes?)
|
|
1273 |
$relurl[1]= lc($relurl[1]) if defined($relurl[1]) ;
|
|
1274 |
|
|
1275 |
return &unparse_url(@relurl) ; # Step 7
|
|
1276 |
}
|
|
1277 |
|
|
1278 |
|
|
1279 |
|
|
1280 |
# Convert a local URL into a canonicalized absolute path, or undef if
|
|
1281 |
# not on this host or other error.
|
|
1282 |
# Result should only be used as filename.
|
|
1283 |
# Supports UserDir (e.g. public_html) for "/~username/path/file" URLs.
|
|
1284 |
# Supports Alias, AliasMatch, ScriptAlias, and ScriptAliasMatch from srm.conf
|
|
1285 |
# (but note use of Perl regex's instead of standard regex's).
|
|
1286 |
# Inserts index.html, etc. (from @DIRECTORY_INDEX) if result is a directory,
|
|
1287 |
# but just return directory name (ending in '/') if none of those exists.
|
|
1288 |
# Removes PATH_INFO, if any, from filename.
|
|
1289 |
# Directory names are always returned with trailing slash (which would not
|
|
1290 |
# be appropriate if PATH_INFO was to be retained).
|
|
1291 |
# While this routines makes some tests (e.g. if the file is a directory),
|
|
1292 |
# it does not verify that file at the resulting $filename exists.
|
|
1293 |
# Note that not all URLs point to files, so this routine is not always
|
|
1294 |
# appropriate. In this program, the result from this routine is only
|
|
1295 |
# used when we know the URL is not a CGI script (and is therefore a file),
|
|
1296 |
# except in &is_cgi() itself, which tests if a file is a CGI script.
|
|
1297 |
# If it weren't for &is_cgi(), we could ignore cases when the URL isn't
|
|
1298 |
# a file.
|
|
1299 |
# 12-1-99 JSM: Changed to also return "redirected" location, in case URL
|
|
1300 |
# is a directory but not ending in a slash, so relative URLs will resolve
|
|
1301 |
# correctly against the redirected URL.
|
|
1302 |
sub url_to_filename {
|
|
1303 |
my($URL)= @_ ;
|
|
1304 |
my($URLpath, $path, $location, $docroot, $user) ;
|
|
1305 |
return undef unless $URL=~ m#^http://\Q$LOCAL_HOST\E/#io ;
|
|
1306 |
$URLpath= (&parse_url($URL))[2] ;
|
|
1307 |
die "couldn't get path from [$URL]" unless length($URLpath) ;
|
|
1308 |
|
|
1309 |
# Note minor security hole: if this script is run setuid, then any
|
|
1310 |
# file on the system could be read by using an ALIAS to point to the
|
|
1311 |
# file. Note also that if a $URLpath such as "/alias/dir/../.." is
|
|
1312 |
# passed to this routine, the alias will be substituted BEFORE the
|
|
1313 |
# ".." path segments are traversed. A case like this probably a
|
|
1314 |
# mistake in the URL anyway.
|
|
1315 |
|
|
1316 |
# Make no more than one alias substitution-- is there a precedence order?
|
|
1317 |
# Note that %(.*)_MATCH use Perl regex's, not standard regex's.
|
|
1318 |
# 3-29-99 JSM: These all alias to actual directory, not to a resulting
|
|
1319 |
# URL, so no further conversion should be done if one of these matches.
|
|
1320 |
# 3-29-99 JSM: Changed ALIAS_MATCH and SCRIPT_ALIAS_MATCH blocks to
|
|
1321 |
# allow $1-type substitution in targets; MUST TEST!
|
|
1322 |
ALIAS: {
|
|
1323 |
foreach (keys %ALIAS)
|
|
1324 |
{ $path= $URLpath, last ALIAS
|
|
1325 |
if $URLpath=~ s/^\Q$_\E/$ALIAS{$_}/ }
|
|
1326 |
foreach (keys %ALIAS_MATCH)
|
|
1327 |
{ $path= $URLpath, last ALIAS
|
|
1328 |
if eval "\$URLpath=~ s/^\$_/$ALIAS_MATCH{$_}/" }
|
|
1329 |
foreach (keys %SCRIPT_ALIAS)
|
|
1330 |
{ $path= $URLpath, last ALIAS
|
|
1331 |
if $URLpath=~ s/^\Q$_\E/$SCRIPT_ALIAS{$_}/ }
|
|
1332 |
foreach (keys %SCRIPT_ALIAS_MATCH)
|
|
1333 |
{ $path= $URLpath, last ALIAS
|
|
1334 |
if eval "\$URLpath=~ s/^\$_/$SCRIPT_ALIAS_MATCH{$_}/" }
|
|
1335 |
}
|
|
1336 |
|
|
1337 |
|
|
1338 |
# If $path has been set in above ALIAS block, no further conversion is
|
|
1339 |
# needed.
|
|
1340 |
if ($path eq '') {
|
|
1341 |
|
|
1342 |
# Must check for ^/.. before PATH_INFO check, in case $URL's path
|
|
1343 |
# is e.g. '/../conf/access.conf'
|
|
1344 |
return undef if $URLpath=~ m#^/\.\.(/|$)# ; # ^/.. is not allowed
|
|
1345 |
|
|
1346 |
# Set $docroot and $path for this file, based on the URL (contains '~' ?)
|
|
1347 |
if (length($USER_DIR) and ($user,$path)= $URLpath=~ m#^/~([^/]+)(.*)# ) {
|
|
1348 |
&read_home_dirs unless %home_dir ; # only read when needed
|
|
1349 |
return undef unless length($home_dir{$user}) ;
|
|
1350 |
$docroot= "$home_dir{$user}/$USER_DIR" ;
|
|
1351 |
$path= '/' unless length($path) ;
|
|
1352 |
} else {
|
|
1353 |
# If we have no $DOCUMENT_ROOT, we can't handle URLs without ~.
|
|
1354 |
return undef unless $DOC_ROOT_EXISTS ;
|
|
1355 |
$docroot= $DOCUMENT_ROOT ;
|
|
1356 |
$path= $URLpath ;
|
|
1357 |
}
|
|
1358 |
|
|
1359 |
# Handle PATH_INFO: remove path segments until an existing file is named.
|
|
1360 |
# Note that directories cannot have PATH_INFO after them.
|
|
1361 |
unless (-e "$docroot$path") {
|
|
1362 |
for (my($path2)= $path ; $path2=~ m#/# ; $path2=~ s#/[^/]*$##) {
|
|
1363 |
last if -d "$docroot$path2" ;
|
|
1364 |
$path= $path2, last if -e "$docroot$path2" ;
|
|
1365 |
}
|
|
1366 |
}
|
|
1367 |
|
|
1368 |
# canonicalize path, and recheck for ^/.. (handles an obscure error,
|
|
1369 |
# when $URL's path is e.g. '/a/b/../../..'; but must canonicalize
|
|
1370 |
# after PATH_INFO check, in case path is e.g. '/a/b.cgi/../../..').
|
|
1371 |
1 while $path=~ s#/\.(/|$)#/# ;
|
|
1372 |
1 while $path=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ;
|
|
1373 |
return undef if $path=~ m#^/\.\.(/|$)# ; # ^/.. is not allowed
|
|
1374 |
|
|
1375 |
$path= "$docroot$path" ;
|
|
1376 |
|
|
1377 |
}
|
|
1378 |
|
|
1379 |
|
|
1380 |
# Add "index.html", etc. if appropriate
|
|
1381 |
if (-d $path) {
|
|
1382 |
$path.= '/' unless $path=~ m#/$# ;
|
|
1383 |
# 12-1-99 JSM: set "redirected" location also
|
|
1384 |
$location= "$URL/" unless $URL=~ m#/$# ;
|
|
1385 |
foreach (@DIRECTORY_INDEX) {
|
|
1386 |
$path.= $_, last if -f "$path$_" ;
|
|
1387 |
}
|
|
1388 |
}
|
|
1389 |
return ($path, $location) ;
|
|
1390 |
}
|
|
1391 |
|
|
1392 |
|
|
1393 |
# Convert a local (possibly relative) pathname into a canonicalized URL.
|
|
1394 |
# If filename is relative and no $basepath is given, assume it's in the
|
|
1395 |
# current directory.
|
|
1396 |
# Supports UserDir (e.g. public_html) for "/~username/path/file" URLs.
|
|
1397 |
# Each path segment is checked to see if it's the same as $DOCUMENT_ROOT,
|
|
1398 |
# by comparing inodes. When a match is found, it's cut off the front,
|
|
1399 |
# and an absolute URL is constructed. If $DOCUMENT_ROOT is never matched,
|
|
1400 |
# then $USER_DIR is scanned for. If that doesn't match (i.e. the file
|
|
1401 |
# is not served to the Web), undef is returned.
|
|
1402 |
# Note that $DOC_ROOT_DEV and $DOC_ROOT_INODE are set at the start of the
|
|
1403 |
# program for efficiency, but are an integral part of this routine.
|
|
1404 |
# %ALIAS is supported using %UN_ALIAS, as best as possible. See next note
|
|
1405 |
# on avoiding use of this routine.
|
|
1406 |
# This is currently only used when parsing command-line filenames, and when
|
|
1407 |
# an <!--#include file="..." --> includes an <!--#include virtual="..." -->
|
|
1408 |
# (which may be an error anyway). Thus, it can be avoided if needed, such
|
|
1409 |
# as when complex aliasing makes results ambiguous.
|
|
1410 |
# jsm-- should this add/remove @DIRECTORY_INDEX, to avoid some duplication?
|
|
1411 |
# 3-29-99 JSM: Changed UNALIAS handling-- if it's unaliased, then no other
|
|
1412 |
# conversion is necessary.
|
|
1413 |
sub filename_to_url {
|
|
1414 |
my($path, $basepath)= @_ ;
|
|
1415 |
my($URLpath) ;
|
|
1416 |
unless ($path=~ m#^/#) {
|
|
1417 |
$basepath= $CWD unless length($basepath) ;
|
|
1418 |
$basepath.= '/' if -d $basepath && $basepath!~ m#/$# ;
|
|
1419 |
$basepath=~ s#/[^/]*$#/# ;
|
|
1420 |
$path= "$basepath$path" ;
|
|
1421 |
}
|
|
1422 |
|
|
1423 |
# canonicalize filename by removing ./ and ../ where appropriate
|
|
1424 |
1 while $path=~ s#/\.(/|$)#/# ;
|
|
1425 |
1 while $path=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ;
|
|
1426 |
|
|
1427 |
# canonicalize directory to include final /
|
|
1428 |
$path.= '/' if -d $path && $path!~ m#/$# ;
|
|
1429 |
|
|
1430 |
# First, if path can be unaliased, return that.
|
|
1431 |
# (Added 3-29-99 by JSM.)
|
|
1432 |
foreach (keys %UN_ALIAS)
|
|
1433 |
{ $URLpath= $path, last if $path=~ s/^\Q$_\E/$UN_ALIAS{$_}/ }
|
|
1434 |
return "http://\L$LOCAL_HOST\E$URLpath" if $URLpath ne '' ;
|
|
1435 |
|
|
1436 |
# Then, check if file is under $DOCUMENT_ROOT tree, and convert if so.
|
|
1437 |
if ($DOC_ROOT_EXISTS) {
|
|
1438 |
my($doc_root)= $path ;
|
|
1439 |
while ($doc_root=~ s#/[^/]*$##) {
|
|
1440 |
my($dev,$inode)= stat("$doc_root/.") ;
|
|
1441 |
if ( ($dev==$DOC_ROOT_DEV) && ($inode==$DOC_ROOT_INODE) ) {
|
|
1442 |
$path=~ s/^$doc_root// ;
|
|
1443 |
# foreach (keys %UN_ALIAS)
|
|
1444 |
# { last if $path=~ s/^\Q$_\E/$UN_ALIAS{$_}/ }
|
|
1445 |
return "http://\L$LOCAL_HOST\E$path" ;
|
|
1446 |
}
|
|
1447 |
}
|
|
1448 |
}
|
|
1449 |
|
|
1450 |
# Handle possible case of "~username/$USER_DIR/$path"
|
|
1451 |
# I don't think %ALIAS applies here, does it?
|
|
1452 |
# This misses some when $HOME/$USER_DIR points through a symbolic link,
|
|
1453 |
# and $CWD isn't set to match %dir_to_user. Work around by avoiding
|
|
1454 |
# this routine, e.g. using only URLs on command line.
|
|
1455 |
if (length($USER_DIR)) {
|
|
1456 |
if ($path=~ m#^(.*?)/$USER_DIR(/.*)# ) {
|
|
1457 |
# First, see if path is in %dir_to_user
|
|
1458 |
&read_home_dirs unless %dir_to_user ; # only read when needed
|
|
1459 |
return "http://\L$LOCAL_HOST\E/~$dir_to_user{$1}$2"
|
|
1460 |
if length($dir_to_user{$1}) ;
|
|
1461 |
|
|
1462 |
# If not, then we must check inodes to equivalate directories
|
|
1463 |
&read_inode_to_user unless %inode_to_user ; # only read when needed
|
|
1464 |
my($dev,$inode)= stat("$1/.") ;
|
|
1465 |
return "http://\L$LOCAL_HOST\E/~$inode_to_user{$dev}{$inode}$2"
|
|
1466 |
if length($inode_to_user{$dev}{$inode}) ;
|
|
1467 |
}
|
|
1468 |
}
|
|
1469 |
|
|
1470 |
return undef ;
|
|
1471 |
}
|
|
1472 |
|
|
1473 |
|
|
1474 |
|
|
1475 |
# Reads all users' home directory into %home_dir, from /etc/passwd.
|
|
1476 |
# Also creates %dir_to_user, which is faster than %inode_to_user (below).
|
|
1477 |
# Only used when $USER_DIR is used, for "/~username/path/file" URLs.
|
|
1478 |
# 2-27-00 JSM: Changed to use getpwent, instead of reading /etc/passwd.
|
|
1479 |
sub read_home_dirs {
|
|
1480 |
my($user, $homedir) ;
|
|
1481 |
|
|
1482 |
setpwent ; # to rewind, in case getpwent has already been used
|
|
1483 |
while ( ($user, $homedir)= (getpwent)[0,7] ) {
|
|
1484 |
$home_dir{$user}= $homedir ;
|
|
1485 |
$dir_to_user{$homedir}= $user
|
|
1486 |
unless $dir_to_user{$homedir} ne '' ;
|
|
1487 |
}
|
|
1488 |
endpwent ; # clean way to end getpwent processing
|
|
1489 |
}
|
|
1490 |
|
|
1491 |
|
|
1492 |
# Reads home directory inode information into %inode_to_user, from /etc/passwd.
|
|
1493 |
# Because this is time-consuming, it is only called if needed, and only once.
|
|
1494 |
# Only used when $USER_DIR is used, for "/~username/path/file" URLs.
|
|
1495 |
# On SPARCstation-10 with 3000 /etc/passwd records, this takes ~2 seconds.
|
|
1496 |
# 2-27-00 JSM: Changed to use already-existing %home_dir, instead of reading
|
|
1497 |
# /etc/passwd again.
|
|
1498 |
sub read_inode_to_user {
|
|
1499 |
my($user, $homedir) ;
|
|
1500 |
my($dev, $inode) ;
|
|
1501 |
&read_home_dirs unless %home_dir ; # only read when needed
|
|
1502 |
|
|
1503 |
while ( ($user, $homedir)= each %home_dir ) {
|
|
1504 |
($dev,$inode)= stat("$homedir/.") ;
|
|
1505 |
$inode_to_user{$dev}{$inode}= $user
|
|
1506 |
unless $inode_to_user{$dev}{$inode} ne '' ;
|
|
1507 |
}
|
|
1508 |
}
|
|
1509 |
|
|
1510 |
|
|
1511 |
|
|
1512 |
#----- Extracting URLs from HTML ------------------------------------
|
|
1513 |
|
|
1514 |
|
|
1515 |
# Parse an SGML tag, and return a hash structure with a "name" scalar and
|
|
1516 |
# an "attrib" hash.
|
|
1517 |
# Parses first tag in string, ignoring all surrounding text.
|
|
1518 |
# Results are canonicalized to lower case wherever case-insensitive.
|
|
1519 |
sub parse_tag {
|
|
1520 |
my($tag)= @_ ; # will be mangled
|
|
1521 |
my($tagname,%attrib) ;
|
|
1522 |
|
|
1523 |
# only parse first tag in string
|
|
1524 |
($tag)= split(/>/, $tag) ; # remove all after >
|
|
1525 |
$tag=~ s/^([^<]*<)?\s*// ; # remove pre-<, <, and leading blanks
|
|
1526 |
($tagname,$tag)= split(/\s+/, $tag, 2) ; # split out tag name
|
|
1527 |
|
|
1528 |
# Extract name/value (possibly quoted), lowercase name, set $attrib{}.
|
|
1529 |
# If quoted, is delimited by quotes; if not, delimited by whitespace.
|
|
1530 |
$attrib{lc($1)}= &HTMLunescape($+)
|
|
1531 |
while ($tag=~ s/\s*(\w+)\s*=\s*(([^"']\S*)|"([^"]*)"?|'([^']*)'?)//) ;
|
|
1532 |
|
|
1533 |
# now, get remaining non-valued (boolean) attributes
|
|
1534 |
$tag=~ s/^\s*|\s*$//g ; # skip leading/trailing blanks
|
|
1535 |
foreach (split(/\s+/, $tag)) {
|
|
1536 |
$_= lc($_) ;
|
|
1537 |
$attrib{$_}= $_ ; # booleans have values equal to their name
|
|
1538 |
}
|
|
1539 |
|
|
1540 |
return { 'name' => lc($tagname),
|
|
1541 |
'attrib' => \%attrib } ;
|
|
1542 |
}
|
|
1543 |
|
|
1544 |
|
|
1545 |
# Unescape any HTML character references and return resulting string.
|
|
1546 |
# Support entity character references in %e_to_ch (which is incomplete),
|
|
1547 |
# plus "$#ddd;" and "Ý" forms for values<256.
|
|
1548 |
# Note that not decoding a valid character is erroneous, in that a
|
|
1549 |
# subsequent re-escaping will not return the original string, because
|
|
1550 |
# of the ampersand. Nonetheless, that's preferable to losing the data.
|
|
1551 |
# Q: Is there an appropriate general way to represent an unescaped string?
|
|
1552 |
sub HTMLunescape {
|
|
1553 |
my($s)= @_ ;
|
|
1554 |
|
|
1555 |
# Try alpha, decimal, and hex representations, only substituting if valid
|
|
1556 |
$s=~ s/&(([a-zA-Z][a-zA-Z0-9.-]*);?|#([0-9]+);?|#[Xx]([0-9a-fA-F]+);?)/
|
|
1557 |
length($2) ? ( defined($e_to_ch{$2}) ? $e_to_ch{$2} : "&$1" )
|
|
1558 |
: length($3) ? ( $3 < 256 ? chr($3) : "&$1" )
|
|
1559 |
: length($4) ? ( hex($4) < 256 ? chr(hex($4)) : "&$1" )
|
|
1560 |
: "&$1"
|
|
1561 |
/ge ;
|
|
1562 |
|
|
1563 |
return $s ;
|
|
1564 |
}
|
|
1565 |
|
|
1566 |
|
|
1567 |
# Given a block of HTML, extracts all URLs referenced in it, and adds them
|
|
1568 |
# to our data structures to be downloaded or checked (i.e. calls
|
|
1569 |
# &add_url()).
|
|
1570 |
# Note that %html_urls and %non_html_urls are set at the start of the
|
|
1571 |
# program for efficiency, but are an integral part of this routine.
|
|
1572 |
# Currently, this extracts all <.*?> patterns, which may not be valid if
|
|
1573 |
# "<" or ">" characters are e.g. inside a <script> element.
|
|
1574 |
sub extract_urls {
|
|
1575 |
my($HTML, $baseurl, $referer, $depth)= @_ ;
|
|
1576 |
my(@tags) ;
|
|
1577 |
|
|
1578 |
# Remove comments before extracting links, as pointed out by Tim Hunter.
|
|
1579 |
$HTML=~ s/<!--.*?--.*?>//gs ;
|
|
1580 |
|
|
1581 |
# We must look for <base> tag before all the work, so we must parse
|
|
1582 |
# all tags first. :( Therefore, we save this large array of
|
|
1583 |
# structures for efficiency, hoping we don't run out of memory.
|
|
1584 |
my($i)= -1 ; # to start at array element 0
|
|
1585 |
|
|
1586 |
foreach ($HTML=~ /(<.*?>)/gs) {
|
|
1587 |
$tags[++$i]= &parse_tag($_) ;
|
|
1588 |
$baseurl= $tags[$i]{'attrib'}{'href'}
|
|
1589 |
if ($tags[$i]{'name'} eq 'base')
|
|
1590 |
and (length($tags[$i]{'attrib'}{'href'})) ;
|
|
1591 |
}
|
|
1592 |
|
|
1593 |
# For each tag, call &add_url() for each URL in the tag
|
|
1594 |
foreach my $tag (@tags) {
|
|
1595 |
next if $tag->{'name'}=~ m#^/# ;
|
|
1596 |
|
|
1597 |
# Handle the "regular" tag-attributes, in %html_urls and %non_html_urls
|
|
1598 |
|
|
1599 |
foreach (@{$html_urls{$tag->{'name'}}}) {
|
|
1600 |
&add_url(&absolute_url($tag->{'attrib'}{$_}, $baseurl),
|
|
1601 |
$referer, $depth)
|
|
1602 |
if length($tag->{'attrib'}{$_}) ;
|
|
1603 |
}
|
|
1604 |
|
|
1605 |
foreach (@{$non_html_urls{$tag->{'name'}}}) {
|
|
1606 |
&add_url(&absolute_url($tag->{'attrib'}{$_}, $baseurl),
|
|
1607 |
$referer, $depth, 0)
|
|
1608 |
if length($tag->{'attrib'}{$_}) ;
|
|
1609 |
}
|
|
1610 |
|
|
1611 |
# Now handle each tag-attribute that needs special attention
|
|
1612 |
|
|
1613 |
if ($tag->{'name'} eq 'object') {
|
|
1614 |
my($codebase)=
|
|
1615 |
&absolute_url($tag->{'attrib'}{'codebase'}, $baseurl) ;
|
|
1616 |
|
|
1617 |
&add_url(&absolute_url($tag->{'attrib'}{'data'}, $codebase),
|
|
1618 |
$referer, $depth)
|
|
1619 |
if length($tag->{'attrib'}{'data'}) ;
|
|
1620 |
|
|
1621 |
# There seems to be a contradiction between the HTML 4.0 spec
|
|
1622 |
# section 13.3.3 and RFC 1808 section 4 step 2b regarding
|
|
1623 |
# the URL resolution of e.g. classid="java:program.start".
|
|
1624 |
# For now, we'll stick with the RFC 1808 method.
|
|
1625 |
&add_url(&absolute_url($tag->{'attrib'}{'classid'}, $codebase),
|
|
1626 |
$referer, $depth, 0)
|
|
1627 |
if length($tag->{'attrib'}{'classid'}) ;
|
|
1628 |
|
|
1629 |
# <object> tag's "archive" attribute is a space-separated list
|
|
1630 |
foreach (split(/\s+/, $tag->{'attrib'}{'archive'})) {
|
|
1631 |
&add_url(&absolute_url($_, $codebase), $referer, $depth, 0) ;
|
|
1632 |
}
|
|
1633 |
|
|
1634 |
} elsif ($tag->{'name'} eq 'head') {
|
|
1635 |
# "profile" attribute is a space-separated list
|
|
1636 |
foreach (split(/\s+/, $tag->{'attrib'}{'profile'})) {
|
|
1637 |
&add_url(&absolute_url($_, $baseurl), $referer, $depth, 0) ;
|
|
1638 |
}
|
|
1639 |
|
|
1640 |
} elsif ($tag->{'name'} eq 'applet') {
|
|
1641 |
my($codebase)=
|
|
1642 |
&absolute_url($tag->{'attrib'}{'codebase'}, $baseurl) ;
|
|
1643 |
|
|
1644 |
&add_url(&absolute_url($tag->{'attrib'}{'code'}, $codebase),
|
|
1645 |
$referer, $depth, 0)
|
|
1646 |
if length($tag->{'attrib'}{'code'}) ;
|
|
1647 |
|
|
1648 |
&add_url(&absolute_url($tag->{'attrib'}{'object'}, $codebase),
|
|
1649 |
$referer, $depth, 0)
|
|
1650 |
if length($tag->{'attrib'}{'object'}) ;
|
|
1651 |
|
|
1652 |
# <applet> tag's "archive" attribute is a comma-separated list
|
|
1653 |
foreach (split(/\s*,\s*/, $tag->{'attrib'}{'archive'})) {
|
|
1654 |
&add_url(&absolute_url($_, $codebase), $referer, $depth, 0) ;
|
|
1655 |
}
|
|
1656 |
|
|
1657 |
# Check form script for existence only, but don't follow hyperlinks.
|
|
1658 |
# Handles the unlikely case when a CGI is referred to by both a
|
|
1659 |
# <form action=___.cgi> tag and an <a href=___.cgi> tag: If a CGI
|
|
1660 |
# is only pointed to by <form action>, then don't follow the link
|
|
1661 |
# (i.e. set $url->{'dontfollow'} to verify, not load). If a CGI
|
|
1662 |
# is called by at least one <a href> tag, then do follow the link.
|
|
1663 |
} elsif ($tag->{'name'} eq 'form') {
|
|
1664 |
&add_url(&absolute_url($tag->{'attrib'}{'action'}, $baseurl),
|
|
1665 |
$referer, $depth, undef, 1, 1)
|
|
1666 |
if length($tag->{'attrib'}{'action'}) ;
|
|
1667 |
|
|
1668 |
} # if ($tag->{'name'} eq '...')
|
|
1669 |
|
|
1670 |
|
|
1671 |
} # foreach $tag (@tags)
|
|
1672 |
|
|
1673 |
} # &extract_urls()
|
|
1674 |
|
|
1675 |
|
|
1676 |
|
|
1677 |
#----- output routines ------------------------------------------------
|
|
1678 |
|
|
1679 |
# Generate final report
|
|
1680 |
sub make_report {
|
|
1681 |
if ($verbose_report) { &make_verbose_report }
|
|
1682 |
else { &make_short_report }
|
|
1683 |
}
|
|
1684 |
|
|
1685 |
|
|
1686 |
|
|
1687 |
# Generate a verbose report of all URLs that have been checked
|
|
1688 |
sub make_verbose_report {
|
|
1689 |
my($rootlist)= join("\n ", @ARGV) ;
|
|
1690 |
my($numurls)= scalar keys %url ;
|
|
1691 |
|
|
1692 |
print <<EOH ;
|
|
1693 |
======================================================================
|
|
1694 |
Report of invalid links on $LOCAL_HOST,
|
|
1695 |
recursively followed starting with the files/URLs:
|
|
1696 |
|
|
1697 |
$rootlist
|
|
1698 |
|
|
1699 |
EOH
|
|
1700 |
|
|
1701 |
if (@INCLUDE_PATTERNS) {
|
|
1702 |
my($includelist)= join("\n ", @INCLUDE_PATTERNS) ;
|
|
1703 |
print <<EOH ;
|
|
1704 |
Only including URLs matching at least one of the following patterns:
|
|
1705 |
$includelist
|
|
1706 |
|
|
1707 |
EOH
|
|
1708 |
}
|
|
1709 |
|
|
1710 |
if (@EXCLUDE_PATTERNS) {
|
|
1711 |
my($excludelist)= join("\n ", @EXCLUDE_PATTERNS) ;
|
|
1712 |
print <<EOH ;
|
|
1713 |
Excluding URLs matching any of the following patterns:
|
|
1714 |
$excludelist
|
|
1715 |
|
|
1716 |
EOH
|
|
1717 |
}
|
|
1718 |
|
|
1719 |
print "Total URLs checked: $numurls\n\n" ;
|
|
1720 |
|
|
1721 |
print( "Only reporting response codes beginning with: ",
|
|
1722 |
join(", ", @INCLUDE_STATUS), "\n" )
|
|
1723 |
if @INCLUDE_STATUS ;
|
|
1724 |
|
|
1725 |
print( "Excluding response codes beginning with: ",
|
|
1726 |
join(", ", @EXCLUDE_STATUS), "\n" )
|
|
1727 |
if @EXCLUDE_STATUS ;
|
|
1728 |
|
|
1729 |
print "Maximum traversal depth: $max_depth\n" if length($max_depth) ;
|
|
1730 |
print "Only local files were checked; no CGI scripts were invoked.\n"
|
|
1731 |
if $file_check ;
|
|
1732 |
print "Local URLs were read from the filesystem where possible.\n"
|
|
1733 |
unless $full_http_check ;
|
|
1734 |
|
|
1735 |
print "\n" ;
|
|
1736 |
|
|
1737 |
# Only report status errors if there are any.
|
|
1738 |
# Using grep()s is slightly inefficient, but a *lot* cleaner.
|
|
1739 |
my($has_statuserrs) ;
|
|
1740 |
foreach my $URL (sort keys %url) {
|
|
1741 |
next if @INCLUDE_STATUS &&
|
|
1742 |
!grep($url{$URL}{'status'}=~ /^$_/, @INCLUDE_STATUS) ;
|
|
1743 |
next if grep($url{$URL}{'status'}=~ /^$_/, @EXCLUDE_STATUS) ;
|
|
1744 |
|
|
1745 |
$has_statuserrs=1, last ;
|
|
1746 |
}
|
|
1747 |
|
|
1748 |
|
|
1749 |
if ($has_statuserrs) {
|
|
1750 |
print <<EOH ;
|
|
1751 |
======================================================================
|
|
1752 |
|
|
1753 |
RESULTS FROM ALL URLS WITH SELECTED RESPONSE STATUS CODES
|
|
1754 |
------------------------------------------------------------
|
|
1755 |
|
|
1756 |
EOH
|
|
1757 |
|
|
1758 |
foreach (sort keys %url) {
|
|
1759 |
my($u)= $url{$_} ;
|
|
1760 |
|
|
1761 |
next if @INCLUDE_STATUS &&
|
|
1762 |
!grep($u->{'status'}=~ /^$_/, @INCLUDE_STATUS) ;
|
|
1763 |
next if grep($u->{'status'}=~ /^$_/, @EXCLUDE_STATUS) ;
|
|
1764 |
|
|
1765 |
print "$u->{'URL'}\n", '-' x 50, "\n" ;
|
|
1766 |
print "Status: $u->{'status'}\n" ;
|
|
1767 |
print "Moved to: $u->{'location'}\n" if $u->{'location'} ;
|
|
1768 |
print "Depth: $u->{'depth'}\n" ;
|
|
1769 |
|
|
1770 |
for (my($URL)= $u->{'referer'}, my($tab)= ' ' ;
|
|
1771 |
length($url{$URL}) ;
|
|
1772 |
$URL= $url{$URL}{'referer'}, $tab.= ' ' )
|
|
1773 |
{
|
|
1774 |
print "${tab}referred by $url{$URL}{'URL'}\n" ;
|
|
1775 |
die "PROGRAM ERROR: apparent infinite referer loop.\n"
|
|
1776 |
if length($tab)>200 ; # simple-minded sanity check
|
|
1777 |
}
|
|
1778 |
|
|
1779 |
print "\n\n" ;
|
|
1780 |
} # URL
|
|
1781 |
|
|
1782 |
}
|
|
1783 |
|
|
1784 |
|
|
1785 |
|
|
1786 |
# Only report SSI errors if there are any
|
|
1787 |
my($has_ssierrs) ;
|
|
1788 |
foreach (sort keys %url) {
|
|
1789 |
$has_ssierrs=1, last if @{$url{$_}{'ssierrs'}} ;
|
|
1790 |
}
|
|
1791 |
|
|
1792 |
|
|
1793 |
if ($has_ssierrs) {
|
|
1794 |
print <<EOH ;
|
|
1795 |
======================================================================
|
|
1796 |
|
|
1797 |
PROBLEMS WITH SERVER-SIDE INCLUDE (AKA SHTML) DIRECTIVES
|
|
1798 |
------------------------------------------------------------
|
|
1799 |
|
|
1800 |
EOH
|
|
1801 |
foreach (sort keys %url) {
|
|
1802 |
my($u)= $url{$_} ;
|
|
1803 |
if (@{$u->{'ssierrs'}}) {
|
|
1804 |
print "$u->{'URL'}\n", '-' x 50, "\n" ;
|
|
1805 |
printf "Total %s SSI Errors:\n", $#{$u->{'ssierrs'}}+1 ;
|
|
1806 |
foreach my $i (0..$#{$u->{'ssierrs'}}) {
|
|
1807 |
printf " %s) ", $i+1 ;
|
|
1808 |
my($tab)= ' ' x (4+length($i+1)) ;
|
|
1809 |
my($tab2) ;
|
|
1810 |
foreach my $level (@{$u->{'ssierrs'}[$i]}) {
|
|
1811 |
print "${tab2}file: $level->{'path'}\n" ;
|
|
1812 |
print "${tab}in tag: $level->{'tag'}\n" ;
|
|
1813 |
print "${tab}error: $level->{'errmsg'}\n"
|
|
1814 |
if $level->{'errmsg'} ;
|
|
1815 |
$tab.= ' ' ;
|
|
1816 |
$tab2= $tab ;
|
|
1817 |
}
|
|
1818 |
}
|
|
1819 |
print "\n\n" ;
|
|
1820 |
}
|
|
1821 |
}
|
|
1822 |
}
|
|
1823 |
|
|
1824 |
|
|
1825 |
unless ($has_statuserrs or $has_ssierrs) {
|
|
1826 |
print <<EOH ;
|
|
1827 |
======================================================================
|
|
1828 |
|
|
1829 |
<< NO ERRORS FOUND >>
|
|
1830 |
|
|
1831 |
EOH
|
|
1832 |
}
|
|
1833 |
|
|
1834 |
print '=' x 70, "\n" ;
|
|
1835 |
|
|
1836 |
} # &make_verbose_report()
|
|
1837 |
|
|
1838 |
|
|
1839 |
# Generate a one-line-per-URL report
|
|
1840 |
sub make_short_report {
|
|
1841 |
my($numurls)= scalar keys %url ;
|
|
1842 |
|
|
1843 |
print "Total $numurls URLs checked\n" ;
|
|
1844 |
URL: foreach my $URL (sort keys %url) {
|
|
1845 |
next if @INCLUDE_STATUS &&
|
|
1846 |
!grep($url{$URL}{'status'}=~ /^$_/, @INCLUDE_STATUS) ;
|
|
1847 |
next if grep($url{$URL}{'status'}=~ /^$_/, @EXCLUDE_STATUS) ;
|
|
1848 |
|
|
1849 |
print "$url{$URL}{'URL'}\t$url{$URL}{'status'}\n" ;
|
|
1850 |
} # URL
|
|
1851 |
}
|
|
1852 |
|
|
1853 |
|
|
1854 |
|
|
1855 |
# Print a summary of usage and exit
|
|
1856 |
sub usage {
|
|
1857 |
print <<EOF ;
|
|
1858 |
|
|
1859 |
Checklinks $CL_VERSION
|
|
1860 |
|
|
1861 |
To recursively check all HTML links on the local site, enter:
|
|
1862 |
$0 <options> [ start-file | start-URL ] ...
|
|
1863 |
|
|
1864 |
Options include:
|
|
1865 |
-v Generate full (verbose) report, including full
|
|
1866 |
referral information and detailed SSI error
|
|
1867 |
reporting.
|
|
1868 |
-I <include-pattern> Only check URLs matching <include-pattern>.
|
|
1869 |
-X <exclude-pattern> Don't check URLs matching <exclude-pattern>.
|
|
1870 |
-i <include-status> Only report URLs whose response code starts with
|
|
1871 |
the pattern <include-status>.
|
|
1872 |
-x <exclude-status> Don't report URLs whose response code starts with
|
|
1873 |
the pattern <exclude-status>. Default is to
|
|
1874 |
exclude "200" responses only (i.e. "-x 200").
|
|
1875 |
-d <max-depth> Traverse links no deeper than <max-depth>.
|
|
1876 |
-f "File mode"-- only read files from the filesystem;
|
|
1877 |
do not go through the HTTP server at all. This
|
|
1878 |
will skip all URLs that point to CGI scripts.
|
|
1879 |
-h "HTTP mode"-- use HTTP to check ALL URLs, even
|
|
1880 |
if they could be read from the filesystem.
|
|
1881 |
Incompatible with "-f" option.
|
|
1882 |
-c <config-file> Read appropriate configuration parameters from
|
|
1883 |
<config-file>, typically srm.conf. Use '-' to
|
|
1884 |
read from STDIN. If a directory is named, use
|
|
1885 |
"srm.conf" in that directory.
|
|
1886 |
-q Print current configuration parameters.
|
|
1887 |
-? Print this help message.
|
|
1888 |
-- End command-line option processing.
|
|
1889 |
|
|
1890 |
Don't stack options like "-vf"; use "-v -f" instead.
|
|
1891 |
|
|
1892 |
For -I, -X, -i, and -x:
|
|
1893 |
Values are interpreted as Perl 5 regular expressions.
|
|
1894 |
Use multiple options to build a list (e.g. "-I include1 -I include2").
|
|
1895 |
Use a value of '' to clear a list (e.g. -x '' means "report all responses",
|
|
1896 |
"-x '' -x 401" means "report all but 401 responses").
|
|
1897 |
As a special case, an empty -I or -i list implies no include-restrictions.
|
|
1898 |
If an item is in both the include and exclude list, it is excluded.
|
|
1899 |
Note that -I and -X restrict which URLs are traversed into, so they may
|
|
1900 |
prune large areas of your Web structure.
|
|
1901 |
|
|
1902 |
EOF
|
|
1903 |
exit ;
|
|
1904 |
}
|
|
1905 |
|
|
1906 |
|
|
1907 |
#----- debugging routines below ---------------------------------------
|
|
1908 |
|
|
1909 |
# Print current configuration settings
|
|
1910 |
sub print_config {
|
|
1911 |
print "\n----- OPTIONS SPECIFIC TO THIS EXECUTION -----------------------------\n\n" ;
|
|
1912 |
|
|
1913 |
print "Include only URLs containing one of the following patterns:\n",
|
|
1914 |
( map { " $_\n" } @INCLUDE_PATTERNS ),
|
|
1915 |
"\n"
|
|
1916 |
if @INCLUDE_PATTERNS ;
|
|
1917 |
|
|
1918 |
print "Exclude URLs containing one of the following patterns:\n",
|
|
1919 |
(map { " $_\n" } @EXCLUDE_PATTERNS ),
|
|
1920 |
"\n"
|
|
1921 |
if @EXCLUDE_PATTERNS ;
|
|
1922 |
|
|
1923 |
print "Only report response codes beginning with: ",
|
|
1924 |
join(", ", @INCLUDE_STATUS), "\n"
|
|
1925 |
if @INCLUDE_STATUS ;
|
|
1926 |
|
|
1927 |
print "Don't report response codes beginning with: ",
|
|
1928 |
join(", ", @EXCLUDE_STATUS), "\n"
|
|
1929 |
if @EXCLUDE_STATUS ;
|
|
1930 |
|
|
1931 |
print "\nMaximum search depth: $max_depth\n" if length($max_depth) ;
|
|
1932 |
|
|
1933 |
print <<EOS ;
|
|
1934 |
|
|
1935 |
----- INSTALLATION PARAMETERS ----------------------------------------
|
|
1936 |
|
|
1937 |
Local Host: $LOCAL_HOST
|
|
1938 |
Document Root: $DOCUMENT_ROOT
|
|
1939 |
User Web Directory: $USER_DIR
|
|
1940 |
Default Filename(s): @DIRECTORY_INDEX
|
|
1941 |
|
|
1942 |
File extensions indicating a CGI program: @CGI_EXTENSIONS
|
|
1943 |
File extensions indicating server-parsed HTML: @SHTML_EXTENSIONS
|
|
1944 |
|
|
1945 |
EOS
|
|
1946 |
|
|
1947 |
print "Directory Aliases:\n",
|
|
1948 |
map { sprintf(" %15s ==> %s\n", $_, $ALIAS{$_}) }
|
|
1949 |
sort keys %ALIAS,
|
|
1950 |
"\n"
|
|
1951 |
if keys %ALIAS ;
|
|
1952 |
|
|
1953 |
print "Directory Regular Expression Aliases:\n",
|
|
1954 |
map { sprintf(" %15s ==> %s\n", $_, $ALIAS_MATCH{$_}) }
|
|
1955 |
sort keys %ALIAS_MATCH,
|
|
1956 |
"\n"
|
|
1957 |
if keys %ALIAS_MATCH ;
|
|
1958 |
|
|
1959 |
print "Script Directory Aliases:\n",
|
|
1960 |
map { sprintf(" %15s ==> %s\n", $_, $SCRIPT_ALIAS{$_}) }
|
|
1961 |
sort keys %SCRIPT_ALIAS,
|
|
1962 |
"\n"
|
|
1963 |
if keys %SCRIPT_ALIAS ;
|
|
1964 |
|
|
1965 |
print "Script Directory Regular Expression Aliases:\n",
|
|
1966 |
map { sprintf(" %15s ==> %s\n", $_, $SCRIPT_ALIAS_MATCH{$_}) }
|
|
1967 |
sort keys %SCRIPT_ALIAS_MATCH,
|
|
1968 |
"\n"
|
|
1969 |
if keys %SCRIPT_ALIAS_MATCH ;
|
|
1970 |
|
|
1971 |
if ($ENV{'http_proxy'}) {
|
|
1972 |
print "HTTP Proxy: $ENV{'http_proxy'}\n" ;
|
|
1973 |
print "Except to hosts ending in: $ENV{'no_proxy'}\n"
|
|
1974 |
if $ENV{'no_proxy'} ;
|
|
1975 |
print "\n" ;
|
|
1976 |
} else {
|
|
1977 |
print "Not using any HTTP Proxy.\n" ;
|
|
1978 |
}
|
|
1979 |
|
|
1980 |
print "Maximum HTTP redirections allowed for a URL: $MAX_REDIRECTS\n",
|
|
1981 |
"Maximum number of tries to get a single URL: $MAX_ATTEMPTS\n\n";
|
|
1982 |
|
|
1983 |
print '-' x 70, "\n\n" ;
|
|
1984 |
|
|
1985 |
}
|
|
1986 |
|
|
1987 |
|
|
1988 |
|
|
1989 |
# print configuration, in a style more for debugging (mostly from srm.conf)
|
|
1990 |
sub print_debug_config {
|
|
1991 |
print <<EOF ;
|
|
1992 |
DOCUMENT_ROOT= [$DOCUMENT_ROOT]
|
|
1993 |
USER_DIR= [$USER_DIR]
|
|
1994 |
DIRECTORY_INDEX= [@DIRECTORY_INDEX]
|
|
1995 |
|
|
1996 |
CGI_EXTENSIONS= [@CGI_EXTENSIONS]
|
|
1997 |
SHTML_EXTENSIONS= [@SHTML_EXTENSIONS]
|
|
1998 |
|
|
1999 |
EOF
|
|
2000 |
|
|
2001 |
foreach (sort keys %ALIAS)
|
|
2002 |
{ print "ALIAS{$_}= [$ALIAS{$_}]\n" }
|
|
2003 |
foreach (sort keys %ALIAS_MATCH)
|
|
2004 |
{ print "ALIAS_MATCH{$_}= [$ALIAS_MATCH{$_}]\n" }
|
|
2005 |
foreach (sort keys %SCRIPT_ALIAS)
|
|
2006 |
{ print "SCRIPT_ALIAS{$_}= [$SCRIPT_ALIAS{$_}]\n" }
|
|
2007 |
foreach (sort keys %SCRIPT_ALIAS_MATCH)
|
|
2008 |
{ print "SCRIPT_ALIAS_MATCH{$_}= [$SCRIPT_ALIAS_MATCH{$_}]\n" }
|
|
2009 |
|
|
2010 |
}
|
|
2011 |
|
|
2012 |
|
|
2013 |
#----------------------------------------------------------------------
|
|
2014 |
#
|
|
2015 |
# SPARSE DOCUMENTATION ON PROGRAM INTERNALS:
|
|
2016 |
#
|
|
2017 |
#----------------------------------------------------------------------
|
|
2018 |
#
|
|
2019 |
# URLs are read directly from the local filesystem if possible, or
|
|
2020 |
# downloaded or checked with HTTP GET and HEAD methods if not. CGI
|
|
2021 |
# scripts are called through the HTTP server, even if local (since the
|
|
2022 |
# resource their URLs refer to are not files).
|
|
2023 |
#
|
|
2024 |
#
|
|
2025 |
# Global variables:
|
|
2026 |
#
|
|
2027 |
# %url holds all data regarding all URLs, using the text URL as the key:
|
|
2028 |
#
|
|
2029 |
# %url{$URL}{qw( URL filename referer depth
|
|
2030 |
# ishtml iscgi islocal dontfollow
|
|
2031 |
# status location numredirects numtries lasttried
|
|
2032 |
# ssierrs
|
|
2033 |
# hasbeenloaded)} <== only for testing
|
|
2034 |
# URL is same as hash key
|
|
2035 |
# ishtml means is type text/html; links MAY be extracted. (Used in
|
|
2036 |
# main loop.)
|
|
2037 |
# iscgi means is a CGI script; MUST NOT be read directly from
|
|
2038 |
# filesystem. (Used in verify_url() and load_url().)
|
|
2039 |
# islocal means served by local server:
|
|
2040 |
# a) MAY be read directly from filesystem
|
|
2041 |
# b) links MAY be extracted
|
|
2042 |
# dontfollow means links MUST NOT be extracted.
|
|
2043 |
# status begins with a number, but may have text afterward.
|
|
2044 |
#
|
|
2045 |
# islocal= (URL=~ m#^http://$LOCAL_HOST/#i) , i.e. redundant.
|
|
2046 |
# ishtml and iscgi are guesses, may be 0, 1, or '' for unset.
|
|
2047 |
# iscgi only matters for local URLs, but may safely be set for
|
|
2048 |
# remote URLs.
|
|
2049 |
# dontfollow is a hack to handle an obscure CGI-related situation.
|
|
2050 |
#
|
|
2051 |
# In addition to the standard HTTP status codes, 'status' may take one
|
|
2052 |
# of the following values:
|
|
2053 |
# 450 Can't read file: $!
|
|
2054 |
# 451 SSI Error(s) (__ total)
|
|
2055 |
# 600 Can't create socket: $@
|
|
2056 |
# 601 Connection timed out
|
|
2057 |
# 602 Incomplete response (%s of %s bytes)
|
|
2058 |
# 603 Incomplete chunked response
|
|
2059 |
#
|
|
2060 |
# Handling of 602 and 603 is somewhat hackish-- overwrites real status
|
|
2061 |
# line with artificial one.
|
|
2062 |
#
|
|
2063 |
# $url->{'ssierrs'} is a list of errors relating to SSI includes.
|
|
2064 |
# Each error has a list of filenames representing the include
|
|
2065 |
# chain. Each element in the chain stores the absolute
|
|
2066 |
# pathname, the <!--#include--> tag which called it, and
|
|
2067 |
# possibly an error message (usually only on the last file in
|
|
2068 |
# the include-chain). Thus, the entire structure is
|
|
2069 |
#
|
|
2070 |
# $url->{'ssierrs'}[$errid][$levelsdeep]{qw( path tag errmsg )}
|
|
2071 |
#
|
|
2072 |
# This is only a memory hog with many errors of deeply nested files.
|
|
2073 |
#
|
|
2074 |
#
|
|
2075 |
# @urlstoget: canonicalized list of URLs left to get, in order
|
|
2076 |
# of appearance. List elements are pointers to %url elements.
|
|
2077 |
# This array may grow from &extract_urls(), or be rearranged when
|
|
2078 |
# a URL needs to be retried.
|
|
2079 |
#
|
|
2080 |
# Note that $url is normally a pointer, and $URL is normally a string.
|
|
2081 |
#
|
|
2082 |
# Files/paths are canonicalized by removing unneeded "./" and "../"
|
|
2083 |
# segments.
|
|
2084 |
# Directories are canonicalized to always end in "/", including in URLs;
|
|
2085 |
# note that some system variables and local directory variables aren't so.
|
|
2086 |
# URLs that point to "/" end in "/", e.g. "http://www.foo.com/".
|
|
2087 |
#
|
|
2088 |
#
|
|
2089 |
# This program has a lot of unused functionality for possible extension,
|
|
2090 |
# so don't get confused by routines that do more than they are used for.
|
|
2091 |
#
|
|
2092 |
#----------------------------------------------------------------------
|
|
2093 |
#
|
|
2094 |
# To do:
|
|
2095 |
#
|
|
2096 |
# * If you want a different output format (like in HTML), it should
|
|
2097 |
# be easy enough to add.
|
|
2098 |
# * This could be extended to check remote sites without much trouble.
|
|
2099 |
# * Should probably support directories that don't have index.html.
|
|
2100 |
# * Check that pointed-to fragments exist in target files
|
|
2101 |
# * keep track of servers and don't hit them too often
|
|
2102 |
# * support robots.txt
|
|
2103 |
# * add option to read list of URLs from file
|
|
2104 |
# * support HTTP Basic Authentication somehow. Note that currently, this
|
|
2105 |
# reads local files even when they should be protected by .htaccess
|
|
2106 |
# (unless using -h option).
|
|
2107 |
# * Read list of local host names from gethostbyname() ?
|
|
2108 |
# * check read permissions on files?
|
|
2109 |
# * In the event multiple URLs redirect to the same URL, this could be a
|
|
2110 |
# little more efficient if it saved the redirected loads/downloads.
|
|
2111 |
#
|
|
2112 |
#----------------------------------------------------------------------
|