|
1 #!/usr/local/bin/perl |
|
2 # |
|
3 # Checklinks 1.0.1 |
|
4 # |
|
5 # Starting at one or more seed HTML files, recursively check the |
|
6 # validity of all links on the site. Major features: |
|
7 # |
|
8 # * Local URLs are read from the filesystem when possible (much |
|
9 # faster than going through HTTP server). |
|
10 # * Basic server-side includes (aka SSI or SHTML) are checked. |
|
11 # * Latest standards are supported-- HTML 4.0, HTTP 1.1, URIs |
|
12 # according to RFC 2396. |
|
13 # * Links are traversed breadth-first. |
|
14 # |
|
15 # To list command-line options, run "cl -?" or see &usage() below. |
|
16 # |
|
17 # TO CONFIGURE: |
|
18 # |
|
19 # 1) Set $LOCAL_HOST and $DOCUMENT_ROOT, just below. If you don't, the |
|
20 # program will try to guess them in set_needed_globals(), but it's more |
|
21 # reliable if you enter them here. |
|
22 # |
|
23 # 2) If needed, set any further server configuration below-- things like |
|
24 # path aliases and so forth. If you have the srm.conf file, you can |
|
25 # feed it into this script with "-c srm.conf"; otherwise, the default |
|
26 # settings will probably work OK. |
|
27 # |
|
28 # You can set a few parameters with the undocumented "-D <name=value>" |
|
29 # command-line option, e.g. "-D LOCAL_HOST=www.myhost.com". |
|
30 # |
|
31 # Further comments, including an overview of script internals, are at |
|
32 # the end of this file. |
|
33 # |
|
34 # Copyright (C) 1998, 2000 by James Marshall, james@jmarshall.com |
|
35 # see http://www.jmarshall.com/tools/cl/ for more info |
|
36 # |
|
37 # |
|
38 # CHANGES IN 1.0.1: |
|
39 # |
|
40 # This is just a bug fix release. Fixes include: |
|
41 # . Aliases are handled correctly now. Sorry 'bout that. |
|
42 # . A redirect + relative URL no longer results in infinitely |
|
43 # recursing URLs. |
|
44 # . More HTML tags are searched for links. |
|
45 # . Non-HTML files are no longer searched for links. |
|
46 # . There were other minor bug fixes. |
|
47 # |
|
48 #---------------------------------------------------------------------- |
|
49 |
|
50 #use strict ; |
|
51 |
|
52 my( $LOCAL_HOST, $DOCUMENT_ROOT, $USER_DIR, @DIRECTORY_INDEX, |
|
53 %ALIAS, %ALIAS_MATCH, %SCRIPT_ALIAS, %SCRIPT_ALIAS_MATCH, %UN_ALIAS, |
|
54 @SHTML_EXTENSIONS, @CGI_EXTENSIONS, @INCLUDE_PATTERNS, @EXCLUDE_PATTERNS, |
|
55 @INCLUDE_STATUS, @EXCLUDE_STATUS, |
|
56 $verbose_report, $max_depth, $file_check, $full_http_check, |
|
57 $MAX_REDIRECTS, $MAX_ATTEMPTS, $HTML_BY_NAME, $SUPPORT_NCSA_BUG, |
|
58 @NO_PROXY, $DOC_ROOT_DEV, $DOC_ROOT_INODE, $DOC_ROOT_EXISTS, $CWD, |
|
59 %html_urls, %non_html_urls, %e_to_ch, |
|
60 |
|
61 %home_dir, %dir_to_user, %inode_to_user, |
|
62 |
|
63 %url, @urlstoget, |
|
64 |
|
65 $debug, $CL_VERSION, |
|
66 ) ; |
|
67 |
|
68 |
|
69 #----- User Configuration --------------------------------------------- |
|
70 |
|
71 # This should be 'localhost', or a hostname of the Web server. URLs at |
|
72 # this host will be assumed to be local; URLs not at this host will not be |
|
73 # traversed into. If this names a remote host, the program will not work. |
|
74 # Note that 'localhost' doesn't necessarily point to your local Web server. |
|
75 |
|
76 # $LOCAL_HOST= 'localhost' ; |
|
77 # $LOCAL_HOST= 'www.example.com' ; |
|
78 $LOCAL_HOST='isabelle.in.tum.de'; |
|
79 |
|
80 # This is your root Web directory, i.e. the directory that the Web server |
|
81 # sends the user if the URL "http://$LOCAL_HOST" is requested. It's in |
|
82 # the configuration file srm.conf (and is read by -c option). |
|
83 # If you don't know the document root of your server, but you don't need |
|
84 # it because you're only checking URLs whose path starts with ~, put a |
|
85 # non-existent path here rather than leave it blank (a hack). |
|
86 |
|
87 # $DOCUMENT_ROOT= '/home/www/htdocs' ; |
|
88 $DOCUMENT_ROOT='/home/proj/isabelle'; |
|
89 |
|
90 #----- variables equivalent to srm.conf entries |
|
91 |
|
92 # These globals are from the equivalent entries in srm.conf, etc. |
|
93 # See the command-line option -c <config-file>, to read values directly |
|
94 # from srm.conf instead. |
|
95 |
|
96 # $USER_DIR= 'public_html' ; |
|
97 $USER_DIR='.html-data'; |
|
98 @DIRECTORY_INDEX= qw( index.html index.cgi index.shtml ) ; |
|
99 |
|
100 # Used in &url_to_filename(), and possibly elsewhere |
|
101 # Note that ALIAS_MATCH and SCRIPT_ALIAS_MATCH use Perl (not standard) regexps. |
|
102 # If order of multiple e.g. "Alias" directives is important, this may not work. |
|
103 %ALIAS= () ; |
|
104 %ALIAS_MATCH= () ; |
|
105 %SCRIPT_ALIAS= () ; |
|
106 %SCRIPT_ALIAS_MATCH= () ; |
|
107 |
|
108 # The list of file extensions to interpret as CGI scripts or |
|
109 # server-parsed HTML files. |
|
110 # These are not specific settings in srm.conf, but are combinations of |
|
111 # AddHandler directives and possibly AddType directives. |
|
112 @CGI_EXTENSIONS= qw( .cgi ) ; |
|
113 @SHTML_EXTENSIONS= qw( .shtml ) ; |
|
114 |
|
115 #----- end of variables equivalent to srm.conf entries |
|
116 |
|
117 # Specify patterns here to only include URLs that match at least one |
|
118 # pattern. As a special case, an empty list includes all URLs, i.e. |
|
119 # does not restrict URLs by name (except perhaps by @EXCLUDE_PATTERNS). |
|
120 # This can be added to or cleared with the -I command-line option. |
|
121 @INCLUDE_PATTERNS= () ; |
|
122 |
|
123 # Specify patterns here to cause matching URLs to be excluded, |
|
124 # e.g. '\?' means ignore all URLs that query. |
|
125 # This can be added to or cleared with the -X command-line option. |
|
126 # @EXCLUDE_PATTERNS= qw( \? ) ; |
|
127 |
|
128 # Only report URLs whose status codes start with one of these patterns. |
|
129 # As a special case, an empty list reports all URLs, i.e. does not |
|
130 # restrict URLs by status code (except perhaps by @EXCLUDE_STATUS). |
|
131 # This can be added to or cleared with the -i command-line option. |
|
132 @INCLUDE_STATUS= () ; |
|
133 |
|
134 # Don't report URLs whose status codes start with these patterns. Default |
|
135 # is qw( 200 ). |
|
136 # This can be added to or cleared with the -x command-line option. |
|
137 @EXCLUDE_STATUS= qw( 200 ) ; |
|
138 |
|
139 # For 302 or 303 HTTP redirection, redirect no more than this many times. |
|
140 $MAX_REDIRECTS= 5 ; |
|
141 |
|
142 # If a connection times out, etc., attempt no more than this many times. |
|
143 $MAX_ATTEMPTS= 5 ; |
|
144 |
|
145 # The old version determined whether a file was HTML by the -T test (text |
|
146 # file), and so traversed all HTML-like links in any text file that wasn't |
|
147 # a CGI script. It's probably more appropriate to check the file |
|
148 # extension, to exclude source code, .txt files, etc. Leave $HTML_BY_NAME |
|
149 # set to use the filename, or unset it to traverse all HTML-like links in |
|
150 # any text files, as the old version did. |
|
151 $HTML_BY_NAME= 1 ; |
|
152 |
|
153 # Some old NCSA servers, including 1.5.2, don't report the HTTP version |
|
154 # correctly in the status line; they return e.g. "HTTP 200 OK". To allow |
|
155 # this, leave the variable here set. |
|
156 $SUPPORT_NCSA_BUG= 1 ; |
|
157 |
|
158 |
|
159 #----- DO NOT CHANGE ANYTHING BELOW THIS LINE, unless you want to... --- |
|
160 |
|
161 #----- Further Global Variable Initialization -------------------------- |
|
162 |
|
163 $CL_VERSION= '1.0.1' ; |
|
164 |
|
165 $ENV{'http_proxy'}||= $ENV{'HTTP_PROXY'} ; |
|
166 @NO_PROXY= split(/[\s,]+/, $ENV{'no_proxy'} || $ENV{'NO_PROXY'} ) ; |
|
167 |
|
168 |
|
169 # If output's not going directly to terminal, this ensures autoflushing. |
|
170 $|= 1 ; |
|
171 |
|
172 #----- End of Configuration -------------------------------------------- |
|
173 |
|
174 use strict 'vars' ; |
|
175 use IO::Socket ; |
|
176 |
|
177 &usage unless @ARGV ; |
|
178 |
|
179 # Process command-line options |
|
180 &getopts ; |
|
181 |
|
182 # Make any final needed adjustments to globals, after the hard-coded |
|
183 # values above and any options have been processed. |
|
184 &adjust_all_globals ; |
|
185 |
|
186 # Default to "." if no starting filenames given. |
|
187 # 3-6-98: Anh, decided against it. |
|
188 #@ARGV= ('.') unless @ARGV ; |
|
189 |
|
190 # &add_url() sets $url{$_} and pushes to @urlstoget, only if not already |
|
191 # added, plus any other initialization. |
|
192 # Only add a file if it can be accessed with a URL. |
|
193 foreach my $arg (@ARGV) { |
|
194 if ($arg=~ m#^http://#i) { |
|
195 &add_url($arg, '-', 0) ; |
|
196 } else { |
|
197 my($URL)= &filename_to_url($arg, $CWD) ; |
|
198 if (defined($URL)) { |
|
199 &add_url($URL, '-', 0) ; |
|
200 } else { |
|
201 die "ERROR: $arg is not accessible through the Web server.\n" ; |
|
202 } |
|
203 } |
|
204 } |
|
205 |
|
206 |
|
207 # Check the URLs, in order. @urlstoget may grow and rearrange. |
|
208 while (@urlstoget) { |
|
209 my($url)= shift(@urlstoget) ; |
|
210 if ( !$url->{'ishtml'} or !$url->{'islocal'} or $url->{'dontfollow'} |
|
211 or (length($max_depth) and $url->{'depth'} > $max_depth ) ) { |
|
212 &verify_url($url) ; # may set ishtml=true |
|
213 } |
|
214 if ( $url->{'ishtml'} and $url->{'islocal'} and !$url->{'dontfollow'} |
|
215 and (!length($max_depth) or $url->{'depth'} <= $max_depth ) ) { |
|
216 my($HTML)= &load_url($url) ; # may set ishtml=false |
|
217 # 11-30-99 JSM: fixed to handle rel URLs in redirected pages correctly |
|
218 my($base_url)= $url->{'location'} || $url->{'URL'} ; |
|
219 &extract_urls($HTML, $base_url, $url->{'URL'}, $url->{'depth'}+1) |
|
220 if $url->{'ishtml'} ; # big, calls &add_url() |
|
221 } |
|
222 |
|
223 # If we get an error response that may be corrected with another |
|
224 # attempt, put it back in the queue. Such errors include 408, |
|
225 # 503, 504, and the homegrown codes 600, 601, 602, and 603. |
|
226 if ($url->{'status'}=~ /^(408|503|504|600|601|602|603)\b/ ) { |
|
227 push(@urlstoget, $url) if ( $url->{'numtries'} < $MAX_ATTEMPTS ) ; |
|
228 } |
|
229 |
|
230 } |
|
231 |
|
232 &make_report() ; |
|
233 |
|
234 exit ; |
|
235 |
|
236 |
|
237 |
|
238 #----- Process command-line options ----------------------------------- |
|
239 |
|
240 # Process any command-line options. |
|
241 sub getopts { |
|
242 my($opt, $param) ; |
|
243 while ($ARGV[0]=~ /^-/) { |
|
244 $opt= shift(@ARGV) ; |
|
245 ($opt, $param)= $opt=~ /^-(.)(.*)/ ; |
|
246 |
|
247 # Turn on verbose reporting |
|
248 if ($opt eq 'v') { |
|
249 $verbose_report= ($param ne '-') ; |
|
250 |
|
251 # User-specified patterns to exclude ('' to clear list) |
|
252 } elsif ($opt eq 'I') { |
|
253 $param= shift(@ARGV) unless length($param) ; |
|
254 if (length($param)) { push(@INCLUDE_PATTERNS, $param) } |
|
255 else { @INCLUDE_PATTERNS= () } |
|
256 |
|
257 # User-specified patterns to exclude ('' to clear list) |
|
258 } elsif ($opt eq 'X') { |
|
259 $param= shift(@ARGV) unless length($param) ; |
|
260 if (length($param)) { push(@EXCLUDE_PATTERNS, $param) } |
|
261 else { @EXCLUDE_PATTERNS= () } |
|
262 |
|
263 # User-specified response codes to ignore ('' to clear list) |
|
264 } elsif ($opt eq 'i') { |
|
265 $param= shift(@ARGV) unless length($param) ; |
|
266 if (length($param)) { push(@INCLUDE_STATUS, $param) } |
|
267 else { @INCLUDE_STATUS= () } |
|
268 |
|
269 # User-specified response codes to ignore ('' to clear list) |
|
270 } elsif ($opt eq 'x') { |
|
271 $param= shift(@ARGV) unless length($param) ; |
|
272 if (length($param)) { push(@EXCLUDE_STATUS, $param) } |
|
273 else { @EXCLUDE_STATUS= () } |
|
274 |
|
275 # Maximum traversal depth |
|
276 } elsif ($opt eq 'd') { |
|
277 $param= shift(@ARGV) unless length($param) ; |
|
278 $max_depth= $param ; |
|
279 |
|
280 # Make it a "file check"-- only read local files, do not use HTTP |
|
281 } elsif ($opt eq 'f') { |
|
282 $file_check= ($param ne '-') ; |
|
283 |
|
284 # Use HTTP for all URL's, even local files |
|
285 } elsif ($opt eq 'h') { |
|
286 $full_http_check= ($param ne '-') ; |
|
287 |
|
288 # Read configuration parameters from srm.conf-like file |
|
289 } elsif ($opt eq 'c') { |
|
290 $param= shift(@ARGV) unless length($param) ; |
|
291 &read_srm_conf($param) ; |
|
292 |
|
293 # Print current configuration parameters |
|
294 } elsif ($opt eq 'q') { |
|
295 &print_config ; |
|
296 exit ; # jsm-- should we exit? |
|
297 |
|
298 # Allow certain parameters to be defined via the command line |
|
299 } elsif ($opt eq 'D') { |
|
300 $param= shift(@ARGV) unless length($param) ; |
|
301 $debug=1, unshift(@ARGV,$param), next if $param=~ /^-/ ; |
|
302 my($name,$value)= split(/=/, $param, 2) ; |
|
303 $value= 1 unless length($value) ; |
|
304 if ($name=~ /^(LOCAL_HOST|DOCUMENT_ROOT|USER_DIR|DEBUG|debug)$/) { |
|
305 eval "\$$name= \$value" ; |
|
306 #$$name= $value ; # this doesn't work, because of initial my() |
|
307 } |
|
308 |
|
309 } elsif ($opt eq '?') { |
|
310 &usage ; |
|
311 |
|
312 # End command-line option processing on "--" |
|
313 } elsif ($opt eq '-') { |
|
314 return ; |
|
315 |
|
316 } else { |
|
317 print STDERR |
|
318 "Illegal option-- '$opt'. Enter \"$0 -?\" for help.\n" ; |
|
319 exit ; |
|
320 } |
|
321 |
|
322 } |
|
323 |
|
324 if ($file_check and $full_http_check) { |
|
325 print STDERR "You cannot use both the -f and the -h options.\n" ; |
|
326 exit ; |
|
327 } |
|
328 |
|
329 } |
|
330 |
|
331 |
|
332 # Read appropriate values from the given file, typically srm.conf. If a |
|
333 # directory is named, default to filename "srm.conf". |
|
334 # Note that opening "-" will open STDIN. |
|
335 sub read_srm_conf { |
|
336 my($fname)= @_ ; |
|
337 local(*SRM) ; |
|
338 |
|
339 # default to srm.conf if only a directory is named |
|
340 if (-d $fname) { |
|
341 $fname=~ s#/$## ; |
|
342 $fname.= "/srm.conf" ; |
|
343 } |
|
344 |
|
345 # Clear old values |
|
346 $DOCUMENT_ROOT= $USER_DIR= '' ; |
|
347 @DIRECTORY_INDEX= @CGI_EXTENSIONS= @SHTML_EXTENSIONS= () ; |
|
348 %ALIAS= %ALIAS_MATCH= %SCRIPT_ALIAS= %SCRIPT_ALIAS_MATCH= () ; |
|
349 |
|
350 open(SRM, "<$fname") || die "Can't open $fname: $!" ; |
|
351 while (<SRM>) { |
|
352 s/#.*// ; |
|
353 next unless /\S/ ; |
|
354 my($name, @param)= /(\S+)/g ; |
|
355 |
|
356 if ($name eq 'DocumentRoot') { |
|
357 $DOCUMENT_ROOT= $param[0] ; |
|
358 |
|
359 } elsif ($name eq 'UserDir') { |
|
360 $USER_DIR= $param[0] ; |
|
361 |
|
362 } elsif ($name eq 'DirectoryIndex') { |
|
363 @DIRECTORY_INDEX= @param ; |
|
364 |
|
365 } elsif ($name eq 'Alias') { |
|
366 $ALIAS{$param[0]}= $param[1] ; |
|
367 |
|
368 } elsif ($name eq 'AliasMatch') { |
|
369 $ALIAS_MATCH{$param[0]}= $param[1] ; |
|
370 |
|
371 } elsif ($name eq 'ScriptAlias') { |
|
372 $SCRIPT_ALIAS{$param[0]}= $param[1] ; |
|
373 |
|
374 } elsif ($name eq 'ScriptAliasMatch') { |
|
375 $SCRIPT_ALIAS_MATCH{$param[0]}= $param[1] ; |
|
376 |
|
377 } elsif ($name eq 'AddHandler') { |
|
378 if ($param[0] eq 'cgi-script') { |
|
379 push(@CGI_EXTENSIONS, $param[1]) ; |
|
380 } elsif ($param[0] eq 'server-parsed') { |
|
381 push(@SHTML_EXTENSIONS, $param[1]) ; |
|
382 } |
|
383 } |
|
384 |
|
385 } |
|
386 close(SRM) ; |
|
387 } |
|
388 |
|
389 |
|
390 # Make any final settings to global variables, after the hard-coded values |
|
391 # and command-line options have been processed. |
|
392 # Most non-user-configurable globals are also set here. |
|
393 sub adjust_all_globals { |
|
394 |
|
395 # Standardize $USER_DIR to never have trailing slash |
|
396 $USER_DIR=~ s#/$## ; |
|
397 |
|
398 # If no $LOCAL_HOST set, try to read it from first URL in list, or |
|
399 # use the string 'localhost' if that URL contains no hostname. |
|
400 unless (length($LOCAL_HOST)) { |
|
401 $LOCAL_HOST= (&parse_url($ARGV[0]))[1] || 'localhost' ; |
|
402 print STDERR "LOCAL_HOST set to \"\L$LOCAL_HOST\E\"\n" ; |
|
403 } |
|
404 $LOCAL_HOST= lc($LOCAL_HOST) ; |
|
405 |
|
406 # If no $DOCUMENT_ROOT, try to guess it from $HOME, username, $USER_DIR. |
|
407 unless (length($DOCUMENT_ROOT)) { |
|
408 my($home) ; |
|
409 unless ($home= $ENV{'HOME'}) { |
|
410 my($uname)= getpwuid($<) || $ENV{'USER'} || `whoami` || `id -un` ; |
|
411 chomp($uname) ; |
|
412 &read_home_dirs unless %home_dir ; # only read when needed |
|
413 $home= $home_dir{$uname} ; |
|
414 } |
|
415 $DOCUMENT_ROOT= "$home/$USER_DIR" ; |
|
416 |
|
417 die "Could not determine DOCUMENT_ROOT; edit the $0 script to set it.\n" |
|
418 unless (-d $DOCUMENT_ROOT) ; |
|
419 |
|
420 print STDERR "DOCUMENT_ROOT set to \"$DOCUMENT_ROOT\"\n" ; |
|
421 } |
|
422 $DOCUMENT_ROOT=~ s#/$## ; |
|
423 |
|
424 # Allows &filename_to_url() to unalias as best as possible. Note that |
|
425 # use of &filename_to_url() can be avoided by the user; see note in |
|
426 # that routine. |
|
427 %UN_ALIAS= (reverse (%ALIAS, %SCRIPT_ALIAS) ) ; |
|
428 |
|
429 # These are to compare equivalency to later, in &filename_to_url(). |
|
430 ($DOC_ROOT_DEV, $DOC_ROOT_INODE)= stat("$DOCUMENT_ROOT/.") ; |
|
431 |
|
432 $DOC_ROOT_EXISTS= -e _ ; |
|
433 |
|
434 # Set CWD from shell variable, else from `pwd`. |
|
435 $CWD= $ENV{'PWD'} || `pwd` || die "couldn't run pwd: $!" ; |
|
436 chomp($CWD) ; |
|
437 |
|
438 |
|
439 # These are used by &extract_urls(). |
|
440 # This is a complete list of URL-type attributes defined in HTML 4.0, |
|
441 # plus any others I found, like nonstandard ones or from an earlier HTML. |
|
442 # Only a few of these are commonly used, as of early 1998. |
|
443 # The set in %html_urls could possibly link to HTML resources, while the |
|
444 # set in %non_html_urls could not. The %special(.*) sets, here for |
|
445 # reference only, include URL attributes that require special handling. |
|
446 |
|
447 %html_urls= ( 'a' => [ 'href' ], |
|
448 'area' => [ 'href' ], |
|
449 'frame' => [ 'src', 'longdesc' ], |
|
450 'link' => [ 'href', 'urn' ], |
|
451 'img' => [ 'longdesc', 'usemap' ], |
|
452 'q' => [ 'cite' ], |
|
453 'blockquote' => [ 'cite' ], |
|
454 'ins' => [ 'cite' ], |
|
455 'del' => [ 'cite' ], |
|
456 'object' => [ 'usemap' ], |
|
457 'input' => [ 'usemap' ], |
|
458 'iframe' => [ 'src', 'longdesc' ], |
|
459 'ilayer' => [ 'src' ], |
|
460 'layer' => [ 'src' ], |
|
461 'fig' => [ 'imagemap' ], |
|
462 'overlay' => [ 'imagemap' ], |
|
463 'meta' => [ 'url' ], |
|
464 'note' => [ 'src' ], |
|
465 ) ; |
|
466 |
|
467 %non_html_urls= ( 'body' => [ 'background' ], |
|
468 'img' => [ 'src', 'lowsrc', 'dynsrc' ], |
|
469 'input' => [ 'src' ], |
|
470 'script' => [ 'src', 'for' ], |
|
471 |
|
472 'fig' => [ 'src' ], |
|
473 'overlay' => [ 'src' ], |
|
474 'select' => [ 'src' ], |
|
475 'ul' => [ 'src' ], |
|
476 'h1' => [ 'src' ], |
|
477 'h2' => [ 'src' ], |
|
478 'h3' => [ 'src' ], |
|
479 'h4' => [ 'src' ], |
|
480 'h5' => [ 'src' ], |
|
481 'h6' => [ 'src' ], |
|
482 'hr' => [ 'src' ], |
|
483 'table' => [ 'src' ], |
|
484 'td' => [ 'src' ], |
|
485 'th' => [ 'src' ], |
|
486 'tr' => [ 'src' ], |
|
487 |
|
488 'bgsound' => [ 'src' ], |
|
489 'embed' => [ 'src' ], |
|
490 ) ; |
|
491 |
|
492 # %special_urls= ( 'base' => [ 'href' ] ) ; |
|
493 # |
|
494 # %special_html_urls= ( 'object' => [ 'codebase', 'data' ] ) ; |
|
495 # |
|
496 # %special_non_html_urls= |
|
497 # ( 'head' => [ 'profile' ], |
|
498 # 'object' => [ 'codebase', 'archive', 'classid' ], |
|
499 # 'applet' => [ 'codebase', 'code', 'object', 'archive' ], |
|
500 # 'form' => [ 'action', 'script' ] |
|
501 # ) ; |
|
502 |
|
503 |
|
504 # This is a translation from entity character references to characters, |
|
505 # used in &HTMLunescape(). |
|
506 # This simplified version only supports " < > &, but that |
|
507 # should be enough for URL-type attributes. |
|
508 # See http://www.w3.org/TR/REC-html40/sgml/entities.html for full entity |
|
509 # list. |
|
510 |
|
511 %e_to_ch= (quot => '"', |
|
512 'lt' => '<', |
|
513 'gt' => '>', |
|
514 amp => '&') ; |
|
515 |
|
516 } |
|
517 |
|
518 |
|
519 #---------------------------------------------------------------------- |
|
520 |
|
521 # Add the URL to our data structures; specifically, to %url and @urlstoget. |
|
522 # Returns a pointer to the structure in %url, or undef if already defined |
|
523 # or on error. |
|
524 # Currently, this always receives the URL with the host name lowercase, |
|
525 # either from &absolute_url() or from using $LOCAL_HOST. |
|
526 sub add_url { |
|
527 my($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow)= @_ ; |
|
528 |
|
529 # Allow the user to restrict URL patterns: URLs must be in |
|
530 # @INCLUDE_PATTERNS but not in @EXCLUDE_PATTERNS (but only restrict |
|
531 # by @INCLUDE_PATTERNS if it's not empty). |
|
532 return undef if @INCLUDE_PATTERNS && |
|
533 !grep( $URL=~ /$_/, @INCLUDE_PATTERNS ) ; |
|
534 return undef if grep( $URL=~ /$_/, @EXCLUDE_PATTERNS ) ; |
|
535 |
|
536 # Canonicalize URL, so we don't get a page multiple times |
|
537 $URL= &canonicalize($URL) ; |
|
538 |
|
539 # for obscure case involving a <form action=___.cgi>-extracted URL being |
|
540 # overwritten by <a href=___.cgi> extraction (don't fret over this) |
|
541 $url{$URL}{'dontfollow'}&&= $dontfollow if $url{$URL} ; |
|
542 |
|
543 # Don't add the record a second time! Or will infinitely traverse. |
|
544 return undef if $url{$URL} ; # or add to @referers, for 301 correction...? |
|
545 |
|
546 # Only HTTP URLs are currently supported |
|
547 return undef unless $URL=~ /^http:/i ; |
|
548 |
|
549 # Any self-referral here indicates a bug in the program. It's happened. |
|
550 die "PROGRAM ERROR: $URL shows its first referer as itself.\n" |
|
551 if $referer eq $URL ; |
|
552 |
|
553 my(%u) ; |
|
554 @u{qw(URL referer depth ishtml iscgi dontfollow)}= |
|
555 ($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow) ; |
|
556 $u{'islocal'}= ($URL=~ m#^http://\Q$LOCAL_HOST\E/#io) + 0 ; # make length>0 |
|
557 if ($u{'islocal'}) { |
|
558 # $u{'filename'}= &url_to_filename($URL) ; |
|
559 @u{'filename', 'location'}= &url_to_filename($URL) ; |
|
560 $u{'iscgi'}= &is_cgi($u{'filename'}, $URL) if $u{'iscgi'} eq '' ; |
|
561 |
|
562 # 2-27-00 JSM: Detect ishtml by filename, not -T test. |
|
563 if ( $u{'ishtml'} eq '' ) { |
|
564 $u{'ishtml'}= $HTML_BY_NAME |
|
565 ? ( !$u{'iscgi'} && -e $u{'filename'} && |
|
566 $u{'filename'}=~ /\.html?$/i ) + 0 |
|
567 : (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0 ; |
|
568 } |
|
569 # $u{'ishtml'}= (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0 |
|
570 # unless length($u{'ishtml'}) ; |
|
571 |
|
572 } |
|
573 |
|
574 # If we're only doing a file check, don't add URLs that require HTTP |
|
575 return undef if ($file_check and (!$u{'islocal'} or $u{'iscgi'}) ) ; |
|
576 |
|
577 push(@urlstoget, \%u) ; |
|
578 $url{$URL}= \%u ; |
|
579 |
|
580 # return \%u ; # unneeded because of previous statement |
|
581 } |
|
582 |
|
583 |
|
584 # Guess if a file is a CGI script or not. Returns true if the (regular) file |
|
585 # is executable, has one of @CGI_EXTENSIONS, or if the URL is in a |
|
586 # ScriptAlias'ed directory. |
|
587 # $fname must be absolute path, but $URL is optional (saves time if available). |
|
588 # Note that URLs like "/path/script.cgi?a=b" are handled correctly-- the |
|
589 # previously extracted filename is tested for CGI-ness, while the URL is |
|
590 # checked for ScriptAlias matching (which is unaffected by final query |
|
591 # strings or PATH_INFO). |
|
592 sub is_cgi { |
|
593 my($fname, $URL)= @_ ; |
|
594 return 1 if (-x $fname && ! -d _ ) ; # should we really do this? |
|
595 foreach (@CGI_EXTENSIONS) { return 1 if $fname=~ /\Q$_\E$/i } |
|
596 |
|
597 $URL= &filename_to_url($fname) unless length($URL) ; # currently unused |
|
598 my($URLpath)= $URL=~ m#^http://[^/]*(.*)#i ; |
|
599 foreach (keys %SCRIPT_ALIAS) { return 1 if $URLpath=~ /^\Q$_\E/ } |
|
600 foreach (keys %SCRIPT_ALIAS_MATCH) { return 1 if $URLpath=~ /^$_/ } |
|
601 |
|
602 return 0 ; |
|
603 } |
|
604 |
|
605 |
|
606 # Put the URL in such a form that two URLs that point to the same resource |
|
607 # have the same URL, to avoid superfluous retrievals. |
|
608 # Host name is lowercased elsewhere-- this routine is only called from |
|
609 # &add_url; see note there. To lowercase the host name here would be |
|
610 # inefficient. |
|
611 sub canonicalize { |
|
612 my($URL)= @_ ; |
|
613 |
|
614 $URL=~ s/#.*// ; # remove any "#" fragment from end of URL |
|
615 |
|
616 return $URL ; |
|
617 } |
|
618 |
|
619 |
|
620 #----- File reading/downloading routines (includes networking) -------- |
|
621 |
|
622 # Verify that a URL exists, and set $url->{'status'} accordingly. Do |
|
623 # this either by checking the local filesystem or by using the HTTP HEAD |
|
624 # method for remote sites or CGI scripts. |
|
625 # Set $url->{'ishtml'} accordingly if discovered from Content-Type:. |
|
626 # This does not support various Redirect directives in srm.conf. |
|
627 sub verify_url { |
|
628 my($url)= @_ ; |
|
629 |
|
630 print STDERR "verifying $url->{'URL'}\n" if $debug ; |
|
631 |
|
632 |
|
633 # Depending on the state of $url->{islocal, iscgi, dontfollow} and |
|
634 # $full_http_check, take appropriate actions to check/set the |
|
635 # status code for this URL. |
|
636 |
|
637 # NOTE: In some situations, specifically when checking a CGI script |
|
638 # named in a <form action> (thus implying that dontfollow is set), |
|
639 # and using HTTP to check the URL (because the script is remote or |
|
640 # $full_http_check is set), the HTTP response code may not be |
|
641 # accurate. This is because there is no form data sent with the |
|
642 # request, as there normally would be. In these cases, a cautionary |
|
643 # note is appended to $url->{'status'}. Additionally, an empty |
|
644 # $url->{'status'} is changed to an explanatory note (maybe we should |
|
645 # do that in load_url() too?). |
|
646 |
|
647 # Use HEAD if file is remote, or if $full_http_check is set. |
|
648 if (!$url->{'islocal'} or $full_http_check) { |
|
649 &load_url_using_HTTP($url, 'HEAD') ; |
|
650 $url->{'status'}= '[no status returned]' |
|
651 unless length($url->{'status'}) ; |
|
652 $url->{'status'}.= ' (NOTE: Form was not submitted normally)' |
|
653 if $url->{'dontfollow'} ; |
|
654 |
|
655 # URL is local: If it's not CGI, do a normal local file check |
|
656 } elsif (!$url->{'iscgi'}) { |
|
657 $url->{'status'}= (-e $url->{'filename'}) |
|
658 ? "200 Local File Exists" : "404 File Not Found" ; |
|
659 |
|
660 # URL is local CGI: Use HEAD unless dontfollow is set |
|
661 } elsif (!$url->{'dontfollow'}) { |
|
662 &load_url_using_HTTP($url, 'HEAD') ; |
|
663 |
|
664 # Else it's a local CGI with dontfollow set: Check for executable file |
|
665 } else { |
|
666 $url->{'status'}= |
|
667 (! -e $url->{'filename'}) ? "404 File Not Found" |
|
668 : (! -x $url->{'filename'}) ? "403 Local File Is Not Executable" |
|
669 : "200 Local Executable File Exists" |
|
670 |
|
671 } |
|
672 |
|
673 |
|
674 # Old verify routine below: |
|
675 # |
|
676 # # If is a local non-CGI file, check it directly from the filesystem |
|
677 # if ($url->{'islocal'} and !$url->{'iscgi'} and !$full_http_check) { |
|
678 # $url->{'status'}= (-e $url->{'filename'}) |
|
679 # ? "200 Local File Exists" : "404 File Not Found" ; |
|
680 # |
|
681 # # Otherwise, download its HEAD from its HTTP server |
|
682 # } else { |
|
683 # &load_url_using_HTTP($url, 'HEAD') ; |
|
684 # } |
|
685 |
|
686 |
|
687 } |
|
688 |
|
689 |
|
690 |
|
691 # Load entire file/resource and return its contents, setting $url->{'status'} |
|
692 # accordingly. Do this either by checking the local filesystem or by |
|
693 # using the HTTP GET method for remote sites or CGI scripts. |
|
694 # Set $url->{'ishtml'} accordingly if discovered from Content-Type:. |
|
695 # This does not support various Redirect directives in srm.conf. |
|
696 sub load_url { |
|
697 my($url)= @_ ; |
|
698 my($HTML) ; |
|
699 |
|
700 print STDERR "loading $url->{'URL'}\n" if $debug ; |
|
701 |
|
702 # If is a local non-CGI file, read it directly from the filesystem |
|
703 if ($url->{'islocal'} and !$url->{'iscgi'} and !$full_http_check) { |
|
704 my($iscgi) ; |
|
705 ($HTML, $url->{'ssierrs'}, $iscgi)= |
|
706 &read_expanded_file($url->{'filename'}, $url->{'URL'}) ; |
|
707 $url->{'status'}= |
|
708 !defined($HTML) |
|
709 ? sprintf("450 Can't read file: %s (%s)", $!, $!+0) |
|
710 : @{$url->{'ssierrs'}} |
|
711 ? sprintf("451 SSI Error(s) (%s total)", |
|
712 scalar @{$url->{'ssierrs'}}) |
|
713 |
|
714 : "200 Local File Read OK" ; |
|
715 |
|
716 # $url->{'iscgi'} may be set if an SHTML file included CGI calls. |
|
717 # Don't set it if we're doing a file check, in which case we'll |
|
718 # keep whatever $HTML we could get. |
|
719 $url->{'iscgi'}= $iscgi unless $file_check ; |
|
720 } |
|
721 |
|
722 # Otherwise (or if rereckoned), download the resource from its HTTP server |
|
723 if (!$url->{'islocal'} or $url->{'iscgi'} or $full_http_check) { |
|
724 (undef, undef, $HTML)= &load_url_using_HTTP($url, 'GET') ; |
|
725 } |
|
726 |
|
727 # Note that this will be set even when URL is to be reloaded, like |
|
728 # for a 601 (timeout) response. |
|
729 $url->{'hasbeenloaded'}= 1 ; |
|
730 return $HTML ; |
|
731 } |
|
732 |
|
733 |
|
734 # Read a local file and return its contents. If a file is SSI (aka SHTML), |
|
735 # expand any SSI <!--#include--> directives as needed, recursively |
|
736 # including nested files. |
|
737 # This is used for all local reads, SHTML or not, but the vast bulk of this |
|
738 # routine is for SHTML files. |
|
739 # |
|
740 # If file is SHTML, this routine also returns a structure of error data, |
|
741 # and a boolean saying if this file needs to be downloaded via HTTP |
|
742 # for a complete check (e.g. includes CGI calls). |
|
743 # |
|
744 # $fname must be canonicalized absolute path, but $URL parameter is optional. |
|
745 # %$parents contains all "include"-ancestors of the file, to prevent loops. |
|
746 # If omitted, assumes no ancestors (and a fresh hash is started). |
|
747 # |
|
748 # This routine seems much bigger and more complex than it needs to be. |
|
749 # It could be one third the size and much simpler if we didn't have to |
|
750 # worry about full error reporting on nested includes. |
|
751 # |
|
752 # Note: This routine was made to mimic what Apache would return to a client. |
|
753 # However, the result differs from Apache's in two slight ways, both |
|
754 # involving nested SSI within <!--#include file="..." -->, and both |
|
755 # apparent bugs in Apache 1.1 (may be fixed in later versions): |
|
756 # |
|
757 # 1) If a <file="..."> value contains no "/" (i.e. in current directory), |
|
758 # then Apache always parses the included file as SHTML, regardless of |
|
759 # extension. This routine checks @SHTML_EXTENSIONS for all included |
|
760 # files. |
|
761 # 2) If a <file="..."> value containing a "/" loads an SHTML file |
|
762 # containing a <virtual="..."> tag with a relative path, the directive |
|
763 # fails in Apache. This routine tries to guess the correct path/URL. |
|
764 # |
|
765 # |
|
766 # Notes on this routine, and SHTML files in general: |
|
767 # |
|
768 # At first thought, it seems like we could load each included file |
|
769 # only once, instead of once for every file that includes it. |
|
770 # However, because of the fact that relative URLs are resolved |
|
771 # relative to the top-level including file, the top-level file will |
|
772 # need to be expanded every time. (It's legal (though of questionable |
|
773 # wisdom) to include a file from e.g. both /a/index.shtml and |
|
774 # /b/index.shtml, so links from the included file point to different |
|
775 # URLs.) |
|
776 # |
|
777 # Note that while URLs in included files (e.g. <a href="...">) are |
|
778 # resolved relative to the top-level including file, nested include tags |
|
779 # are resolved relative to the direct includer. |
|
780 # |
|
781 # We could possibly be more efficient in time (but costly in memory) |
|
782 # by storing the expanded contents and $errlist of each included file, |
|
783 # since those will be constant (except $errlist's include-loop |
|
784 # reporting might vary somewhat). There are probably other ways to |
|
785 # eek out savings of time and memory, at the cost of complexity. |
|
786 # |
|
787 # The main loop here is inside of an s/// statement. Unusual, but it's an |
|
788 # appropriate way to handle the recursion. Recursion is needed, since each |
|
789 # included file may or may not be SHTML. |
|
790 # |
|
791 # $iscgi is set if a file includes "<!--#exec", or if it contains an |
|
792 # <!--#include virtual="..." --> tag that points to a CGI file, or if |
|
793 # any of its include-children sets $iscgi. |
|
794 # |
|
795 # |
|
796 # Notes to help clarify data structures, if (God forbid) you have to modify |
|
797 # this routine: |
|
798 # |
|
799 # Each error is a list of files in an "include chain", and $errlist is a |
|
800 # list of errors. $errlist is associated with the current $HTML. Each |
|
801 # error in $errlist is associated with some tag in $HTML, as iterated in |
|
802 # the s/// loop. When this routine returns ($HTML, $errlist), the |
|
803 # errors in $errlist should all have as their first element tags that |
|
804 # were found in $HTML. |
|
805 # |
|
806 # Each error's final element (the "leaf") is associated with a tag that |
|
807 # tried to load an invalid file. Each leaf will always be the complete |
|
808 # set of errors for that tag (i.e. it has no children, since it couldn't |
|
809 # load the file). |
|
810 # |
|
811 # If the file can be validly loaded, then we may have 0 or multiple errors |
|
812 # associated with this file/tag (and returned from this routine). Each |
|
813 # file's $errlist is an accumulation of all of its children's $errlist's. |
|
814 # |
|
815 # Errors that come from loading a child are associated with the child's |
|
816 # $HTML and tags. Before adding them to the parent's (current) $errlist, |
|
817 # they must have the CHILD's path/tag unshifted onto the front of the |
|
818 # include chain; all errors are then added to the current $errlist. |
|
819 # This ensures that: |
|
820 # a) The errors are now all associated with the tag that loaded the child. |
|
821 # b) The errors in the current $errlist are always associated with tags |
|
822 # from the current $HTML. |
|
823 # |
|
824 |
|
825 sub read_expanded_file { |
|
826 my($fname, $URL, $parents)= @_ ; |
|
827 my($HTML, $errlist, $iscgi) ; |
|
828 my($isshtml) ; |
|
829 |
|
830 $parents->{$fname}= 1 ; |
|
831 |
|
832 $HTML= &read_file($fname) ; |
|
833 return(undef) unless defined($HTML) ; |
|
834 |
|
835 foreach (@SHTML_EXTENSIONS) { |
|
836 $isshtml= 1, last if ($fname=~ /\Q$_\E$/i) ; |
|
837 } |
|
838 |
|
839 if ($isshtml) { |
|
840 $errlist= [] ; |
|
841 $iscgi= ($HTML=~ /<!--#exec\b/i) ; |
|
842 |
|
843 $HTML=~ s{(<!--#include\b.*?>)} { |
|
844 do { |
|
845 my($childfname, $childURL) ; |
|
846 my($childHTML, $childerrlist, $childiscgi) ; |
|
847 my($tagst) = $1 ; |
|
848 my($tag)= &parse_tag($tagst) ; |
|
849 GET_CHILD: { |
|
850 if (length($tag->{'attrib'}{'virtual'})) { |
|
851 $URL= &filename_to_url($fname) unless length($URL) ; |
|
852 $childURL= |
|
853 &absolute_url($tag->{'attrib'}{'virtual'}, $URL) ; |
|
854 ($childfname)= &url_to_filename($childURL) ; |
|
855 |
|
856 # If it's a CGI, don't follow it, but no error either |
|
857 if (&is_cgi($childfname, $childURL)) { |
|
858 $iscgi= 1 ; |
|
859 last GET_CHILD ; |
|
860 } |
|
861 |
|
862 } elsif (length($tag->{'attrib'}{'file'})) { |
|
863 $childfname= $tag->{'attrib'}{'file'} ; |
|
864 if ($childfname=~ m#^(/|~)#) { |
|
865 push(@$errlist, [ {'path' => $childfname, |
|
866 'tag' => $tagst, 'errmsg' => |
|
867 'Absolute paths are not allowed in ' |
|
868 . '<!--#include file="..." -->.'}]); |
|
869 last GET_CHILD ; |
|
870 } |
|
871 if ($childfname=~ m#\.\.(/|$)#) { |
|
872 push(@$errlist, [ {'path' => $childfname, |
|
873 'tag' => $tagst, 'errmsg' => |
|
874 'Paths can not contain "../" in ' |
|
875 . '<!--#include file="..." -->.'}]); |
|
876 last GET_CHILD ; |
|
877 } |
|
878 $childfname= ($fname=~ m#(.*/)#)[0] . $childfname ; |
|
879 |
|
880 } else { |
|
881 push(@$errlist, [ {'path' => '', |
|
882 'tag' => $tagst, 'errmsg' => |
|
883 'Tag must contain either the "file" or ' |
|
884 . '"virtual" attribute.'}]); |
|
885 last GET_CHILD ; |
|
886 |
|
887 } |
|
888 |
|
889 # canonicalize filename for %$parents |
|
890 1 while $childfname=~ s#/\.(/|$)#/# ; |
|
891 1 while $childfname=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ; |
|
892 |
|
893 # Guarantee that file exists, is regular, and is readable |
|
894 unless (-e $childfname) { |
|
895 push(@$errlist, [{'path' => $childfname, 'tag' => $tagst, |
|
896 'errmsg' => 'File not found'} ] ) ; |
|
897 last GET_CHILD ; |
|
898 } |
|
899 unless (-f $childfname) { |
|
900 push(@$errlist, [{'path' => $childfname, 'tag' => $tagst, |
|
901 'errmsg' => 'File is not a regular' |
|
902 . ' file.' } ] ) ; |
|
903 last GET_CHILD ; |
|
904 } |
|
905 unless (-r $childfname) { |
|
906 push(@$errlist, [{'path' => $childfname, 'tag' => $tagst, |
|
907 'errmsg' => 'File is not readable by' |
|
908 . ' current user.' } ] ) ; |
|
909 last GET_CHILD ; |
|
910 } |
|
911 |
|
912 # Guard against include loops |
|
913 if ($parents->{$childfname}) { |
|
914 push(@$errlist, [{'path' => $childfname, 'tag' => $tagst, |
|
915 'errmsg' => 'An "include" loop exists' |
|
916 . ' involving this file.' } ] ) ; |
|
917 last GET_CHILD ; |
|
918 } |
|
919 |
|
920 |
|
921 # Get the included file, with any error data |
|
922 ($childHTML, $childerrlist, $childiscgi)= |
|
923 &read_expanded_file($childfname, $childURL, $parents) ; |
|
924 |
|
925 # Log if there was any error reading the file |
|
926 push(@$errlist, [{'path' => $childfname, 'tag' => $tagst, |
|
927 'errmsg' => "Can't read file: $!." } ] ) |
|
928 unless defined($childHTML) ; |
|
929 |
|
930 # Add any errors to the current (parent) error list |
|
931 foreach my $error (@$childerrlist) { |
|
932 unshift(@$error, |
|
933 { 'path' => $childfname, 'tag' => $tagst } ) ; |
|
934 } |
|
935 push(@$errlist, @$childerrlist) ; |
|
936 |
|
937 # Parent is a CGI if any of its children is a CGI |
|
938 $iscgi||= $childiscgi ; |
|
939 |
|
940 } # GET_CHILD |
|
941 |
|
942 |
|
943 $childHTML ; # final value to replace in main s/// construct |
|
944 |
|
945 } # do {} |
|
946 |
|
947 }gie ; # $HTML=~ s{} {} |
|
948 |
|
949 } # if ($isshtml) |
|
950 |
|
951 delete $parents->{$fname} ; |
|
952 |
|
953 return($HTML, $errlist, $iscgi) ; |
|
954 } |
|
955 |
|
956 |
|
957 |
|
958 # Returns the contents of the named file, or undef on error. |
|
959 sub read_file { |
|
960 my($fname)= @_ ; |
|
961 local(*F, $/) ; |
|
962 |
|
963 undef $/ ; |
|
964 open(F, "<$fname") || return undef ; |
|
965 my($ret)= <F> ; |
|
966 close(F) ; |
|
967 |
|
968 return $ret ; |
|
969 } |
|
970 |
|
971 |
|
972 # Try to get the given URL with the given HTTP method, and return the |
|
973 # status line, headers, and body. |
|
974 # Set $url->{'status'} accordingly, and set $url->{'ishtml'} accordingly |
|
975 # if Content-Type: header is returned. |
|
976 # This is specific to this program, and calls the more general &get_url(). |
|
977 # This could be slightly more efficient if 302 or 303 was handled in the |
|
978 # calling routine, where it could take advantage of a new URL being local. |
|
979 sub load_url_using_HTTP { |
|
980 my($url, $method)= @_ ; |
|
981 my($status_line, $headers, $body) ; |
|
982 |
|
983 # We should not get here if $file_check is set |
|
984 die "mistakenly called load_url_using_HTTP($url->{'URL'})" if $file_check ; |
|
985 |
|
986 GETFILE: { |
|
987 ($status_line, $headers, $body)= |
|
988 &get_url( ($url->{'location'} || $url->{'URL'}), $method) ; |
|
989 |
|
990 # If HEAD failed (as on some servers), sigh and use GET |
|
991 ($status_line, $headers, $body)= |
|
992 &get_url( ($url->{'location'} || $url->{'URL'}), 'GET') |
|
993 unless length($status_line) ; |
|
994 |
|
995 ($url->{'status'})= $status_line=~ m#^HTTP/[\d.]+\s+(.*)# ; |
|
996 |
|
997 # 2-27-00 JSM: Allow old NCSA servers to not include the HTTP version. |
|
998 if ($SUPPORT_NCSA_BUG and $url->{'status'} eq '') { |
|
999 ($url->{'status'})= $status_line=~ m#^HTTP(?:/[\d.]+)?\s+(.*)# ; |
|
1000 } |
|
1001 |
|
1002 # Redirect to new location if status is 302 or 303 |
|
1003 if ($url->{'status'}=~ /^(301|302|303)\b/) { |
|
1004 ($url->{'location'})= $headers=~ m#^Location:[ \t]+(\S+)#im ; |
|
1005 last GETFILE unless length($url->{'location'}) ; |
|
1006 $url->{'location'}= |
|
1007 &absolute_url($url->{'location'}, $url->{'URL'}) ; |
|
1008 redo GETFILE |
|
1009 if ($url->{'status'}=~ /^(302|303)\b/) |
|
1010 && (++$url->{'numredirects'} <= $MAX_REDIRECTS) ; |
|
1011 } |
|
1012 } |
|
1013 |
|
1014 $url->{'numtries'}++ ; |
|
1015 $url->{'lasttried'}= time ; |
|
1016 |
|
1017 # If successful response included Content-Type:, set ishtml accordingly |
|
1018 $url->{'ishtml'}= (lc($1) eq 'text/html') + 0 |
|
1019 if $url->{'status'}=~ /^2/ |
|
1020 and $headers=~ m#^content-type:[ \t]*(\S+)#im ; |
|
1021 |
|
1022 print STDERR "status: $status_line\n" if $debug ; |
|
1023 |
|
1024 return($status_line, $headers, $body) ; |
|
1025 } |
|
1026 |
|
1027 |
|
1028 # Request the HTTP resource at the given absolute URL using the given method, |
|
1029 # and return the response status line, headers, and body. |
|
1030 # jsm-- in the future, this should support downloading to a file, in case |
|
1031 # the download is too large to fit in memory. |
|
1032 sub get_url { |
|
1033 my($URL, $method)= @_ ; |
|
1034 my($host, $uri, $endhost) ; |
|
1035 my($S, $rin) ; |
|
1036 my($response, $status_line, $headers, $body, $status_code) ; |
|
1037 my($content_length) ; |
|
1038 $method= uc($method) ; |
|
1039 $method= 'GET' unless length($method) ; |
|
1040 |
|
1041 ($host, $uri)= $URL=~ m#^http://([^/]*)(.*)$#i ; |
|
1042 $uri= '/' unless length($uri) ; |
|
1043 $endhost= $host ; |
|
1044 |
|
1045 # use an HTTP proxy if $ENV{'http_proxy'} is set |
|
1046 USEPROXY: { |
|
1047 last USEPROXY unless $host=~ /\./ ; |
|
1048 if (length($ENV{'http_proxy'})) { |
|
1049 foreach (@NO_PROXY) { |
|
1050 last USEPROXY if $host=~ /$_$/i ; |
|
1051 } |
|
1052 ($host)= $ENV{'http_proxy'}=~ m#^(?:http://)?([^/]*)#i ; |
|
1053 $uri= $URL ; |
|
1054 } |
|
1055 } |
|
1056 |
|
1057 # Open socket |
|
1058 $S= IO::Socket::INET->new(PeerAddr => $host, # may contain :port |
|
1059 PeerPort => 80, # default if none in PeerAddr |
|
1060 Proto => 'tcp') ; |
|
1061 return("HTTP/1.1 600 Can't create socket: $@") unless defined($S) ; |
|
1062 $S->autoflush() ; # very important!! |
|
1063 |
|
1064 # Send HTTP 1.1 request |
|
1065 print $S "$method $uri HTTP/1.1\015\012", |
|
1066 "Host: $endhost\015\012", |
|
1067 "Connection: close\015\012", |
|
1068 "User-agent: CheckLinks/$CL_VERSION\015\012", |
|
1069 "\015\012" ; |
|
1070 |
|
1071 # Wait for socket response with select() |
|
1072 vec($rin= '', fileno($S), 1)= 1 ; |
|
1073 select($rin, undef, undef, 60) |
|
1074 || return("HTTP/1.1 601 Connection timed out") ; |
|
1075 |
|
1076 local($/)= "\012" ; |
|
1077 |
|
1078 # Handle "100 Continue" responses for HTTP 1.1: loop until non-1xx. |
|
1079 do { |
|
1080 $status_line= <$S> ; |
|
1081 $status_line=~ s/\015?\012$// ; |
|
1082 ($status_code)= $status_line=~ m#^HTTP/\d+\.\d+\s+(\d+)# ; |
|
1083 |
|
1084 $headers= '' ; |
|
1085 while (<$S>) { |
|
1086 last if /^\015?\012/ ; |
|
1087 $headers.= $_ ; |
|
1088 } |
|
1089 $headers=~ s/\015?\012[ \t]+/ /g ; |
|
1090 } until $status_code!~ /^1/ ; |
|
1091 |
|
1092 # Body length is determined by HTTP 1.1 spec, section 4.4: these |
|
1093 # certain conditions implying no body, then chunked encoding, |
|
1094 # then Content-length: header, then server closing connection. |
|
1095 if ($method eq 'HEAD' or $status_code=~ /^(1|204\b|304\b)/) { |
|
1096 $body= undef ; |
|
1097 |
|
1098 # else chunked encoding |
|
1099 } elsif ($headers=~ /^transfer-encoding:[ \t]*chunked\b/im) { |
|
1100 # 7-16-99: Old code was only saving last chunk. Fix using |
|
1101 # $this_chunk contributed by Mark Trotter. |
|
1102 my($this_chunk, $chunk_size, $readsofar, $thisread) ; |
|
1103 while ($chunk_size= hex(<$S>)) { |
|
1104 $readsofar= 0 ; |
|
1105 while ($readsofar!=$chunk_size) { |
|
1106 last unless $thisread= |
|
1107 read($S, $this_chunk, $chunk_size-$readsofar, $readsofar) ; |
|
1108 $readsofar+= $thisread ; |
|
1109 } |
|
1110 return("HTTP/1.1 603 Incomplete chunked response", $headers, $body) |
|
1111 if $readsofar!=$chunk_size ; |
|
1112 $_= <$S> ; # clear CRLF after chunk |
|
1113 $body.= $this_chunk ; |
|
1114 } |
|
1115 |
|
1116 # Read footers if they exist |
|
1117 while (<$S>) { |
|
1118 last if /^\015?\012/ ; |
|
1119 $headers.= $_ ; |
|
1120 } |
|
1121 $headers=~ s/\015?\012[ \t]+/ /g ; |
|
1122 |
|
1123 |
|
1124 # else body length given in Content-length: |
|
1125 } elsif (($content_length)= $headers=~ /^content-length:[ \t]*(\d+)/im) { |
|
1126 my($readsofar, $thisread) ; |
|
1127 while ($readsofar!=$content_length) { |
|
1128 last unless $thisread= |
|
1129 read($S, $body, $content_length-$readsofar, $readsofar) ; |
|
1130 $readsofar+= $thisread ; |
|
1131 } |
|
1132 return(sprintf("HTTP/1.1 602 Incomplete response (%s of %s bytes)", |
|
1133 $readsofar+0, $content_length), |
|
1134 $headers, $body) |
|
1135 if $readsofar!=$content_length ; |
|
1136 |
|
1137 |
|
1138 # else body is entire socket output |
|
1139 } else { |
|
1140 local($/)= undef ; |
|
1141 $body= <$S> ; |
|
1142 } |
|
1143 |
|
1144 close($S) ; |
|
1145 |
|
1146 return($status_line, $headers, $body) ; |
|
1147 } |
|
1148 |
|
1149 |
|
1150 #----- URL-parsing routines ------------------------------------------- |
|
1151 |
|
1152 # The routines parse_url(), unparse_url(), and absolute_url() are based on |
|
1153 # different sections in the Internet Draft "Uniform Resource Identifiers |
|
1154 # (URI): Generic Syntax and Semantics", 11-18-97, by Berners-Lee, |
|
1155 # Fielding, and Masinter, filename draft-fielding-uri-syntax-01. |
|
1156 |
|
1157 # Parse a URL into its components, according to URI draft, sections 4.3, 4.4. |
|
1158 # This regular expression is straight from Appendix B, modified to use Perl 5. |
|
1159 # Returns scheme, site, path, query, and fragment. All but path may have |
|
1160 # the undefined value. |
|
1161 sub parse_url { |
|
1162 my($URL)= @_ ; |
|
1163 my($scheme, $site, $path, $query, $fragment)= |
|
1164 ($URL=~ m{^(?: ([^:/?\#]+):)? |
|
1165 (?: // ([^/?\#]*))? |
|
1166 ([^?\#]*) |
|
1167 (?: \? ([^\#]*))? |
|
1168 (?: \# (.*))? |
|
1169 }x |
|
1170 ) ; |
|
1171 |
|
1172 |
|
1173 # Un-URL-encode the path, to equivalate things like %7E --> ~ |
|
1174 # Note that in some situations, this may cause problems with URLs that |
|
1175 # contain the % character: if the unescaped URL is then used in |
|
1176 # relative URL calculation, it may be unescaped again (rare). |
|
1177 $path=~ s/\+/ /g ; |
|
1178 $path=~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/ge ; |
|
1179 |
|
1180 # Note that in HTTP, the presence of a host implies a path beginning with |
|
1181 # '/', so $path should be '/' for URLs like "http://www.somehost.com" |
|
1182 $path= '/' if !length($path) && length($site) && lc($scheme) eq 'http' ; |
|
1183 |
|
1184 return($scheme, $site, $path, $query, $fragment) ; |
|
1185 |
|
1186 } |
|
1187 |
|
1188 |
|
1189 # Returns a full URL string, given its components |
|
1190 # The full procedure is described in the URI draft, section 5.2, step 7. |
|
1191 sub unparse_url { |
|
1192 my($scheme, $site, $path, $query, $fragment)= @_ ; |
|
1193 my($URL) ; |
|
1194 |
|
1195 $URL= "$scheme:" if defined($scheme) ; |
|
1196 $URL.= "//$site" if defined($site) ; |
|
1197 $URL.= $path ; |
|
1198 $URL.= "?$query" if defined($query) ; |
|
1199 $URL.= "#$fragment" if defined($fragment) ; |
|
1200 |
|
1201 return $URL ; |
|
1202 } |
|
1203 |
|
1204 |
|
1205 # Returns a canonicalized absolute URL, given a relative URL and a base URL. |
|
1206 # The full procedure is described in the URI draft, section 5.2. |
|
1207 # Note that a relative URI of "#fragment" should be resolved to "the current |
|
1208 # document", not to an absolute URL. This presents a quandary for this |
|
1209 # routine: should it always return an absolute URL, thus violating the |
|
1210 # spec, or should it not always return an absolute URL, thus requiring any |
|
1211 # caller to check for this special case? This routine leaves that up to |
|
1212 # the caller, with $return_rel_fragment-- if set, stick to the spec; |
|
1213 # otherwise, always return an absolute URL. See section G.4 of the draft. |
|
1214 # Note that the pathname reduction in steps 6.c-f messes up any PATH_INFO |
|
1215 # that has ./ or ../ in it, which may be a bug in the spec. |
|
1216 sub absolute_url { |
|
1217 my($relurl, $baseurl, $return_rel_fragment)= @_ ; |
|
1218 my(@relurl, @baseurl) ; |
|
1219 |
|
1220 # parse_url() returns scheme, site, path, query, fragment |
|
1221 @relurl= &parse_url($relurl) ; # Step 1 |
|
1222 @baseurl= &parse_url($baseurl) ; |
|
1223 |
|
1224 COMBINE: { |
|
1225 |
|
1226 # Step 2 |
|
1227 # See note above about $return_rel_fragment |
|
1228 if ( $relurl[2] eq '' && |
|
1229 !defined($relurl[0]) && |
|
1230 !defined($relurl[1]) && |
|
1231 !defined($relurl[3]) ) { |
|
1232 @relurl[0..3]= @baseurl[0..3] ; |
|
1233 return $relurl if $return_rel_fragment ; # see note above |
|
1234 last COMBINE ; |
|
1235 } |
|
1236 |
|
1237 last COMBINE if defined($relurl[0]) ; # Step 3 |
|
1238 $relurl[0]= $baseurl[0] ; |
|
1239 |
|
1240 last COMBINE if defined($relurl[1]) ; # Step 4 |
|
1241 $relurl[1]= $baseurl[1] ; |
|
1242 |
|
1243 last COMBINE if $relurl[2]=~ m#^/# ; # Step 5 |
|
1244 |
|
1245 # Step 6-- resolve relative path |
|
1246 my($path)= $baseurl[2]=~ m#^(.*/)# ; # Step 6.a |
|
1247 $relurl[2]= $path . $relurl[2] ; # Step 6.b |
|
1248 |
|
1249 } # COMBINE |
|
1250 |
|
1251 # Put the remaining steps outside of the block to canonicalize the path. |
|
1252 # Arguably, this is not allowed. To avoid such arguments at the expense of |
|
1253 # path canonicalization, put steps 6.c-f back in the COMBINE block. |
|
1254 |
|
1255 1 while $relurl[2]=~ s#(^|/)\./#$1# ; # Step 6.c |
|
1256 $relurl[2]=~ s#(^|/)\.$#$1# ; # Step 6.d |
|
1257 |
|
1258 # Step 6.e |
|
1259 my($oldpath) ; |
|
1260 while ($relurl[2]=~ s#(([^/]+)/\.\./)# ($2 eq '..') ? $1 : '' #ge) { |
|
1261 last if ($relurl[2] eq $oldpath) ; |
|
1262 $oldpath= $relurl[2] ; |
|
1263 } |
|
1264 |
|
1265 # Step 6.f |
|
1266 $relurl[2]=~ s#(([^/]+)/\.\.$)# ($2 eq '..') ? $1 : '' #ge ; |
|
1267 |
|
1268 # Step 6.g: allow leading ".." segments to remain in path |
|
1269 # Step 6.h: relurl[2] is already the buffer string |
|
1270 |
|
1271 # To canonicalize further, lowercase the hostname (is this valid for all |
|
1272 # schemes?) |
|
1273 $relurl[1]= lc($relurl[1]) if defined($relurl[1]) ; |
|
1274 |
|
1275 return &unparse_url(@relurl) ; # Step 7 |
|
1276 } |
|
1277 |
|
1278 |
|
1279 |
|
1280 # Convert a local URL into a canonicalized absolute path, or undef if |
|
1281 # not on this host or other error. |
|
1282 # Result should only be used as filename. |
|
1283 # Supports UserDir (e.g. public_html) for "/~username/path/file" URLs. |
|
1284 # Supports Alias, AliasMatch, ScriptAlias, and ScriptAliasMatch from srm.conf |
|
1285 # (but note use of Perl regex's instead of standard regex's). |
|
1286 # Inserts index.html, etc. (from @DIRECTORY_INDEX) if result is a directory, |
|
1287 # but just return directory name (ending in '/') if none of those exists. |
|
1288 # Removes PATH_INFO, if any, from filename. |
|
1289 # Directory names are always returned with trailing slash (which would not |
|
1290 # be appropriate if PATH_INFO was to be retained). |
|
1291 # While this routines makes some tests (e.g. if the file is a directory), |
|
1292 # it does not verify that file at the resulting $filename exists. |
|
1293 # Note that not all URLs point to files, so this routine is not always |
|
1294 # appropriate. In this program, the result from this routine is only |
|
1295 # used when we know the URL is not a CGI script (and is therefore a file), |
|
1296 # except in &is_cgi() itself, which tests if a file is a CGI script. |
|
1297 # If it weren't for &is_cgi(), we could ignore cases when the URL isn't |
|
1298 # a file. |
|
1299 # 12-1-99 JSM: Changed to also return "redirected" location, in case URL |
|
1300 # is a directory but not ending in a slash, so relative URLs will resolve |
|
1301 # correctly against the redirected URL. |
|
1302 sub url_to_filename { |
|
1303 my($URL)= @_ ; |
|
1304 my($URLpath, $path, $location, $docroot, $user) ; |
|
1305 return undef unless $URL=~ m#^http://\Q$LOCAL_HOST\E/#io ; |
|
1306 $URLpath= (&parse_url($URL))[2] ; |
|
1307 die "couldn't get path from [$URL]" unless length($URLpath) ; |
|
1308 |
|
1309 # Note minor security hole: if this script is run setuid, then any |
|
1310 # file on the system could be read by using an ALIAS to point to the |
|
1311 # file. Note also that if a $URLpath such as "/alias/dir/../.." is |
|
1312 # passed to this routine, the alias will be substituted BEFORE the |
|
1313 # ".." path segments are traversed. A case like this probably a |
|
1314 # mistake in the URL anyway. |
|
1315 |
|
1316 # Make no more than one alias substitution-- is there a precedence order? |
|
1317 # Note that %(.*)_MATCH use Perl regex's, not standard regex's. |
|
1318 # 3-29-99 JSM: These all alias to actual directory, not to a resulting |
|
1319 # URL, so no further conversion should be done if one of these matches. |
|
1320 # 3-29-99 JSM: Changed ALIAS_MATCH and SCRIPT_ALIAS_MATCH blocks to |
|
1321 # allow $1-type substitution in targets; MUST TEST! |
|
1322 ALIAS: { |
|
1323 foreach (keys %ALIAS) |
|
1324 { $path= $URLpath, last ALIAS |
|
1325 if $URLpath=~ s/^\Q$_\E/$ALIAS{$_}/ } |
|
1326 foreach (keys %ALIAS_MATCH) |
|
1327 { $path= $URLpath, last ALIAS |
|
1328 if eval "\$URLpath=~ s/^\$_/$ALIAS_MATCH{$_}/" } |
|
1329 foreach (keys %SCRIPT_ALIAS) |
|
1330 { $path= $URLpath, last ALIAS |
|
1331 if $URLpath=~ s/^\Q$_\E/$SCRIPT_ALIAS{$_}/ } |
|
1332 foreach (keys %SCRIPT_ALIAS_MATCH) |
|
1333 { $path= $URLpath, last ALIAS |
|
1334 if eval "\$URLpath=~ s/^\$_/$SCRIPT_ALIAS_MATCH{$_}/" } |
|
1335 } |
|
1336 |
|
1337 |
|
1338 # If $path has been set in above ALIAS block, no further conversion is |
|
1339 # needed. |
|
1340 if ($path eq '') { |
|
1341 |
|
1342 # Must check for ^/.. before PATH_INFO check, in case $URL's path |
|
1343 # is e.g. '/../conf/access.conf' |
|
1344 return undef if $URLpath=~ m#^/\.\.(/|$)# ; # ^/.. is not allowed |
|
1345 |
|
1346 # Set $docroot and $path for this file, based on the URL (contains '~' ?) |
|
1347 if (length($USER_DIR) and ($user,$path)= $URLpath=~ m#^/~([^/]+)(.*)# ) { |
|
1348 &read_home_dirs unless %home_dir ; # only read when needed |
|
1349 return undef unless length($home_dir{$user}) ; |
|
1350 $docroot= "$home_dir{$user}/$USER_DIR" ; |
|
1351 $path= '/' unless length($path) ; |
|
1352 } else { |
|
1353 # If we have no $DOCUMENT_ROOT, we can't handle URLs without ~. |
|
1354 return undef unless $DOC_ROOT_EXISTS ; |
|
1355 $docroot= $DOCUMENT_ROOT ; |
|
1356 $path= $URLpath ; |
|
1357 } |
|
1358 |
|
1359 # Handle PATH_INFO: remove path segments until an existing file is named. |
|
1360 # Note that directories cannot have PATH_INFO after them. |
|
1361 unless (-e "$docroot$path") { |
|
1362 for (my($path2)= $path ; $path2=~ m#/# ; $path2=~ s#/[^/]*$##) { |
|
1363 last if -d "$docroot$path2" ; |
|
1364 $path= $path2, last if -e "$docroot$path2" ; |
|
1365 } |
|
1366 } |
|
1367 |
|
1368 # canonicalize path, and recheck for ^/.. (handles an obscure error, |
|
1369 # when $URL's path is e.g. '/a/b/../../..'; but must canonicalize |
|
1370 # after PATH_INFO check, in case path is e.g. '/a/b.cgi/../../..'). |
|
1371 1 while $path=~ s#/\.(/|$)#/# ; |
|
1372 1 while $path=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ; |
|
1373 return undef if $path=~ m#^/\.\.(/|$)# ; # ^/.. is not allowed |
|
1374 |
|
1375 $path= "$docroot$path" ; |
|
1376 |
|
1377 } |
|
1378 |
|
1379 |
|
1380 # Add "index.html", etc. if appropriate |
|
1381 if (-d $path) { |
|
1382 $path.= '/' unless $path=~ m#/$# ; |
|
1383 # 12-1-99 JSM: set "redirected" location also |
|
1384 $location= "$URL/" unless $URL=~ m#/$# ; |
|
1385 foreach (@DIRECTORY_INDEX) { |
|
1386 $path.= $_, last if -f "$path$_" ; |
|
1387 } |
|
1388 } |
|
1389 return ($path, $location) ; |
|
1390 } |
|
1391 |
|
1392 |
|
1393 # Convert a local (possibly relative) pathname into a canonicalized URL. |
|
1394 # If filename is relative and no $basepath is given, assume it's in the |
|
1395 # current directory. |
|
1396 # Supports UserDir (e.g. public_html) for "/~username/path/file" URLs. |
|
1397 # Each path segment is checked to see if it's the same as $DOCUMENT_ROOT, |
|
1398 # by comparing inodes. When a match is found, it's cut off the front, |
|
1399 # and an absolute URL is constructed. If $DOCUMENT_ROOT is never matched, |
|
1400 # then $USER_DIR is scanned for. If that doesn't match (i.e. the file |
|
1401 # is not served to the Web), undef is returned. |
|
1402 # Note that $DOC_ROOT_DEV and $DOC_ROOT_INODE are set at the start of the |
|
1403 # program for efficiency, but are an integral part of this routine. |
|
1404 # %ALIAS is supported using %UN_ALIAS, as best as possible. See next note |
|
1405 # on avoiding use of this routine. |
|
1406 # This is currently only used when parsing command-line filenames, and when |
|
1407 # an <!--#include file="..." --> includes an <!--#include virtual="..." --> |
|
1408 # (which may be an error anyway). Thus, it can be avoided if needed, such |
|
1409 # as when complex aliasing makes results ambiguous. |
|
1410 # jsm-- should this add/remove @DIRECTORY_INDEX, to avoid some duplication? |
|
1411 # 3-29-99 JSM: Changed UNALIAS handling-- if it's unaliased, then no other |
|
1412 # conversion is necessary. |
|
1413 sub filename_to_url { |
|
1414 my($path, $basepath)= @_ ; |
|
1415 my($URLpath) ; |
|
1416 unless ($path=~ m#^/#) { |
|
1417 $basepath= $CWD unless length($basepath) ; |
|
1418 $basepath.= '/' if -d $basepath && $basepath!~ m#/$# ; |
|
1419 $basepath=~ s#/[^/]*$#/# ; |
|
1420 $path= "$basepath$path" ; |
|
1421 } |
|
1422 |
|
1423 # canonicalize filename by removing ./ and ../ where appropriate |
|
1424 1 while $path=~ s#/\.(/|$)#/# ; |
|
1425 1 while $path=~ s#/(?!\.\./)[^/]+/\.\.(/|$)#/# ; |
|
1426 |
|
1427 # canonicalize directory to include final / |
|
1428 $path.= '/' if -d $path && $path!~ m#/$# ; |
|
1429 |
|
1430 # First, if path can be unaliased, return that. |
|
1431 # (Added 3-29-99 by JSM.) |
|
1432 foreach (keys %UN_ALIAS) |
|
1433 { $URLpath= $path, last if $path=~ s/^\Q$_\E/$UN_ALIAS{$_}/ } |
|
1434 return "http://\L$LOCAL_HOST\E$URLpath" if $URLpath ne '' ; |
|
1435 |
|
1436 # Then, check if file is under $DOCUMENT_ROOT tree, and convert if so. |
|
1437 if ($DOC_ROOT_EXISTS) { |
|
1438 my($doc_root)= $path ; |
|
1439 while ($doc_root=~ s#/[^/]*$##) { |
|
1440 my($dev,$inode)= stat("$doc_root/.") ; |
|
1441 if ( ($dev==$DOC_ROOT_DEV) && ($inode==$DOC_ROOT_INODE) ) { |
|
1442 $path=~ s/^$doc_root// ; |
|
1443 # foreach (keys %UN_ALIAS) |
|
1444 # { last if $path=~ s/^\Q$_\E/$UN_ALIAS{$_}/ } |
|
1445 return "http://\L$LOCAL_HOST\E$path" ; |
|
1446 } |
|
1447 } |
|
1448 } |
|
1449 |
|
1450 # Handle possible case of "~username/$USER_DIR/$path" |
|
1451 # I don't think %ALIAS applies here, does it? |
|
1452 # This misses some when $HOME/$USER_DIR points through a symbolic link, |
|
1453 # and $CWD isn't set to match %dir_to_user. Work around by avoiding |
|
1454 # this routine, e.g. using only URLs on command line. |
|
1455 if (length($USER_DIR)) { |
|
1456 if ($path=~ m#^(.*?)/$USER_DIR(/.*)# ) { |
|
1457 # First, see if path is in %dir_to_user |
|
1458 &read_home_dirs unless %dir_to_user ; # only read when needed |
|
1459 return "http://\L$LOCAL_HOST\E/~$dir_to_user{$1}$2" |
|
1460 if length($dir_to_user{$1}) ; |
|
1461 |
|
1462 # If not, then we must check inodes to equivalate directories |
|
1463 &read_inode_to_user unless %inode_to_user ; # only read when needed |
|
1464 my($dev,$inode)= stat("$1/.") ; |
|
1465 return "http://\L$LOCAL_HOST\E/~$inode_to_user{$dev}{$inode}$2" |
|
1466 if length($inode_to_user{$dev}{$inode}) ; |
|
1467 } |
|
1468 } |
|
1469 |
|
1470 return undef ; |
|
1471 } |
|
1472 |
|
1473 |
|
1474 |
|
1475 # Reads all users' home directory into %home_dir, from /etc/passwd. |
|
1476 # Also creates %dir_to_user, which is faster than %inode_to_user (below). |
|
1477 # Only used when $USER_DIR is used, for "/~username/path/file" URLs. |
|
1478 # 2-27-00 JSM: Changed to use getpwent, instead of reading /etc/passwd. |
|
1479 sub read_home_dirs { |
|
1480 my($user, $homedir) ; |
|
1481 |
|
1482 setpwent ; # to rewind, in case getpwent has already been used |
|
1483 while ( ($user, $homedir)= (getpwent)[0,7] ) { |
|
1484 $home_dir{$user}= $homedir ; |
|
1485 $dir_to_user{$homedir}= $user |
|
1486 unless $dir_to_user{$homedir} ne '' ; |
|
1487 } |
|
1488 endpwent ; # clean way to end getpwent processing |
|
1489 } |
|
1490 |
|
1491 |
|
1492 # Reads home directory inode information into %inode_to_user, from /etc/passwd. |
|
1493 # Because this is time-consuming, it is only called if needed, and only once. |
|
1494 # Only used when $USER_DIR is used, for "/~username/path/file" URLs. |
|
1495 # On SPARCstation-10 with 3000 /etc/passwd records, this takes ~2 seconds. |
|
1496 # 2-27-00 JSM: Changed to use already-existing %home_dir, instead of reading |
|
1497 # /etc/passwd again. |
|
1498 sub read_inode_to_user { |
|
1499 my($user, $homedir) ; |
|
1500 my($dev, $inode) ; |
|
1501 &read_home_dirs unless %home_dir ; # only read when needed |
|
1502 |
|
1503 while ( ($user, $homedir)= each %home_dir ) { |
|
1504 ($dev,$inode)= stat("$homedir/.") ; |
|
1505 $inode_to_user{$dev}{$inode}= $user |
|
1506 unless $inode_to_user{$dev}{$inode} ne '' ; |
|
1507 } |
|
1508 } |
|
1509 |
|
1510 |
|
1511 |
|
1512 #----- Extracting URLs from HTML ------------------------------------ |
|
1513 |
|
1514 |
|
1515 # Parse an SGML tag, and return a hash structure with a "name" scalar and |
|
1516 # an "attrib" hash. |
|
1517 # Parses first tag in string, ignoring all surrounding text. |
|
1518 # Results are canonicalized to lower case wherever case-insensitive. |
|
1519 sub parse_tag { |
|
1520 my($tag)= @_ ; # will be mangled |
|
1521 my($tagname,%attrib) ; |
|
1522 |
|
1523 # only parse first tag in string |
|
1524 ($tag)= split(/>/, $tag) ; # remove all after > |
|
1525 $tag=~ s/^([^<]*<)?\s*// ; # remove pre-<, <, and leading blanks |
|
1526 ($tagname,$tag)= split(/\s+/, $tag, 2) ; # split out tag name |
|
1527 |
|
1528 # Extract name/value (possibly quoted), lowercase name, set $attrib{}. |
|
1529 # If quoted, is delimited by quotes; if not, delimited by whitespace. |
|
1530 $attrib{lc($1)}= &HTMLunescape($+) |
|
1531 while ($tag=~ s/\s*(\w+)\s*=\s*(([^"']\S*)|"([^"]*)"?|'([^']*)'?)//) ; |
|
1532 |
|
1533 # now, get remaining non-valued (boolean) attributes |
|
1534 $tag=~ s/^\s*|\s*$//g ; # skip leading/trailing blanks |
|
1535 foreach (split(/\s+/, $tag)) { |
|
1536 $_= lc($_) ; |
|
1537 $attrib{$_}= $_ ; # booleans have values equal to their name |
|
1538 } |
|
1539 |
|
1540 return { 'name' => lc($tagname), |
|
1541 'attrib' => \%attrib } ; |
|
1542 } |
|
1543 |
|
1544 |
|
1545 # Unescape any HTML character references and return resulting string. |
|
1546 # Support entity character references in %e_to_ch (which is incomplete), |
|
1547 # plus "$#ddd;" and "Ý" forms for values<256. |
|
1548 # Note that not decoding a valid character is erroneous, in that a |
|
1549 # subsequent re-escaping will not return the original string, because |
|
1550 # of the ampersand. Nonetheless, that's preferable to losing the data. |
|
1551 # Q: Is there an appropriate general way to represent an unescaped string? |
|
1552 sub HTMLunescape { |
|
1553 my($s)= @_ ; |
|
1554 |
|
1555 # Try alpha, decimal, and hex representations, only substituting if valid |
|
1556 $s=~ s/&(([a-zA-Z][a-zA-Z0-9.-]*);?|#([0-9]+);?|#[Xx]([0-9a-fA-F]+);?)/ |
|
1557 length($2) ? ( defined($e_to_ch{$2}) ? $e_to_ch{$2} : "&$1" ) |
|
1558 : length($3) ? ( $3 < 256 ? chr($3) : "&$1" ) |
|
1559 : length($4) ? ( hex($4) < 256 ? chr(hex($4)) : "&$1" ) |
|
1560 : "&$1" |
|
1561 /ge ; |
|
1562 |
|
1563 return $s ; |
|
1564 } |
|
1565 |
|
1566 |
|
1567 # Given a block of HTML, extracts all URLs referenced in it, and adds them |
|
1568 # to our data structures to be downloaded or checked (i.e. calls |
|
1569 # &add_url()). |
|
1570 # Note that %html_urls and %non_html_urls are set at the start of the |
|
1571 # program for efficiency, but are an integral part of this routine. |
|
1572 # Currently, this extracts all <.*?> patterns, which may not be valid if |
|
1573 # "<" or ">" characters are e.g. inside a <script> element. |
|
1574 sub extract_urls { |
|
1575 my($HTML, $baseurl, $referer, $depth)= @_ ; |
|
1576 my(@tags) ; |
|
1577 |
|
1578 # Remove comments before extracting links, as pointed out by Tim Hunter. |
|
1579 $HTML=~ s/<!--.*?--.*?>//gs ; |
|
1580 |
|
1581 # We must look for <base> tag before all the work, so we must parse |
|
1582 # all tags first. :( Therefore, we save this large array of |
|
1583 # structures for efficiency, hoping we don't run out of memory. |
|
1584 my($i)= -1 ; # to start at array element 0 |
|
1585 |
|
1586 foreach ($HTML=~ /(<.*?>)/gs) { |
|
1587 $tags[++$i]= &parse_tag($_) ; |
|
1588 $baseurl= $tags[$i]{'attrib'}{'href'} |
|
1589 if ($tags[$i]{'name'} eq 'base') |
|
1590 and (length($tags[$i]{'attrib'}{'href'})) ; |
|
1591 } |
|
1592 |
|
1593 # For each tag, call &add_url() for each URL in the tag |
|
1594 foreach my $tag (@tags) { |
|
1595 next if $tag->{'name'}=~ m#^/# ; |
|
1596 |
|
1597 # Handle the "regular" tag-attributes, in %html_urls and %non_html_urls |
|
1598 |
|
1599 foreach (@{$html_urls{$tag->{'name'}}}) { |
|
1600 &add_url(&absolute_url($tag->{'attrib'}{$_}, $baseurl), |
|
1601 $referer, $depth) |
|
1602 if length($tag->{'attrib'}{$_}) ; |
|
1603 } |
|
1604 |
|
1605 foreach (@{$non_html_urls{$tag->{'name'}}}) { |
|
1606 &add_url(&absolute_url($tag->{'attrib'}{$_}, $baseurl), |
|
1607 $referer, $depth, 0) |
|
1608 if length($tag->{'attrib'}{$_}) ; |
|
1609 } |
|
1610 |
|
1611 # Now handle each tag-attribute that needs special attention |
|
1612 |
|
1613 if ($tag->{'name'} eq 'object') { |
|
1614 my($codebase)= |
|
1615 &absolute_url($tag->{'attrib'}{'codebase'}, $baseurl) ; |
|
1616 |
|
1617 &add_url(&absolute_url($tag->{'attrib'}{'data'}, $codebase), |
|
1618 $referer, $depth) |
|
1619 if length($tag->{'attrib'}{'data'}) ; |
|
1620 |
|
1621 # There seems to be a contradiction between the HTML 4.0 spec |
|
1622 # section 13.3.3 and RFC 1808 section 4 step 2b regarding |
|
1623 # the URL resolution of e.g. classid="java:program.start". |
|
1624 # For now, we'll stick with the RFC 1808 method. |
|
1625 &add_url(&absolute_url($tag->{'attrib'}{'classid'}, $codebase), |
|
1626 $referer, $depth, 0) |
|
1627 if length($tag->{'attrib'}{'classid'}) ; |
|
1628 |
|
1629 # <object> tag's "archive" attribute is a space-separated list |
|
1630 foreach (split(/\s+/, $tag->{'attrib'}{'archive'})) { |
|
1631 &add_url(&absolute_url($_, $codebase), $referer, $depth, 0) ; |
|
1632 } |
|
1633 |
|
1634 } elsif ($tag->{'name'} eq 'head') { |
|
1635 # "profile" attribute is a space-separated list |
|
1636 foreach (split(/\s+/, $tag->{'attrib'}{'profile'})) { |
|
1637 &add_url(&absolute_url($_, $baseurl), $referer, $depth, 0) ; |
|
1638 } |
|
1639 |
|
1640 } elsif ($tag->{'name'} eq 'applet') { |
|
1641 my($codebase)= |
|
1642 &absolute_url($tag->{'attrib'}{'codebase'}, $baseurl) ; |
|
1643 |
|
1644 &add_url(&absolute_url($tag->{'attrib'}{'code'}, $codebase), |
|
1645 $referer, $depth, 0) |
|
1646 if length($tag->{'attrib'}{'code'}) ; |
|
1647 |
|
1648 &add_url(&absolute_url($tag->{'attrib'}{'object'}, $codebase), |
|
1649 $referer, $depth, 0) |
|
1650 if length($tag->{'attrib'}{'object'}) ; |
|
1651 |
|
1652 # <applet> tag's "archive" attribute is a comma-separated list |
|
1653 foreach (split(/\s*,\s*/, $tag->{'attrib'}{'archive'})) { |
|
1654 &add_url(&absolute_url($_, $codebase), $referer, $depth, 0) ; |
|
1655 } |
|
1656 |
|
1657 # Check form script for existence only, but don't follow hyperlinks. |
|
1658 # Handles the unlikely case when a CGI is referred to by both a |
|
1659 # <form action=___.cgi> tag and an <a href=___.cgi> tag: If a CGI |
|
1660 # is only pointed to by <form action>, then don't follow the link |
|
1661 # (i.e. set $url->{'dontfollow'} to verify, not load). If a CGI |
|
1662 # is called by at least one <a href> tag, then do follow the link. |
|
1663 } elsif ($tag->{'name'} eq 'form') { |
|
1664 &add_url(&absolute_url($tag->{'attrib'}{'action'}, $baseurl), |
|
1665 $referer, $depth, undef, 1, 1) |
|
1666 if length($tag->{'attrib'}{'action'}) ; |
|
1667 |
|
1668 } # if ($tag->{'name'} eq '...') |
|
1669 |
|
1670 |
|
1671 } # foreach $tag (@tags) |
|
1672 |
|
1673 } # &extract_urls() |
|
1674 |
|
1675 |
|
1676 |
|
1677 #----- output routines ------------------------------------------------ |
|
1678 |
|
1679 # Generate final report |
|
1680 sub make_report { |
|
1681 if ($verbose_report) { &make_verbose_report } |
|
1682 else { &make_short_report } |
|
1683 } |
|
1684 |
|
1685 |
|
1686 |
|
1687 # Generate a verbose report of all URLs that have been checked |
|
1688 sub make_verbose_report { |
|
1689 my($rootlist)= join("\n ", @ARGV) ; |
|
1690 my($numurls)= scalar keys %url ; |
|
1691 |
|
1692 print <<EOH ; |
|
1693 ====================================================================== |
|
1694 Report of invalid links on $LOCAL_HOST, |
|
1695 recursively followed starting with the files/URLs: |
|
1696 |
|
1697 $rootlist |
|
1698 |
|
1699 EOH |
|
1700 |
|
1701 if (@INCLUDE_PATTERNS) { |
|
1702 my($includelist)= join("\n ", @INCLUDE_PATTERNS) ; |
|
1703 print <<EOH ; |
|
1704 Only including URLs matching at least one of the following patterns: |
|
1705 $includelist |
|
1706 |
|
1707 EOH |
|
1708 } |
|
1709 |
|
1710 if (@EXCLUDE_PATTERNS) { |
|
1711 my($excludelist)= join("\n ", @EXCLUDE_PATTERNS) ; |
|
1712 print <<EOH ; |
|
1713 Excluding URLs matching any of the following patterns: |
|
1714 $excludelist |
|
1715 |
|
1716 EOH |
|
1717 } |
|
1718 |
|
1719 print "Total URLs checked: $numurls\n\n" ; |
|
1720 |
|
1721 print( "Only reporting response codes beginning with: ", |
|
1722 join(", ", @INCLUDE_STATUS), "\n" ) |
|
1723 if @INCLUDE_STATUS ; |
|
1724 |
|
1725 print( "Excluding response codes beginning with: ", |
|
1726 join(", ", @EXCLUDE_STATUS), "\n" ) |
|
1727 if @EXCLUDE_STATUS ; |
|
1728 |
|
1729 print "Maximum traversal depth: $max_depth\n" if length($max_depth) ; |
|
1730 print "Only local files were checked; no CGI scripts were invoked.\n" |
|
1731 if $file_check ; |
|
1732 print "Local URLs were read from the filesystem where possible.\n" |
|
1733 unless $full_http_check ; |
|
1734 |
|
1735 print "\n" ; |
|
1736 |
|
1737 # Only report status errors if there are any. |
|
1738 # Using grep()s is slightly inefficient, but a *lot* cleaner. |
|
1739 my($has_statuserrs) ; |
|
1740 foreach my $URL (sort keys %url) { |
|
1741 next if @INCLUDE_STATUS && |
|
1742 !grep($url{$URL}{'status'}=~ /^$_/, @INCLUDE_STATUS) ; |
|
1743 next if grep($url{$URL}{'status'}=~ /^$_/, @EXCLUDE_STATUS) ; |
|
1744 |
|
1745 $has_statuserrs=1, last ; |
|
1746 } |
|
1747 |
|
1748 |
|
1749 if ($has_statuserrs) { |
|
1750 print <<EOH ; |
|
1751 ====================================================================== |
|
1752 |
|
1753 RESULTS FROM ALL URLS WITH SELECTED RESPONSE STATUS CODES |
|
1754 ------------------------------------------------------------ |
|
1755 |
|
1756 EOH |
|
1757 |
|
1758 foreach (sort keys %url) { |
|
1759 my($u)= $url{$_} ; |
|
1760 |
|
1761 next if @INCLUDE_STATUS && |
|
1762 !grep($u->{'status'}=~ /^$_/, @INCLUDE_STATUS) ; |
|
1763 next if grep($u->{'status'}=~ /^$_/, @EXCLUDE_STATUS) ; |
|
1764 |
|
1765 print "$u->{'URL'}\n", '-' x 50, "\n" ; |
|
1766 print "Status: $u->{'status'}\n" ; |
|
1767 print "Moved to: $u->{'location'}\n" if $u->{'location'} ; |
|
1768 print "Depth: $u->{'depth'}\n" ; |
|
1769 |
|
1770 for (my($URL)= $u->{'referer'}, my($tab)= ' ' ; |
|
1771 length($url{$URL}) ; |
|
1772 $URL= $url{$URL}{'referer'}, $tab.= ' ' ) |
|
1773 { |
|
1774 print "${tab}referred by $url{$URL}{'URL'}\n" ; |
|
1775 die "PROGRAM ERROR: apparent infinite referer loop.\n" |
|
1776 if length($tab)>200 ; # simple-minded sanity check |
|
1777 } |
|
1778 |
|
1779 print "\n\n" ; |
|
1780 } # URL |
|
1781 |
|
1782 } |
|
1783 |
|
1784 |
|
1785 |
|
1786 # Only report SSI errors if there are any |
|
1787 my($has_ssierrs) ; |
|
1788 foreach (sort keys %url) { |
|
1789 $has_ssierrs=1, last if @{$url{$_}{'ssierrs'}} ; |
|
1790 } |
|
1791 |
|
1792 |
|
1793 if ($has_ssierrs) { |
|
1794 print <<EOH ; |
|
1795 ====================================================================== |
|
1796 |
|
1797 PROBLEMS WITH SERVER-SIDE INCLUDE (AKA SHTML) DIRECTIVES |
|
1798 ------------------------------------------------------------ |
|
1799 |
|
1800 EOH |
|
1801 foreach (sort keys %url) { |
|
1802 my($u)= $url{$_} ; |
|
1803 if (@{$u->{'ssierrs'}}) { |
|
1804 print "$u->{'URL'}\n", '-' x 50, "\n" ; |
|
1805 printf "Total %s SSI Errors:\n", $#{$u->{'ssierrs'}}+1 ; |
|
1806 foreach my $i (0..$#{$u->{'ssierrs'}}) { |
|
1807 printf " %s) ", $i+1 ; |
|
1808 my($tab)= ' ' x (4+length($i+1)) ; |
|
1809 my($tab2) ; |
|
1810 foreach my $level (@{$u->{'ssierrs'}[$i]}) { |
|
1811 print "${tab2}file: $level->{'path'}\n" ; |
|
1812 print "${tab}in tag: $level->{'tag'}\n" ; |
|
1813 print "${tab}error: $level->{'errmsg'}\n" |
|
1814 if $level->{'errmsg'} ; |
|
1815 $tab.= ' ' ; |
|
1816 $tab2= $tab ; |
|
1817 } |
|
1818 } |
|
1819 print "\n\n" ; |
|
1820 } |
|
1821 } |
|
1822 } |
|
1823 |
|
1824 |
|
1825 unless ($has_statuserrs or $has_ssierrs) { |
|
1826 print <<EOH ; |
|
1827 ====================================================================== |
|
1828 |
|
1829 << NO ERRORS FOUND >> |
|
1830 |
|
1831 EOH |
|
1832 } |
|
1833 |
|
1834 print '=' x 70, "\n" ; |
|
1835 |
|
1836 } # &make_verbose_report() |
|
1837 |
|
1838 |
|
1839 # Generate a one-line-per-URL report |
|
1840 sub make_short_report { |
|
1841 my($numurls)= scalar keys %url ; |
|
1842 |
|
1843 print "Total $numurls URLs checked\n" ; |
|
1844 URL: foreach my $URL (sort keys %url) { |
|
1845 next if @INCLUDE_STATUS && |
|
1846 !grep($url{$URL}{'status'}=~ /^$_/, @INCLUDE_STATUS) ; |
|
1847 next if grep($url{$URL}{'status'}=~ /^$_/, @EXCLUDE_STATUS) ; |
|
1848 |
|
1849 print "$url{$URL}{'URL'}\t$url{$URL}{'status'}\n" ; |
|
1850 } # URL |
|
1851 } |
|
1852 |
|
1853 |
|
1854 |
|
1855 # Print a summary of usage and exit |
|
1856 sub usage { |
|
1857 print <<EOF ; |
|
1858 |
|
1859 Checklinks $CL_VERSION |
|
1860 |
|
1861 To recursively check all HTML links on the local site, enter: |
|
1862 $0 <options> [ start-file | start-URL ] ... |
|
1863 |
|
1864 Options include: |
|
1865 -v Generate full (verbose) report, including full |
|
1866 referral information and detailed SSI error |
|
1867 reporting. |
|
1868 -I <include-pattern> Only check URLs matching <include-pattern>. |
|
1869 -X <exclude-pattern> Don't check URLs matching <exclude-pattern>. |
|
1870 -i <include-status> Only report URLs whose response code starts with |
|
1871 the pattern <include-status>. |
|
1872 -x <exclude-status> Don't report URLs whose response code starts with |
|
1873 the pattern <exclude-status>. Default is to |
|
1874 exclude "200" responses only (i.e. "-x 200"). |
|
1875 -d <max-depth> Traverse links no deeper than <max-depth>. |
|
1876 -f "File mode"-- only read files from the filesystem; |
|
1877 do not go through the HTTP server at all. This |
|
1878 will skip all URLs that point to CGI scripts. |
|
1879 -h "HTTP mode"-- use HTTP to check ALL URLs, even |
|
1880 if they could be read from the filesystem. |
|
1881 Incompatible with "-f" option. |
|
1882 -c <config-file> Read appropriate configuration parameters from |
|
1883 <config-file>, typically srm.conf. Use '-' to |
|
1884 read from STDIN. If a directory is named, use |
|
1885 "srm.conf" in that directory. |
|
1886 -q Print current configuration parameters. |
|
1887 -? Print this help message. |
|
1888 -- End command-line option processing. |
|
1889 |
|
1890 Don't stack options like "-vf"; use "-v -f" instead. |
|
1891 |
|
1892 For -I, -X, -i, and -x: |
|
1893 Values are interpreted as Perl 5 regular expressions. |
|
1894 Use multiple options to build a list (e.g. "-I include1 -I include2"). |
|
1895 Use a value of '' to clear a list (e.g. -x '' means "report all responses", |
|
1896 "-x '' -x 401" means "report all but 401 responses"). |
|
1897 As a special case, an empty -I or -i list implies no include-restrictions. |
|
1898 If an item is in both the include and exclude list, it is excluded. |
|
1899 Note that -I and -X restrict which URLs are traversed into, so they may |
|
1900 prune large areas of your Web structure. |
|
1901 |
|
1902 EOF |
|
1903 exit ; |
|
1904 } |
|
1905 |
|
1906 |
|
1907 #----- debugging routines below --------------------------------------- |
|
1908 |
|
1909 # Print current configuration settings |
|
1910 sub print_config { |
|
1911 print "\n----- OPTIONS SPECIFIC TO THIS EXECUTION -----------------------------\n\n" ; |
|
1912 |
|
1913 print "Include only URLs containing one of the following patterns:\n", |
|
1914 ( map { " $_\n" } @INCLUDE_PATTERNS ), |
|
1915 "\n" |
|
1916 if @INCLUDE_PATTERNS ; |
|
1917 |
|
1918 print "Exclude URLs containing one of the following patterns:\n", |
|
1919 (map { " $_\n" } @EXCLUDE_PATTERNS ), |
|
1920 "\n" |
|
1921 if @EXCLUDE_PATTERNS ; |
|
1922 |
|
1923 print "Only report response codes beginning with: ", |
|
1924 join(", ", @INCLUDE_STATUS), "\n" |
|
1925 if @INCLUDE_STATUS ; |
|
1926 |
|
1927 print "Don't report response codes beginning with: ", |
|
1928 join(", ", @EXCLUDE_STATUS), "\n" |
|
1929 if @EXCLUDE_STATUS ; |
|
1930 |
|
1931 print "\nMaximum search depth: $max_depth\n" if length($max_depth) ; |
|
1932 |
|
1933 print <<EOS ; |
|
1934 |
|
1935 ----- INSTALLATION PARAMETERS ---------------------------------------- |
|
1936 |
|
1937 Local Host: $LOCAL_HOST |
|
1938 Document Root: $DOCUMENT_ROOT |
|
1939 User Web Directory: $USER_DIR |
|
1940 Default Filename(s): @DIRECTORY_INDEX |
|
1941 |
|
1942 File extensions indicating a CGI program: @CGI_EXTENSIONS |
|
1943 File extensions indicating server-parsed HTML: @SHTML_EXTENSIONS |
|
1944 |
|
1945 EOS |
|
1946 |
|
1947 print "Directory Aliases:\n", |
|
1948 map { sprintf(" %15s ==> %s\n", $_, $ALIAS{$_}) } |
|
1949 sort keys %ALIAS, |
|
1950 "\n" |
|
1951 if keys %ALIAS ; |
|
1952 |
|
1953 print "Directory Regular Expression Aliases:\n", |
|
1954 map { sprintf(" %15s ==> %s\n", $_, $ALIAS_MATCH{$_}) } |
|
1955 sort keys %ALIAS_MATCH, |
|
1956 "\n" |
|
1957 if keys %ALIAS_MATCH ; |
|
1958 |
|
1959 print "Script Directory Aliases:\n", |
|
1960 map { sprintf(" %15s ==> %s\n", $_, $SCRIPT_ALIAS{$_}) } |
|
1961 sort keys %SCRIPT_ALIAS, |
|
1962 "\n" |
|
1963 if keys %SCRIPT_ALIAS ; |
|
1964 |
|
1965 print "Script Directory Regular Expression Aliases:\n", |
|
1966 map { sprintf(" %15s ==> %s\n", $_, $SCRIPT_ALIAS_MATCH{$_}) } |
|
1967 sort keys %SCRIPT_ALIAS_MATCH, |
|
1968 "\n" |
|
1969 if keys %SCRIPT_ALIAS_MATCH ; |
|
1970 |
|
1971 if ($ENV{'http_proxy'}) { |
|
1972 print "HTTP Proxy: $ENV{'http_proxy'}\n" ; |
|
1973 print "Except to hosts ending in: $ENV{'no_proxy'}\n" |
|
1974 if $ENV{'no_proxy'} ; |
|
1975 print "\n" ; |
|
1976 } else { |
|
1977 print "Not using any HTTP Proxy.\n" ; |
|
1978 } |
|
1979 |
|
1980 print "Maximum HTTP redirections allowed for a URL: $MAX_REDIRECTS\n", |
|
1981 "Maximum number of tries to get a single URL: $MAX_ATTEMPTS\n\n"; |
|
1982 |
|
1983 print '-' x 70, "\n\n" ; |
|
1984 |
|
1985 } |
|
1986 |
|
1987 |
|
1988 |
|
1989 # print configuration, in a style more for debugging (mostly from srm.conf) |
|
1990 sub print_debug_config { |
|
1991 print <<EOF ; |
|
1992 DOCUMENT_ROOT= [$DOCUMENT_ROOT] |
|
1993 USER_DIR= [$USER_DIR] |
|
1994 DIRECTORY_INDEX= [@DIRECTORY_INDEX] |
|
1995 |
|
1996 CGI_EXTENSIONS= [@CGI_EXTENSIONS] |
|
1997 SHTML_EXTENSIONS= [@SHTML_EXTENSIONS] |
|
1998 |
|
1999 EOF |
|
2000 |
|
2001 foreach (sort keys %ALIAS) |
|
2002 { print "ALIAS{$_}= [$ALIAS{$_}]\n" } |
|
2003 foreach (sort keys %ALIAS_MATCH) |
|
2004 { print "ALIAS_MATCH{$_}= [$ALIAS_MATCH{$_}]\n" } |
|
2005 foreach (sort keys %SCRIPT_ALIAS) |
|
2006 { print "SCRIPT_ALIAS{$_}= [$SCRIPT_ALIAS{$_}]\n" } |
|
2007 foreach (sort keys %SCRIPT_ALIAS_MATCH) |
|
2008 { print "SCRIPT_ALIAS_MATCH{$_}= [$SCRIPT_ALIAS_MATCH{$_}]\n" } |
|
2009 |
|
2010 } |
|
2011 |
|
2012 |
|
2013 #---------------------------------------------------------------------- |
|
2014 # |
|
2015 # SPARSE DOCUMENTATION ON PROGRAM INTERNALS: |
|
2016 # |
|
2017 #---------------------------------------------------------------------- |
|
2018 # |
|
2019 # URLs are read directly from the local filesystem if possible, or |
|
2020 # downloaded or checked with HTTP GET and HEAD methods if not. CGI |
|
2021 # scripts are called through the HTTP server, even if local (since the |
|
2022 # resource their URLs refer to are not files). |
|
2023 # |
|
2024 # |
|
2025 # Global variables: |
|
2026 # |
|
2027 # %url holds all data regarding all URLs, using the text URL as the key: |
|
2028 # |
|
2029 # %url{$URL}{qw( URL filename referer depth |
|
2030 # ishtml iscgi islocal dontfollow |
|
2031 # status location numredirects numtries lasttried |
|
2032 # ssierrs |
|
2033 # hasbeenloaded)} <== only for testing |
|
2034 # URL is same as hash key |
|
2035 # ishtml means is type text/html; links MAY be extracted. (Used in |
|
2036 # main loop.) |
|
2037 # iscgi means is a CGI script; MUST NOT be read directly from |
|
2038 # filesystem. (Used in verify_url() and load_url().) |
|
2039 # islocal means served by local server: |
|
2040 # a) MAY be read directly from filesystem |
|
2041 # b) links MAY be extracted |
|
2042 # dontfollow means links MUST NOT be extracted. |
|
2043 # status begins with a number, but may have text afterward. |
|
2044 # |
|
2045 # islocal= (URL=~ m#^http://$LOCAL_HOST/#i) , i.e. redundant. |
|
2046 # ishtml and iscgi are guesses, may be 0, 1, or '' for unset. |
|
2047 # iscgi only matters for local URLs, but may safely be set for |
|
2048 # remote URLs. |
|
2049 # dontfollow is a hack to handle an obscure CGI-related situation. |
|
2050 # |
|
2051 # In addition to the standard HTTP status codes, 'status' may take one |
|
2052 # of the following values: |
|
2053 # 450 Can't read file: $! |
|
2054 # 451 SSI Error(s) (__ total) |
|
2055 # 600 Can't create socket: $@ |
|
2056 # 601 Connection timed out |
|
2057 # 602 Incomplete response (%s of %s bytes) |
|
2058 # 603 Incomplete chunked response |
|
2059 # |
|
2060 # Handling of 602 and 603 is somewhat hackish-- overwrites real status |
|
2061 # line with artificial one. |
|
2062 # |
|
2063 # $url->{'ssierrs'} is a list of errors relating to SSI includes. |
|
2064 # Each error has a list of filenames representing the include |
|
2065 # chain. Each element in the chain stores the absolute |
|
2066 # pathname, the <!--#include--> tag which called it, and |
|
2067 # possibly an error message (usually only on the last file in |
|
2068 # the include-chain). Thus, the entire structure is |
|
2069 # |
|
2070 # $url->{'ssierrs'}[$errid][$levelsdeep]{qw( path tag errmsg )} |
|
2071 # |
|
2072 # This is only a memory hog with many errors of deeply nested files. |
|
2073 # |
|
2074 # |
|
2075 # @urlstoget: canonicalized list of URLs left to get, in order |
|
2076 # of appearance. List elements are pointers to %url elements. |
|
2077 # This array may grow from &extract_urls(), or be rearranged when |
|
2078 # a URL needs to be retried. |
|
2079 # |
|
2080 # Note that $url is normally a pointer, and $URL is normally a string. |
|
2081 # |
|
2082 # Files/paths are canonicalized by removing unneeded "./" and "../" |
|
2083 # segments. |
|
2084 # Directories are canonicalized to always end in "/", including in URLs; |
|
2085 # note that some system variables and local directory variables aren't so. |
|
2086 # URLs that point to "/" end in "/", e.g. "http://www.foo.com/". |
|
2087 # |
|
2088 # |
|
2089 # This program has a lot of unused functionality for possible extension, |
|
2090 # so don't get confused by routines that do more than they are used for. |
|
2091 # |
|
2092 #---------------------------------------------------------------------- |
|
2093 # |
|
2094 # To do: |
|
2095 # |
|
2096 # * If you want a different output format (like in HTML), it should |
|
2097 # be easy enough to add. |
|
2098 # * This could be extended to check remote sites without much trouble. |
|
2099 # * Should probably support directories that don't have index.html. |
|
2100 # * Check that pointed-to fragments exist in target files |
|
2101 # * keep track of servers and don't hit them too often |
|
2102 # * support robots.txt |
|
2103 # * add option to read list of URLs from file |
|
2104 # * support HTTP Basic Authentication somehow. Note that currently, this |
|
2105 # reads local files even when they should be protected by .htaccess |
|
2106 # (unless using -h option). |
|
2107 # * Read list of local host names from gethostbyname() ? |
|
2108 # * check read permissions on files? |
|
2109 # * In the event multiple URLs redirect to the same URL, this could be a |
|
2110 # little more efficient if it saved the redirected loads/downloads. |
|
2111 # |
|
2112 #---------------------------------------------------------------------- |