# Copyright (c) 1994 Regents of the University of California. # All rights reserved. # $Id: momhistory.pl,v 1.8 1994/08/10 10:18:29 fielding Exp $ # --------------------------------------------------------------------------- # momhistory: A package for remembering the results of web traversals # for a World-Wide Web spider. This code is separated as # a package so that, if needed, this in-memory version can # be easily replaced by a database version without effecting # the spider's interface. # # This software has been developed by Roy Fielding as # part of the Arcadia project at the University of California, Irvine. # # Redistribution and use in source and binary forms are permitted, # subject to the restriction noted below, provided that the above # copyright notice and this paragraph and the following paragraphs are # duplicated in all such forms and that any documentation, advertising # materials, and other materials related to such distribution and use # acknowledge that the software was developed in part by the University of # California, Irvine. The name of the University may not be used to # endorse or promote products derived from this software without # specific prior written permission. THIS SOFTWARE IS PROVIDED ``AS IS'' # AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT # LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE. # # Use of this software in any way or in any form, source or binary, # is not allowed in any country which prohibits disclaimers of any # implied warranties of merchantability or fitness for a particular # purpose or any disclaimers of a similar nature. # # IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY # FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES # ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION # (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF THE UNIVERSITY # OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # If you have any suggestions, bug reports, fixes, or enhancements, # send them to the author Roy Fielding at . # --------------------------------------------------------------------------- require "www.pl"; require "wwwerror.pl"; require "momconfig.pl"; require "momavoid.pl"; package momhistory; # ========================================================================== # Set things which should not normally be changed at installation time. # Status of a seen node (yes, the order is very important) $S_not_seen = 0; # Node has not yet been seen (also undefined) $S_seen_not_tested = 1; # Node has been seen but not yet tested $S_avoided = 2; # Node has been avoided (see momavoid.pl) $S_will_leaf = 3; # Node will be leafed (see momavoid.pl) $S_will_test = 4; # Node will be tested $S_tested_unknown = 5; # Node has been tested but has unknown leaf status $S_leafed = 6; # Node has been tested and determined to be a leaf $S_will_traverse = 7; # Node has been placed on the traversal queue $S_traversed = 8; # Node has been traversed # ========================================================================== # Figure out what to do given the following response codes. Note that many # of these should never be generated by a normal GET or HEAD request. $DO_continue = 1; # continue processing $DO_ok_stop = 2; # discontinue processing, no error $DO_redirect = 3; # redirected URL $DO_broken = 4; # signal broken link %WhatToDo = ( $wwwerror'RC_unknown, $DO_broken, # 000 $wwwerror'RC_ok, $DO_continue, # 200 $wwwerror'RC_created, $DO_ok_stop, # 201 $wwwerror'RC_accepted, $DO_ok_stop, # 202 $wwwerror'RC_partial, $DO_continue, # 203 $wwwerror'RC_no_response, $DO_ok_stop, # 204 $wwwerror'RC_moved, $DO_redirect, # 301 $wwwerror'RC_found, $DO_redirect, # 302 $wwwerror'RC_method, $DO_ok_stop, # 303 $wwwerror'RC_not_modified, $DO_ok_stop, # 304 $wwwerror'RC_bad_request, $DO_broken, # 400 $wwwerror'RC_unauthorized, $DO_ok_stop, # 401 $wwwerror'RC_payment_required, $DO_ok_stop, # 402 $wwwerror'RC_forbidden, $DO_broken, # 403 $wwwerror'RC_not_found, $DO_broken, # 404 $wwwerror'RC_internal_error, $DO_broken, # 500 $wwwerror'RC_not_implemented, $DO_broken, # 501 $wwwerror'RC_bad_response, $DO_broken, # 502 $wwwerror'RC_too_busy, $DO_broken, # 503 $wwwerror'RC_bad_request_client, $DO_broken, # 600 $wwwerror'RC_not_implemented_client, $DO_ok_stop, # 601 $wwwerror'RC_connection_failed, $DO_broken, # 602 $wwwerror'RC_timed_out, $DO_broken, # 603 ); # ========================================================================== &init; # ========================================================================== # ========================================================================== # init(): Initialize history information # sub init { %Tops = (); # Associative array of Top nodes -> Index URL %Visited = (); # Associative array of URLs visited -> @Vis* index $VisNumber = 0; # Number of URLs visited since process start @VisURL = (); # URL of node (maps @Vis* index -> URL visited) @VisStatus = (); # Status of a seen node @VisRespCode = (); # Server response code from last access @VisConType = (); # MIME Content-type of response @VisRedirect = (); # Redirected URL (from a 302 Moved response) @VisTitle = (); # Title text from headers or last traversal @VisOwner = (); # Owner name from headers or last traversal @VisReplyTo = (); # Reply-To address from headers or last traversal @VisLastMod = (); # Last-modified date from headers @VisExpires = (); # Expires date from headers @VisInTask = (); # Seen during the current task? @VisLocal = (); # URL considered to be local to this network? # Note that the above is why Perl desperately needs arrays of arrays. # If it had that, I could just use @VisMetainfo and let the other packages # be free to use whatever is in the metainfo associative array. } # ========================================================================== # clear(): Clear all history information # sub clear { undef %Tops; undef %Visited; undef $VisNumber; undef @VisURL; undef @VisStatus; undef @VisRespCode; undef @VisConType; undef @VisRedirect; undef @VisTitle; undef @VisOwner; undef @VisReplyTo; undef @VisLastMod; undef @VisExpires; undef @VisInTask; undef @VisLocal; } # ========================================================================== # reset_traversal(): Reset the status of all visited nodes so that they # are no longer considered traversed. # sub reset_traversal { local($vidx); foreach $vidx (1..$VisNumber) { next unless defined($VisURL[$vidx]); $VisInTask[$vidx] = 0; if ($VisStatus[$vidx] > $S_tested_unknown) { $VisStatus[$vidx] = $S_tested_unknown; } elsif (($VisStatus[$vidx] == $S_seen_not_tested) || ($VisStatus[$vidx] == $S_will_leaf) || ($VisStatus[$vidx] == $S_will_test)) { $VisStatus[$vidx] = $S_not_seen; } } } # ========================================================================== # remember(): See if the passed-in absolute URL is in our history. # If so, update its status iff the new status is more advanced # and return the index to its history record. # If not, create a history record for it with the given status. # sub remember { local($url, $status) = @_; local($vidx, $cstat); $vidx = $Visited{$url}; if (!$vidx) { $vidx = ++$VisNumber; $Visited{$url} = $vidx; $VisURL[$vidx] = $url; $VisStatus[$vidx] = $status; $VisRespCode[$vidx] = 0; if (($url =~ m#^file:#) || ($url =~ m#://[^/]*$momconfig'LocalNetwork#io)) { $VisLocal[$vidx] = 1; } } else { $cstat = $VisStatus[$vidx]; if (!defined($cstat) || ($cstat < $status)) { $VisStatus[$vidx] = $status; } } $VisInTask[$vidx] = 1; return $vidx; } # ========================================================================== # remember_tops(): This is a special version of remember so that all of the # task tops (the top URL for each task to be traversed) is # placed in memory and the associated Index URL is saved so # that cross-references can be made between indexes. # sub remember_tops { local(*topurl, *topindex) = @_; local($vidx, $url); foreach $task (1 .. $#topurl) { next unless ($url = $topurl[$task]); $vidx = $Visited{$url}; if (!$vidx) # If not already on Tops { $vidx = ++$VisNumber; $Visited{$url} = $vidx; $VisURL[$vidx] = $url; $VisStatus[$vidx] = $S_not_seen; $VisRespCode[$vidx] = 0; $VisInTask[$vidx] = 0; if (($url =~ m#^file:#) || ($url =~ m#://[^/]*$momconfig'LocalNetwork#io)) { $VisLocal[$vidx] = 1; } $Tops{$vidx} = $topindex[$task]; } } } # ========================================================================== # forget(): Remove the passed-in absolute URL from our history. # Returns the old history record number (or 0 if already forgotten). # Note that this does not reclaim the record number. # sub forget { local($url) = @_; local($vidx, $cstat); $vidx = $Visited{$url}; return 0 unless ($vidx); # Already forgotten undef $VisURL[$vidx]; undef $VisStatus[$vidx]; undef $VisRespCode[$vidx]; undef $VisConType[$vidx]; undef $VisRedirect[$vidx]; undef $VisTitle[$vidx]; undef $VisOwner[$vidx]; undef $VisReplyTo[$vidx]; undef $VisLastMod[$vidx]; undef $VisExpires[$vidx]; undef $VisInTask[$vidx]; undef $VisLocal[$vidx]; delete $Visited{$url}; return $vidx; } # ========================================================================== # store(): Update the metainformation stored for the passed-in record number # along with its status and the response code from a recent WWW # request. Returns 1 if okay, 0 if the record was not found. # sub store { local($vidx, $status, $respcode, *headers) = @_; local($nextbit, $cstat); return 0 unless defined($VisURL[$vidx]); $VisRespCode[$vidx] = $respcode; # Latest response code $cstat = $VisStatus[$vidx]; if (!defined($cstat) || ($cstat < $status)) { if (($status == $S_tested_unknown) && ($cstat == $S_will_leaf)) { $status = $S_leafed; } $VisStatus[$vidx] = $status; # Traversal status } if ($nextbit = $headers{'content-type'}) { $VisConType[$vidx] = $nextbit; # MIME Content-type } if ($nextbit = ($headers{'uri'} || $headers{'location'})) { $VisRedirect[$vidx] = $nextbit; # Redirected URL } if ($nextbit = $headers{'title'}) { $VisTitle[$vidx] = $nextbit; # Title text } if ($nextbit = $headers{'owner'}) { $VisOwner[$vidx] = $nextbit; # Owner name } if ($nextbit = $headers{'reply-to'}) { $VisReplyTo[$vidx] = $nextbit; # Reply-To address } if ($nextbit = $headers{'last-modified'}) { $VisLastMod[$vidx] = $nextbit; # Last-modified date } if ($nextbit = $headers{'expires'}) { $VisExpires[$vidx] = $nextbit; # Expires date } return 1; } # ========================================================================== # recall(): Pass back the metainformation stored for the given record number # inside the %headers array and return the most recent response code. # sub recall { local($vidx, *headers) = @_; local($nextbit, $cstat); return 0 unless defined($VisURL[$vidx]); if ($nextbit = $VisConType[$vidx]) { $headers{'content-type'} = $nextbit; # MIME Content-type } if ($nextbit = $VisRedirect[$vidx]) { $headers{'uri'} = $nextbit; # Redirected URL } if ($nextbit = $VisTitle[$vidx]) { $headers{'title'} = $nextbit; # Title text } if ($nextbit = $VisOwner[$vidx]) { $headers{'owner'} = $nextbit; # Owner name } if ($nextbit = $VisReplyTo[$vidx]) { $headers{'reply-to'} = $nextbit; # Reply-To address } if ($nextbit = $VisLastMod[$vidx]) { $headers{'last-modified'} = $nextbit; # Last-modified date } if ($nextbit = $VisExpires[$vidx]) { $headers{'expires'} = $nextbit; # Expires date } return $VisRespCode[$vidx]; # Latest response code } # ========================================================================== # get_url(): Return the stored URL of the given record number. # sub get_url { return $VisURL[$_[0]]; } # ========================================================================== # get_status(): Return the current traversal status of the given node. # The node may be a record number or a URL. # sub get_status { local($node) = @_; local($vidx); if ($node !~ /^\d+$/) { $vidx = $Visited{$node}; } else { $vidx = $node; } return $S_not_seen unless defined($VisURL[$vidx]); return $VisStatus[$vidx]; } # ========================================================================== # set_status(): Set the current traversal status of the given node. # The node must be a record number. # Returns true (1) if okay, else false (0) if record not found. # sub set_status { local($vidx, $status) = @_; local($cstat); return 0 unless defined($VisURL[$vidx]); $cstat = $VisStatus[$vidx]; if (!defined($cstat) || ($cstat < $status)) { $VisStatus[$vidx] = $status; } return 1; } # ========================================================================== # was_avoided(): Return true (1) if the given node has been avoided. # The passed value must be a record number. # sub was_avoided { local($vidx) = @_; return 0 unless defined($VisURL[$vidx]); return ($VisStatus[$vidx] == $S_avoided); } # ========================================================================== # was_tested(): Return true (1) if the given node has been tested. # The passed value must be a record number. # sub was_tested { local($vidx) = @_; return 0 unless defined($VisURL[$vidx]); return ($VisStatus[$vidx] >= $S_tested_unknown); } # ========================================================================== # checked_trav(): Return true (1) if the given node will be or has already # been checked for its traversal status. # The passed value must be a record number. # sub checked_trav { local($vidx) = @_; return 0 unless defined($VisURL[$vidx]); return ($VisStatus[$vidx] > $S_tested_unknown); } # ========================================================================== # traversing(): Return true (1) if the given node will be or has already # been traversed (or leafed) for this infostructure. # The passed value must be a record number. # sub traversing { local($vidx) = @_; return 0 unless defined($VisURL[$vidx]); return ($VisStatus[$vidx] >= $S_will_traverse); } # ========================================================================== # isa_top(): Return the index URL if the passed-in node is an index Top. # Return undef otherwise. The passed value must be a record number. # sub isa_top { return $Tops{$_[0]}; } # ========================================================================== # isa_local(): Return true if the passed-in node is considered local. # Return undef otherwise. The passed value must be a record number. # sub isa_local { return $VisLocal[$_[0]]; } # ========================================================================== # must_avoid(): Return true (1) if the given node must be or has already # been avoided according to the rules described in momavoid.pl # The passed value must be a record number. # sub must_avoid { local($vidx) = @_; local($url, $chk); return 1 if ($VisStatus[$vidx] == $S_avoided); # Have we already avoided? return 0 if (($VisStatus[$vidx] > $S_avoided) && # Have we already checked? ($VisStatus[$vidx] != $S_tested_unknown)); return 1 unless ($url = $VisURL[$vidx]); # Get it's URL or quit $chk = &momavoid'checkurl($url); # Check Avoid/Leaf status return 0 unless ($chk); # 0 -> no restrictions if ($chk == 1) # 1 -> must leaf this URL { if ($VisStatus[$vidx] == $S_tested_unknown) { $VisStatus[$vidx] = $S_leafed; } else { $VisStatus[$vidx] = $S_will_leaf; } return 0; } # 2 -> must avoid this URL $VisStatus[$vidx] = $S_avoided; return 1; } # ========================================================================== # nodes_used(): Run at the end of each infostructure to summarize the results # and get the number of unique and local nodes for each of the # following categories: # # ($TotalNodes, -- The total number of unique URLs seen # $NodesTest, -- URLs that were tested # $NodesTrav, -- URLs that were traversed # $NodesAvd, -- URLs that were avoided # $NodesUnt, -- URLs that were untestable # $TotalLocal, -- URLs that are local to this network # $LocalTest, -- local URLs that were tested # $LocalTrav, -- local URLs that were traversed # $LocalAvd, -- local URLs that were avoided # $LocalUnt, -- local URLs that were untestable # ) = &momhistory'nodes_used; # sub nodes_used { local($vidx, $seen, $test, $trav, $avoided, $untested, $is_loc, $lseen, $ltest, $ltrav, $lavoided, $luntested); $seen = $test = $trav = $avoided = $untested = 0; $lseen = $ltest = $ltrav = $lavoided = $luntested = 0; foreach $vidx (1..$VisNumber) { if ($VisInTask[$vidx]) { ++$seen; if ($is_loc = $VisLocal[$vidx]) { ++$lseen; } if ($VisStatus[$vidx] == $S_avoided) { ++$avoided; if ($is_loc) { ++$lavoided; } } elsif ($VisRespCode[$vidx] == $wwwerror'RC_not_implemented_client) { ++$untested; if ($is_loc) { ++$luntested; } } elsif ($VisStatus[$vidx] >= $S_tested_unknown) { ++$test; if ($is_loc) { ++$ltest; } if ($VisStatus[$vidx] == $S_traversed) { ++$trav; if ($is_loc) { ++$ltrav; } } } } } return ($seen, $test, $trav, $avoided, $untested, $lseen, $ltest, $ltrav, $lavoided, $luntested); } # ========================================================================== # nodes_processed(): Like nodes_used, except only run at the end of the # current process to get the number of unique and local nodes for # the entire process in each of the following categories: # # ($ProcTotalNodes, -- The total number of unique URLs seen # $ProcNodesTest, -- URLs that were tested # $ProcNodesAvd, -- URLs that were avoided # $ProcNodesUnt, -- URLs that were untestable # $ProcNodesBroke, -- URLs that were broken # $ProcNodesRedir, -- URLs that were redirected # $ProcTotalLocal, -- URLs that are local to this network # $ProcLocalTest, -- local URLs that were tested # $ProcLocalAvd, -- local URLs that were avoided # $ProcLocalUnt, -- local URLs that were untestable # $ProcLocalBroke, -- local URLs that were broken # $ProcLocalRedir -- local URLs that were redirected # ) = &momhistory'nodes_processed; # sub nodes_processed { local($vidx, $seen, $test, $avoided, $untested, $broke, $red, $is_loc, $lseen, $ltest, $lavoided, $luntested, $lbroke, $lred); $seen = $test = $avoided = $untested = $broke = $red = 0; $lseen = $ltest = $lavoided = $luntested = $lbroke = $lred = 0; foreach $vidx (1..$VisNumber) { if ($VisURL[$vidx]) { ++$seen; if ($is_loc = $VisLocal[$vidx]) { ++$lseen; } if ($VisStatus[$vidx] == $S_avoided) { ++$avoided; if ($is_loc) { ++$lavoided; } } elsif ($VisRespCode[$vidx] == $wwwerror'RC_not_implemented_client) { ++$untested; if ($is_loc) { ++$luntested; } } elsif ($VisStatus[$vidx] >= $S_tested_unknown) { ++$test; if ($is_loc) { ++$ltest; } if ($WhatToDo{$VisRespCode[$vidx]} == $DO_broken) { ++$broke; if ($is_loc) { ++$lbroke; } } elsif ($WhatToDo{$VisRespCode[$vidx]} == $DO_redirect) { ++$red; if ($is_loc) { ++$lred; } } } } } return ($seen, $test, $avoided, $untested, $broke, $red, $lseen, $ltest, $lavoided, $luntested, $lbroke, $lred); } # ========================================================================== 1;