# Copyright (c) 1994 Regents of the University of California. # All rights reserved. # $Id: momconfig.pl,v 1.3 1994/08/10 10:18:29 fielding Exp $ # --------------------------------------------------------------------------- # momconfig: A package for setting the configuration options of a # World-Wide Web spider. This package exists so that all # user-configurable defaults can be set in one package and then # used by all of the mom* packages. # # Before changing things here, the installer should first: # # 1) Read and follow the installation instructions in docs/INSTALL.txt # 2) Set the three things that need to be set in "momspider" # a) The first line which specifies the perl interpreter; # b) The INClude path for libwww-perl library packages $WWWlib # c) The INClude path for MOMspider library packages $MOMlib # # Note that, except for LocalNetwork, it may not be necessary to change # anything here if your operating system setup is similar to mine. # Note also that this is real Perl code -- if you don't understand the # syntax, take a look at the Perl manual (man perl) or at one of the many # hypertext archives of Perl info, e.g. . # # This software has been developed by Roy Fielding as # part of the Arcadia project at the University of California, Irvine. # # Redistribution and use in source and binary forms are permitted, # subject to the restriction noted below, provided that the above # copyright notice and this paragraph and the following paragraphs are # duplicated in all such forms and that any documentation, advertising # materials, and other materials related to such distribution and use # acknowledge that the software was developed in part by the University of # California, Irvine. The name of the University may not be used to # endorse or promote products derived from this software without # specific prior written permission. THIS SOFTWARE IS PROVIDED ``AS IS'' # AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT # LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE. # # Use of this software in any way or in any form, source or binary, # is not allowed in any country which prohibits disclaimers of any # implied warranties of merchantability or fitness for a particular # purpose or any disclaimers of a similar nature. # # IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY # FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES # ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION # (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF THE UNIVERSITY # OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # If you have any suggestions, bug reports, fixes, or enhancements, # send them to the author Roy Fielding at . # --------------------------------------------------------------------------- package momconfig; # ========================================================================== # These first four global variables are needed by later options, but # should not normally be changed by the installer. $MOMlib = $main'MOMlib; $HOMEdir = ($ENV{'HOME'} || $ENV{'home'} || '.'); $PWDdir = ($ENV{'PWD'} || $ENV{'cwd'} || '.'); $TMPdir = ($ENV{'TMPDIR'} || '/tmp'); # ========================================================================== # Local Network should be the network domain which you consider to be local. # (i.e., a network request to sites in this domain do not create any external # network costs to your organization). YOU WILL WANT TO CHANGE THIS!!! $LocalNetwork = '\.uci\.edu'; # Use backslash to escape any periods # ========================================================================== # sendmailCommand should point to the sendmail binary. The assumption is # that this program accepts command-line arguments specifying addresses to # which messages should be mailed, and accepts other headers and message # text from stdin. $sendmailCommand = "/usr/sbin/sendmail"; # ========================================================================== # The following options allow MOMspider to decode traversable response # content that has been encoded (so far, this only means compressed). # This may never be used if your site does not compress any HTML files. # The following association maps content-encodings to their decoder command. %CEdecoder = ( 'x-compress', 'uncompress', 'x-gzip', 'gunzip', ); # The following association maps content-encodings to the file extension # expected by the decoder. %CEextension = ( 'x-compress', '.Z', 'x-gzip', '.gz', ); # The following sets the temporary filename [without .(Z|gz) suffix] for # file decoding. $CEfile = "$TMPdir/mom$$-comp.html"; # Temporary file for decompression # ========================================================================== # Set the default location of the working Index file. $TempIndex = "$TMPdir/mom$$-index.html"; # Standard filename extension for HTML index files (and old version) $Extension = 'html'; $OldExtension = 'old.html'; # ========================================================================== # Set the default location of the instruction file and allowed tasks. $InstructFile = "$HOMEdir/.momspider-instruct"; # Edit the following lines to specify whether (1) or not (0) you want # to allow the specified option. %Allowed = ( # Allow MOMspider to traverse ... 'Owner', 1, # all links via TopURL owned by owner? 'Tree', 1, # all links at or below TopURL? 'Site', 1, # all links at TopURL's site? ); # ========================================================================== # Set the default location for the avoid and sites files. These locations # can be overridden in the instructions or on the command line. # NOTE that the avoid and sites files must always be used in pairs # since the contents of each file are dependent on the other. # SystemAvoid should point to the systemwide avoidance file -- a structured # list of URLs that all users of MOMspider must avoid (or leaf). $SystemAvoid = "$MOMlib/system-avoid"; # SystemSites should point to the systemwide sites file -- a structured # list of IP addresses, ports, and dates which indicate when the sites # should next be checked for RobotsNotWanted restrictions. $SystemSites = "$MOMlib/system-sites"; # The user AvoidFile and SitesFile are exactly the same but are intended # to be written by whomever is running the spider. $AvoidFile = "$HOMEdir/.momspider-avoid"; # Default user avoid file $SitesFile = "$HOMEdir/.momspider-sites"; # Default user sites file # The following is the standard name for the URL which defines for # any site where Robots are not allowed. See Martijn Koster's proposal # at for more info. $RobotsURL = "/robots.txt"; # Set the default number of days between checks of a site's $RobotsURL. # This can be overridden in the instructions. $CheckInterval = 15; # ========================================================================== # Set things which control the traversal process. $MaxDepth = 20; # Default maximum traversal depth. # Can be overridden by the instructions or commandline. $Timeout = 30; # The maximum number of seconds to wait for a response. # Increase if you have an extremely slow net connection. $MaxConsec = 5; # Max number of consecutive requests to any site # before a long pause is required. Don't change it. $PauseTime = 60; # The number of seconds for a long pause. # Increase if your server is very slow. $BetweenTime = 15; # Amount of time required between any two requests # to the same site. Increase if server is slow. $BaseURL = "file://localhost$PWDdir/"; # The initial Base URL -- no need to change this. # ========================================================================== # DO NOT change the following unless you know exactly what you are doing # AND have checked first with Roy Fielding . $Version = $main'Version; # The location for distribution information about MOMspider $DistInfo = "$Version"; # ========================================================================== 1; # THIS LINE MUST BE LAST -- DO NOT CHANGE IT