#!/usr/bin/env python
# -*- python -*-
#
# Copyright Ars Digita, Inc., 2000
#
# elorenzo@arsdigita.com, 5-10-2000
#
#
# This file has several logical pieces.  It would be nice to split them
# up into seperate files, but I want to keep the packaging of this script
# simple: cramming everything into one file means that someone can just
# download the whole thing and plop it into a 'bin' directory somewhere,
# and be ready to go.
#
# 1) Several small classes and functions shared throughout the code.
#
# 2) Procedures for reading and writing profile sample sets to and from
#    CSV files.
#
# 3) A large class named HTTPUserAgent responsible for handling much
#    of the bookkeeping involved in getting web pages (cookies, redirects,
#    connection timeouts).
#
# 4) A callable class named site_sampler that uses an HTTPUserAgent to
#    load pages from a site, and a procedure named profile_site that
#    uses a site_sampler instance to crawl around a site at random,
#    fetching pages, checking for errors and recording page load
#    times.
#
# 5) A procedure that takes a set of samples generated by the procedures
#    part 4 or 5 and generates statistics on it.
#
# 6) A set of relatively small and simple classes and functions used
#    to define callable plug-ins that define aspects of how the
#    procedures in parts 4 and 5 work.
#
# 7) A chunk of code that parses user parameters and based on them executes
#    one of the procedures in part 4 or 5, then outputs the resulting
#    sample set to a file and generates and prints a report on it using
#    the procedure in section 5.
#

import string
import sys
import re
import random
import httplib
import urllib
import time
import string
import threading
import exceptions
import urlparse
import math
import types
import htmllib
import formatter




######################################################################
# 1) A few small simple utility functions and classes used in a
#    number of places through the program.
#


def collect(collect_func, L):
    result = {}
    for x in L:
        key = collect_func(x)
        if result.has_key(key):
            result[key].append(x)
        else:
            result[key] = [x]
    return result


def sum(l):
    if len(l) == 0: return None
    return reduce(lambda x, y: x+y, l, 0)


def mean(l):
    if len(l) == 0: return None
    return sum(l) / len(l)


def mean_stddev(l):
    avg = mean(l)
    if len(l) == 0 or len(l) == 1: return (avg, None)
    sum_squared_diff = sum(map(lambda x, a=avg: (x-a)**2, l))
    return (avg, math.sqrt(sum_squared_diff / (len(l)-1)))

class site_sample:
    """class site_sample(path_chain, form_data, prev_path, errcode,
                         errmsg, valid_html, duration, timestamp):

    A dumb data glob that just gives meaningful names to the scraps of data
    we get back from a fetch on a page, rather than requiring me to remember
    tuple indices.  These fields include:

      path_chain   - Chain of paths retrieved - first path is first requested
                     path, remainder are redirects
      form_data    - Form data passed with a POST (currently unused)
      prev_path    - Location of page on which the link to path was found
      errcode      - An error code.  Positive numbers are HTTP result codes,
                     negatives indicate some error that prevented a fetch
                     (see HTTPUserAgent.load_page).
      errmsg       - A text error message, may be unpopulated
      valid_html_p - A flag indicating whether or not the HTML returned by
                     the server passed validation.  1 if it passed, 0
                     if it failed, None if no validation performed.
      duration     - Total time required for the fetch, in seconds.  This
                     includes following all redirects, if any.
      timestamp    - Time at which the fetch started, expressed in seconds
                     since the epoch (midnight, January 1, 1970, I think).

    """
    def __init__(self, path_chain, form_data, prev_path,
                 errcode, errmsg, valid, duration, timestamp):
        self.path_chain = path_chain
        self.form_data = form_data
        self.prev_path = prev_path
        self.errcode = errcode
        self.errmsg = errmsg
        self.valid_html_p = valid
        self.duration = duration
        self.timestamp = timestamp




######################################################################
# 2) Functions for reading and writing lists of site_sample objects
#    to and from CSV files.
#


def read_log_file(filename):
    result = []
    fin = open(filename)

    for l in fin.readlines():
        [seq_num, path, form_data, redirects, prev_path, errcode, errmsg,
         valid_html_p, duration, timestamp] = string.split(l, ',');
        path_chain = (path,) + tuple(string.split(redirects, ';'))
        errcode = string.atoi(errcode)
        if valid_html_p == '':
            valid_html_p = None
        else:
            valid_html_p = string.atoi(valid_html_p)
        duration = string.atof(duration)
        timestamp = time.mktime(time.strptime(timestamp, '%Y-%m-%d %H:%M:%S'))
        result.append(site_sample(path_chain, form_data, prev_path, errcode,
                                  errmsg, valid_html_p, duration, timestamp))
    return result


def write_log_sample(f, i, s):
    if s.prev_path == None: prev_path = ''
    else: prev_path = s.prev_path
    if s.errmsg == None: errmsg = ''
    else: errmsg = re.sub(',', ';', s.errmsg)
    if s.valid_html_p == None: valid = ''
    else: valid = '%d' % s.valid_html_p
    f.write('%d,%s,%s,%s,%s,%d,%s,%s,%f,%s\n' % \
            (i, s.path_chain[0],  s.form_data,
             string.join(s.path_chain[1:], ';'), prev_path, s.errcode, errmsg,
             valid,  s.duration, time.strftime('%Y-%m-%d %H:%M:%S',
                                               time.gmtime(int(s.timestamp)))))
    

def write_log_file(filename, samples):
    fout = open(filename, 'w')

    i = 0
    for s in samples:
        write_log_sample(fout, i, s)
        i = i + 1
    fout.close()





######################################################################
# 3) Definitions related to the HTTPUserAgent which acts as a
#    sort of 'virtual browser' that handles much of the
#    annoying bookkeeping (cookies, redirects) associated with
#    fetching pages from a site.
#


class CookieDict:
    """class CookieDict()

    A 'dictionary' of cookies.  We don't worry about things like persistent
    or secure cookies, or multiple domains, because we don't need them for
    profiling.  Yet.
    """

    def __init__(self):
        self.paths = {}

    def set(self, path, name, value):
        if path == '' or path == None: path = '/'
        if self.paths.has_key(path):
            path_values = self.paths[path]
        else:
            path_values = {}
            self.paths[path] = path_values

        path_values[name] = value

    def get(self, path):
        result = []
        for junk, pv in filter(lambda x, p=path: p[0:len(x[0])] == x[0],
                               self.paths.items()):
            result.extend(pv.items())
        return result




class HTTPUserAgent:
    """class HTTPUserAgent(host, debug=0)

    Class to act as an HTTP user agent.  Is responsible for loading
    pages, following redirects and handling cookies.  A single instance
    of this class can only cope with a single host.  The debug parameter
    is a boolean indicating whether or not debug messages should be
    printed.  The business end of this class is the method load_page.
    """

    def __init__(self, host, debug=0):
        self.host = host
        self.cookie_re = re.compile('([^=]+)=([^;]*);.*(?:path=([^;]);|).*')
        self.cookies = CookieDict()
        self.debug = debug
        self.conn = httplib.HTTP()
        self.conn.set_debuglevel(self.debug)

    # Internal function, needed so that we can do the actual fetch in
    # a seperate thread.
    # conn is an httplib.HTTP object, path is the path to fetch
    def __fetch_page(self, cookie_str, path, result):
        # Make the request and read the response
        try:
            self.conn.connect(self.host)
            self.conn.putrequest('GET', path)
            if cookie_str <> '': self.conn.putheader('Cookie', cookie_str)
            self.conn.putheader('Host', str(self.host))
            self.conn.endheaders()
            errcode, errmsg, headers = self.conn.getreply()
            f = self.conn.getfile()
            html = f.read()
            f.close()
            result.append((errcode, errmsg, headers, html))
        except KeyboardInterrupt:
            result.append((-10, 'Ctl-C pressed', None, None))
        except exceptions.Exception, x:
            if self.debug: print 'ERROR CONNECTING:', path, x
            result.append((-11, 'exception during fetch: ' + x.__str__(),
                           None, None))


    def load_page(self, url):
        """errcode, errmsg, url_chain, html = load_page(self, path)

        Loads a single specified page from the host specified when the
        user agent was instantiated.  Will follow redirects and only return
        the html associated with the last one, but will return all links
        in redirect chain in url_chain.  Pretty dumb about redirects:
        2 URLs that redirect to each other will make it loop infinitely.  The
        errcode is the HTTP result code, if an HTTP connection was made.  If
        no connection was made, or an off-host redirect was encountered or
        some other exception occured, the errcode will be negative:
            -1  : HTTP library returned error
            -11 : exception caught while making HTTP connection
            -12 : page load timed out (took more than 30 seconds)
            -13 : redirect to another host encountered
        """

        url_chain = []
        while 1:
            # Allocate storage for result
            result = []
            
            # Build a string of cookies to go with the request
            cookie_str = string.join(map(lambda c: c[0]+'='+c[1],
                                         self.cookies.get(url)),
                                     '; ')

            # The connection seems to hang sometimes, so we do the fetch in
            # a seperate thread and give it 1 minute to complete.  Longer
            # than that, and we assume some sort of error has occured.
            fetch_thread = threading.Thread(None, self.__fetch_page, None,
                                            (cookie_str, url, result))
            fetch_thread.setDaemon(1)
            fetch_thread.start()
            fetch_thread.join(60)
            if fetch_thread.isAlive():
                url_chain.append(url)
                return (-12, 'page load timed out', tuple(url_chain), None)
            
            errcode, errmsg, headers, html = result[0]

            if errcode == -10: raise KeyboardInterrupt

            # Process a Set-Cookie header, if any.  Gotta check errcode
            # before headers, because headers will be None if errcode <= 0.
            if errcode > 0 and headers.has_key('Set-Cookie'):
                cookie_match = self.cookie_re.match(headers['Set-Cookie'])
                if cookie_match <> None:
                    key,value,path = cookie_match.groups()
                    if self.debug: print 'SETTING COOKIE:', key, value, path
                    self.cookies.set(path,key,value)
                else:
                    if self.debug:
                        print 'BAD SET-COOKIE HEADER: "' + \
                              headers['Set-Cookie'] + '"'
            
            # Process a Location header, if any: split into host and URL
            if errcode > 0 and headers.has_key('Location'):
                location = headers['Location']
                scheme, hdr_host, path, param, query, frag = \
                        urlparse.urlparse(location)

                # We assume that no match means a URL without a scheme and
                # host given in location header (illegal by HTTP specs).
                if string.lower(scheme) not in ('http',) or not hdr_host:
                    if self.debug:
                        print 'BAD LOCATION HEADER:', url, location
                    url_chain.append(url)
                    return (-14, 'Bad Location Header',
                            tuple(url_chain) + (location,), None)

                # Reconstruct the URL minus host and scheme
                hdr_url = urlparse.urlunparse(('','',path,param,query,frag))
            else:
                location = None
                hdr_host = self.host
                hdr_url = url
                
            # If this wasn't a redirect, then the URL in the header
            # (if any) is the 'exact' location for the requested object,
            # and should be appended to the URL chain instead of whatever
            # got passed in as the url.  And since it's not a redirect,
            # we're done looping at this point, so we break...
            if errcode != 302:
                url_chain.append(hdr_url)
                break

            if location == None:
                return (-15, 'Redirect W/O Location', tuple(url_chain), None)

            # This was a redirect, so we need to go ahead and stuff the
            # old url into url_chain, because the one given by the Location
            # header is the location we're redirecting to...
            url_chain.append(url)

            if hdr_host <> self.host:
                return (-13, 'off-host redirect', tuple(url_chain), None)

            url = hdr_url

        return (errcode, errmsg, tuple(url_chain), html)




######################################################################
# 4) site_sampler and profile_site - a callable class and a control
#    function that use an HTTPUserAgent to crawl a site
#

class site_sampler:
    """class site_sampler(browser, outfile=None, validator=None):
    
    This class bundles together several actions that happen every time
    a page is loaded from the site into a single callable: The browser
    loads the page, the validator is run on the HTML, a site_sample is
    constructed from the data.  The sample is written to an output
    file (if any).  Calling an instance of the class returns a tuple
    containing the site_sample instance and the html of the page."""

    
    def __init__(self, browser, outfile=None, validator=None):
        self.browser = browser
        self.validator = validator
        self.outfile = outfile
        self.i = 0

    def __call__(self, path, prev_path):
        start_time = time.time()
        errcode, errmsg, path_chain, html = self.browser.load_page(path)
        duration = time.time() - start_time

        if self.validator:
            valid_html_p = self.validator(html)
        else:
            valid_html_p = None

        s = site_sample(path_chain, '', prev_path, errcode, errmsg,
                        valid_html_p, duration, int(start_time))
        
        if self.outfile: write_log_sample(self.outfile, self.i, s)

        self.i = self.i + 1

        return (s, html)



def profile_site(start_path,
                 fetches,
                 sample_site, 
                 get_links,
                 reduce_path):

    """samples = profile_site(start_path
                              fetches,
                              sample_site,
                              get_links,
                              reduce_path):

    Crawls around a site, starting at the specified start_path,
    storing statistics about page load times.  The fetches parameter
    specifies how many pages are to be loaded in total.

    Pages are retrieved from the site using the sample_site callable.
    This callable should accept a URL to load (minus any scheme or
    host information, since that data should somehow already be known
    to the sample_site callable) and return a tuple containing the
    sample_site instance and the page's HTML, in that order.

    The get_links callable is used to parse the links from each page's
    HTML: the callable should accept the path of the page containing
    the HTML, and the HTML itself.  It should return a list of URLs,
    minus the scheme or host.

    reduce_path is a callable that is used is used to map from each
    page's url to some other set of distinct values.  The crawler will
    attempt to get a set of statistics that is evenly balanced among
    these values.  One obvious reduction function is to simply strip
    off any query string.

    This function returns a list of site_sample objects, each
    representing a fetch (or attempted fetch) from the site.  Note
    that in the event of a redirect or series of redirects, all links
    in the redirect chain will be combined into a single sample.
    """

    samples = []
    fetch_counts = {reduce_path(start_path):0}
    path = start_path
    prev_path = None
    try:
        for i in range(0, fetches):

            # Sample the site
            s, html = sample_site(path, prev_path)
            samples.append(s)
        
            # increment counter for profile balancing
            reduced_path = reduce_path(path)
            try:
                fetch_counts[reduced_path] = fetch_counts[reduced_path] + 1
            except KeyError:
                fetch_counts[reduced_path] = 1

            # check for error returns
            if s.errcode != 200:
                print i, s.path_chain, 'ERROR:', s.errcode, s.errmsg
                prev_path = None
                path = start_path
                continue

            # every 100th hit, return to start path (part of balancing -
            # help to ensure that we don't get trapped in some small
            # subset of the site)
            if (i % 100) == 0:
                print i, s.path_chain, 'RETURNING TO', start_path
                prev_path = None
                path = start_path
                continue

            # get links from just-fetched page
            links = get_links(s.path_chain[-1], html)
            if len(links) == 0:
                print i, s.path_chain, 'NO LINKS, RETURNING TO', start_path
                prev_path = None
                path = start_path
                continue

            # build list containing all links with minimum hits
            # (more balancing)
            min_w = -1
            for l in links:
                r = reduce_path(l)
                w = fetch_counts.get(r, 0)
                if w < min_w or min_w == -1:
                    min_w = w
                    new_path_list = [l]
                elif w == min_w:
                    new_path_list.append(l)

            # pick randomly from links with minimum hits
            new_path = new_path_list[random.randint(0,len(new_path_list)-1)]
            print i, s.path_chain, 'FOLLOWING LINK TO', new_path

            prev_path = path
            path = new_path

    except KeyboardInterrupt:
        pass

    return samples





######################################################################
# 5) A procedure that takes a set of samples generated by the procedures
#    part 4 or 5 and generates statistics on it.
#



def build_statistics(samples, group_func):
    """statistics = build_statistics(samples, group_func):

    Partitions the site_samples in the list samples into groups using
    the provided group_func, which is applied to the first entry in
    each sample's path chain.  (The first page fetched in the redirect
    chain, if any.)  Then computes statistics on each group, and returns
    a list of these statistics.

    The returned list contains a set of tuples of the form

      (<mean>, <sample count (successful fetches only)>, <standard deviation>,
       <error count>, <key (i.e. reduced URL)>)
    """
    results = []
    grouped_samples = {}
    for s in samples:
        rp = group_func(s.path_chain[0])
        if not grouped_samples.has_key(rp):
            grouped_samples[rp] = ([], [])
        if s.errcode == 200:
            grouped_samples[rp][1].append(s.duration)
        else:
            grouped_samples[rp][0].append(s.errcode)

    for key, (errors, time_list) in grouped_samples.items():
        mean, stddev = mean_stddev(time_list)
        results.append((mean, len(time_list), stddev, len(errors), key))

    return results





######################################################################
# 6) A set of relatively small and simple classes and functions used
#    to define callable plug-ins that define aspects of how the
#    procedures in parts 4 and 5 work.



# Functions for path reduction

# Simple function that, given a URL, strips off any parameters, query
# string, or fragment at the end
def strip_url_vars(url):
    scheme, netloc, path, param, query, frag = urlparse.urlparse(url)
    return urlparse.urlunparse((scheme, netloc, path, '', '', ''))



# A more complex URL processor: this one strips URL variables, then applies
# the provided regexp.  If a match is found, all non-None groups in the
# match are concatenated together, seperated by a *.
class strip_url_vars_and_group:
    "class strip_url_vars: Callable - give it a URL, it strips off URL vars."
    def __init__(self, group_re):
        self.group_re = re.compile(group_re)
    def __call__(self, url):
        url = strip_url_vars(url)
        group_match = self.group_re.match(url)
        if group_match <> None:
            url = string.join(filter(lambda x: x, group_match.groups()), '*')
        return url




# An extended HTML parser that handles forms
# (UNTESTED AND CURRENTLY UNUSED)

def simplify_tag_attrs(attrs):
    attrs = collect(lambda a: a[0], attrs)
    for k, v in attrs.items():
        attrs[k] = v[-1][1]
    return attrs
    

class MyHTMLParser(htmllib.HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self, formatter.NullFormatter())
        self.forms = []
        self.in_form_p = None
        self.in_select_p = None
        self.in_textarea_p = None

    def start_form(self, attrs):
        if self.in_form_p: self.end_form()

        attrs = simplify_tag_attrs(attrs)
        self.in_form_p = 1
        self.form_action = attrs.get('action', None)
        self.form_method = attrs.get('method', 'POST')
        self.form_enctype = attrs.get('enctype',
                                      'application/x-www-form-urlencoded')
        self.form_input_text = []
        self.form_input_password = []
        self.form_input_checkbox = []
        self.form_input_radio = {}
        self.form_input_submit = []
        self.form_input_image = []
        self.form_input_file = []
        self.form_input_hidden = []
        self.form_select = []
        self.form_textarea = []

    def do_input(self, attrs):
        if not in_form_p: return
        
        attrs = simplify_tag_attrs(attrs)
        type = string.lower(attrs['type'])

        name = attrs.get('name', None)
        value = attrs.get('value', None)
        try:
            maxlength = int(attrs.get('maxlength', 'a'))
            if maxlength <= 0: maxlength = None
        except ValueError:
            maxlength = None

        if type == 'text':
            if name:
                self.form_input_text.append((name, value, maxlength))

        elif type == 'password':
            if name:
                self.form_input_password.append((name, maxlength))

        elif type in ('checkbox', 'radio'):
            checked = attrs.has_key('checked')
            if name:
                if type == 'checkbox':
                    self.form_input_checkbox.append((name, value, checked))
                else:
                    if not self.form_input_radio.has_key(name):
                        self.form_input_radio[name] = []
                    self.form_input_radio[name].append((value, checked))

        elif type == 'submit':
            self.form_input_submit.append((name, value))

        elif type == 'image':
            self.form_input_image.append((name, value))

        elif type == 'file':
            accept = attrs.get('accept', None)
            if accept: accept = string.split('accept', ',')
            if name:
                self.form_input_file.append((name, maxlength, accept))

        elif type == 'hidden':
            if name:
                self.form_input_hidden.append((name, value))


                        
        def start_select(self, attrs):
            if self.in_select_p: self.end_select()
            
            attrs = simplify_tag_attrs(attrs)
            self.in_select_p = 1
            self.select_name = attrs.get('name', None)
            self.select_multiple = attrs.has_key('multiple')
            self.select_option = []

        def do_option(self, attrs):
            attrs = simplify_tag_attrs(attrs)
            value = attrs.get('value', None)
            checked = attrs.has_key('selected')
            self.select_option.append((value, checked))

        def end_select(self):
            if self.select_name:
                self.form_select.append((self.select_name,
                                         self.select_multiple,
                                         self.select_option))
            self.in_select_p = None
            del self.select_name
            del self.select_multiple
            del self.select_option

        def do_textarea(self, attrs):
            attrs = simplify_tag_attrs(attrs)
            name = attrs.get('name', None)
            rows = attrs.get('rows', None)
            cols = attrs.get('cols', None)
            if name:
                self.form_textarea.append((name, rows, cols))
            
        def end_form(self):
            if self.form_action:
                self.forms.append((self.form_action,
                                   self.form_method,
                                   self.form_enctype,
                                   self.form_input_text,
                                   self.form_input_password,
                                   self.form_input_checkbox,
                                   self.form_input_radio,
                                   self.form_input_submit,
                                   self.form_input_image,
                                   self.form_input_file,
                                   self.form_input_hidden,
                                   self.form_select,
                                   self.form_textarea))
            self.in_form_p = None
            del self.form_action
            del self.form_method
            del self.form_enctype
            del self.form_input_text
            del self.form_input_password
            del self.form_input_checkbox
            del self.form_input_radio
            del self.form_input_submit
            del self.form_input_image
            del self.form_input_file
            del self.form_input_hidden
            del self.form_select
            del self.form_textarea


            

# Classes for stripping links out of the HTML returned by a page


class local_links:
    """class local_links(host):

    Callable - given HTML, returns list of links local to host.
    """
    def __init__(self, host):
        self.host = host

    def __call__(self, root, html):
        parser = htmllib.HTMLParser(formatter.NullFormatter())
        parser.feed(html)
        parser.close()

        # Parse URLs into their component parts
        l = map(lambda a: urlparse.urlparse(a, 'http'), parser.anchorlist)

        # Filter out HREFs that aren't http or pointing at this host
        l = filter(lambda a, h=self.host: a[0] == 'http' and a[1] in (h, ''), l)

        # Convert URLs back to strings, dropping scheme, host, and fragment
        l = map(lambda a: urlparse.urlunparse(('','',a[2] or '/',a[3],a[4],'')),
                l)

        # Do a join to the root to handle relative URLs
        l =  map(lambda a, r=root: urlparse.urljoin(r, a), l)

        return l



class filtered_local_links(local_links):
    """class filtered_local_links(host, legal_url_regex):

    Callable, derived from local links.  Works like local_links, except it
    filters out paths that don't match the provided regex.
    """
    def __init__(self, host, illegal_url_regex):
        local_links.__init__(self, host)

        self.filter_re = re.compile(illegal_url_regex)
        
    def __call__(self, root, html):
        return filter(lambda x, f=self.filter_re.match: not f(x),
                      local_links.__call__(self, root, html))





################
# MAIN PROGRAM #
################

def display_usage_info():
    script_filename = re.match('.*?([^/]*)$', sys.argv[0]).group(1)
    print """
Usage:
    
  %(script)s -profile <host> <path>  <count> <log> [<logon>] [<illegal_urls>]
  %(script)s -report <grouping_regex> <filename> [<filename>+]
  %(script)s -help
""" % {'script':script_filename}


def process_logon_args(n):
    if len(sys.argv) < n+1: return (None, n+0, 0)
    if sys.argv[n] == '-acs':
        logon_url = '/register/user-login.tcl?' + \
                    urllib.urlencode({'email':sys.argv[n+1],
                                      'password_from_form':sys.argv[n+2]})
        return (logon_url, n+3, 1)
    elif sys.argv[n] == '-generic':
        return (sys.argv[n+1], n+2, 0)
    else:
        return (None, n+0, 0)




# redirect output of print to stderr until we get to 'real' output
# (i.e. either a report or the help listing)
sys.stdout = sys.__stderr__

if len(sys.argv) < 2:
    display_usage_info()
    sys.exit()

if sys.argv[1] == '-profile':
    try:
        # grab the first 4 arguments
        host = sys.argv[2]
        start_path = sys.argv[3]
        fetches = string.atoi(sys.argv[4])
        outfile = sys.argv[5]

        logon_url, filter_arg, acs_logon = process_logon_args(6)

        if len(sys.argv) > filter_arg:
            if acs_logon:
                link_filter_re = '/register|' + sys.argv[filter_arg]
            else:
                link_filter_re = sys.argv[filter_arg]
            link_filter = filtered_local_links(host, link_filter_re)
        else:
            if acs_logon:
                link_filter = filtered_local_links(host, '/register')
            else:
                link_filter = local_links(host)

    except IndexError:
        display_usage_info()
        sys.exit()

    fout = open(outfile, 'w')

    samples = []
    sampler_func = site_sampler(HTTPUserAgent(host, 0), fout)
    
    if logon_url <> None:
        print 'LOGGING ON'
        fetches = fetches - 1
        s, html = sampler_func(logon_url, None)
        samples.append(s)
        print s.__dict__, html
        if s.errcode <> 200:
            print s.path_chain, 'LOGIN ERROR:', s.errcode, s.errmsg

    try:
        samples.extend(profile_site(start_path, fetches, sampler_func,
                                    link_filter, strip_url_vars))
    except KeyboardInterrupt:
        print 'User Interrupt - Generating Partial Statistics & Logfile'
        
    fout.close()

    group_re = '(?!)'

elif sys.argv[1] == '-report':
    try:
        group_re = sys.argv[2]
        sample_files = sys.argv[3:]
    except IndexError:
        display_usage_info()
        sys.exit()
        
    samples = []
    for filename in sample_files:
        samples.extend(read_log_file(filename))
    if len(samples) == 0:
        print 'No samples found in specified files.'
        sys.exit()

elif sys.argv[1] == '-help':
    sys.stdout = sys.__stdout__
    script_filename = re.match('.*?([^/]*)$', sys.argv[0]).group(1)
    print """
Usage:
    
  %(script)s -profile <host> <path> <count> <log> [<logon>] [<illegal_urls>]
  %(script)s -report <grouping_regex> <filename> [<filename>+]
  %(script)s -help

  -profile <host> <path> <count> <log> [<logon>] [<illegal_urls>]
  ===============================================================

  In this mode, the program crawls over the site at http://<host>/ for
  the number of hits specified in <count>.  It records a raw log of its
  activity in <log>.  It also prints a report containing some statistics
  to standard output.  <logon> can be used to specify logon information
  that will be used to log on as a particular user of the site to be
  profiled.  It takes one of two forms:

      -acs <email> <password>
          In this case, the program assumes it is dealing with a standard ACS
          site, and attempts to log on to the site by passing the specified
          <email> and <password> as URL variables to /register/user-login.tcl,
          then following redirects and setting cookies.

      -generic <url>
          Given a logon spec of this form, the program attempts to log on
          to the site by executing a 'GET' on the specified URL, which would
          presumably contain URL variables specifying logon information
          required by the site.

  Finally, <illegal_urls> is a Perl-style regular expression.  The crawler
  will avoid following links whose paths (minus hostname, etc.) match
  the provided regex.  This is useful for preventing the crawler from
  wasting time on pages that you aren't interested in testing.  And it's
  also useful for preventing the crawler from following links that will
  cause it to be logged out.  For this reason, if you specify a -acs logon,
  \"/register\" is automatically added to any regex you provide.


  -report <grouping_regex> <filename>+
  ====================================

  In this mode, the program imports all the profile log files specified in
  <filename>+ (these are files whose names were specified as <log> to
  -profile or -loadtest and generates a set of statistics on the union of
  all data therein.  The samples are grouped by a string that is obtained by
  first stripping off any URL variables from the path, then attempting to
  match them against <grouping_regex>.  Paths that match this regex have
  the groups generated by the match concatenated together with '*' as
  a seperator.  So, for example, if you were to run:

      %(script)s -report \"(/store/)[^/](/.*|)\" log.csv

  Then samples of these URLs:

      /store/hitime/tasting-group.html?tasting_group_id=51
      /store/moorebros/tasting-group.html
      /store/greenville/tasting-group.html?group_type=red

  Would all be grouped under the heading \"/store/*/tasting-group.html\"
  for the purpose of computing statistics.


  Notes On Log File Format
  ========================

  Finally, for those of you who want to manipulate the logfile data on your
  own, here's the format:  Each line in the file represents a single sample
  and is of the form:

  <seq>,<path>,<form>,<redirect>,<prev>,<cd>,<msg>,<valid>,<dur>,<timestamp>

      <seq>       - a simple sequence number recording the order in which
                    hits occured
      <path>      - path of the fetch, minus \"http://<host>\"
      <form>      - form data, stored in URL-encoded form
      <redirect>  - a redirect chain, semicolon-seperated
      <prev>      - page on which the link to <path> was found (maybe empty)
      <cd>        - a result code.  Positive numbers are HTTP return codes,
                    negatives indicate errors connecting:
                        -1, -11 : an error occured connecting, no further info
                        -12     : timeout (> 60 seconds) fetching page
                        -13     : encountered redirect to another host
                        -14     : Bad Location header
                        -15     : redirect W/O Location header
      <msg>       - a text error message
      <valid>     - 0 if HTML failed some sort of validation, 1 if it
                    passed, empty string if no validation performed.  Currently
                    the only 'validation' done is a check for unevaluated
                    evaluated '<%  %>' ADP tags.
      <dur>       - length of time it took to fetch page, including all links
                    in the redirect chain, if any
      <timestamp> - a timestamp telling when the fetch was initiated, in
                    in the format \"YYYY-MM-DD HH:MI:SS\", with a 24-hour
                    clock.  Greenwich mean time is used.

""" % {'script':script_filename}
    sys.exit()

else:
    display_usage_info()
    sys.exit()



sys.stdout = sys.__stdout__

statistics = build_statistics(samples, strip_url_vars_and_group(group_re))
statistics.sort()
statistics.reverse()


print ' Mean      Hits   Std Dev    Errs   URL'
print '---------+------+----------+------+------------------------------------'
for mean,count,stddev,errs,path in statistics:
    if stddev == None:
        stddev_str = '  N/A   '
    else:
        stddev_str = '%8.4f' % (stddev,)
    if mean == None:
        mean_str = '  N/A   '
    else:
        mean_str = '%8.4f' % (mean,)
    print '%s | %4d | %s | %4d | %s' % \
          (mean_str, count, stddev_str, errs, path)

# Hopefully this will kill off any zombie connection threads
# hanging around,  so that the program won't hang waiting for
# a server response that's never going to happen.
sys.exit()