#!/usr/bin/env python # -*- python -*- # # Copyright Ars Digita, Inc., 2000 # # elorenzo@arsdigita.com, 5-10-2000 # # # This file has several logical pieces. It would be nice to split them # up into seperate files, but I want to keep the packaging of this script # simple: cramming everything into one file means that someone can just # download the whole thing and plop it into a 'bin' directory somewhere, # and be ready to go. # # 1) Several small classes and functions shared throughout the code. # # 2) Procedures for reading and writing profile sample sets to and from # CSV files. # # 3) A large class named HTTPUserAgent responsible for handling much # of the bookkeeping involved in getting web pages (cookies, redirects, # connection timeouts). # # 4) A callable class named site_sampler that uses an HTTPUserAgent to # load pages from a site, and a procedure named profile_site that # uses a site_sampler instance to crawl around a site at random, # fetching pages, checking for errors and recording page load # times. # # 5) A procedure that takes a set of samples generated by the procedures # part 4 or 5 and generates statistics on it. # # 6) A set of relatively small and simple classes and functions used # to define callable plug-ins that define aspects of how the # procedures in parts 4 and 5 work. # # 7) A chunk of code that parses user parameters and based on them executes # one of the procedures in part 4 or 5, then outputs the resulting # sample set to a file and generates and prints a report on it using # the procedure in section 5. # import string import sys import re import random import httplib import urllib import time import string import threading import exceptions import urlparse import math import types import htmllib import formatter ###################################################################### # 1) A few small simple utility functions and classes used in a # number of places through the program. # def collect(collect_func, L): result = {} for x in L: key = collect_func(x) if result.has_key(key): result[key].append(x) else: result[key] = [x] return result def sum(l): if len(l) == 0: return None return reduce(lambda x, y: x+y, l, 0) def mean(l): if len(l) == 0: return None return sum(l) / len(l) def mean_stddev(l): avg = mean(l) if len(l) == 0 or len(l) == 1: return (avg, None) sum_squared_diff = sum(map(lambda x, a=avg: (x-a)**2, l)) return (avg, math.sqrt(sum_squared_diff / (len(l)-1))) class site_sample: """class site_sample(path_chain, form_data, prev_path, errcode, errmsg, valid_html, duration, timestamp): A dumb data glob that just gives meaningful names to the scraps of data we get back from a fetch on a page, rather than requiring me to remember tuple indices. These fields include: path_chain - Chain of paths retrieved - first path is first requested path, remainder are redirects form_data - Form data passed with a POST (currently unused) prev_path - Location of page on which the link to path was found errcode - An error code. Positive numbers are HTTP result codes, negatives indicate some error that prevented a fetch (see HTTPUserAgent.load_page). errmsg - A text error message, may be unpopulated valid_html_p - A flag indicating whether or not the HTML returned by the server passed validation. 1 if it passed, 0 if it failed, None if no validation performed. duration - Total time required for the fetch, in seconds. This includes following all redirects, if any. timestamp - Time at which the fetch started, expressed in seconds since the epoch (midnight, January 1, 1970, I think). """ def __init__(self, path_chain, form_data, prev_path, errcode, errmsg, valid, duration, timestamp): self.path_chain = path_chain self.form_data = form_data self.prev_path = prev_path self.errcode = errcode self.errmsg = errmsg self.valid_html_p = valid self.duration = duration self.timestamp = timestamp ###################################################################### # 2) Functions for reading and writing lists of site_sample objects # to and from CSV files. # def read_log_file(filename): result = [] fin = open(filename) for l in fin.readlines(): [seq_num, path, form_data, redirects, prev_path, errcode, errmsg, valid_html_p, duration, timestamp] = string.split(l, ','); path_chain = (path,) + tuple(string.split(redirects, ';')) errcode = string.atoi(errcode) if valid_html_p == '': valid_html_p = None else: valid_html_p = string.atoi(valid_html_p) duration = string.atof(duration) timestamp = time.mktime(time.strptime(timestamp, '%Y-%m-%d %H:%M:%S')) result.append(site_sample(path_chain, form_data, prev_path, errcode, errmsg, valid_html_p, duration, timestamp)) return result def write_log_sample(f, i, s): if s.prev_path == None: prev_path = '' else: prev_path = s.prev_path if s.errmsg == None: errmsg = '' else: errmsg = re.sub(',', ';', s.errmsg) if s.valid_html_p == None: valid = '' else: valid = '%d' % s.valid_html_p f.write('%d,%s,%s,%s,%s,%d,%s,%s,%f,%s\n' % \ (i, s.path_chain[0], s.form_data, string.join(s.path_chain[1:], ';'), prev_path, s.errcode, errmsg, valid, s.duration, time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(int(s.timestamp))))) def write_log_file(filename, samples): fout = open(filename, 'w') i = 0 for s in samples: write_log_sample(fout, i, s) i = i + 1 fout.close() ###################################################################### # 3) Definitions related to the HTTPUserAgent which acts as a # sort of 'virtual browser' that handles much of the # annoying bookkeeping (cookies, redirects) associated with # fetching pages from a site. # class CookieDict: """class CookieDict() A 'dictionary' of cookies. We don't worry about things like persistent or secure cookies, or multiple domains, because we don't need them for profiling. Yet. """ def __init__(self): self.paths = {} def set(self, path, name, value): if path == '' or path == None: path = '/' if self.paths.has_key(path): path_values = self.paths[path] else: path_values = {} self.paths[path] = path_values path_values[name] = value def get(self, path): result = [] for junk, pv in filter(lambda x, p=path: p[0:len(x[0])] == x[0], self.paths.items()): result.extend(pv.items()) return result class HTTPUserAgent: """class HTTPUserAgent(host, debug=0) Class to act as an HTTP user agent. Is responsible for loading pages, following redirects and handling cookies. A single instance of this class can only cope with a single host. The debug parameter is a boolean indicating whether or not debug messages should be printed. The business end of this class is the method load_page. """ def __init__(self, host, debug=0): self.host = host self.cookie_re = re.compile('([^=]+)=([^;]*);.*(?:path=([^;]);|).*') self.cookies = CookieDict() self.debug = debug self.conn = httplib.HTTP() self.conn.set_debuglevel(self.debug) # Internal function, needed so that we can do the actual fetch in # a seperate thread. # conn is an httplib.HTTP object, path is the path to fetch def __fetch_page(self, cookie_str, path, result): # Make the request and read the response try: self.conn.connect(self.host) self.conn.putrequest('GET', path) if cookie_str <> '': self.conn.putheader('Cookie', cookie_str) self.conn.putheader('Host', str(self.host)) self.conn.endheaders() errcode, errmsg, headers = self.conn.getreply() f = self.conn.getfile() html = f.read() f.close() result.append((errcode, errmsg, headers, html)) except KeyboardInterrupt: result.append((-10, 'Ctl-C pressed', None, None)) except exceptions.Exception, x: if self.debug: print 'ERROR CONNECTING:', path, x result.append((-11, 'exception during fetch: ' + x.__str__(), None, None)) def load_page(self, url): """errcode, errmsg, url_chain, html = load_page(self, path) Loads a single specified page from the host specified when the user agent was instantiated. Will follow redirects and only return the html associated with the last one, but will return all links in redirect chain in url_chain. Pretty dumb about redirects: 2 URLs that redirect to each other will make it loop infinitely. The errcode is the HTTP result code, if an HTTP connection was made. If no connection was made, or an off-host redirect was encountered or some other exception occured, the errcode will be negative: -1 : HTTP library returned error -11 : exception caught while making HTTP connection -12 : page load timed out (took more than 30 seconds) -13 : redirect to another host encountered """ url_chain = [] while 1: # Allocate storage for result result = [] # Build a string of cookies to go with the request cookie_str = string.join(map(lambda c: c[0]+'='+c[1], self.cookies.get(url)), '; ') # The connection seems to hang sometimes, so we do the fetch in # a seperate thread and give it 1 minute to complete. Longer # than that, and we assume some sort of error has occured. fetch_thread = threading.Thread(None, self.__fetch_page, None, (cookie_str, url, result)) fetch_thread.setDaemon(1) fetch_thread.start() fetch_thread.join(60) if fetch_thread.isAlive(): url_chain.append(url) return (-12, 'page load timed out', tuple(url_chain), None) errcode, errmsg, headers, html = result[0] if errcode == -10: raise KeyboardInterrupt # Process a Set-Cookie header, if any. Gotta check errcode # before headers, because headers will be None if errcode <= 0. if errcode > 0 and headers.has_key('Set-Cookie'): cookie_match = self.cookie_re.match(headers['Set-Cookie']) if cookie_match <> None: key,value,path = cookie_match.groups() if self.debug: print 'SETTING COOKIE:', key, value, path self.cookies.set(path,key,value) else: if self.debug: print 'BAD SET-COOKIE HEADER: "' + \ headers['Set-Cookie'] + '"' # Process a Location header, if any: split into host and URL if errcode > 0 and headers.has_key('Location'): location = headers['Location'] scheme, hdr_host, path, param, query, frag = \ urlparse.urlparse(location) # We assume that no match means a URL without a scheme and # host given in location header (illegal by HTTP specs). if string.lower(scheme) not in ('http',) or not hdr_host: if self.debug: print 'BAD LOCATION HEADER:', url, location url_chain.append(url) return (-14, 'Bad Location Header', tuple(url_chain) + (location,), None) # Reconstruct the URL minus host and scheme hdr_url = urlparse.urlunparse(('','',path,param,query,frag)) else: location = None hdr_host = self.host hdr_url = url # If this wasn't a redirect, then the URL in the header # (if any) is the 'exact' location for the requested object, # and should be appended to the URL chain instead of whatever # got passed in as the url. And since it's not a redirect, # we're done looping at this point, so we break... if errcode != 302: url_chain.append(hdr_url) break if location == None: return (-15, 'Redirect W/O Location', tuple(url_chain), None) # This was a redirect, so we need to go ahead and stuff the # old url into url_chain, because the one given by the Location # header is the location we're redirecting to... url_chain.append(url) if hdr_host <> self.host: return (-13, 'off-host redirect', tuple(url_chain), None) url = hdr_url return (errcode, errmsg, tuple(url_chain), html) ###################################################################### # 4) site_sampler and profile_site - a callable class and a control # function that use an HTTPUserAgent to crawl a site # class site_sampler: """class site_sampler(browser, outfile=None, validator=None): This class bundles together several actions that happen every time a page is loaded from the site into a single callable: The browser loads the page, the validator is run on the HTML, a site_sample is constructed from the data. The sample is written to an output file (if any). Calling an instance of the class returns a tuple containing the site_sample instance and the html of the page.""" def __init__(self, browser, outfile=None, validator=None): self.browser = browser self.validator = validator self.outfile = outfile self.i = 0 def __call__(self, path, prev_path): start_time = time.time() errcode, errmsg, path_chain, html = self.browser.load_page(path) duration = time.time() - start_time if self.validator: valid_html_p = self.validator(html) else: valid_html_p = None s = site_sample(path_chain, '', prev_path, errcode, errmsg, valid_html_p, duration, int(start_time)) if self.outfile: write_log_sample(self.outfile, self.i, s) self.i = self.i + 1 return (s, html) def profile_site(start_path, fetches, sample_site, get_links, reduce_path): """samples = profile_site(start_path fetches, sample_site, get_links, reduce_path): Crawls around a site, starting at the specified start_path, storing statistics about page load times. The fetches parameter specifies how many pages are to be loaded in total. Pages are retrieved from the site using the sample_site callable. This callable should accept a URL to load (minus any scheme or host information, since that data should somehow already be known to the sample_site callable) and return a tuple containing the sample_site instance and the page's HTML, in that order. The get_links callable is used to parse the links from each page's HTML: the callable should accept the path of the page containing the HTML, and the HTML itself. It should return a list of URLs, minus the scheme or host. reduce_path is a callable that is used is used to map from each page's url to some other set of distinct values. The crawler will attempt to get a set of statistics that is evenly balanced among these values. One obvious reduction function is to simply strip off any query string. This function returns a list of site_sample objects, each representing a fetch (or attempted fetch) from the site. Note that in the event of a redirect or series of redirects, all links in the redirect chain will be combined into a single sample. """ samples = [] fetch_counts = {reduce_path(start_path):0} path = start_path prev_path = None try: for i in range(0, fetches): # Sample the site s, html = sample_site(path, prev_path) samples.append(s) # increment counter for profile balancing reduced_path = reduce_path(path) try: fetch_counts[reduced_path] = fetch_counts[reduced_path] + 1 except KeyError: fetch_counts[reduced_path] = 1 # check for error returns if s.errcode != 200: print i, s.path_chain, 'ERROR:', s.errcode, s.errmsg prev_path = None path = start_path continue # every 100th hit, return to start path (part of balancing - # help to ensure that we don't get trapped in some small # subset of the site) if (i % 100) == 0: print i, s.path_chain, 'RETURNING TO', start_path prev_path = None path = start_path continue # get links from just-fetched page links = get_links(s.path_chain[-1], html) if len(links) == 0: print i, s.path_chain, 'NO LINKS, RETURNING TO', start_path prev_path = None path = start_path continue # build list containing all links with minimum hits # (more balancing) min_w = -1 for l in links: r = reduce_path(l) w = fetch_counts.get(r, 0) if w < min_w or min_w == -1: min_w = w new_path_list = [l] elif w == min_w: new_path_list.append(l) # pick randomly from links with minimum hits new_path = new_path_list[random.randint(0,len(new_path_list)-1)] print i, s.path_chain, 'FOLLOWING LINK TO', new_path prev_path = path path = new_path except KeyboardInterrupt: pass return samples ###################################################################### # 5) A procedure that takes a set of samples generated by the procedures # part 4 or 5 and generates statistics on it. # def build_statistics(samples, group_func): """statistics = build_statistics(samples, group_func): Partitions the site_samples in the list samples into groups using the provided group_func, which is applied to the first entry in each sample's path chain. (The first page fetched in the redirect chain, if any.) Then computes statistics on each group, and returns a list of these statistics. The returned list contains a set of tuples of the form (, , , , ) """ results = [] grouped_samples = {} for s in samples: rp = group_func(s.path_chain[0]) if not grouped_samples.has_key(rp): grouped_samples[rp] = ([], []) if s.errcode == 200: grouped_samples[rp][1].append(s.duration) else: grouped_samples[rp][0].append(s.errcode) for key, (errors, time_list) in grouped_samples.items(): mean, stddev = mean_stddev(time_list) results.append((mean, len(time_list), stddev, len(errors), key)) return results ###################################################################### # 6) A set of relatively small and simple classes and functions used # to define callable plug-ins that define aspects of how the # procedures in parts 4 and 5 work. # Functions for path reduction # Simple function that, given a URL, strips off any parameters, query # string, or fragment at the end def strip_url_vars(url): scheme, netloc, path, param, query, frag = urlparse.urlparse(url) return urlparse.urlunparse((scheme, netloc, path, '', '', '')) # A more complex URL processor: this one strips URL variables, then applies # the provided regexp. If a match is found, all non-None groups in the # match are concatenated together, seperated by a *. class strip_url_vars_and_group: "class strip_url_vars: Callable - give it a URL, it strips off URL vars." def __init__(self, group_re): self.group_re = re.compile(group_re) def __call__(self, url): url = strip_url_vars(url) group_match = self.group_re.match(url) if group_match <> None: url = string.join(filter(lambda x: x, group_match.groups()), '*') return url # An extended HTML parser that handles forms # (UNTESTED AND CURRENTLY UNUSED) def simplify_tag_attrs(attrs): attrs = collect(lambda a: a[0], attrs) for k, v in attrs.items(): attrs[k] = v[-1][1] return attrs class MyHTMLParser(htmllib.HTMLParser): def __init__(self): HTMLParser.__init__(self, formatter.NullFormatter()) self.forms = [] self.in_form_p = None self.in_select_p = None self.in_textarea_p = None def start_form(self, attrs): if self.in_form_p: self.end_form() attrs = simplify_tag_attrs(attrs) self.in_form_p = 1 self.form_action = attrs.get('action', None) self.form_method = attrs.get('method', 'POST') self.form_enctype = attrs.get('enctype', 'application/x-www-form-urlencoded') self.form_input_text = [] self.form_input_password = [] self.form_input_checkbox = [] self.form_input_radio = {} self.form_input_submit = [] self.form_input_image = [] self.form_input_file = [] self.form_input_hidden = [] self.form_select = [] self.form_textarea = [] def do_input(self, attrs): if not in_form_p: return attrs = simplify_tag_attrs(attrs) type = string.lower(attrs['type']) name = attrs.get('name', None) value = attrs.get('value', None) try: maxlength = int(attrs.get('maxlength', 'a')) if maxlength <= 0: maxlength = None except ValueError: maxlength = None if type == 'text': if name: self.form_input_text.append((name, value, maxlength)) elif type == 'password': if name: self.form_input_password.append((name, maxlength)) elif type in ('checkbox', 'radio'): checked = attrs.has_key('checked') if name: if type == 'checkbox': self.form_input_checkbox.append((name, value, checked)) else: if not self.form_input_radio.has_key(name): self.form_input_radio[name] = [] self.form_input_radio[name].append((value, checked)) elif type == 'submit': self.form_input_submit.append((name, value)) elif type == 'image': self.form_input_image.append((name, value)) elif type == 'file': accept = attrs.get('accept', None) if accept: accept = string.split('accept', ',') if name: self.form_input_file.append((name, maxlength, accept)) elif type == 'hidden': if name: self.form_input_hidden.append((name, value)) def start_select(self, attrs): if self.in_select_p: self.end_select() attrs = simplify_tag_attrs(attrs) self.in_select_p = 1 self.select_name = attrs.get('name', None) self.select_multiple = attrs.has_key('multiple') self.select_option = [] def do_option(self, attrs): attrs = simplify_tag_attrs(attrs) value = attrs.get('value', None) checked = attrs.has_key('selected') self.select_option.append((value, checked)) def end_select(self): if self.select_name: self.form_select.append((self.select_name, self.select_multiple, self.select_option)) self.in_select_p = None del self.select_name del self.select_multiple del self.select_option def do_textarea(self, attrs): attrs = simplify_tag_attrs(attrs) name = attrs.get('name', None) rows = attrs.get('rows', None) cols = attrs.get('cols', None) if name: self.form_textarea.append((name, rows, cols)) def end_form(self): if self.form_action: self.forms.append((self.form_action, self.form_method, self.form_enctype, self.form_input_text, self.form_input_password, self.form_input_checkbox, self.form_input_radio, self.form_input_submit, self.form_input_image, self.form_input_file, self.form_input_hidden, self.form_select, self.form_textarea)) self.in_form_p = None del self.form_action del self.form_method del self.form_enctype del self.form_input_text del self.form_input_password del self.form_input_checkbox del self.form_input_radio del self.form_input_submit del self.form_input_image del self.form_input_file del self.form_input_hidden del self.form_select del self.form_textarea # Classes for stripping links out of the HTML returned by a page class local_links: """class local_links(host): Callable - given HTML, returns list of links local to host. """ def __init__(self, host): self.host = host def __call__(self, root, html): parser = htmllib.HTMLParser(formatter.NullFormatter()) parser.feed(html) parser.close() # Parse URLs into their component parts l = map(lambda a: urlparse.urlparse(a, 'http'), parser.anchorlist) # Filter out HREFs that aren't http or pointing at this host l = filter(lambda a, h=self.host: a[0] == 'http' and a[1] in (h, ''), l) # Convert URLs back to strings, dropping scheme, host, and fragment l = map(lambda a: urlparse.urlunparse(('','',a[2] or '/',a[3],a[4],'')), l) # Do a join to the root to handle relative URLs l = map(lambda a, r=root: urlparse.urljoin(r, a), l) return l class filtered_local_links(local_links): """class filtered_local_links(host, legal_url_regex): Callable, derived from local links. Works like local_links, except it filters out paths that don't match the provided regex. """ def __init__(self, host, illegal_url_regex): local_links.__init__(self, host) self.filter_re = re.compile(illegal_url_regex) def __call__(self, root, html): return filter(lambda x, f=self.filter_re.match: not f(x), local_links.__call__(self, root, html)) ################ # MAIN PROGRAM # ################ def display_usage_info(): script_filename = re.match('.*?([^/]*)$', sys.argv[0]).group(1) print """ Usage: %(script)s -profile [] [] %(script)s -report [+] %(script)s -help """ % {'script':script_filename} def process_logon_args(n): if len(sys.argv) < n+1: return (None, n+0, 0) if sys.argv[n] == '-acs': logon_url = '/register/user-login.tcl?' + \ urllib.urlencode({'email':sys.argv[n+1], 'password_from_form':sys.argv[n+2]}) return (logon_url, n+3, 1) elif sys.argv[n] == '-generic': return (sys.argv[n+1], n+2, 0) else: return (None, n+0, 0) # redirect output of print to stderr until we get to 'real' output # (i.e. either a report or the help listing) sys.stdout = sys.__stderr__ if len(sys.argv) < 2: display_usage_info() sys.exit() if sys.argv[1] == '-profile': try: # grab the first 4 arguments host = sys.argv[2] start_path = sys.argv[3] fetches = string.atoi(sys.argv[4]) outfile = sys.argv[5] logon_url, filter_arg, acs_logon = process_logon_args(6) if len(sys.argv) > filter_arg: if acs_logon: link_filter_re = '/register|' + sys.argv[filter_arg] else: link_filter_re = sys.argv[filter_arg] link_filter = filtered_local_links(host, link_filter_re) else: if acs_logon: link_filter = filtered_local_links(host, '/register') else: link_filter = local_links(host) except IndexError: display_usage_info() sys.exit() fout = open(outfile, 'w') samples = [] sampler_func = site_sampler(HTTPUserAgent(host, 0), fout) if logon_url <> None: print 'LOGGING ON' fetches = fetches - 1 s, html = sampler_func(logon_url, None) samples.append(s) print s.__dict__, html if s.errcode <> 200: print s.path_chain, 'LOGIN ERROR:', s.errcode, s.errmsg try: samples.extend(profile_site(start_path, fetches, sampler_func, link_filter, strip_url_vars)) except KeyboardInterrupt: print 'User Interrupt - Generating Partial Statistics & Logfile' fout.close() group_re = '(?!)' elif sys.argv[1] == '-report': try: group_re = sys.argv[2] sample_files = sys.argv[3:] except IndexError: display_usage_info() sys.exit() samples = [] for filename in sample_files: samples.extend(read_log_file(filename)) if len(samples) == 0: print 'No samples found in specified files.' sys.exit() elif sys.argv[1] == '-help': sys.stdout = sys.__stdout__ script_filename = re.match('.*?([^/]*)$', sys.argv[0]).group(1) print """ Usage: %(script)s -profile [] [] %(script)s -report [+] %(script)s -help -profile [] [] =============================================================== In this mode, the program crawls over the site at http:/// for the number of hits specified in . It records a raw log of its activity in . It also prints a report containing some statistics to standard output. can be used to specify logon information that will be used to log on as a particular user of the site to be profiled. It takes one of two forms: -acs In this case, the program assumes it is dealing with a standard ACS site, and attempts to log on to the site by passing the specified and as URL variables to /register/user-login.tcl, then following redirects and setting cookies. -generic Given a logon spec of this form, the program attempts to log on to the site by executing a 'GET' on the specified URL, which would presumably contain URL variables specifying logon information required by the site. Finally, is a Perl-style regular expression. The crawler will avoid following links whose paths (minus hostname, etc.) match the provided regex. This is useful for preventing the crawler from wasting time on pages that you aren't interested in testing. And it's also useful for preventing the crawler from following links that will cause it to be logged out. For this reason, if you specify a -acs logon, \"/register\" is automatically added to any regex you provide. -report + ==================================== In this mode, the program imports all the profile log files specified in + (these are files whose names were specified as to -profile or -loadtest and generates a set of statistics on the union of all data therein. The samples are grouped by a string that is obtained by first stripping off any URL variables from the path, then attempting to match them against . Paths that match this regex have the groups generated by the match concatenated together with '*' as a seperator. So, for example, if you were to run: %(script)s -report \"(/store/)[^/](/.*|)\" log.csv Then samples of these URLs: /store/hitime/tasting-group.html?tasting_group_id=51 /store/moorebros/tasting-group.html /store/greenville/tasting-group.html?group_type=red Would all be grouped under the heading \"/store/*/tasting-group.html\" for the purpose of computing statistics. Notes On Log File Format ======================== Finally, for those of you who want to manipulate the logfile data on your own, here's the format: Each line in the file represents a single sample and is of the form: ,,
,,,,,,, - a simple sequence number recording the order in which hits occured - path of the fetch, minus \"http://\" - form data, stored in URL-encoded form - a redirect chain, semicolon-seperated - page on which the link to was found (maybe empty) - a result code. Positive numbers are HTTP return codes, negatives indicate errors connecting: -1, -11 : an error occured connecting, no further info -12 : timeout (> 60 seconds) fetching page -13 : encountered redirect to another host -14 : Bad Location header -15 : redirect W/O Location header - a text error message - 0 if HTML failed some sort of validation, 1 if it passed, empty string if no validation performed. Currently the only 'validation' done is a check for unevaluated evaluated '<% %>' ADP tags. - length of time it took to fetch page, including all links in the redirect chain, if any - a timestamp telling when the fetch was initiated, in in the format \"YYYY-MM-DD HH:MI:SS\", with a 24-hour clock. Greenwich mean time is used. """ % {'script':script_filename} sys.exit() else: display_usage_info() sys.exit() sys.stdout = sys.__stdout__ statistics = build_statistics(samples, strip_url_vars_and_group(group_re)) statistics.sort() statistics.reverse() print ' Mean Hits Std Dev Errs URL' print '---------+------+----------+------+------------------------------------' for mean,count,stddev,errs,path in statistics: if stddev == None: stddev_str = ' N/A ' else: stddev_str = '%8.4f' % (stddev,) if mean == None: mean_str = ' N/A ' else: mean_str = '%8.4f' % (mean,) print '%s | %4d | %s | %4d | %s' % \ (mean_str, count, stddev_str, errs, path) # Hopefully this will kill off any zombie connection threads # hanging around, so that the program won't hang waiting for # a server response that's never going to happen. sys.exit()