Revision 7614 (by gradha, 2006/12/02 18:13:42) Whitespace cleanup.
#!/usr/bin/env python
# -*- mode:Python; tab-width: 3 -*-
"""
This program obtains DJGPP's ZIP picker web page and parses it
to obtain a list of download mirrors. Then, it updates whatever
specified template file with the new list only if the retrieved
data hash doesn't match the one previously stored there.

This script was written by Grzegorz Adam Hankiewicz and is Giftware:
you are free to do what you want with it without any restriction. I
do not accept responsibility for any effects, adverse or otherwise,
that this script may have on you, your computer, your sanity,
your dog, and anything else that you can think of. Use it at your
own risk.

$Id: update-mirror-list.py 7614 2006-12-02 18:13:42Z gradha $
"""


import getopt
import md5
import os
import re
import singleton
import sys
import urllib


short_arguments = "hvf:"
long_arguments = ["help", "version", "file="]
version = "update-mirro-list 0.2"
start_comment_tag = r"<!-- begin automatically updated mirror section: "
end_comment_tag = r"<!-- end automatically updated mirror section -->"
MAIN_URL = "http://www.delorie.com/djgpp/zip-picker.html"
REGEXP = re.compile(r'<option value="((ftp|http)://([^/]+)/[^"]+)">(.+)')


def show_program_usage(argv_zero = "", exit_code = 0):
   """Simple function which explains the commandline switches and exits.

   argv_zero is the first entry in sys.argv, it's used to extract the name
   of the script being run from the commandline. exit_code is the code
   which will be returned to the OS.
   """
   print "Usage: %s -f source_file_with_mirror_information [-hv]\n" % os.path.split(argv_zero)[1]
   print "-h, --help          Shows this help message"
   print "-v, --version       Displays version and exits"
   print "-f, --file=xxx      The file to be updated with the mirror list"
   print
   sys.exit(exit_code)


def process_arguments(arguments):
   """Processes the arguments used to invoke the program.

   Returns the filename to be modified.
   """
   try:
      optlist, args = getopt.getopt(arguments[1:], short_arguments, long_arguments)
   except getopt.error, msg:
      print "Error parsing arguments:\n", msg, "\n"
      show_program_usage(arguments[0], 1)

   file_name = ""
   for option, value in optlist:
      if option in ("--help", "-h"):
         show_program_usage(arguments[0], 0)
      elif option in ("--version", "-v"):
         print version
         sys.exit(0)
      elif option in ("--file", "-f"):
         file_name = value
      else:
         print "Option '%s' not recognized\n" % option
         show_program_usage(arguments[0], 1)

   if not file_name:
      print "You need to specify the file where I'll update the mirror list"
      show_program_usage(arguments[0], 1)

   return file_name


def load_and_split_source(file_name):
   """load_and_split_source(file_name) -> hash, 3 line lists tuple

   Tries to load the specified file and searches its content for
   the marking comments which delimit the automatically generated
   text section. The tuple will contain three line lists: header,
   body and footer. The headerand footer are the static parts of the
   source file and they don't have to me modified. body is the bunch
   of lines which can be modified, and it doesn't include the lines
   which contain either the begin or end HTML comments. hash is the
   md5 hash retrieved from the comment which starts the body section.
   """

   start_exp = re.compile(start_comment_tag +
      r"\s*(?P<hash>\w{32})")
   end_exp = re.compile(end_comment_tag)
   file = open(file_name, "rt")
   tuple = ([], [], [])
   selected = 0
   hash = "x" * 32
   line = file.readline()
   while line:
      # first look for beginning comment and extract it's date
      if selected == 0 and start_exp.match(line):
         hash = start_exp.match(line).group("hash")
         if not hash:
            raise "Couldn't extract hash from beginning comment:\n%s" % line

         selected += 1

      # then look for end comment
      elif selected == 1 and end_exp.match(line):
         selected += 1

      else:
         # meanwhile add the rest of the lines to the appropriate section
         tuple[selected].append(line)

      line = file.readline()

   file.close()
   if selected != 2:
      raise "Parsing error, didn't detect three sections in %s" % file_name
   return hash, tuple


def get_djgpp_mirrors():
   """f() -> [(url, site, description)]

   Retrieves from MAIN_URL the addresses of DJGPP's mirrors. The HTML
   of the web page is parsed and a list of tuples returned. Each
   tuple consists of the full URL to the Allegro directory, the
   basic domain name, and the text description associated to that
   mirror as a text string in English.
   """
   input = urllib.urlopen(MAIN_URL)
   mirrors = []
   try:
      line = input.readline()
      while line:
         m = REGEXP.match(line)
         if m:
            mirrors.append(("%s%s" % (m.group(1), "v2tk/allegro/"),
               "%s.%s" % (m.group(2), m.group(3)), m.group(4)))
         line = input.readline()
      return mirrors
   finally:
      input.close()


def calculate_md5(data):
   """f([(a, b, c), ...]) -> string with md5 hash

   Given a list of tuples with three elements, builds a long string
   containing data and returns the appropriated md5 hash.
   """

   lines = []
   for triplet in data:
      lines.append("%s%s%s" % triplet)
   m = md5.new("".join(lines))
   return m.hexdigest()


def update_source_file(file_name, hash, mirrors, header, footer):
   """Updates file_name with the given data.

   Opens file_name and writes the information contained in mirrors,
   putting header and footer around the generated lines.
   """

   body = generate_html_code(mirrors)
   file = open(file_name, "wt")
   for line in header:
      file.write(line)
   file.write("%s%s -->\n" % (start_comment_tag, hash))
   for line in body:
      file.write(line)
   file.write("%s\n" % end_comment_tag)
   for line in footer:
      file.write(line)
   file.close()


def generate_html_code(mirrors):
   """generate_html_code([(a, b, c), ...]) -> list of strings

   Given a list of tuples in the form (url, site, description),
   returns a list of strings which compose an HTML chunk of links
   to those URLs.  First the defines/macros are created. Then,
   a list of links is written as a table to use different colors.
   """

   colors = ('class="row1"', 'class="row2"')
   lines = []

   def full(string):
      return os.path.join(string, allegro_simtel_dir)

   for dummy, site, description in mirrors:
      lines.append("#!- <%s> %s\n" % (site, description))
   lines.append("<include-data=simtel/simtel:replace_default_values>\n")

   lines.append("<table width=\"100%\">\n")
   for url, site, dummy in mirrors:
      lines.append('<tr %s><td><a href="%s"><%s></a><br></td></tr>\n' %
         (colors[0], url, site))
      colors = (colors[1], colors[0])
   lines.append("</table>\n")
   return lines


def main(argv = None):
   """Entry point of the script."""
   if argv is None:
      argv = sys.argv
   file_name = process_arguments(argv)

   old_md5, section = load_and_split_source(file_name)

   mirrors = get_djgpp_mirrors()

   new_md5 = calculate_md5(mirrors)

   if old_md5 != new_md5:
      print "Updating"
      update_source_file(file_name, new_md5, mirrors, section[0], section[2])


if __name__ == "__main__":
   """Protect the entry point with a file lock."""
   singleton.run_if_possible(main)