Revision 7616 (by gradha, 2006/12/02 18:59:24) Updated binary files.
#!/usr/bin/env python
# -*- mode:Python; tab-width: 3 -*-

"""mirror.py, a small tool to mirror files.

This program was created to fulfill the following tasks for the Allegro web
page:

- Distribution of binary files over several mirrors
- Different mirrors should be able to mirror different sets of files
- All mirroring should be automatic

This script will open a configuration file which contains the data about
the files to mirror. Then, according to this information and to the files
which are found locally, it should know what to download/ignore/update.

This script was written by Grzegorz Adam Hankiewicz and is Giftware: you
are free to do what you want with it without any restriction. I do not
accept responsibility for any effects, adverse or otherwise, that this
script may have on you, your computer, your sanity, your dog, and
anything else that you can think of. Use it at your own risk.

The file doesn't use tabs, each identation level is three space characters.
"""

import getopt
import md5
import os
import random
import re
import sys
import time
import urllib
import urlparse


short_arguments = "hvc:o:l:ft:d:r"
long_arguments = ["help", "version", "configuration=", "output-dir=",
   "local-conf", "fake", "timestamp=", "data=", "random-test"]
version = "Mirror.py 0.2"
mirror_type = []
delete_list = []


def load_configuration(filename):
   """Opens a .ini file and gets the mirror_type value"""
   import ConfigParser
   conf = ConfigParser.ConfigParser()
   conf.read(filename)

   # Ok, start adding tags
   try: mirror_type.extend(conf.get("global", "mirror_type", 1).split())
   except (ConfigParser.NoOptionError, ConfigParser.NoSectionError): pass



def show_program_usage(argv_zero = "", exit_code = 0):
   """Simple function which explains the commandline switches and exits.

   argv_zero is the first entry in sys.argv, it's used to extract the name
   of the script being run from the commandline. exit_code is the code
   which will be returned to the OS.
   """
   print "Usage: %s -d data -o output-dir [-hvl file -c configuration]\n" % os.path.split(argv_zero)[1]
   print "-h, --help              Shows this help message"
   print "-v, --version           Displays version and exits"
   print "-c x, --configuration=x Use xxx as configuration file (.ini)"
   print "-d x, --data=x          Data about files to be mirrored"
   print "-o x, --output-dir=x    Where binary files will be placed"
   print "-l x, --local-conf=x    File containing behaviour preferences"
   print "-f, --fake              Fake process, do not remove/create files"
   print "-t x, --timestamp=x     Create/modify x if something changed"
   print "-r, --random-test       Downloads a random file to test script"
   print
   sys.exit(exit_code)



def load_mirror_data(data_file, base_dir = ""):
   """Loads the configuration file and returns a list of files to mirror.

   Optionally, if base_dir is not None, it will prefix all paths.
   """
   mirror_files = []
   delete = []
   execfile(data_file)
   if base_dir:
      for f in range(len(mirror_files)):
         mirror_files[f] = (mirror_files[f][0], mirror_files[f][1],
            mirror_files[f][2], mirror_files[f][3],
            os.path.join(base_dir, mirror_files[f][4]),
            mirror_files[f][5])
   for f in delete:
      delete_list.append(os.path.join(base_dir, f))
   return mirror_files



def process_arguments(arguments):
   """Processes the arguments used to invoke the program.

   Returns the tuple data_file, output_directory, local_configuration,
   faking, timestamp, random_test.
   configuration_file is the file which contains all the information about
   the mirror files and their parameters.
   output_directory is where the files should be placed.
   local_configuration can be empty, otherwise points to a file with custom
   parameters which are used to control the mirroring.
   faking is a boolean indicating that no modifications should take place.
   timestamp can be empty, otherwise is the filename to modify when
   there are changes to the mirrored files.
   random_test is a boolean. If true, the script should ignore
   creating local files and try to download a file at random,
   ignoring server configuration. Used for unit testing.
   """
   try:
      optlist, args = getopt.getopt(arguments[1:], short_arguments, long_arguments)
   except getopt.error, msg:
      print "Error parsing arguments:\n", msg, "\n"
      show_program_usage(arguments[0], 1)

   data_file = output_dir = local_conf = faking = timestamp = random_test = None
   for option, value in optlist:
      if option in ("--help", "-h"):
         show_program_usage(arguments[0], 0)
      elif option in ("--version", "-v"):
         print version
         sys.exit(0)
      elif option in ("--data", "-d"):
         data_file = value
      elif option in ("--configuration", "-c"):
         load_configuration(value)
      elif option in ("--local-conf", "-l"):
         local_conf = value
      elif option in ("--output-dir", "-o"):
         output_dir = value
      elif option in ("--fake", "-f"):
         faking = 1
      elif option in ("--timestamp", "-t"):
         timestamp = value
      elif option in ("--random-test", "-r"):
         random_test = 1
      else:
         print "Option '%s' not recognized\n" % option
         show_program_usage(arguments[0], 1)

   if not data_file:
      print "A data file with mirror information is required"
      show_program_usage(arguments[0], 1)

   if not output_dir:
      print "You have to tell me where to put the binary files"
      show_program_usage(arguments[0], 1)

   return data_file, output_dir, local_conf, faking, timestamp, random_test



def get_md5sum(data):
   """Returns the md5sum hex string according to the data provided."""
   m = md5.new()
   m.update(data)
   return m.hexdigest()



def should_fetch_file(size, hash, location, type):
   """Process a mirror file entry.

   Returns true if the file should be fetched, depending on it's type and
   the current configuration, it might get added to the global 'delete' list.
   """
   # first check type mirror
   if "all" not in type:
      for option in type:
         if option in ("high", "medium", "low") and option not in mirror_type:
            delete_list.append(location)
            return 0
         elif "no-" == option[:3] and option[3:] in mirror_type:
            delete_list.append(location)
            return 0
         elif "none" == option: # this option doesn't force local deletion
            return 0

   try: # verify existence and file size
      if size != os.path.getsize(location):
         if size < 0:
            return 0
         return 1
   except OSError:
      return 1

   # verify hash
   file = open(location)
   md5sum = get_md5sum(file.read())
   file.close()
   if md5sum != hash:
      return 1

   return 0



def get_sourceforge_refresh_url(url, regexp_match):
   """Reads text data from url, searching the first meta refresh tag, and
   returns it's download url.
   """
   exp = re.compile(r'http://(?P<url>[^">]*%s)' % re.escape(regexp_match),
      re.IGNORECASE)

   file = urllib.urlopen(url)
   line = file.readline()
   while line:
      res = exp.search(line)
      if res:
         file.close()
         #print "Found http://%s" % res.group("url")
         return "http://%s" % res.group("url")
      #sys.stdout.write(line)
      line = file.readline()

   file.close()
   raise IOError, "Couldn't parse %s" % url



def get_sourceforge_mirror_url(url):
   """Reads text data from url, searching the first redirection which is in
   the form of an url with some '?use_mirror' text. If found, continues
   through that with get_sourceforge_refresh_url, otherwise raises IOError.
   """
   exp = re.compile(r'<a href=["\']?(?P<url>[^?><]*)\?use_mirror=(?P<mirror>[^\'">]*)',
      re.IGNORECASE)
   file = urllib.urlopen(url)
   line = file.readline()
   while line:
      res = exp.search(line)
      if res:
         file.close()
         #print "DEBUG url `%s' mirror `%s'" % (res.group('url'), res.group('mirror'))
         return get_sourceforge_refresh_url(urlparse.urljoin(url,
            "%s?use_mirror=%s" % (res.group('url'), res.group('mirror'))),
            res.group('url'))
      #sys.stdout.write(line)
      line = file.readline()

   file.close()

   raise IOError, "Couldn't parse %s" % url



def get_url(url):
   """Opens url and returns it's binary data"""
#   if url[-9:] == "?download":
#      # special case to handle sourceforge redirection
#      url = get_sourceforge_mirror_url(url)

   file = urllib.urlopen(url)
   data = file.read()
   file.close()
   return data



def update_timestamp(filename):
   """Creates/modifies filename with the current date"""
   file = open(filename, "wt")
   file.write("%s" % time.asctime(time.localtime()))
   file.close()



def fetch_file(url, size, hash, dest_file, faking, timestamp):
   """Retrieves an url, verifies size/hash and creates local file"""

   if faking:
      print "Would fetch", url
      return 1

   print "Mirroring %0.1f KiB %s -> %s" % (size / 1024.0, url, dest_file)
   try:
      data = get_url(url)
   except IOError, msg:
      print "Couldn't get %s: %s" % (url, msg)
      return 0

   if size > 0 and len(data) != size:
      print "Incorrect size (%d bytes) of fetched %s" % (len(data), url)
      return 0

   checksum = get_md5sum(data)
   if size > 0 and checksum != hash:
      print "Corrupt file fetched from %s (%s != %s)" % (url, checksum, hash)
      return 0

   # Ok, create the file
   try: os.makedirs(os.path.split(dest_file)[0], 0775)
   except OSError: pass

   file = open(dest_file, "wb")
   file.write(data)
   file.close()
   os.chmod(dest_file, 0664)

   # Pending correct setting of group/permission
#   os.chown(dest_file, "gregorio", "disk")
   if size < 0:
      print "File %s, size %d, hash %s" % (url, len(data), get_md5sum(data))
   update_timestamp(timestamp)
   return 1



def main(argv):
   """Entry point of the script."""
   (data_file, output_dir, local_conf, faking, timestamp,
      random_test) = process_arguments(argv)

   # Don't delete files in random test mode.
   if random_test:
      (nick, url, size, hash, location, type) = random.choice(
         load_mirror_data(data_file, output_dir))

      if "bogus" in "url":
         print "Bogus url!"
         return

      location = os.tmpnam()

      try:
         tries = 0
         while not fetch_file(url, size, hash, location, faking, timestamp):
            tries += 1
            if tries == 3:
               print "Giving up."
               sys.exit(1)
         print "Successfully downloaded"
      finally:
         try: os.unlink(location)
         except: pass
      return

   # Loop through mirror file information
   for nick, url, size, hash, location, type in load_mirror_data(data_file, output_dir):
      if not should_fetch_file(size, hash, location, type):
         continue

      tries = 0
      while not fetch_file(url, size, hash, location, faking, timestamp):
         tries += 1
         if tries == 3:
            print "Giving up."
            sys.exit(1)

   for file in filter(os.path.isfile, delete_list):
      if faking:
         print "Would delete", file
      else:
         os.unlink(file)



if __name__ == "__main__":
   main(sys.argv)