#!/usr/bin/env python
# -*- mode:Python; tab-width: 3 -*-
"""mirror.py, a small tool to mirror files.
This program was created to fulfill the following tasks for the Allegro web
page:
- Distribution of binary files over several mirrors
- Different mirrors should be able to mirror different sets of files
- All mirroring should be automatic
This script will open a configuration file which contains the data about
the files to mirror. Then, according to this information and to the files
which are found locally, it should know what to download/ignore/update.
This script was written by Grzegorz Adam Hankiewicz and is Giftware: you
are free to do what you want with it without any restriction. I do not
accept responsibility for any effects, adverse or otherwise, that this
script may have on you, your computer, your sanity, your dog, and
anything else that you can think of. Use it at your own risk.
The file doesn't use tabs, each identation level is three space characters.
"""
import getopt
import md5
import os
import random
import re
import sys
import time
import urllib
import urlparse
short_arguments = "hvc:o:l:ft:d:r"
long_arguments = ["help", "version", "configuration=", "output-dir=",
"local-conf", "fake", "timestamp=", "data=", "random-test"]
version = "Mirror.py 0.2"
mirror_type = []
delete_list = []
def load_configuration(filename):
"""Opens a .ini file and gets the mirror_type value"""
import ConfigParser
conf = ConfigParser.ConfigParser()
conf.read(filename)
# Ok, start adding tags
try: mirror_type.extend(conf.get("global", "mirror_type", 1).split())
except (ConfigParser.NoOptionError, ConfigParser.NoSectionError): pass
def show_program_usage(argv_zero = "", exit_code = 0):
"""Simple function which explains the commandline switches and exits.
argv_zero is the first entry in sys.argv, it's used to extract the name
of the script being run from the commandline. exit_code is the code
which will be returned to the OS.
"""
print "Usage: %s -d data -o output-dir [-hvl file -c configuration]\n" % os.path.split(argv_zero)[1]
print "-h, --help Shows this help message"
print "-v, --version Displays version and exits"
print "-c x, --configuration=x Use xxx as configuration file (.ini)"
print "-d x, --data=x Data about files to be mirrored"
print "-o x, --output-dir=x Where binary files will be placed"
print "-l x, --local-conf=x File containing behaviour preferences"
print "-f, --fake Fake process, do not remove/create files"
print "-t x, --timestamp=x Create/modify x if something changed"
print "-r, --random-test Downloads a random file to test script"
print
sys.exit(exit_code)
def load_mirror_data(data_file, base_dir = ""):
"""Loads the configuration file and returns a list of files to mirror.
Optionally, if base_dir is not None, it will prefix all paths.
"""
mirror_files = []
delete = []
execfile(data_file)
if base_dir:
for f in range(len(mirror_files)):
mirror_files[f] = (mirror_files[f][0], mirror_files[f][1],
mirror_files[f][2], mirror_files[f][3],
os.path.join(base_dir, mirror_files[f][4]),
mirror_files[f][5])
for f in delete:
delete_list.append(os.path.join(base_dir, f))
return mirror_files
def process_arguments(arguments):
"""Processes the arguments used to invoke the program.
Returns the tuple data_file, output_directory, local_configuration,
faking, timestamp, random_test.
configuration_file is the file which contains all the information about
the mirror files and their parameters.
output_directory is where the files should be placed.
local_configuration can be empty, otherwise points to a file with custom
parameters which are used to control the mirroring.
faking is a boolean indicating that no modifications should take place.
timestamp can be empty, otherwise is the filename to modify when
there are changes to the mirrored files.
random_test is a boolean. If true, the script should ignore
creating local files and try to download a file at random,
ignoring server configuration. Used for unit testing.
"""
try:
optlist, args = getopt.getopt(arguments[1:], short_arguments, long_arguments)
except getopt.error, msg:
print "Error parsing arguments:\n", msg, "\n"
show_program_usage(arguments[0], 1)
data_file = output_dir = local_conf = faking = timestamp = random_test = None
for option, value in optlist:
if option in ("--help", "-h"):
show_program_usage(arguments[0], 0)
elif option in ("--version", "-v"):
print version
sys.exit(0)
elif option in ("--data", "-d"):
data_file = value
elif option in ("--configuration", "-c"):
load_configuration(value)
elif option in ("--local-conf", "-l"):
local_conf = value
elif option in ("--output-dir", "-o"):
output_dir = value
elif option in ("--fake", "-f"):
faking = 1
elif option in ("--timestamp", "-t"):
timestamp = value
elif option in ("--random-test", "-r"):
random_test = 1
else:
print "Option '%s' not recognized\n" % option
show_program_usage(arguments[0], 1)
if not data_file:
print "A data file with mirror information is required"
show_program_usage(arguments[0], 1)
if not output_dir:
print "You have to tell me where to put the binary files"
show_program_usage(arguments[0], 1)
return data_file, output_dir, local_conf, faking, timestamp, random_test
def get_md5sum(data):
"""Returns the md5sum hex string according to the data provided."""
m = md5.new()
m.update(data)
return m.hexdigest()
def should_fetch_file(size, hash, location, type):
"""Process a mirror file entry.
Returns true if the file should be fetched, depending on it's type and
the current configuration, it might get added to the global 'delete' list.
"""
# first check type mirror
if "all" not in type:
for option in type:
if option in ("high", "medium", "low") and option not in mirror_type:
delete_list.append(location)
return 0
elif "no-" == option[:3] and option[3:] in mirror_type:
delete_list.append(location)
return 0
elif "none" == option: # this option doesn't force local deletion
return 0
try: # verify existence and file size
if size != os.path.getsize(location):
if size < 0:
return 0
return 1
except OSError:
return 1
# verify hash
file = open(location)
md5sum = get_md5sum(file.read())
file.close()
if md5sum != hash:
return 1
return 0
def get_sourceforge_refresh_url(url, regexp_match):
"""Reads text data from url, searching the first meta refresh tag, and
returns it's download url.
"""
exp = re.compile(r'http://(?P<url>[^">]*%s)' % re.escape(regexp_match),
re.IGNORECASE)
file = urllib.urlopen(url)
line = file.readline()
while line:
res = exp.search(line)
if res:
file.close()
#print "Found http://%s" % res.group("url")
return "http://%s" % res.group("url")
#sys.stdout.write(line)
line = file.readline()
file.close()
raise IOError, "Couldn't parse %s" % url
def get_sourceforge_mirror_url(url):
"""Reads text data from url, searching the first redirection which is in
the form of an url with some '?use_mirror' text. If found, continues
through that with get_sourceforge_refresh_url, otherwise raises IOError.
"""
exp = re.compile(r'<a href=["\']?(?P<url>[^?><]*)\?use_mirror=(?P<mirror>[^\'">]*)',
re.IGNORECASE)
file = urllib.urlopen(url)
line = file.readline()
while line:
res = exp.search(line)
if res:
file.close()
#print "DEBUG url `%s' mirror `%s'" % (res.group('url'), res.group('mirror'))
return get_sourceforge_refresh_url(urlparse.urljoin(url,
"%s?use_mirror=%s" % (res.group('url'), res.group('mirror'))),
res.group('url'))
#sys.stdout.write(line)
line = file.readline()
file.close()
raise IOError, "Couldn't parse %s" % url
def get_url(url):
"""Opens url and returns it's binary data"""
# if url[-9:] == "?download":
# # special case to handle sourceforge redirection
# url = get_sourceforge_mirror_url(url)
file = urllib.urlopen(url)
data = file.read()
file.close()
return data
def update_timestamp(filename):
"""Creates/modifies filename with the current date"""
file = open(filename, "wt")
file.write("%s" % time.asctime(time.localtime()))
file.close()
def fetch_file(url, size, hash, dest_file, faking, timestamp):
"""Retrieves an url, verifies size/hash and creates local file"""
if faking:
print "Would fetch", url
return 1
print "Mirroring %0.1f KiB %s -> %s" % (size / 1024.0, url, dest_file)
try:
data = get_url(url)
except IOError, msg:
print "Couldn't get %s: %s" % (url, msg)
return 0
if size > 0 and len(data) != size:
print "Incorrect size (%d bytes) of fetched %s" % (len(data), url)
return 0
checksum = get_md5sum(data)
if size > 0 and checksum != hash:
print "Corrupt file fetched from %s (%s != %s)" % (url, checksum, hash)
return 0
# Ok, create the file
try: os.makedirs(os.path.split(dest_file)[0], 0775)
except OSError: pass
file = open(dest_file, "wb")
file.write(data)
file.close()
os.chmod(dest_file, 0664)
# Pending correct setting of group/permission
# os.chown(dest_file, "gregorio", "disk")
if size < 0:
print "File %s, size %d, hash %s" % (url, len(data), get_md5sum(data))
update_timestamp(timestamp)
return 1
def main(argv):
"""Entry point of the script."""
(data_file, output_dir, local_conf, faking, timestamp,
random_test) = process_arguments(argv)
# Don't delete files in random test mode.
if random_test:
(nick, url, size, hash, location, type) = random.choice(
load_mirror_data(data_file, output_dir))
if "bogus" in "url":
print "Bogus url!"
return
location = os.tmpnam()
try:
tries = 0
while not fetch_file(url, size, hash, location, faking, timestamp):
tries += 1
if tries == 3:
print "Giving up."
sys.exit(1)
print "Successfully downloaded"
finally:
try: os.unlink(location)
except: pass
return
# Loop through mirror file information
for nick, url, size, hash, location, type in load_mirror_data(data_file, output_dir):
if not should_fetch_file(size, hash, location, type):
continue
tries = 0
while not fetch_file(url, size, hash, location, faking, timestamp):
tries += 1
if tries == 3:
print "Giving up."
sys.exit(1)
for file in filter(os.path.isfile, delete_list):
if faking:
print "Would delete", file
else:
os.unlink(file)
if __name__ == "__main__":
main(sys.argv)