Revision 7614 (by gradha, 2006/12/02 18:13:42) Whitespace cleanup.
#!/usr/bin/env python
# -*- mode:Python; tab-width: 4 -*-
"""
Analyses source code from the src directory and translations under
data. Then gathers if there are stale English entries (unused data)
and missing translations (untranslated but used data). It can also
gather if there are any obsolete entries (translated but unused
data).

Everything is done statically and output is generated on stdout,
so no locking is required. Run without arguments to get help.
"""


import getopt
import glob
import os
import re
import sys


INCLUDE_DATA_RE = re.compile(r"<include-data(?P<cond>[^=]*)=(?P<filename>\S+?)>(?P<post>.*)")


def show_program_usage(exit_code = 0):
    """f() -> no return."""

    print """Usage: %s -d checkout_dir

-h, --help          Shows this help message.
-v, --version       Displays version and exits.
-d x, --dir=x       Where `data' and `src' are located.
-m, --master        Shows stale entries in the master language.
-o, --obsolete      Shows also obsolete entries.
-e, --exclude=x     List of colon separated languages to exclude.
-i, --include=x     On top of not being excluded, a language must be included.

Example:
  %s -d . -m -e ko:pl
""" % (sys.argv[0], sys.argv[0])
    sys.exit(exit_code)



def process_arguments(arguments):
    """f([list of arguments]) -> (options)

    Processes the list of command line arguments and returns the
    active options as a tuple.
    """
    try:
        options, files_to_process = getopt.getopt(arguments,
            "hvd:moe:i:", ["help", "version", "dir=", "master",
                "obsolete", "exclude=", "include="])
    except getopt.error, msg:
        print "Error parsing arguments: %s" % msg
        show_program_usage(1)

    directory = None
    show_stale_master = 0
    show_obsolete = 0
    exclude = []
    include = []

    for op, value in options:
        if op in ("--dir", "-d"):
            directory = value
        elif op in ("--version", "-v"):
            print version
            sys.exit(0)
        elif op in ("--help", "-h"):
            show_program_usage(0)
        elif op in ("--master", "-m"):
            show_stale_master = 1
        elif op in ("--obsolete", "-o"):
            show_obsolete = 1
        elif op in ("--exclude", "-e"):
            exclude = value.split(":")
        elif op in ("--include", "-i"):
            include = value.split(":")

    if not directory:
       print "You must specify the root directory."
       show_program_usage(1)

    # Transform exclusion/inclusion list into dictionary.
    exclude = dict(zip(exclude, [1] * len(exclude)))
    include = dict(zip(include, [1] * len(include)))

    return directory, show_stale_master, show_obsolete, exclude, include


def read_file_data_entries(filename):
    """f(filename) -> {data_entries}

    Returns all the include-data entries from filename. The value
    is always zero, a dictionary is used to avoid key repetition.
    """
    entries = {}
    input = file(filename, "rt")
    for line in input.readlines():
        while line:
            m = INCLUDE_DATA_RE.search(line)
            if m:
                entries[os.path.normpath(m.group("filename"))] = 0
                line = m.group("post")
            else:
                line = ""
    input.close()

    return entries


def read_src_dir(base_dir):
    """f(base_dir) -> {used_data_entries}

    Reads all files in the base_dir/src directory and builds a
    dictionary of include-data tags' parameters where the value
    is always zero (to avoid key repetition).
    """
    dir_pattern = os.path.join(base_dir, "src", "*")
    data_entries = {}
    for file in filter(os.path.isfile, glob.glob(dir_pattern)):
        data_entries.update(read_file_data_entries(file))

    return data_entries


def read_data_file(data_dir, filename):
    """f(data_dir, filename) -> (langcode, [list_of_entries])

    Reads filename and extracts from it the language code. Also
    returned, a list of data entries from the file.
    """
    entries = []
    langcode = filename [-2:]
    pattern = filename[len(data_dir) + 1:-3]

    input = file(filename, "rt")
    for line in input.readlines():
        if line[:2] == "#-":
            entries.append("%s:%s" % (pattern, line[2:].rstrip()))
    input.close()

    return langcode, entries


def read_data_dir(base_dir):
    """f(base_dir) -> {data}

    Reads all the files in the directories under base_dir/data and
    returns a dictionary for each language, whose value is the
    list of data used by the language and where (same path format as
    used by <include> tags).
    """
    data_dir = os.path.join(base_dir, "data")
    dir_pattern = os.path.join(data_dir, "*", "*.??")
    languages = {}
    for file in filter(os.path.isfile, glob.glob(dir_pattern)):
        lang, entries = read_data_file(data_dir, file)
        if not languages.has_key(lang):
            languages[lang] = []
        languages[lang].extend(entries)

    return languages


def main():
    """f() -> int

    Main entry point. Returns zero on success.
    """
    directory, show_stale_master, show_obsolete, exclude, include = (
        process_arguments(sys.argv[1:]))

    # Dict for quick searching, list for sorted output.
    master_dict = read_src_dir(directory)
    master_list = master_dict.keys()
    master_list.sort()

    data = read_data_dir(directory)

    # Build a reverse dictionary of used English entries.
    en_dict = dict(zip(data["en"], [1] * len(data["en"])))

    # Add stale entries to the English master for completeness.
    if show_stale_master:
        for element in master_list:
            if element not in en_dict:
                print "Stale English %s" % element

    # Finally, process all the languages.
    for lang, list_data in data.iteritems():
        # Skip uninteresting languages.
        if lang in exclude:
            continue

        if include and lang not in include:
            continue

        if lang != "en":
            # Build reverse dictionary.
            available = dict(zip(list_data, [1] * len(list_data)))

            # Create list of elements which are missing.
            missing = []
            for element in master_list:
                if element not in available:
                    missing.append("%s:%s" % (lang, element))

            # Show results.
            if missing:
                missing_percent = len(missing) / float(len(master_list)) * 100
                print "%d blocks missing out of %d (%.02f%%) for `%s':" % (
                    len(missing), len(master_list), missing_percent, lang)
                for element in missing:
                    print "    %s" % element
                print
            else:
                print "No missing entries for `%s'.\n" % lang

        # Show obsolete entries.
        if show_obsolete:
            obsolete = []
            list_data.sort()
            for element in list_data:
                if element not in master_dict:
                    obsolete.append("%s:%s" % (lang, element))
            if obsolete:
                print "%d entries obsolete in `%s':" % (len(obsolete), lang)
                for element in obsolete:
                    print "    %s" % element
                print
            else:
                print "No obsolete entries in `%s'.\n" % lang

    return 0


if __name__ == "__main__":
    main()