#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""Filter a Git repository with a file whitelist.

This program is intended to be invoked by `git filter-branch
--prune-empty --tree-filter`, which runs a command in each revision
that it is mangling.  It deletes any files in the directory where it
is run, except the ones listed in the whitelist file provided to the
--delete-all-files-not-listed-in option.  So, for example, if that
file is empty, this command is equivalent to `rm -r * .[^.]*`.  This
is dangerous behavior, so the option is mandatory; hopefully the
option name is sufficiently scary as to prevent accidents.

This is useful when you are extracting some subset of files out of an
existing repository to make a new project, but you want to retain the
history of those files, rather like the `git filter-branch
--subdirectory-filter` command.  Make sure you include any old names
of the files in the whitelist; for example, if a file is currently
called masamune.py but previously was called util.py, list both
masamune.py and util.py in the whitelist.

The file list contains one literal filename per line, as a canonical
relative pathname from the current directory; leading and trailing
whitespace are removed.  For example, a line that says “mudhoney.py”
will prevent ./mudhoney.py from being deleted, but not
netfind/mudhoney.py.  If you want to prevent netfind/mudhoney.py from
being deleted, you need a line that says either "netfind" or
“netfind/mudhoney.py”; you cannot do it with a line that says
“./netfind/mudhoney.py”, “netfind/docs/../mudhoney.py”, or “netfind/”,
all of which will produce errors.  Because leading and trailing
whitespace is removed, you cannot preserve files whose names contain
leading and/or trailing whitespace, and you should be ashamed of
wanting to.  Lines beginning with “#” are not treated as comments, but
if you don't have a file called “# This file got deleted in 2009”,
then a line saying “# This file got deleted in 2009” will have no
effect.  Just don't put “/../” in your “comments”.

"""
import argparse
import os


def main():
    f = argparse.RawDescriptionHelpFormatter
    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=f)
    parser.add_argument('--delete-all-files-not-listed-in',
                        type=open, required=True,
                        help='name of a file listing the files to not delete, one per line')
    parser.add_argument('--really', action='store_true',
                        help='actually delete the files instead of just listing the ones to delete')
    parser.add_argument('--quiet', action='store_true',
                        help='don’t list the files being deleted')
    args = parser.parse_args()

    assert os.path.sep == '/', "Not prepared to deal with non-Unix firesystems"

    whitelist = set(['.git'])
    for line in args.delete_all_files_not_listed_in:
        line = line.strip()
        if line.startswith("./") or "/../" in line or line.endswith('/'):
            raise ShittyPathname(line)
        whitelist.add(line)

    for dirpath, _, filenames in os.walk('.'):
        if dirpath.startswith('./'):
            dirpath = dirpath[2:]
        elif dirpath == '.':
            dirpath = ''
        else:
            assert False, dirpath

        # Skip the path if dir or any ancestor is in the whitelist
        dirpath_components = dirpath.split('/')
        if any('/'.join(dirpath_components[:i]) in whitelist
               for i in range(1+len(dirpath_components))):
            continue

        for filename in filenames:
            pathname = os.path.join(dirpath, filename)
            if pathname not in whitelist:
                if not args.quiet:
                    print("rm", pathname)
                if args.really:
                    os.unlink(pathname)


class ShittyPathname(Exception): pass


if __name__ == '__main__':
    main()
