#!/usr/bin/python
"""Generate plausible English words from bigram frequencies.
"""

import bisect
import random
import sys


def main(filename):
    for letter in generate(compute_letter_distributions(wordlist(filename))):
        sys.stdout.write(letter)

def generate(cdfs):
    last_letter = ' '
    while True:
        last_letter = cdfs[last_letter].choose(random.random())
        yield last_letter

def wordlist(filename):
    "Parse a file containing an integer frequency and a word on each line."
    for line in open(filename):
        freq, word = line.split()
        yield int(freq), word

def compute_letter_distributions(wordlist):
    "Returns a dictionary of Cdf objects."
    freqs = {}

    for freq, word in wordlist:
        assert len(word) > 0
        word = ' ' + word + ' '
        for ii in range(len(word)-1):
            last_letter, next_letter = word[ii:ii+2]
            if last_letter not in freqs:
                freqs[last_letter] = {}
            if next_letter not in freqs[last_letter]:
                freqs[last_letter][next_letter] = 0

            freqs[last_letter][next_letter] += freq

    return dict((last_letter, Cdf(freqs[last_letter]))
                for last_letter in freqs.keys())

class Cdf:
    "Generate a discrete distribution with the given relative frequencies."
    def __init__(self, freqs):
        "freqs is a mapping from items to their relative frequencies."
        self.cdf = []
        self.keys = []

        total = 0
        for key in sorted(freqs.keys()):
            assert freqs[key] > 0

            total += freqs[key]

            self.cdf.append(total)
            self.keys.append(key)

    def choose(self, variate):
        "variate is a uniformly distributed random variable on [0, 1]."
        assert 0 <= variate <= 1
        
        scaled = round(variate * self.cdf[-1])
        index = bisect.bisect_left(self.cdf, scaled)

        assert 0 <= index < len(self.cdf)
        assert self.cdf[index] >= scaled
        assert index == 0 or self.cdf[index-1] < scaled

        return self.keys[index]

def _test_cdf():
    xx = Cdf({'a': 2, 'b': 2, 'c': 6})
    ok(xx.choose(0), 'a')
    ok(xx.choose(0.1), 'a')
    ok(xx.choose(0.3), 'b')
    ok(xx.choose(0.5), 'c')
    ok(xx.choose(0.9), 'c')
    ok(xx.choose(1), 'c')

def ok(a, b): assert a == b, (a, b)

_test_cdf()
if __name__ == '__main__':
    main(sys.argv[1])

