#!/usr/bin/env python

import hashlib
import json
import math
import pprint
import random
import roman
import sys
import argparse

PHO_GEN_VOWELS = "aeiou"

PHO_GEN_TEST = [
        [ b"passgeny",  "herang xiasem zitend qibele" ],
        [ b"phonetic",  "lineum foneum zybale mangur" ],
        [ b"generator", "latole elitab ackina exprou" ],
        [ b"password",  "nulize nomere fonici crednt" ],
        [ b"duck",      "catabb rompor cricin prunsi" ] ]


PYTHON_HEADER = """#
# Phonetic Map -- autogenerated, do not edit.
#
"""

C_HEADER = """/*
 * Phonetic Map -- autogenerated, do not edit.
 *
 * The list contains exactly 27*27 entries (all combinations of 2 characters
 * from the array  [" ", "a"..."z"]) and is sorted alphabetically. This makes
 * the ngram lookup time constant, but it also means that the map contains
 * holes. The holes have a value of `map` set to NULL.
 */

struct phogen_entry
{
    char    *ngram;     /* Ngram */
    char    *map;       /* Character map */
};

struct phogen_entry phogen_map[] =
{
"""

C_FOOTER = \
"""};
"""

g_ngram_table = None
g_args = None

class HashInt:
    def __init__(self):
        self.hash = None
        self.bits = 0
        self.bits_used = 0

    def from_bytes(self, buf):
        self.hash = int.from_bytes(buf, byteorder='big')
        self.bits = len(buf) * 8

    # Divide the HashInt by `idiv` and return the modulo
    def mod32(self, idiv):
        nbits = math.log(idiv) / math.log(2)
        if nbits + self.bits_used > self.bits:
            return None

        ret = self.hash % idiv
        self.hash //= idiv
        return ret

def elog(*args):
    if g_args.verbose:
        print(*args, file=sys.stderr)

# Read the words file line by line (each line should represent a single word).
# Filter out words that contain special characters or are roman numbers.
def phogen_word_list():
    with open(g_args.input) as f:
        for text in f.readlines():
            # All lower-case, strip whitespaces and new lines
            text = text.lower().strip()
            # Skip non-alphanumeric words
            if not text.isalpha():
                continue

            # The dictionary contains a variatey of roman numbers. Avoid
            # those since they can generate weird combinations (ii, xx, ..)
            if roman.roman_to_int(text) != 0:
                elog("Ignoring roman number:", text)
                continue

            yield text

# This function takes the word list and splits each word into ngrams. A ngram
# in the context of this function is just a combination of two letters.
#
# The next ngram is computed by removing the first letter from the current ngram
# and appending the next letter in the word.
#
# For example, the word "ananas" produces the following ngrams:
#
# "  " (starting ngram), next letter is 'a'
# " a", next letter is 'n'
# "an", next letter is 'a'
# "na", next letter is 'n'
# "an", next letter is 'a'
# "na", next letter is 's'
# "as", end
#
# Each ngram is inserted into a dictionary. The dictionary data is another
# dictionary that cointains the next letter as key and the number of occurrences
# detected of the next letter.
#
# Taking "ananas" as example again, we get the following output:
#
# {
#   '  ': {'a': 1},
#   ' a': {'n': 1},
#   'an': {'a': 2},
#   'na': {'n': 1, 's': 1}}
#   'as' {}
# }
#
def phogen_freq():
    ngram_freq_list = {}

    # Generate a ngram table. This is a double dictionary where the mapping is
    # as follows:
    # ngram_freq_list[ngram][next_letter] =  frequency
    for word in phogen_word_list():
        # Start with an empty ngram (two spaces)
        ngram = "  "
        # Read the word character by character and generate new ngrams
        for letter in word:
            if not ngram in ngram_freq_list:
                ngram_freq_list[ngram] = {}

            # Take the current ngram and compute the next character frequency
            freq = ngram_freq_list[ngram].get(letter, 0)
            freq += 1
            ngram_freq_list[ngram][letter] = freq

            # Compute the next ngram from the next character
            ngram = ngram[1:] + letter

    return ngram_freq_list

# Take the output of `phogen_freq_list` and condense it down:
#   - For each ngram, take the letter:freq list, sort it by most common letter
#     and use 5 topmost letters. The only exception is the initial ngram "  ",
#     which can all letter mappings instead of 5.
#   - Ignore ngram + letter combination that would produce new ngrams without a
#     a proper mapping.
#   - If the list contains 5 or less letters, fill it with vowels
def phogen_map(ngram_list):
    # Scan the ngram list and normalize it:
    #   - If a ngram -> letters contains more than 5 combinations, use the 5
    #     most frequent combinations
    #   - If a ngram -> letters contains less than 5 combinations, fill it with
    #     vowels
    ngram_normal = {}
    for (ngram, data) in ngram_list.items():
        ndata = []
        # Scan the letters data in an order that is sorted by frequency (and
        # character value, so letters with the same frequency have a predictable
        # outout) of the letters.  Compute the next ngram by removing the first
        # letter from the current ngram and append the next letter. If the new
        # ngram does not exists in ngram_list, consider it invalid and ignore it.
        #
        # If the mapping exists, add the current letter to the normalized list.
        #
        # Do this for a maximum of 5 letters.
        for (letter, freq) in sorted(data.items(), key=lambda x: x[1] * 256 + ord(x[0]), reverse=True):
            new_ngram = ngram[1:] + letter
            if not new_ngram in ngram_list:
                elog("Ignoring invalid ngram mapping {} + {} -> {}".format(ngram, letter, new_ngram))
                continue

            ndata.append(letter)
            if ngram != "  " and len(ndata) >= len(PHO_GEN_VOWELS): break

        # If the list contains less than 5 letters, fill it with vowels
        for v in PHO_GEN_VOWELS:
            if len(ndata) >= len(PHO_GEN_VOWELS): break
            if v not in ndata: ndata.append(v)

        ngram_normal[ngram] = ndata

    return ngram_normal

# Take the output of `phogen_map` and verify that all mappings actually
# resolve to new ngrams
def phogen_verify(ngram_list):
    for (ngram, letters) in ngram_list.items():
        for l in letters:
            tngram = ngram[1:] + l
            if tngram not in ngram_list:
                print("Incomplete map, dangling mapping: {} + {} -> !{}",
                        ngram, l, nngram)
                return False

    return True

def phogen_init():
    global g_phonetic_map

    elog("Generating frequency table ...")
    pho_freq = phogen_freq()
    elog("Generating phonetic map ...")
    pho_map = phogen_map(pho_freq)

    elog("Verifying mappings ...")
    if not phogen_verify(pho_map):
        return False

    g_phonetic_map = pho_map
    return True

def phogen_test():
    # Generate phonetic passwords from 5 hard-coded examples
    for w in PHO_GEN_TEST:
        # Generate a big integer from the SHA256 data
        h = HashInt()
        h.from_bytes(hashlib.sha256(w[0]).digest())
        phopass = []
        for j in range(0, 4):
            phopass.append(phogen(h, 6))

        wp = " ".join(phopass)
        if wp != w[1]:
            print("Internal test failed: '{}' != '{}'".format(wp, w[1]))
            return False

        #print(" ".join(phopass))
    return True

# Output a phonetic representation of `length`. The data is taken from the
# Hash in `h`
def phogen(h, length):
    global g_phonetic_map

    word =""
    ngram = "  "
    for i in range(length):
        nl = len(g_phonetic_map[ngram])
        ni = h.mod32(nl)
        letter = g_phonetic_map[ngram][ni]
        word += letter
        ngram = ngram[1:] + letter

    return word

def pho_parse_args():
    global g_args

    args = argparse.ArgumentParser(description="Generate mappings for the phonetic generator. Without arguments, just run the built-in tests.")
    args.add_argument("-i", "--input", action="store", help="Input file (word list)", default=None, required=True)
    args.add_argument("-j", "--json", action="store", help="JSON output file", default=None)
    args.add_argument("-p", "--python", action="store", help="Python output file", default=None)
    args.add_argument("-c", "--clang", action="store", help="C output file", default=None)
    args.add_argument("-t", "--test", action="store_true", help="Output random passwords", default=False)
    args.add_argument("-v", "--verbose", action="store_true", help="Verbose", default=False)
    g_args =  args.parse_args()

def phogen_clang_dump(f):
    print(C_HEADER, file=f, end="")

    # Generate a list of (" ", "a" ... "z" )
    charmap = (" ", *(chr(x) for x in range(ord('a'), ord('z') +1)))

    first = True

    for i in charmap:
        for j in charmap:
            if not first:
                print('    },', file=f)

            first = False

            ngram = i + j
            if ngram in g_phonetic_map:
                nmap = '"{}"'.format("".join(g_phonetic_map[ngram]))
            else:
                nmap = "NULL"

            print('    {', file=f)
            print('        .ngram = "{}",'.format(ngram), file=f)
            print('        .map = {}'.format(nmap), file=f)

    print('    }', file=f)
    print(C_FOOTER, file=f, end="")

pho_parse_args()

if not phogen_init():
    print("Error generating phonetic map", file=std.err)
    sys.exit(1)

if not phogen_test():
    sys.exit(1)

if g_args.python:
    with open(g_args.python, "w") as f:
        print(PYTHON_HEADER, file=f, end="")
        print("g_phonetic_map = \\", file=f)
        pprint.pprint(g_phonetic_map, f, width=256, compact=False)

if g_args.json:
    with open(g_args.json, "w") as f:
        json.dump(g_phonetic_map, f, indent=4, sort_keys=True)
        print(file=f)

if g_args.clang:
    with open(g_args.clang, "w") as f:
        phogen_clang_dump(f)

if g_args.test:
    for w in range(0, 5):
        h = HashInt()
        h.from_bytes(random.randbytes(16))
        phopass = []
        for j in range(0, 4):
            phopass.append(phogen(h, 6))

        print(" ".join(phopass))