python: Add the Python implementation of the phoentic generator

Add the Python implementation of the phoentic table generator.
2021-08-27 14:53:50 +02:00
parent a18891eac9
commit 7acbc7de0c
2 changed files with 372 additions and 0 deletions
--- a/python/phogen_map/phogen_map.py
+++ b/python/phogen_map/phogen_map.py
@ -0,0 +1,327 @@
 #!/usr/bin/env python
 import hashlib
 import json
 import math
 import pprint
 import random
 import roman
 import sys
 import argparse
 PHO_GEN_VOWELS = "aeiou"
 PHO_GEN_TEST = [
        [ b"passgeny",  "herang xiasem zitend qibele" ],
        [ b"phonetic",  "lineum foneum zybale mangur" ],
        [ b"generator", "latole elitab ackina exprou" ],
        [ b"password",  "nulize nomere fonici crednt" ],
        [ b"duck",      "catabb rompor cricin prunsi" ] ]
 PYTHON_HEADER = """#
 # Phonetic Map -- autogenerated, do not edit.
 #
 """
 C_HEADER = """/*
 * Phonetic Map -- autogenerated, do not edit.
 *
 * The list contains exactly 27*27 entries (all combinations of 2 characters
 * from the array  [" ", "a"..."z"]) and is sorted alphabetically. This makes
 * the ngram lookup time constant, but it also means that the map contains
 * holes. The holes have a value of `map` set to NULL.
 */
 struct phogen_entry
 {
    char    *ngram;     /* Ngram */
    char    *map;       /* Character map */
 };
 struct phogen_entry phogen_map[] =
 {
 """
 C_FOOTER = \
 """};
 """
 g_ngram_table = None
 g_args = None
 class HashInt:
    def __init__(self):
        self.hash = None
        self.bits = 0
        self.bits_used = 0
    def from_bytes(self, buf):
        self.hash = int.from_bytes(buf, byteorder='big')
        self.bits = len(buf) * 8
    # Divide the HashInt by `idiv` and return the modulo
    def mod32(self, idiv):
        nbits = math.log(idiv) / math.log(2)
        if nbits + self.bits_used > self.bits:
            return None
        ret = self.hash % idiv
        self.hash //= idiv
        return ret
 def elog(*args):
    if g_args.verbose:
        print(*args, file=sys.stderr)
 # Read the words file line by line (each line should represent a single word).
 # Filter out words that contain special characters or are roman numbers.
 def phogen_word_list():
    with open(g_args.input) as f:
        for text in f.readlines():
            # All lower-case, strip whitespaces and new lines
            text = text.lower().strip()
            # Skip non-alphanumeric words
            if not text.isalpha():
                continue
            # The dictionary contains a variatey of roman numbers. Avoid
            # those since they can generate weird combinations (ii, xx, ..)
            if roman.roman_to_int(text) != 0:
                elog("Ignoring roman number:", text)
                continue
            yield text
 # This function takes the word list and splits each word into ngrams. A ngram
 # in the context of this function is just a combination of two letters.
 #
 # The next ngram is computed by removing the first letter from the current ngram
 # and appending the next letter in the word.
 #
 # For example, the word "ananas" produces the following ngrams:
 #
 # "  " (starting ngram), next letter is 'a'
 # " a", next letter is 'n'
 # "an", next letter is 'a'
 # "na", next letter is 'n'
 # "an", next letter is 'a'
 # "na", next letter is 's'
 # "as", end
 #
 # Each ngram is inserted into a dictionary. The dictionary data is another
 # dictionary that cointains the next letter as key and the number of occurrences
 # detected of the next letter.
 #
 # Taking "ananas" as example again, we get the following output:
 #
 # {
 #   '  ': {'a': 1},
 #   ' a': {'n': 1},
 #   'an': {'a': 2},
 #   'na': {'n': 1, 's': 1}}
 #   'as' {}
 # }
 #
 def phogen_freq():
    ngram_freq_list = {}
    # Generate a ngram table. This is a double dictionary where the mapping is
    # as follows:
    # ngram_freq_list[ngram][next_letter] =  frequency
    for word in phogen_word_list():
        # Start with an empty ngram (two spaces)
        ngram = "  "
        # Read the word character by character and generate new ngrams
        for letter in word:
            if not ngram in ngram_freq_list:
                ngram_freq_list[ngram] = {}
            # Take the current ngram and compute the next character frequency
            freq = ngram_freq_list[ngram].get(letter, 0)
            freq += 1
            ngram_freq_list[ngram][letter] = freq
            # Compute the next ngram from the next character
            ngram = ngram[1:] + letter
    return ngram_freq_list
 # Take the output of `phogen_freq_list` and condense it down:
 #   - For each ngram, take the letter:freq list, sort it by most common letter
 #     and use 5 topmost letters. The only exception is the initial ngram "  ",
 #     which can all letter mappings instead of 5.
 #   - Ignore ngram + letter combination that would produce new ngrams without a
 #     a proper mapping.
 #   - If the list contains 5 or less letters, fill it with vowels
 def phogen_map(ngram_list):
    # Scan the ngram list and normalize it:
    #   - If a ngram -> letters contains more than 5 combinations, use the 5
    #     most frequent combinations
    #   - If a ngram -> letters contains less than 5 combinations, fill it with
    #     vowels
    ngram_normal = {}
    for (ngram, data) in ngram_list.items():
        ndata = []
        # Scan the letters data in an order that is sorted by frequency (and
        # character value, so letters with the same frequency have a predictable
        # outout) of the letters.  Compute the next ngram by removing the first
        # letter from the current ngram and append the next letter. If the new
        # ngram does not exists in ngram_list, consider it invalid and ignore it.
        #
        # If the mapping exists, add the current letter to the normalized list.
        #
        # Do this for a maximum of 5 letters.
        for (letter, freq) in sorted(data.items(), key=lambda x: x[1] * 256 + ord(x[0]), reverse=True):
            new_ngram = ngram[1:] + letter
            if not new_ngram in ngram_list:
                elog("Ignoring invalid ngram mapping {} + {} -> {}".format(ngram, letter, new_ngram))
                continue
            ndata.append(letter)
            if ngram != "  " and len(ndata) >= len(PHO_GEN_VOWELS): break
        # If the list contains less than 5 letters, fill it with vowels
        for v in PHO_GEN_VOWELS:
            if len(ndata) >= len(PHO_GEN_VOWELS): break
            if v not in ndata: ndata.append(v)
        ngram_normal[ngram] = ndata
    return ngram_normal
 # Take the output of `phogen_map` and verify that all mappings actually
 # resolve to new ngrams
 def phogen_verify(ngram_list):
    for (ngram, letters) in ngram_list.items():
        for l in letters:
            tngram = ngram[1:] + l
            if tngram not in ngram_list:
                print("Incomplete map, dangling mapping: {} + {} -> !{}",
                        ngram, l, nngram)
                return False
    return True
 def phogen_init():
    global g_phonetic_map
    elog("Generating frequency table ...")
    pho_freq = phogen_freq()
    elog("Generating phonetic map ...")
    pho_map = phogen_map(pho_freq)
    elog("Verifying mappings ...")
    if not phogen_verify(pho_map):
        return False
    g_phonetic_map = pho_map
    return True
 def phogen_test():
    # Generate phonetic passwords from 5 hard-coded examples
    for w in PHO_GEN_TEST:
        # Generate a big integer from the SHA256 data
        h = HashInt()
        h.from_bytes(hashlib.sha256(w[0]).digest())
        phopass = []
        for j in range(0, 4):
            phopass.append(phogen(h, 6))
        wp = " ".join(phopass)
        if wp != w[1]:
            print("Internal test failed: '{}' != '{}'".format(wp, w[1]))
            return False
        #print(" ".join(phopass))
    return True
 # Output a phonetic representation of `length`. The data is taken from the
 # Hash in `h`
 def phogen(h, length):
    global g_phonetic_map
    word =""
    ngram = "  "
    for i in range(length):
        nl = len(g_phonetic_map[ngram])
        ni = h.mod32(nl)
        letter = g_phonetic_map[ngram][ni]
        word += letter
        ngram = ngram[1:] + letter
    return word
 def pho_parse_args():
    global g_args
    args = argparse.ArgumentParser(description="Generate mappings for the phonetic generator. Without arguments, just run the built-in tests.")
    args.add_argument("-i", "--input", action="store", help="Input file (word list)", default=None, required=True)
    args.add_argument("-j", "--json", action="store", help="JSON output file", default=None)
    args.add_argument("-p", "--python", action="store", help="Python output file", default=None)
    args.add_argument("-c", "--clang", action="store", help="C output file", default=None)
    args.add_argument("-t", "--test", action="store_true", help="Output random passwords", default=False)
    args.add_argument("-v", "--verbose", action="store_true", help="Verbose", default=False)
    g_args =  args.parse_args()
 def phogen_clang_dump(f):
    print(C_HEADER, file=f, end="")
    # Generate a list of (" ", "a" ... "z" )
    charmap = (" ", *(chr(x) for x in range(ord('a'), ord('z') +1)))
    first = True
    for i in charmap:
        for j in charmap:
            if not first:
                print('    },', file=f)
            first = False
            ngram = i + j
            if ngram in g_phonetic_map:
                nmap = '"{}"'.format("".join(g_phonetic_map[ngram]))
            else:
                nmap = "NULL"
            print('    {', file=f)
            print('        .ngram = "{}",'.format(ngram), file=f)
            print('        .map = {}'.format(nmap), file=f)
    print('    }', file=f)
    print(C_FOOTER, file=f, end="")
 pho_parse_args()
 if not phogen_init():
    print("Error generating phonetic map", file=std.err)
    sys.exit(1)
 if not phogen_test():
    sys.exit(1)
 if g_args.python:
    with open(g_args.python, "w") as f:
        print(PYTHON_HEADER, file=f, end="")
        print("g_phonetic_map = \\", file=f)
        pprint.pprint(g_phonetic_map, f, width=256, compact=False)
 if g_args.json:
    with open(g_args.json, "w") as f:
        json.dump(g_phonetic_map, f, indent=4, sort_keys=True)
        print(file=f)
 if g_args.clang:
    with open(g_args.clang, "w") as f:
        phogen_clang_dump(f)
 if g_args.test:
    for w in range(0, 5):
        h = HashInt()
        h.from_bytes(random.randbytes(16))
        phopass = []
        for j in range(0, 4):
            phopass.append(phogen(h, 6))
        print(" ".join(phopass))
--- a/python/phogen_map/roman.py
+++ b/python/phogen_map/roman.py
@ -0,0 +1,45 @@
 roman_symbol = [
        [ "IV", 4   ],
        [ "IX", 9   ],
        [ "XL", 40  ],
        [ "XC", 90  ],
        [ "CD", 400 ],
        [ "CM", 900 ],
        [ "I",  1   ],
        [ "V",  5   ],
        [ "X",  10  ],
        [ "L",  50  ],
        [ "C",  100 ],
        [ "D",  500 ],
        [ "M",  1000]
 ]
 # Convert an integer to its roman representation as string
 def int_to_roman(i):
    s = ""
    for rs in sorted(roman_symbol, key=lambda x: x[1], reverse=True):
        count = i // rs[1]
        s += rs[0] * count
        if count > 0:
            i %= rs[1] * count
    return s
 # Convert a roman number to its integer value; return 0 on error
 def roman_to_int(s):
    i = 0
    su = s.upper()
    while len(su) > 0:
        for rs in roman_symbol:
            if su.startswith(rs[0]):
                break
        else:
            return 0
        i += rs[1]
        su = su.removeprefix(rs[0])
    if s.upper() != int_to_roman(i):
        return 0
    return i