python: Add the Python implementation of the phoentic generator

Add the Python implementation of the phoentic table generator.
2021-08-27 14:53:50 +02:00
parent a18891eac9
commit 7acbc7de0c
2 changed files with 372 additions and 0 deletions
--- a/python/phogen_map/phogen_map.py
+++ b/python/phogen_map/phogen_map.py
@ -0,0 +1,327 @@
+#!/usr/bin/env python
+
+import hashlib
+import json
+import math
+import pprint
+import random
+import roman
+import sys
+import argparse
+
+PHO_GEN_VOWELS = "aeiou"
+
+PHO_GEN_TEST = [
+        [ b"passgeny",  "herang xiasem zitend qibele" ],
+        [ b"phonetic",  "lineum foneum zybale mangur" ],
+        [ b"generator", "latole elitab ackina exprou" ],
+        [ b"password",  "nulize nomere fonici crednt" ],
+        [ b"duck",      "catabb rompor cricin prunsi" ] ]
+
+
+PYTHON_HEADER = """#
+# Phonetic Map -- autogenerated, do not edit.
+#
+"""
+
+C_HEADER = """/*
+ * Phonetic Map -- autogenerated, do not edit.
+ *
+ * The list contains exactly 27*27 entries (all combinations of 2 characters
+ * from the array  [" ", "a"..."z"]) and is sorted alphabetically. This makes
+ * the ngram lookup time constant, but it also means that the map contains
+ * holes. The holes have a value of `map` set to NULL.
+ */
+
+struct phogen_entry
+{
+    char    *ngram;     /* Ngram */
+    char    *map;       /* Character map */
+};
+
+struct phogen_entry phogen_map[] =
+{
+"""
+
+C_FOOTER = \
+"""};
+"""
+
+g_ngram_table = None
+g_args = None
+
+class HashInt:
+    def __init__(self):
+        self.hash = None
+        self.bits = 0
+        self.bits_used = 0
+
+    def from_bytes(self, buf):
+        self.hash = int.from_bytes(buf, byteorder='big')
+        self.bits = len(buf) * 8
+
+    # Divide the HashInt by `idiv` and return the modulo
+    def mod32(self, idiv):
+        nbits = math.log(idiv) / math.log(2)
+        if nbits + self.bits_used > self.bits:
+            return None
+
+        ret = self.hash % idiv
+        self.hash //= idiv
+        return ret
+
+def elog(*args):
+    if g_args.verbose:
+        print(*args, file=sys.stderr)
+
+# Read the words file line by line (each line should represent a single word).
+# Filter out words that contain special characters or are roman numbers.
+def phogen_word_list():
+    with open(g_args.input) as f:
+        for text in f.readlines():
+            # All lower-case, strip whitespaces and new lines
+            text = text.lower().strip()
+            # Skip non-alphanumeric words
+            if not text.isalpha():
+                continue
+
+            # The dictionary contains a variatey of roman numbers. Avoid
+            # those since they can generate weird combinations (ii, xx, ..)
+            if roman.roman_to_int(text) != 0:
+                elog("Ignoring roman number:", text)
+                continue
+
+            yield text
+
+# This function takes the word list and splits each word into ngrams. A ngram
+# in the context of this function is just a combination of two letters.
+#
+# The next ngram is computed by removing the first letter from the current ngram
+# and appending the next letter in the word.
+#
+# For example, the word "ananas" produces the following ngrams:
+#
+# "  " (starting ngram), next letter is 'a'
+# " a", next letter is 'n'
+# "an", next letter is 'a'
+# "na", next letter is 'n'
+# "an", next letter is 'a'
+# "na", next letter is 's'
+# "as", end
+#
+# Each ngram is inserted into a dictionary. The dictionary data is another
+# dictionary that cointains the next letter as key and the number of occurrences
+# detected of the next letter.
+#
+# Taking "ananas" as example again, we get the following output:
+#
+# {
+#   '  ': {'a': 1},
+#   ' a': {'n': 1},
+#   'an': {'a': 2},
+#   'na': {'n': 1, 's': 1}}
+#   'as' {}
+# }
+#
+def phogen_freq():
+    ngram_freq_list = {}
+
+    # Generate a ngram table. This is a double dictionary where the mapping is
+    # as follows:
+    # ngram_freq_list[ngram][next_letter] =  frequency
+    for word in phogen_word_list():
+        # Start with an empty ngram (two spaces)
+        ngram = "  "
+        # Read the word character by character and generate new ngrams
+        for letter in word:
+            if not ngram in ngram_freq_list:
+                ngram_freq_list[ngram] = {}
+
+            # Take the current ngram and compute the next character frequency
+            freq = ngram_freq_list[ngram].get(letter, 0)
+            freq += 1
+            ngram_freq_list[ngram][letter] = freq
+
+            # Compute the next ngram from the next character
+            ngram = ngram[1:] + letter
+
+    return ngram_freq_list
+
+# Take the output of `phogen_freq_list` and condense it down:
+#   - For each ngram, take the letter:freq list, sort it by most common letter
+#     and use 5 topmost letters. The only exception is the initial ngram "  ",
+#     which can all letter mappings instead of 5.
+#   - Ignore ngram + letter combination that would produce new ngrams without a
+#     a proper mapping.
+#   - If the list contains 5 or less letters, fill it with vowels
+def phogen_map(ngram_list):
+    # Scan the ngram list and normalize it:
+    #   - If a ngram -> letters contains more than 5 combinations, use the 5
+    #     most frequent combinations
+    #   - If a ngram -> letters contains less than 5 combinations, fill it with
+    #     vowels
+    ngram_normal = {}
+    for (ngram, data) in ngram_list.items():
+        ndata = []
+        # Scan the letters data in an order that is sorted by frequency (and
+        # character value, so letters with the same frequency have a predictable
+        # outout) of the letters.  Compute the next ngram by removing the first
+        # letter from the current ngram and append the next letter. If the new
+        # ngram does not exists in ngram_list, consider it invalid and ignore it.
+        #
+        # If the mapping exists, add the current letter to the normalized list.
+        #
+        # Do this for a maximum of 5 letters.
+        for (letter, freq) in sorted(data.items(), key=lambda x: x[1] * 256 + ord(x[0]), reverse=True):
+            new_ngram = ngram[1:] + letter
+            if not new_ngram in ngram_list:
+                elog("Ignoring invalid ngram mapping {} + {} -> {}".format(ngram, letter, new_ngram))
+                continue
+
+            ndata.append(letter)
+            if ngram != "  " and len(ndata) >= len(PHO_GEN_VOWELS): break
+
+        # If the list contains less than 5 letters, fill it with vowels
+        for v in PHO_GEN_VOWELS:
+            if len(ndata) >= len(PHO_GEN_VOWELS): break
+            if v not in ndata: ndata.append(v)
+
+        ngram_normal[ngram] = ndata
+
+    return ngram_normal
+
+# Take the output of `phogen_map` and verify that all mappings actually
+# resolve to new ngrams
+def phogen_verify(ngram_list):
+    for (ngram, letters) in ngram_list.items():
+        for l in letters:
+            tngram = ngram[1:] + l
+            if tngram not in ngram_list:
+                print("Incomplete map, dangling mapping: {} + {} -> !{}",
+                        ngram, l, nngram)
+                return False
+
+    return True
+
+def phogen_init():
+    global g_phonetic_map
+
+    elog("Generating frequency table ...")
+    pho_freq = phogen_freq()
+    elog("Generating phonetic map ...")
+    pho_map = phogen_map(pho_freq)
+
+    elog("Verifying mappings ...")
+    if not phogen_verify(pho_map):
+        return False
+
+    g_phonetic_map = pho_map
+    return True
+
+def phogen_test():
+    # Generate phonetic passwords from 5 hard-coded examples
+    for w in PHO_GEN_TEST:
+        # Generate a big integer from the SHA256 data
+        h = HashInt()
+        h.from_bytes(hashlib.sha256(w[0]).digest())
+        phopass = []
+        for j in range(0, 4):
+            phopass.append(phogen(h, 6))
+
+        wp = " ".join(phopass)
+        if wp != w[1]:
+            print("Internal test failed: '{}' != '{}'".format(wp, w[1]))
+            return False
+
+        #print(" ".join(phopass))
+    return True
+
+# Output a phonetic representation of `length`. The data is taken from the
+# Hash in `h`
+def phogen(h, length):
+    global g_phonetic_map
+
+    word =""
+    ngram = "  "
+    for i in range(length):
+        nl = len(g_phonetic_map[ngram])
+        ni = h.mod32(nl)
+        letter = g_phonetic_map[ngram][ni]
+        word += letter
+        ngram = ngram[1:] + letter
+
+    return word
+
+def pho_parse_args():
+    global g_args
+
+    args = argparse.ArgumentParser(description="Generate mappings for the phonetic generator. Without arguments, just run the built-in tests.")
+    args.add_argument("-i", "--input", action="store", help="Input file (word list)", default=None, required=True)
+    args.add_argument("-j", "--json", action="store", help="JSON output file", default=None)
+    args.add_argument("-p", "--python", action="store", help="Python output file", default=None)
+    args.add_argument("-c", "--clang", action="store", help="C output file", default=None)
+    args.add_argument("-t", "--test", action="store_true", help="Output random passwords", default=False)
+    args.add_argument("-v", "--verbose", action="store_true", help="Verbose", default=False)
+    g_args =  args.parse_args()
+
+def phogen_clang_dump(f):
+    print(C_HEADER, file=f, end="")
+
+    # Generate a list of (" ", "a" ... "z" )
+    charmap = (" ", *(chr(x) for x in range(ord('a'), ord('z') +1)))
+
+    first = True
+
+    for i in charmap:
+        for j in charmap:
+            if not first:
+                print('    },', file=f)
+
+            first = False
+
+            ngram = i + j
+            if ngram in g_phonetic_map:
+                nmap = '"{}"'.format("".join(g_phonetic_map[ngram]))
+            else:
+                nmap = "NULL"
+
+            print('    {', file=f)
+            print('        .ngram = "{}",'.format(ngram), file=f)
+            print('        .map = {}'.format(nmap), file=f)
+
+    print('    }', file=f)
+    print(C_FOOTER, file=f, end="")
+
+pho_parse_args()
+
+if not phogen_init():
+    print("Error generating phonetic map", file=std.err)
+    sys.exit(1)
+
+if not phogen_test():
+    sys.exit(1)
+
+if g_args.python:
+    with open(g_args.python, "w") as f:
+        print(PYTHON_HEADER, file=f, end="")
+        print("g_phonetic_map = \\", file=f)
+        pprint.pprint(g_phonetic_map, f, width=256, compact=False)
+
+if g_args.json:
+    with open(g_args.json, "w") as f:
+        json.dump(g_phonetic_map, f, indent=4, sort_keys=True)
+        print(file=f)
+
+if g_args.clang:
+    with open(g_args.clang, "w") as f:
+        phogen_clang_dump(f)
+
+if g_args.test:
+    for w in range(0, 5):
+        h = HashInt()
+        h.from_bytes(random.randbytes(16))
+        phopass = []
+        for j in range(0, 4):
+            phopass.append(phogen(h, 6))
+
+        print(" ".join(phopass))