diff --git a/python/phogen_map/phogen_map.py b/python/phogen_map/phogen_map.py new file mode 100755 index 0000000..a99da80 --- /dev/null +++ b/python/phogen_map/phogen_map.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python + +import hashlib +import json +import math +import pprint +import random +import roman +import sys +import argparse + +PHO_GEN_VOWELS = "aeiou" + +PHO_GEN_TEST = [ + [ b"passgeny", "herang xiasem zitend qibele" ], + [ b"phonetic", "lineum foneum zybale mangur" ], + [ b"generator", "latole elitab ackina exprou" ], + [ b"password", "nulize nomere fonici crednt" ], + [ b"duck", "catabb rompor cricin prunsi" ] ] + + +PYTHON_HEADER = """# +# Phonetic Map -- autogenerated, do not edit. +# +""" + +C_HEADER = """/* + * Phonetic Map -- autogenerated, do not edit. + * + * The list contains exactly 27*27 entries (all combinations of 2 characters + * from the array [" ", "a"..."z"]) and is sorted alphabetically. This makes + * the ngram lookup time constant, but it also means that the map contains + * holes. The holes have a value of `map` set to NULL. + */ + +struct phogen_entry +{ + char *ngram; /* Ngram */ + char *map; /* Character map */ +}; + +struct phogen_entry phogen_map[] = +{ +""" + +C_FOOTER = \ +"""}; +""" + +g_ngram_table = None +g_args = None + +class HashInt: + def __init__(self): + self.hash = None + self.bits = 0 + self.bits_used = 0 + + def from_bytes(self, buf): + self.hash = int.from_bytes(buf, byteorder='big') + self.bits = len(buf) * 8 + + # Divide the HashInt by `idiv` and return the modulo + def mod32(self, idiv): + nbits = math.log(idiv) / math.log(2) + if nbits + self.bits_used > self.bits: + return None + + ret = self.hash % idiv + self.hash //= idiv + return ret + +def elog(*args): + if g_args.verbose: + print(*args, file=sys.stderr) + +# Read the words file line by line (each line should represent a single word). +# Filter out words that contain special characters or are roman numbers. +def phogen_word_list(): + with open(g_args.input) as f: + for text in f.readlines(): + # All lower-case, strip whitespaces and new lines + text = text.lower().strip() + # Skip non-alphanumeric words + if not text.isalpha(): + continue + + # The dictionary contains a variatey of roman numbers. Avoid + # those since they can generate weird combinations (ii, xx, ..) + if roman.roman_to_int(text) != 0: + elog("Ignoring roman number:", text) + continue + + yield text + +# This function takes the word list and splits each word into ngrams. A ngram +# in the context of this function is just a combination of two letters. +# +# The next ngram is computed by removing the first letter from the current ngram +# and appending the next letter in the word. +# +# For example, the word "ananas" produces the following ngrams: +# +# " " (starting ngram), next letter is 'a' +# " a", next letter is 'n' +# "an", next letter is 'a' +# "na", next letter is 'n' +# "an", next letter is 'a' +# "na", next letter is 's' +# "as", end +# +# Each ngram is inserted into a dictionary. The dictionary data is another +# dictionary that cointains the next letter as key and the number of occurrences +# detected of the next letter. +# +# Taking "ananas" as example again, we get the following output: +# +# { +# ' ': {'a': 1}, +# ' a': {'n': 1}, +# 'an': {'a': 2}, +# 'na': {'n': 1, 's': 1}} +# 'as' {} +# } +# +def phogen_freq(): + ngram_freq_list = {} + + # Generate a ngram table. This is a double dictionary where the mapping is + # as follows: + # ngram_freq_list[ngram][next_letter] = frequency + for word in phogen_word_list(): + # Start with an empty ngram (two spaces) + ngram = " " + # Read the word character by character and generate new ngrams + for letter in word: + if not ngram in ngram_freq_list: + ngram_freq_list[ngram] = {} + + # Take the current ngram and compute the next character frequency + freq = ngram_freq_list[ngram].get(letter, 0) + freq += 1 + ngram_freq_list[ngram][letter] = freq + + # Compute the next ngram from the next character + ngram = ngram[1:] + letter + + return ngram_freq_list + +# Take the output of `phogen_freq_list` and condense it down: +# - For each ngram, take the letter:freq list, sort it by most common letter +# and use 5 topmost letters. The only exception is the initial ngram " ", +# which can all letter mappings instead of 5. +# - Ignore ngram + letter combination that would produce new ngrams without a +# a proper mapping. +# - If the list contains 5 or less letters, fill it with vowels +def phogen_map(ngram_list): + # Scan the ngram list and normalize it: + # - If a ngram -> letters contains more than 5 combinations, use the 5 + # most frequent combinations + # - If a ngram -> letters contains less than 5 combinations, fill it with + # vowels + ngram_normal = {} + for (ngram, data) in ngram_list.items(): + ndata = [] + # Scan the letters data in an order that is sorted by frequency (and + # character value, so letters with the same frequency have a predictable + # outout) of the letters. Compute the next ngram by removing the first + # letter from the current ngram and append the next letter. If the new + # ngram does not exists in ngram_list, consider it invalid and ignore it. + # + # If the mapping exists, add the current letter to the normalized list. + # + # Do this for a maximum of 5 letters. + for (letter, freq) in sorted(data.items(), key=lambda x: x[1] * 256 + ord(x[0]), reverse=True): + new_ngram = ngram[1:] + letter + if not new_ngram in ngram_list: + elog("Ignoring invalid ngram mapping {} + {} -> {}".format(ngram, letter, new_ngram)) + continue + + ndata.append(letter) + if ngram != " " and len(ndata) >= len(PHO_GEN_VOWELS): break + + # If the list contains less than 5 letters, fill it with vowels + for v in PHO_GEN_VOWELS: + if len(ndata) >= len(PHO_GEN_VOWELS): break + if v not in ndata: ndata.append(v) + + ngram_normal[ngram] = ndata + + return ngram_normal + +# Take the output of `phogen_map` and verify that all mappings actually +# resolve to new ngrams +def phogen_verify(ngram_list): + for (ngram, letters) in ngram_list.items(): + for l in letters: + tngram = ngram[1:] + l + if tngram not in ngram_list: + print("Incomplete map, dangling mapping: {} + {} -> !{}", + ngram, l, nngram) + return False + + return True + +def phogen_init(): + global g_phonetic_map + + elog("Generating frequency table ...") + pho_freq = phogen_freq() + elog("Generating phonetic map ...") + pho_map = phogen_map(pho_freq) + + elog("Verifying mappings ...") + if not phogen_verify(pho_map): + return False + + g_phonetic_map = pho_map + return True + +def phogen_test(): + # Generate phonetic passwords from 5 hard-coded examples + for w in PHO_GEN_TEST: + # Generate a big integer from the SHA256 data + h = HashInt() + h.from_bytes(hashlib.sha256(w[0]).digest()) + phopass = [] + for j in range(0, 4): + phopass.append(phogen(h, 6)) + + wp = " ".join(phopass) + if wp != w[1]: + print("Internal test failed: '{}' != '{}'".format(wp, w[1])) + return False + + #print(" ".join(phopass)) + return True + +# Output a phonetic representation of `length`. The data is taken from the +# Hash in `h` +def phogen(h, length): + global g_phonetic_map + + word ="" + ngram = " " + for i in range(length): + nl = len(g_phonetic_map[ngram]) + ni = h.mod32(nl) + letter = g_phonetic_map[ngram][ni] + word += letter + ngram = ngram[1:] + letter + + return word + +def pho_parse_args(): + global g_args + + args = argparse.ArgumentParser(description="Generate mappings for the phonetic generator. Without arguments, just run the built-in tests.") + args.add_argument("-i", "--input", action="store", help="Input file (word list)", default=None, required=True) + args.add_argument("-j", "--json", action="store", help="JSON output file", default=None) + args.add_argument("-p", "--python", action="store", help="Python output file", default=None) + args.add_argument("-c", "--clang", action="store", help="C output file", default=None) + args.add_argument("-t", "--test", action="store_true", help="Output random passwords", default=False) + args.add_argument("-v", "--verbose", action="store_true", help="Verbose", default=False) + g_args = args.parse_args() + +def phogen_clang_dump(f): + print(C_HEADER, file=f, end="") + + # Generate a list of (" ", "a" ... "z" ) + charmap = (" ", *(chr(x) for x in range(ord('a'), ord('z') +1))) + + first = True + + for i in charmap: + for j in charmap: + if not first: + print(' },', file=f) + + first = False + + ngram = i + j + if ngram in g_phonetic_map: + nmap = '"{}"'.format("".join(g_phonetic_map[ngram])) + else: + nmap = "NULL" + + print(' {', file=f) + print(' .ngram = "{}",'.format(ngram), file=f) + print(' .map = {}'.format(nmap), file=f) + + print(' }', file=f) + print(C_FOOTER, file=f, end="") + +pho_parse_args() + +if not phogen_init(): + print("Error generating phonetic map", file=std.err) + sys.exit(1) + +if not phogen_test(): + sys.exit(1) + +if g_args.python: + with open(g_args.python, "w") as f: + print(PYTHON_HEADER, file=f, end="") + print("g_phonetic_map = \\", file=f) + pprint.pprint(g_phonetic_map, f, width=256, compact=False) + +if g_args.json: + with open(g_args.json, "w") as f: + json.dump(g_phonetic_map, f, indent=4, sort_keys=True) + print(file=f) + +if g_args.clang: + with open(g_args.clang, "w") as f: + phogen_clang_dump(f) + +if g_args.test: + for w in range(0, 5): + h = HashInt() + h.from_bytes(random.randbytes(16)) + phopass = [] + for j in range(0, 4): + phopass.append(phogen(h, 6)) + + print(" ".join(phopass)) diff --git a/python/phogen_map/roman.py b/python/phogen_map/roman.py new file mode 100644 index 0000000..0ece533 --- /dev/null +++ b/python/phogen_map/roman.py @@ -0,0 +1,45 @@ +roman_symbol = [ + [ "IV", 4 ], + [ "IX", 9 ], + [ "XL", 40 ], + [ "XC", 90 ], + [ "CD", 400 ], + [ "CM", 900 ], + [ "I", 1 ], + [ "V", 5 ], + [ "X", 10 ], + [ "L", 50 ], + [ "C", 100 ], + [ "D", 500 ], + [ "M", 1000] +] + +# Convert an integer to its roman representation as string +def int_to_roman(i): + s = "" + for rs in sorted(roman_symbol, key=lambda x: x[1], reverse=True): + count = i // rs[1] + s += rs[0] * count + if count > 0: + i %= rs[1] * count + + return s + +# Convert a roman number to its integer value; return 0 on error +def roman_to_int(s): + i = 0 + su = s.upper() + while len(su) > 0: + for rs in roman_symbol: + if su.startswith(rs[0]): + break + else: + return 0 + + i += rs[1] + su = su.removeprefix(rs[0]) + + if s.upper() != int_to_roman(i): + return 0 + + return i