#!/usr/bin/env python import hashlib import json import math import pprint import random import roman import sys import argparse PHO_GEN_VOWELS = "aeiou" PHO_GEN_TEST = [ [ b"passgeny", "herang xiasem zitend qibele" ], [ b"phonetic", "lineum foneum zybale mangur" ], [ b"generator", "latole elitab ackina exprou" ], [ b"password", "nulize nomere fonici crednt" ], [ b"duck", "catabb rompor cricin prunsi" ] ] PYTHON_HEADER = """# # Phonetic Map -- autogenerated, do not edit. # """ C_HEADER = """/* * Phonetic Map -- autogenerated, do not edit. * * The list contains exactly 27*27 entries (all combinations of 2 characters * from the array [" ", "a"..."z"]) and is sorted alphabetically. This makes * the ngram lookup time constant, but it also means that the map contains * holes. The holes have a value of `map` set to NULL. */ struct phogen_entry { char *ngram; /* Ngram */ char *map; /* Character map */ }; struct phogen_entry phogen_map[] = { """ C_FOOTER = \ """}; """ g_ngram_table = None g_args = None class HashInt: def __init__(self): self.hash = None self.bits = 0 self.bits_used = 0 def from_bytes(self, buf): self.hash = int.from_bytes(buf, byteorder='big') self.bits = len(buf) * 8 # Divide the HashInt by `idiv` and return the modulo def mod32(self, idiv): nbits = math.log(idiv) / math.log(2) if nbits + self.bits_used > self.bits: return None ret = self.hash % idiv self.hash //= idiv return ret def elog(*args): if g_args.verbose: print(*args, file=sys.stderr) # Read the words file line by line (each line should represent a single word). # Filter out words that contain special characters or are roman numbers. def phogen_word_list(): with open(g_args.input) as f: for text in f.readlines(): # All lower-case, strip whitespaces and new lines text = text.lower().strip() # Skip non-alphanumeric words if not text.isalpha(): continue # The dictionary contains a variatey of roman numbers. Avoid # those since they can generate weird combinations (ii, xx, ..) if roman.roman_to_int(text) != 0: elog("Ignoring roman number:", text) continue yield text # This function takes the word list and splits each word into ngrams. A ngram # in the context of this function is just a combination of two letters. # # The next ngram is computed by removing the first letter from the current ngram # and appending the next letter in the word. # # For example, the word "ananas" produces the following ngrams: # # " " (starting ngram), next letter is 'a' # " a", next letter is 'n' # "an", next letter is 'a' # "na", next letter is 'n' # "an", next letter is 'a' # "na", next letter is 's' # "as", end # # Each ngram is inserted into a dictionary. The dictionary data is another # dictionary that cointains the next letter as key and the number of occurrences # detected of the next letter. # # Taking "ananas" as example again, we get the following output: # # { # ' ': {'a': 1}, # ' a': {'n': 1}, # 'an': {'a': 2}, # 'na': {'n': 1, 's': 1}} # 'as' {} # } # def phogen_freq(): ngram_freq_list = {} # Generate a ngram table. This is a double dictionary where the mapping is # as follows: # ngram_freq_list[ngram][next_letter] = frequency for word in phogen_word_list(): # Start with an empty ngram (two spaces) ngram = " " # Read the word character by character and generate new ngrams for letter in word: if not ngram in ngram_freq_list: ngram_freq_list[ngram] = {} # Take the current ngram and compute the next character frequency freq = ngram_freq_list[ngram].get(letter, 0) freq += 1 ngram_freq_list[ngram][letter] = freq # Compute the next ngram from the next character ngram = ngram[1:] + letter return ngram_freq_list # Take the output of `phogen_freq_list` and condense it down: # - For each ngram, take the letter:freq list, sort it by most common letter # and use 5 topmost letters. The only exception is the initial ngram " ", # which can all letter mappings instead of 5. # - Ignore ngram + letter combination that would produce new ngrams without a # a proper mapping. # - If the list contains 5 or less letters, fill it with vowels def phogen_map(ngram_list): # Scan the ngram list and normalize it: # - If a ngram -> letters contains more than 5 combinations, use the 5 # most frequent combinations # - If a ngram -> letters contains less than 5 combinations, fill it with # vowels ngram_normal = {} for (ngram, data) in ngram_list.items(): ndata = [] # Scan the letters data in an order that is sorted by frequency (and # character value, so letters with the same frequency have a predictable # outout) of the letters. Compute the next ngram by removing the first # letter from the current ngram and append the next letter. If the new # ngram does not exists in ngram_list, consider it invalid and ignore it. # # If the mapping exists, add the current letter to the normalized list. # # Do this for a maximum of 5 letters. for (letter, freq) in sorted(data.items(), key=lambda x: x[1] * 256 + ord(x[0]), reverse=True): new_ngram = ngram[1:] + letter if not new_ngram in ngram_list: elog("Ignoring invalid ngram mapping {} + {} -> {}".format(ngram, letter, new_ngram)) continue ndata.append(letter) if ngram != " " and len(ndata) >= len(PHO_GEN_VOWELS): break # If the list contains less than 5 letters, fill it with vowels for v in PHO_GEN_VOWELS: if len(ndata) >= len(PHO_GEN_VOWELS): break if v not in ndata: ndata.append(v) ngram_normal[ngram] = ndata return ngram_normal # Take the output of `phogen_map` and verify that all mappings actually # resolve to new ngrams def phogen_verify(ngram_list): for (ngram, letters) in ngram_list.items(): for l in letters: tngram = ngram[1:] + l if tngram not in ngram_list: print("Incomplete map, dangling mapping: {} + {} -> !{}", ngram, l, nngram) return False return True def phogen_init(): global g_phonetic_map elog("Generating frequency table ...") pho_freq = phogen_freq() elog("Generating phonetic map ...") pho_map = phogen_map(pho_freq) elog("Verifying mappings ...") if not phogen_verify(pho_map): return False g_phonetic_map = pho_map return True def phogen_test(): # Generate phonetic passwords from 5 hard-coded examples for w in PHO_GEN_TEST: # Generate a big integer from the SHA256 data h = HashInt() h.from_bytes(hashlib.sha256(w[0]).digest()) phopass = [] for j in range(0, 4): phopass.append(phogen(h, 6)) wp = " ".join(phopass) if wp != w[1]: print("Internal test failed: '{}' != '{}'".format(wp, w[1])) return False #print(" ".join(phopass)) return True # Output a phonetic representation of `length`. The data is taken from the # Hash in `h` def phogen(h, length): global g_phonetic_map word ="" ngram = " " for i in range(length): nl = len(g_phonetic_map[ngram]) ni = h.mod32(nl) letter = g_phonetic_map[ngram][ni] word += letter ngram = ngram[1:] + letter return word def pho_parse_args(): global g_args args = argparse.ArgumentParser(description="Generate mappings for the phonetic generator. Without arguments, just run the built-in tests.") args.add_argument("-i", "--input", action="store", help="Input file (word list)", default=None, required=True) args.add_argument("-j", "--json", action="store", help="JSON output file", default=None) args.add_argument("-p", "--python", action="store", help="Python output file", default=None) args.add_argument("-c", "--clang", action="store", help="C output file", default=None) args.add_argument("-t", "--test", action="store_true", help="Output random passwords", default=False) args.add_argument("-v", "--verbose", action="store_true", help="Verbose", default=False) g_args = args.parse_args() def phogen_clang_dump(f): print(C_HEADER, file=f, end="") # Generate a list of (" ", "a" ... "z" ) charmap = (" ", *(chr(x) for x in range(ord('a'), ord('z') +1))) first = True for i in charmap: for j in charmap: if not first: print(' },', file=f) first = False ngram = i + j if ngram in g_phonetic_map: nmap = '"{}"'.format("".join(g_phonetic_map[ngram])) else: nmap = "NULL" print(' {', file=f) print(' .ngram = "{}",'.format(ngram), file=f) print(' .map = {}'.format(nmap), file=f) print(' }', file=f) print(C_FOOTER, file=f, end="") pho_parse_args() if not phogen_init(): print("Error generating phonetic map", file=std.err) sys.exit(1) if not phogen_test(): sys.exit(1) if g_args.python: with open(g_args.python, "w") as f: print(PYTHON_HEADER, file=f, end="") print("g_phonetic_map = \\", file=f) pprint.pprint(g_phonetic_map, f, width=256, compact=False) if g_args.json: with open(g_args.json, "w") as f: json.dump(g_phonetic_map, f, indent=4, sort_keys=True) print(file=f) if g_args.clang: with open(g_args.clang, "w") as f: phogen_clang_dump(f) if g_args.test: for w in range(0, 5): h = HashInt() h.from_bytes(random.randbytes(16)) phopass = [] for j in range(0, 4): phopass.append(phogen(h, 6)) print(" ".join(phopass))