python: Add the Python implementation of the phoentic generator

Add the Python implementation of the phoentic table generator.
This commit is contained in:
Mitja Horvat
2021-08-27 14:53:50 +02:00
committed by Mitja HORVAT
parent a18891eac9
commit 7acbc7de0c
2 changed files with 372 additions and 0 deletions

327
python/phogen_map/phogen_map.py Executable file
View File

@ -0,0 +1,327 @@
#!/usr/bin/env python
import hashlib
import json
import math
import pprint
import random
import roman
import sys
import argparse
PHO_GEN_VOWELS = "aeiou"
PHO_GEN_TEST = [
[ b"passgeny", "herang xiasem zitend qibele" ],
[ b"phonetic", "lineum foneum zybale mangur" ],
[ b"generator", "latole elitab ackina exprou" ],
[ b"password", "nulize nomere fonici crednt" ],
[ b"duck", "catabb rompor cricin prunsi" ] ]
PYTHON_HEADER = """#
# Phonetic Map -- autogenerated, do not edit.
#
"""
C_HEADER = """/*
* Phonetic Map -- autogenerated, do not edit.
*
* The list contains exactly 27*27 entries (all combinations of 2 characters
* from the array [" ", "a"..."z"]) and is sorted alphabetically. This makes
* the ngram lookup time constant, but it also means that the map contains
* holes. The holes have a value of `map` set to NULL.
*/
struct phogen_entry
{
char *ngram; /* Ngram */
char *map; /* Character map */
};
struct phogen_entry phogen_map[] =
{
"""
C_FOOTER = \
"""};
"""
g_ngram_table = None
g_args = None
class HashInt:
def __init__(self):
self.hash = None
self.bits = 0
self.bits_used = 0
def from_bytes(self, buf):
self.hash = int.from_bytes(buf, byteorder='big')
self.bits = len(buf) * 8
# Divide the HashInt by `idiv` and return the modulo
def mod32(self, idiv):
nbits = math.log(idiv) / math.log(2)
if nbits + self.bits_used > self.bits:
return None
ret = self.hash % idiv
self.hash //= idiv
return ret
def elog(*args):
if g_args.verbose:
print(*args, file=sys.stderr)
# Read the words file line by line (each line should represent a single word).
# Filter out words that contain special characters or are roman numbers.
def phogen_word_list():
with open(g_args.input) as f:
for text in f.readlines():
# All lower-case, strip whitespaces and new lines
text = text.lower().strip()
# Skip non-alphanumeric words
if not text.isalpha():
continue
# The dictionary contains a variatey of roman numbers. Avoid
# those since they can generate weird combinations (ii, xx, ..)
if roman.roman_to_int(text) != 0:
elog("Ignoring roman number:", text)
continue
yield text
# This function takes the word list and splits each word into ngrams. A ngram
# in the context of this function is just a combination of two letters.
#
# The next ngram is computed by removing the first letter from the current ngram
# and appending the next letter in the word.
#
# For example, the word "ananas" produces the following ngrams:
#
# " " (starting ngram), next letter is 'a'
# " a", next letter is 'n'
# "an", next letter is 'a'
# "na", next letter is 'n'
# "an", next letter is 'a'
# "na", next letter is 's'
# "as", end
#
# Each ngram is inserted into a dictionary. The dictionary data is another
# dictionary that cointains the next letter as key and the number of occurrences
# detected of the next letter.
#
# Taking "ananas" as example again, we get the following output:
#
# {
# ' ': {'a': 1},
# ' a': {'n': 1},
# 'an': {'a': 2},
# 'na': {'n': 1, 's': 1}}
# 'as' {}
# }
#
def phogen_freq():
ngram_freq_list = {}
# Generate a ngram table. This is a double dictionary where the mapping is
# as follows:
# ngram_freq_list[ngram][next_letter] = frequency
for word in phogen_word_list():
# Start with an empty ngram (two spaces)
ngram = " "
# Read the word character by character and generate new ngrams
for letter in word:
if not ngram in ngram_freq_list:
ngram_freq_list[ngram] = {}
# Take the current ngram and compute the next character frequency
freq = ngram_freq_list[ngram].get(letter, 0)
freq += 1
ngram_freq_list[ngram][letter] = freq
# Compute the next ngram from the next character
ngram = ngram[1:] + letter
return ngram_freq_list
# Take the output of `phogen_freq_list` and condense it down:
# - For each ngram, take the letter:freq list, sort it by most common letter
# and use 5 topmost letters. The only exception is the initial ngram " ",
# which can all letter mappings instead of 5.
# - Ignore ngram + letter combination that would produce new ngrams without a
# a proper mapping.
# - If the list contains 5 or less letters, fill it with vowels
def phogen_map(ngram_list):
# Scan the ngram list and normalize it:
# - If a ngram -> letters contains more than 5 combinations, use the 5
# most frequent combinations
# - If a ngram -> letters contains less than 5 combinations, fill it with
# vowels
ngram_normal = {}
for (ngram, data) in ngram_list.items():
ndata = []
# Scan the letters data in an order that is sorted by frequency (and
# character value, so letters with the same frequency have a predictable
# outout) of the letters. Compute the next ngram by removing the first
# letter from the current ngram and append the next letter. If the new
# ngram does not exists in ngram_list, consider it invalid and ignore it.
#
# If the mapping exists, add the current letter to the normalized list.
#
# Do this for a maximum of 5 letters.
for (letter, freq) in sorted(data.items(), key=lambda x: x[1] * 256 + ord(x[0]), reverse=True):
new_ngram = ngram[1:] + letter
if not new_ngram in ngram_list:
elog("Ignoring invalid ngram mapping {} + {} -> {}".format(ngram, letter, new_ngram))
continue
ndata.append(letter)
if ngram != " " and len(ndata) >= len(PHO_GEN_VOWELS): break
# If the list contains less than 5 letters, fill it with vowels
for v in PHO_GEN_VOWELS:
if len(ndata) >= len(PHO_GEN_VOWELS): break
if v not in ndata: ndata.append(v)
ngram_normal[ngram] = ndata
return ngram_normal
# Take the output of `phogen_map` and verify that all mappings actually
# resolve to new ngrams
def phogen_verify(ngram_list):
for (ngram, letters) in ngram_list.items():
for l in letters:
tngram = ngram[1:] + l
if tngram not in ngram_list:
print("Incomplete map, dangling mapping: {} + {} -> !{}",
ngram, l, nngram)
return False
return True
def phogen_init():
global g_phonetic_map
elog("Generating frequency table ...")
pho_freq = phogen_freq()
elog("Generating phonetic map ...")
pho_map = phogen_map(pho_freq)
elog("Verifying mappings ...")
if not phogen_verify(pho_map):
return False
g_phonetic_map = pho_map
return True
def phogen_test():
# Generate phonetic passwords from 5 hard-coded examples
for w in PHO_GEN_TEST:
# Generate a big integer from the SHA256 data
h = HashInt()
h.from_bytes(hashlib.sha256(w[0]).digest())
phopass = []
for j in range(0, 4):
phopass.append(phogen(h, 6))
wp = " ".join(phopass)
if wp != w[1]:
print("Internal test failed: '{}' != '{}'".format(wp, w[1]))
return False
#print(" ".join(phopass))
return True
# Output a phonetic representation of `length`. The data is taken from the
# Hash in `h`
def phogen(h, length):
global g_phonetic_map
word =""
ngram = " "
for i in range(length):
nl = len(g_phonetic_map[ngram])
ni = h.mod32(nl)
letter = g_phonetic_map[ngram][ni]
word += letter
ngram = ngram[1:] + letter
return word
def pho_parse_args():
global g_args
args = argparse.ArgumentParser(description="Generate mappings for the phonetic generator. Without arguments, just run the built-in tests.")
args.add_argument("-i", "--input", action="store", help="Input file (word list)", default=None, required=True)
args.add_argument("-j", "--json", action="store", help="JSON output file", default=None)
args.add_argument("-p", "--python", action="store", help="Python output file", default=None)
args.add_argument("-c", "--clang", action="store", help="C output file", default=None)
args.add_argument("-t", "--test", action="store_true", help="Output random passwords", default=False)
args.add_argument("-v", "--verbose", action="store_true", help="Verbose", default=False)
g_args = args.parse_args()
def phogen_clang_dump(f):
print(C_HEADER, file=f, end="")
# Generate a list of (" ", "a" ... "z" )
charmap = (" ", *(chr(x) for x in range(ord('a'), ord('z') +1)))
first = True
for i in charmap:
for j in charmap:
if not first:
print(' },', file=f)
first = False
ngram = i + j
if ngram in g_phonetic_map:
nmap = '"{}"'.format("".join(g_phonetic_map[ngram]))
else:
nmap = "NULL"
print(' {', file=f)
print(' .ngram = "{}",'.format(ngram), file=f)
print(' .map = {}'.format(nmap), file=f)
print(' }', file=f)
print(C_FOOTER, file=f, end="")
pho_parse_args()
if not phogen_init():
print("Error generating phonetic map", file=std.err)
sys.exit(1)
if not phogen_test():
sys.exit(1)
if g_args.python:
with open(g_args.python, "w") as f:
print(PYTHON_HEADER, file=f, end="")
print("g_phonetic_map = \\", file=f)
pprint.pprint(g_phonetic_map, f, width=256, compact=False)
if g_args.json:
with open(g_args.json, "w") as f:
json.dump(g_phonetic_map, f, indent=4, sort_keys=True)
print(file=f)
if g_args.clang:
with open(g_args.clang, "w") as f:
phogen_clang_dump(f)
if g_args.test:
for w in range(0, 5):
h = HashInt()
h.from_bytes(random.randbytes(16))
phopass = []
for j in range(0, 4):
phopass.append(phogen(h, 6))
print(" ".join(phopass))