python: Add the Python implementation of the phoentic generator
Add the Python implementation of the phoentic table generator.
This commit is contained in:
327
python/phogen_map/phogen_map.py
Executable file
327
python/phogen_map/phogen_map.py
Executable file
@ -0,0 +1,327 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import pprint
|
||||||
|
import random
|
||||||
|
import roman
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
PHO_GEN_VOWELS = "aeiou"
|
||||||
|
|
||||||
|
PHO_GEN_TEST = [
|
||||||
|
[ b"passgeny", "herang xiasem zitend qibele" ],
|
||||||
|
[ b"phonetic", "lineum foneum zybale mangur" ],
|
||||||
|
[ b"generator", "latole elitab ackina exprou" ],
|
||||||
|
[ b"password", "nulize nomere fonici crednt" ],
|
||||||
|
[ b"duck", "catabb rompor cricin prunsi" ] ]
|
||||||
|
|
||||||
|
|
||||||
|
PYTHON_HEADER = """#
|
||||||
|
# Phonetic Map -- autogenerated, do not edit.
|
||||||
|
#
|
||||||
|
"""
|
||||||
|
|
||||||
|
C_HEADER = """/*
|
||||||
|
* Phonetic Map -- autogenerated, do not edit.
|
||||||
|
*
|
||||||
|
* The list contains exactly 27*27 entries (all combinations of 2 characters
|
||||||
|
* from the array [" ", "a"..."z"]) and is sorted alphabetically. This makes
|
||||||
|
* the ngram lookup time constant, but it also means that the map contains
|
||||||
|
* holes. The holes have a value of `map` set to NULL.
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct phogen_entry
|
||||||
|
{
|
||||||
|
char *ngram; /* Ngram */
|
||||||
|
char *map; /* Character map */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct phogen_entry phogen_map[] =
|
||||||
|
{
|
||||||
|
"""
|
||||||
|
|
||||||
|
C_FOOTER = \
|
||||||
|
"""};
|
||||||
|
"""
|
||||||
|
|
||||||
|
g_ngram_table = None
|
||||||
|
g_args = None
|
||||||
|
|
||||||
|
class HashInt:
|
||||||
|
def __init__(self):
|
||||||
|
self.hash = None
|
||||||
|
self.bits = 0
|
||||||
|
self.bits_used = 0
|
||||||
|
|
||||||
|
def from_bytes(self, buf):
|
||||||
|
self.hash = int.from_bytes(buf, byteorder='big')
|
||||||
|
self.bits = len(buf) * 8
|
||||||
|
|
||||||
|
# Divide the HashInt by `idiv` and return the modulo
|
||||||
|
def mod32(self, idiv):
|
||||||
|
nbits = math.log(idiv) / math.log(2)
|
||||||
|
if nbits + self.bits_used > self.bits:
|
||||||
|
return None
|
||||||
|
|
||||||
|
ret = self.hash % idiv
|
||||||
|
self.hash //= idiv
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def elog(*args):
|
||||||
|
if g_args.verbose:
|
||||||
|
print(*args, file=sys.stderr)
|
||||||
|
|
||||||
|
# Read the words file line by line (each line should represent a single word).
|
||||||
|
# Filter out words that contain special characters or are roman numbers.
|
||||||
|
def phogen_word_list():
|
||||||
|
with open(g_args.input) as f:
|
||||||
|
for text in f.readlines():
|
||||||
|
# All lower-case, strip whitespaces and new lines
|
||||||
|
text = text.lower().strip()
|
||||||
|
# Skip non-alphanumeric words
|
||||||
|
if not text.isalpha():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# The dictionary contains a variatey of roman numbers. Avoid
|
||||||
|
# those since they can generate weird combinations (ii, xx, ..)
|
||||||
|
if roman.roman_to_int(text) != 0:
|
||||||
|
elog("Ignoring roman number:", text)
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield text
|
||||||
|
|
||||||
|
# This function takes the word list and splits each word into ngrams. A ngram
|
||||||
|
# in the context of this function is just a combination of two letters.
|
||||||
|
#
|
||||||
|
# The next ngram is computed by removing the first letter from the current ngram
|
||||||
|
# and appending the next letter in the word.
|
||||||
|
#
|
||||||
|
# For example, the word "ananas" produces the following ngrams:
|
||||||
|
#
|
||||||
|
# " " (starting ngram), next letter is 'a'
|
||||||
|
# " a", next letter is 'n'
|
||||||
|
# "an", next letter is 'a'
|
||||||
|
# "na", next letter is 'n'
|
||||||
|
# "an", next letter is 'a'
|
||||||
|
# "na", next letter is 's'
|
||||||
|
# "as", end
|
||||||
|
#
|
||||||
|
# Each ngram is inserted into a dictionary. The dictionary data is another
|
||||||
|
# dictionary that cointains the next letter as key and the number of occurrences
|
||||||
|
# detected of the next letter.
|
||||||
|
#
|
||||||
|
# Taking "ananas" as example again, we get the following output:
|
||||||
|
#
|
||||||
|
# {
|
||||||
|
# ' ': {'a': 1},
|
||||||
|
# ' a': {'n': 1},
|
||||||
|
# 'an': {'a': 2},
|
||||||
|
# 'na': {'n': 1, 's': 1}}
|
||||||
|
# 'as' {}
|
||||||
|
# }
|
||||||
|
#
|
||||||
|
def phogen_freq():
|
||||||
|
ngram_freq_list = {}
|
||||||
|
|
||||||
|
# Generate a ngram table. This is a double dictionary where the mapping is
|
||||||
|
# as follows:
|
||||||
|
# ngram_freq_list[ngram][next_letter] = frequency
|
||||||
|
for word in phogen_word_list():
|
||||||
|
# Start with an empty ngram (two spaces)
|
||||||
|
ngram = " "
|
||||||
|
# Read the word character by character and generate new ngrams
|
||||||
|
for letter in word:
|
||||||
|
if not ngram in ngram_freq_list:
|
||||||
|
ngram_freq_list[ngram] = {}
|
||||||
|
|
||||||
|
# Take the current ngram and compute the next character frequency
|
||||||
|
freq = ngram_freq_list[ngram].get(letter, 0)
|
||||||
|
freq += 1
|
||||||
|
ngram_freq_list[ngram][letter] = freq
|
||||||
|
|
||||||
|
# Compute the next ngram from the next character
|
||||||
|
ngram = ngram[1:] + letter
|
||||||
|
|
||||||
|
return ngram_freq_list
|
||||||
|
|
||||||
|
# Take the output of `phogen_freq_list` and condense it down:
|
||||||
|
# - For each ngram, take the letter:freq list, sort it by most common letter
|
||||||
|
# and use 5 topmost letters. The only exception is the initial ngram " ",
|
||||||
|
# which can all letter mappings instead of 5.
|
||||||
|
# - Ignore ngram + letter combination that would produce new ngrams without a
|
||||||
|
# a proper mapping.
|
||||||
|
# - If the list contains 5 or less letters, fill it with vowels
|
||||||
|
def phogen_map(ngram_list):
|
||||||
|
# Scan the ngram list and normalize it:
|
||||||
|
# - If a ngram -> letters contains more than 5 combinations, use the 5
|
||||||
|
# most frequent combinations
|
||||||
|
# - If a ngram -> letters contains less than 5 combinations, fill it with
|
||||||
|
# vowels
|
||||||
|
ngram_normal = {}
|
||||||
|
for (ngram, data) in ngram_list.items():
|
||||||
|
ndata = []
|
||||||
|
# Scan the letters data in an order that is sorted by frequency (and
|
||||||
|
# character value, so letters with the same frequency have a predictable
|
||||||
|
# outout) of the letters. Compute the next ngram by removing the first
|
||||||
|
# letter from the current ngram and append the next letter. If the new
|
||||||
|
# ngram does not exists in ngram_list, consider it invalid and ignore it.
|
||||||
|
#
|
||||||
|
# If the mapping exists, add the current letter to the normalized list.
|
||||||
|
#
|
||||||
|
# Do this for a maximum of 5 letters.
|
||||||
|
for (letter, freq) in sorted(data.items(), key=lambda x: x[1] * 256 + ord(x[0]), reverse=True):
|
||||||
|
new_ngram = ngram[1:] + letter
|
||||||
|
if not new_ngram in ngram_list:
|
||||||
|
elog("Ignoring invalid ngram mapping {} + {} -> {}".format(ngram, letter, new_ngram))
|
||||||
|
continue
|
||||||
|
|
||||||
|
ndata.append(letter)
|
||||||
|
if ngram != " " and len(ndata) >= len(PHO_GEN_VOWELS): break
|
||||||
|
|
||||||
|
# If the list contains less than 5 letters, fill it with vowels
|
||||||
|
for v in PHO_GEN_VOWELS:
|
||||||
|
if len(ndata) >= len(PHO_GEN_VOWELS): break
|
||||||
|
if v not in ndata: ndata.append(v)
|
||||||
|
|
||||||
|
ngram_normal[ngram] = ndata
|
||||||
|
|
||||||
|
return ngram_normal
|
||||||
|
|
||||||
|
# Take the output of `phogen_map` and verify that all mappings actually
|
||||||
|
# resolve to new ngrams
|
||||||
|
def phogen_verify(ngram_list):
|
||||||
|
for (ngram, letters) in ngram_list.items():
|
||||||
|
for l in letters:
|
||||||
|
tngram = ngram[1:] + l
|
||||||
|
if tngram not in ngram_list:
|
||||||
|
print("Incomplete map, dangling mapping: {} + {} -> !{}",
|
||||||
|
ngram, l, nngram)
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def phogen_init():
|
||||||
|
global g_phonetic_map
|
||||||
|
|
||||||
|
elog("Generating frequency table ...")
|
||||||
|
pho_freq = phogen_freq()
|
||||||
|
elog("Generating phonetic map ...")
|
||||||
|
pho_map = phogen_map(pho_freq)
|
||||||
|
|
||||||
|
elog("Verifying mappings ...")
|
||||||
|
if not phogen_verify(pho_map):
|
||||||
|
return False
|
||||||
|
|
||||||
|
g_phonetic_map = pho_map
|
||||||
|
return True
|
||||||
|
|
||||||
|
def phogen_test():
|
||||||
|
# Generate phonetic passwords from 5 hard-coded examples
|
||||||
|
for w in PHO_GEN_TEST:
|
||||||
|
# Generate a big integer from the SHA256 data
|
||||||
|
h = HashInt()
|
||||||
|
h.from_bytes(hashlib.sha256(w[0]).digest())
|
||||||
|
phopass = []
|
||||||
|
for j in range(0, 4):
|
||||||
|
phopass.append(phogen(h, 6))
|
||||||
|
|
||||||
|
wp = " ".join(phopass)
|
||||||
|
if wp != w[1]:
|
||||||
|
print("Internal test failed: '{}' != '{}'".format(wp, w[1]))
|
||||||
|
return False
|
||||||
|
|
||||||
|
#print(" ".join(phopass))
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Output a phonetic representation of `length`. The data is taken from the
|
||||||
|
# Hash in `h`
|
||||||
|
def phogen(h, length):
|
||||||
|
global g_phonetic_map
|
||||||
|
|
||||||
|
word =""
|
||||||
|
ngram = " "
|
||||||
|
for i in range(length):
|
||||||
|
nl = len(g_phonetic_map[ngram])
|
||||||
|
ni = h.mod32(nl)
|
||||||
|
letter = g_phonetic_map[ngram][ni]
|
||||||
|
word += letter
|
||||||
|
ngram = ngram[1:] + letter
|
||||||
|
|
||||||
|
return word
|
||||||
|
|
||||||
|
def pho_parse_args():
|
||||||
|
global g_args
|
||||||
|
|
||||||
|
args = argparse.ArgumentParser(description="Generate mappings for the phonetic generator. Without arguments, just run the built-in tests.")
|
||||||
|
args.add_argument("-i", "--input", action="store", help="Input file (word list)", default=None, required=True)
|
||||||
|
args.add_argument("-j", "--json", action="store", help="JSON output file", default=None)
|
||||||
|
args.add_argument("-p", "--python", action="store", help="Python output file", default=None)
|
||||||
|
args.add_argument("-c", "--clang", action="store", help="C output file", default=None)
|
||||||
|
args.add_argument("-t", "--test", action="store_true", help="Output random passwords", default=False)
|
||||||
|
args.add_argument("-v", "--verbose", action="store_true", help="Verbose", default=False)
|
||||||
|
g_args = args.parse_args()
|
||||||
|
|
||||||
|
def phogen_clang_dump(f):
|
||||||
|
print(C_HEADER, file=f, end="")
|
||||||
|
|
||||||
|
# Generate a list of (" ", "a" ... "z" )
|
||||||
|
charmap = (" ", *(chr(x) for x in range(ord('a'), ord('z') +1)))
|
||||||
|
|
||||||
|
first = True
|
||||||
|
|
||||||
|
for i in charmap:
|
||||||
|
for j in charmap:
|
||||||
|
if not first:
|
||||||
|
print(' },', file=f)
|
||||||
|
|
||||||
|
first = False
|
||||||
|
|
||||||
|
ngram = i + j
|
||||||
|
if ngram in g_phonetic_map:
|
||||||
|
nmap = '"{}"'.format("".join(g_phonetic_map[ngram]))
|
||||||
|
else:
|
||||||
|
nmap = "NULL"
|
||||||
|
|
||||||
|
print(' {', file=f)
|
||||||
|
print(' .ngram = "{}",'.format(ngram), file=f)
|
||||||
|
print(' .map = {}'.format(nmap), file=f)
|
||||||
|
|
||||||
|
print(' }', file=f)
|
||||||
|
print(C_FOOTER, file=f, end="")
|
||||||
|
|
||||||
|
pho_parse_args()
|
||||||
|
|
||||||
|
if not phogen_init():
|
||||||
|
print("Error generating phonetic map", file=std.err)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if not phogen_test():
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if g_args.python:
|
||||||
|
with open(g_args.python, "w") as f:
|
||||||
|
print(PYTHON_HEADER, file=f, end="")
|
||||||
|
print("g_phonetic_map = \\", file=f)
|
||||||
|
pprint.pprint(g_phonetic_map, f, width=256, compact=False)
|
||||||
|
|
||||||
|
if g_args.json:
|
||||||
|
with open(g_args.json, "w") as f:
|
||||||
|
json.dump(g_phonetic_map, f, indent=4, sort_keys=True)
|
||||||
|
print(file=f)
|
||||||
|
|
||||||
|
if g_args.clang:
|
||||||
|
with open(g_args.clang, "w") as f:
|
||||||
|
phogen_clang_dump(f)
|
||||||
|
|
||||||
|
if g_args.test:
|
||||||
|
for w in range(0, 5):
|
||||||
|
h = HashInt()
|
||||||
|
h.from_bytes(random.randbytes(16))
|
||||||
|
phopass = []
|
||||||
|
for j in range(0, 4):
|
||||||
|
phopass.append(phogen(h, 6))
|
||||||
|
|
||||||
|
print(" ".join(phopass))
|
||||||
45
python/phogen_map/roman.py
Normal file
45
python/phogen_map/roman.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
roman_symbol = [
|
||||||
|
[ "IV", 4 ],
|
||||||
|
[ "IX", 9 ],
|
||||||
|
[ "XL", 40 ],
|
||||||
|
[ "XC", 90 ],
|
||||||
|
[ "CD", 400 ],
|
||||||
|
[ "CM", 900 ],
|
||||||
|
[ "I", 1 ],
|
||||||
|
[ "V", 5 ],
|
||||||
|
[ "X", 10 ],
|
||||||
|
[ "L", 50 ],
|
||||||
|
[ "C", 100 ],
|
||||||
|
[ "D", 500 ],
|
||||||
|
[ "M", 1000]
|
||||||
|
]
|
||||||
|
|
||||||
|
# Convert an integer to its roman representation as string
|
||||||
|
def int_to_roman(i):
|
||||||
|
s = ""
|
||||||
|
for rs in sorted(roman_symbol, key=lambda x: x[1], reverse=True):
|
||||||
|
count = i // rs[1]
|
||||||
|
s += rs[0] * count
|
||||||
|
if count > 0:
|
||||||
|
i %= rs[1] * count
|
||||||
|
|
||||||
|
return s
|
||||||
|
|
||||||
|
# Convert a roman number to its integer value; return 0 on error
|
||||||
|
def roman_to_int(s):
|
||||||
|
i = 0
|
||||||
|
su = s.upper()
|
||||||
|
while len(su) > 0:
|
||||||
|
for rs in roman_symbol:
|
||||||
|
if su.startswith(rs[0]):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
i += rs[1]
|
||||||
|
su = su.removeprefix(rs[0])
|
||||||
|
|
||||||
|
if s.upper() != int_to_roman(i):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
return i
|
||||||
Reference in New Issue
Block a user