phogen: Add the C implementation of the phoentic generator

This commit is contained in:
2021-09-14 23:52:59 +02:00
parent 93f9c75131
commit 50b0758ffc

View File

@ -0,0 +1,823 @@
/*
* ===========================================================================
* Generate the phoentic mapping; this is used for mapping n-grams (two
* letter combinations to the letter that most likely follows it.
* ===========================================================================
*/
#include <arpa/inet.h> /* For nothl() */
#include <ctype.h>
#include <errno.h>
#include <getopt.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string.h>
#include <openssl/sha.h>
/* List of vowels */
#define PHOGEN_VOWELS "aeiou"
/* Number of letters in the english alphabet */
#define PHOGEN_ENGLISH_NUM ('z' - 'a' + 1)
/*
* Number of valid characters in a ngram. This is the number of letters in the
* english alphabet with the addition of space (' ')
*/
#define PHOGEN_NGRAM_CHARS (PHOGEN_ENGLISH_NUM + 1)
/* Number of all possible ngrams (two letter combination, including space) */
#define PHOGEN_NGRAM_NUM (PHOGEN_NGRAM_CHARS * PHOGEN_NGRAM_CHARS)
static void phogen_rstrip(char *str, char *what);
/* The largest roman number (in terms of string length) is 3888 */
#define ROMAN_MAX_LEN sizeof("MMMDCCCLXXXVIII")
struct roman_symbol
{
char *rs_symbol;
int rs_value;
};
static struct roman_symbol roman_symbol_list[] =
{
{ .rs_symbol = "M", .rs_value = 1000 },
{ .rs_symbol = "CM", .rs_value = 900 },
{ .rs_symbol = "D", .rs_value = 500 },
{ .rs_symbol = "CD", .rs_value = 400 },
{ .rs_symbol = "C", .rs_value = 100 },
{ .rs_symbol = "XC", .rs_value = 90 },
{ .rs_symbol = "L", .rs_value = 50 },
{ .rs_symbol = "XL", .rs_value = 40 },
{ .rs_symbol = "X", .rs_value = 10 },
{ .rs_symbol = "IX", .rs_value = 9 },
{ .rs_symbol = "V", .rs_value = 5 },
{ .rs_symbol = "IV", .rs_value = 4 },
{ .rs_symbol = "I", .rs_value = 1 }
};
static struct option phogen_map_long_options[] =
{
{ "input", required_argument, NULL, 'i' },
{ "json", required_argument, NULL, 'j' },
{ "python", required_argument, NULL, 'p' },
{ "clang", required_argument, NULL, 'c' },
{ "test", no_argument, NULL, 't' },
{ "verbose", no_argument, NULL, 'v' },
{ NULL, 0, NULL, 0 }
};
struct
{
char *input;
char *output;
} phogen_test_table[] =
{
{ "passgeny", "herang xiasem zitend qibele" },
{ "phonetic", "lineum foneum zybale mangur" },
{ "generator", "latole elitab ackina exprou" },
{ "password", "nulize nomere fonici crednt" },
{ "duck", "catabb rompor cricin prunsi" },
};
const char phogen_python_header[] =
"#\n"
"# Phonetic Map -- autogenerated, do not edit.\n"
"#\n";
const char phogen_clang_header[] =
"/*\n"
" * Phonetic Map -- autogenerated, do not edit.\n"
" *\n"
" * The list contains exactly 27*27 entries (all combinations of 2 characters\n"
" * from the array [\" \", \"a\"...\"z\"]) and is sorted alphabetically. This makes\n"
" * the ngram lookup time constant, but it also means that the map contains\n"
" * holes. The holes have a value of `map` set to NULL.\n"
" */\n"
"\n"
"struct phogen_entry\n"
"{\n"
" char *ngram; /* Ngram */\n"
" char *map; /* Character map */\n"
"};\n"
"\n"
"struct phogen_entry phogen_map[] =\n"
"{\n";
static char *g_word_list = NULL;
static int g_verbose = 0;
struct phogen_freq_entry
{
char fe_letter;
int fe_freq;
};
/* Table containing ngrams and next letter frequency */
static struct phogen_freq_entry phogen_freq_list[PHOGEN_NGRAM_NUM][PHOGEN_ENGLISH_NUM];
static int phogen_entry_cmp(const void *_a, const void *_b);
/*
* Convert an integer to its roman representation as string
*
* This function returns NULL on error.
*/
char *roman_from_int(char (*roman)[ROMAN_MAX_LEN], int i)
{
char *proman;
int ii;
int ij;
if (i < 1 || i > 3999) return NULL;
proman = (char *)roman;
for (ii = 0; ii < sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0]); ii++)
{
int n = i / roman_symbol_list[ii].rs_value;
for (ij = 0; ij < n; ij++)
{
strcpy(proman, roman_symbol_list[ii].rs_symbol);
proman += strlen(roman_symbol_list[ii].rs_symbol);
}
if (n != 0)
{
i %= (n * roman_symbol_list[ii].rs_value);
}
}
return (char *)roman;
}
/*
* Convert a roman number to its integer value; return 0 on error
*/
int roman_to_int(const char *roman)
{
char rs[ROMAN_MAX_LEN];
char *proman;
int retval;
int ii;
char uroman[strlen(roman) + 1];
strcpy(uroman, roman);
/* Convert all to upper */
for (proman = uroman; *proman != '\0'; proman++)
{
*proman = toupper(*proman);
}
proman = uroman;
retval = 0;
while (*proman != '\0')
{
for (ii = 0; ii < sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0]); ii++)
{
if (strncmp(proman,
roman_symbol_list[ii].rs_symbol,
strlen(roman_symbol_list[ii].rs_symbol)) == 0)
{
break;
}
}
if (ii >= sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0]))
{
return 0;
}
retval += roman_symbol_list[ii].rs_value;
proman += strlen(roman_symbol_list[ii].rs_symbol);
}
/*
* Verify that the input string is a corret roman number by converting the
* value back to int
*/
if (roman_from_int(&rs, retval) == NULL || strcmp(rs, uroman) != 0)
{
return 0;
}
return retval;
}
/*
* Read the words file line by line (each line should represent a single word).
* Filter out words that contain special characters or are roman numbers.
*/
char *phogen_word_list(void)
{
static char wl_buf[1024];
static FILE *wl = NULL;
if (wl == NULL)
{
wl = fopen(g_word_list, "r");
if (wl == NULL)
{
fprintf(stderr, "Error opening file: %s\n", g_word_list);
exit(2);
}
}
while (fgets(wl_buf, sizeof(wl_buf), wl) != NULL)
{
return wl_buf;
}
if (ferror(wl))
{
fprintf(stderr, "Error reading file : %s\n", g_word_list);
}
fclose(wl);
wl = NULL;
return NULL;
}
/*
* Calculate the ngram index
*/
int phogen_ngram_to_index(const char *word)
{
int i;
i = (word[0] == ' ' ? 0 : word[0] - 'a' + 1);
i *= PHOGEN_NGRAM_CHARS;
i += (word[1] == ' ' ? 0 : word[1] - 'a' + 1);
return i;
}
/*
* Map an index to its ngram string
*/
void phogen_index_to_ngram(char (*ngram)[3], int index)
{
int n;
n = index / PHOGEN_NGRAM_CHARS;
(*ngram)[0] = (n == 0) ? ' ' : (char)('a' + n - 1);
n = index % PHOGEN_NGRAM_CHARS;
(*ngram)[1] = (n == 0) ? ' ' : (char)('a' + n - 1);
(*ngram)[2] = '\0';
}
/*
* Create a frequency map of `ngram -> next letter` mapping sorted by
* frequency. Take the 5 (number of vowels) most frequent letters. If the
* mapping contains less than 5 letters, fill it with vowels.
*/
void phogen_freq(void)
{
char *word;
int ni;
/* Take the word list and split it into ngrams. Build a frequency list
* of ngram -> next letter.
*/
while ((word = phogen_word_list()) != NULL)
{
char *pword;
char ngram[3] = " ";
phogen_rstrip(word, "\n\r");
if (roman_to_int(word) > 0)
{
if (g_verbose) fprintf(stderr, "Ignoring roman number: %s\n", word);
continue;
}
for (pword = word; *pword != '\0'; pword++)
{
*pword = tolower(*pword);
if (*pword < 'a' || *pword > 'z')
{
if (g_verbose) fprintf(stderr, "Ignoring invalid word: %s\n", word);
break;
}
}
if (*pword != '\0') continue;
for (pword = word; *pword != '\0'; pword++)
{
int ni = phogen_ngram_to_index(ngram);
int ci = *pword - 'a';
/* Update the frequency */
phogen_freq_list[ni][ci].fe_letter = *pword;
phogen_freq_list[ni][ci].fe_freq++;
/* Calculate next ngram */
ngram[0] = ngram[1];
ngram[1] = *pword;
}
}
/*
* Scan the list, sort letters by frequency and use 5 most common letters.
*/
for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++)
{
/* Sort the letters by frequency */
qsort(
phogen_freq_list[ni],
sizeof(phogen_freq_list[ni]) / sizeof(phogen_freq_list[ni][0]),
sizeof(struct phogen_freq_entry),
phogen_entry_cmp);
}
/*
* Filter out letters that do not produce a valid next-ngram mapping.
*
* If the list is shorter than 5 letters, fill it with vowels.
*/
for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++)
{
char ngram[3];
int ci;
/*
* Do not process " ", this is the _starting_ ngram and should contain
* all detected combinations
*/
if (ni == 0) continue;
/*
* Ignore ngrams that do not have any next mappings (never seen in
* the word list)
*/
if (phogen_freq_list[ni][0].fe_letter == '\0') continue;
/*
* From the current ngram and the list of letters, generate all possible
* next-ngrams and remove letters that produce invalid mappings.
*/
phogen_index_to_ngram(&ngram, ni);
ci = 0;
while (phogen_freq_list[ni][ci].fe_letter != '\0')
{
char next_ngram[3];
int next_ni;
if ((ni != 0) && (ci >= (sizeof(PHOGEN_VOWELS) - 1)))
{
break;
}
next_ngram[0] = ngram[1];
next_ngram[1] = phogen_freq_list[ni][ci].fe_letter;
next_ngram[2] = '\0';
next_ni = phogen_ngram_to_index(next_ngram);
if (phogen_freq_list[next_ni][0].fe_letter != '\0')
{
ci++;
continue;
}
if (g_verbose)
{
fprintf(stderr, "Ignoring valid ngram mapping %s + %c -> %s\n",
ngram,
phogen_freq_list[ni][ci].fe_letter,
next_ngram);
}
memmove(&phogen_freq_list[ni][ci], &phogen_freq_list[ni][ci + 1],
sizeof(phogen_freq_list[ni][0]) * (PHOGEN_ENGLISH_NUM - ci));
}
/*
* Clip the number of letters to 5 (number of vowels). If the list is
* shorter than 5, fill it with vowels
*/
for (ci = 0; ci < sizeof(PHOGEN_VOWELS) - 1; ci++)
{
int cci;
int vi;
if (phogen_freq_list[ni][ci].fe_letter != '\0')
{
continue;
}
/* Find a vowel that doesn't exist in the list yet */
for (vi = 0; vi < sizeof(PHOGEN_VOWELS) - 1; vi++)
{
for (cci = 0; cci <= ci; cci++)
{
if (PHOGEN_VOWELS[vi] == phogen_freq_list[ni][cci].fe_letter)
{
break;
}
}
/* Vowel not found, break out */
if (cci > ci)
{
break;
}
}
phogen_freq_list[ni][ci].fe_letter = PHOGEN_VOWELS[vi];
}
phogen_freq_list[ni][ci].fe_letter = '\0';
}
}
/*
* Strip all characters in `what` from the end of string `str`
*
* Note: This function modifies `str`
*/
void phogen_rstrip(char *str, char *what)
{
char *sl = str + strlen(str) - 1;
while (sl > str)
{
if (strspn(sl, what) == 0) return;
*sl-- = '\0';
}
}
/*
* Take the buffer in `bigin` and treat is as a big-endian big number.
* Perform a division using the 32-bit divisor in `base` and return the
* 32-bit modulo.
*/
uint32_t bigint_mod32(void *bigint, size_t bigintsz, uint32_t base)
{
uint32_t *pi;
uint64_t n;
uint32_t mod = 0;
for (pi = (uint32_t *)bigint;
pi < (uint32_t *)(bigint + bigintsz);
pi++)
{
n = mod;
n <<= sizeof(uint32_t) * 8;
n |= htonl(*pi);
*pi = ntohl(n / base);
mod = n % base;
}
return mod;
}
/*
* Take input buffer `in` and generate its phonetic representation
* and store it to out.
*/
void phogen(char *out, size_t outsz, void *in, size_t insz)
{
int ii;
/* Starting ngram */
char ngram[3] = " ";
for (ii = 0; ii < (outsz - 1); ii++)
{
int nsize;
int nsel;
int ni = phogen_ngram_to_index(ngram);
/* Calculate the length of the letter pool */
for (nsize = 0; nsize < PHOGEN_ENGLISH_NUM; nsize++)
{
if (phogen_freq_list[ni][nsize].fe_letter == '\0') break;
}
nsel = bigint_mod32(in, insz, nsize);
out[ii] = phogen_freq_list[ni][nsel].fe_letter;
/* Generate next ngram */
ngram[0] = ngram[1];
ngram[1] = phogen_freq_list[ni][nsel].fe_letter;
}
out[ii] = '\0';
}
/*
* Run basic tests
*/
bool phogen_test(void)
{
unsigned char sha256[SHA256_DIGEST_LENGTH];
SHA256_CTX sha256_ctx;
int ii;
int ij;
bool retval = true;
for (ii = 0; ii < sizeof(phogen_test_table) / sizeof(phogen_test_table[0]); ii++)
{
SHA256_Init(&sha256_ctx);
SHA256_Update(&sha256_ctx, phogen_test_table[ii].input, strlen(phogen_test_table[ii].input));
SHA256_Final(sha256, &sha256_ctx);
/* 4 words, each 6 characters long and 1 more character for spaces */
char buf[(6 + 1) * 4];
char *pbuf = buf;
for (ij = 0; ij < 4; ij++)
{
phogen(pbuf, 7, sha256, sizeof(sha256));
pbuf += strlen(pbuf);
*pbuf++ = ' ';
}
*(--pbuf) = '\0';
if (strcmp(buf, phogen_test_table[ii].output) != 0)
{
fprintf(stderr, "Error, test failed: %s != %s\n", buf, phogen_test_table[ii].output);
retval = false;
}
}
return retval;
}
/*
* qsort() comparator
*/
int phogen_entry_cmp(const void *_a, const void *_b)
{
const struct phogen_freq_entry *a = _a;
const struct phogen_freq_entry *b = _b;
/* Reverse a/b below, so the sorting order is inverted */
if (a->fe_freq != b->fe_freq) return (b->fe_freq - a->fe_freq);
if (a->fe_letter != b->fe_letter) return (b->fe_letter - a->fe_letter);
return 0;
}
bool phogen_pre_test(void)
{
char ngram[3];
int ii;
int ni;
for (ii = 0; ii < PHOGEN_NGRAM_NUM; ii++)
{
phogen_index_to_ngram(&ngram, ii);
ni = phogen_ngram_to_index(ngram);
if (ii != ni)
{
if (g_verbose) fprintf(stderr, "Internal error, index mapping functions are broken.\n");
return false;
}
}
return true;
}
/*
* Dump the frequency table in JSON format
*/
void phogen_dump_json(FILE *f)
{
int ni;
int ci;
fprintf(f, "{\n");
for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++)
{
char ngram[3];
if (phogen_freq_list[ni][0].fe_letter == '\0') continue;
if (ni != 0)
{
fprintf(f, "\n ],\n");
}
phogen_index_to_ngram(&ngram, ni);
fprintf(f, " \"%s\": [\n", ngram);
for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++)
{
if (phogen_freq_list[ni][ci].fe_letter == '\0') break;
if (ci != 0)
{
fprintf(f, ",\n");
}
fprintf(f, " \"%c\"", phogen_freq_list[ni][ci].fe_letter);
}
}
fprintf(f, "\n ]\n}\n");
}
/*
* Dump the frequency table in PYTHON format
*/
void phogen_dump_python(FILE *f)
{
int ni;
int ci;
fprintf(f, "%sg_phonetic_map = \\", phogen_python_header);
for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++)
{
char ngram[3];
if (phogen_freq_list[ni][0].fe_letter == '\0') continue;
phogen_index_to_ngram(&ngram, ni);
fprintf(f, "%s'%s': ", (ni == 0) ? "\n{" : "],\n ", ngram);
for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++)
{
if (phogen_freq_list[ni][ci].fe_letter == '\0') break;
fprintf(f, "%s'%c'", (ci == 0) ? "[" : ", ", phogen_freq_list[ni][ci].fe_letter);
}
}
fprintf(f, "]}\n");
}
/*
* Dump the frequency table in C format
*/
void phogen_dump_clang(FILE *f)
{
int ni;
int ci;
fprintf(f, "%s", phogen_clang_header);
for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++)
{
char ngram[3];
phogen_index_to_ngram(&ngram, ni);
if (ni != 0)
{
fprintf(f, " },\n");
}
fprintf(f, " {\n");
fprintf(f, " .ngram = \"%s\",\n", ngram);
if (phogen_freq_list[ni][0].fe_letter == '\0')
{
fprintf(f, " .map = NULL\n");
}
else
{
fprintf(f, " .map = \"");
for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++)
{
if (phogen_freq_list[ni][ci].fe_letter == '\0') break;
fprintf(f, "%c", phogen_freq_list[ni][ci].fe_letter);
}
fprintf(f, "\"\n");
}
}
fprintf(f, " }\n};\n");
}
void help(void)
{
printf(
"usage: phogen_map.py -i INPUT [-j JSON] [-p PYTHON] [-c CLANG] [-t] [-v]\n"
"\n"
"Generate mappings for the phonetic generator. Without arguments, just run the built-in tests.\n"
"\n"
"optional arguments:\n"
" -i INPUT, --input INPUT\n"
" Input file (word list)\n"
" -j JSON, --json JSON JSON output file\n"
" -p PYTHON, --python PYTHON\n"
" Python output file\n"
" -c CLANG, --clang CLANG\n"
" C output file\n"
" -t, --test Output random passwords\n"
" -v, --verbose Verbose\n");
}
int main(int argc, char *argv[])
{
int opt;
char *json_output = NULL;
char *clang_output = NULL;
char *python_output = NULL;
//bool test = false;
while ((opt = getopt_long(argc, argv, "i:j:p:c:tv", phogen_map_long_options, NULL)) != -1)
{
switch (opt)
{
case 'i':
g_word_list = optarg;
break;
case 'j':
json_output = optarg;
break;
case 'p':
python_output = optarg;
break;
case 'c':
clang_output = optarg;
break;
case 't':
break;
case 'v':
g_verbose++;
break;
default:
help();
return 127;
}
}
if (g_word_list == NULL)
{
fprintf(stderr, "An input parameter is required (--input or -i).\n");
return 127;
}
if (!phogen_pre_test())
{
printf("Basic tests failed.");
return 1;
}
/* Generate the frequency map */
phogen_freq();
/* Run basic tests */
phogen_test();
/* JSON output */
if (json_output != NULL)
{
FILE *f = fopen(json_output, "w");
if (f == NULL)
{
fprintf(stderr, "Error opening JSON output file %s: %s\n",
json_output,
strerror(errno));
return 1;
}
phogen_dump_json(f);
fclose(f);
}
/* PYTHON output */
if (python_output != NULL)
{
FILE *f = fopen(python_output, "w");
if (f == NULL)
{
fprintf(stderr, "Error opening PYTHON output file %s: %s\n",
python_output,
strerror(errno));
return 1;
}
phogen_dump_python(f);
fclose(f);
}
/* C output */
if (clang_output != NULL)
{
FILE *f = fopen(clang_output, "w");
if (f == NULL)
{
fprintf(stderr, "Error opening CLANG output file %s: %s\n",
python_output,
strerror(errno));
return 1;
}
phogen_dump_clang(f);
fclose(f);
}
return 0;
}