diff --git a/c/phogen/phogen_map/phogen_map.c b/c/phogen/phogen_map/phogen_map.c new file mode 100644 index 0000000..6b607e3 --- /dev/null +++ b/c/phogen/phogen_map/phogen_map.c @@ -0,0 +1,823 @@ +/* + * =========================================================================== + * Generate the phoentic mapping; this is used for mapping n-grams (two + * letter combinations to the letter that most likely follows it. + * =========================================================================== + */ + +#include /* For nothl() */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* List of vowels */ +#define PHOGEN_VOWELS "aeiou" + +/* Number of letters in the english alphabet */ +#define PHOGEN_ENGLISH_NUM ('z' - 'a' + 1) + +/* + * Number of valid characters in a ngram. This is the number of letters in the + * english alphabet with the addition of space (' ') + */ +#define PHOGEN_NGRAM_CHARS (PHOGEN_ENGLISH_NUM + 1) + +/* Number of all possible ngrams (two letter combination, including space) */ +#define PHOGEN_NGRAM_NUM (PHOGEN_NGRAM_CHARS * PHOGEN_NGRAM_CHARS) + +static void phogen_rstrip(char *str, char *what); + +/* The largest roman number (in terms of string length) is 3888 */ +#define ROMAN_MAX_LEN sizeof("MMMDCCCLXXXVIII") + +struct roman_symbol +{ + char *rs_symbol; + int rs_value; +}; + +static struct roman_symbol roman_symbol_list[] = +{ + { .rs_symbol = "M", .rs_value = 1000 }, + { .rs_symbol = "CM", .rs_value = 900 }, + { .rs_symbol = "D", .rs_value = 500 }, + { .rs_symbol = "CD", .rs_value = 400 }, + { .rs_symbol = "C", .rs_value = 100 }, + { .rs_symbol = "XC", .rs_value = 90 }, + { .rs_symbol = "L", .rs_value = 50 }, + { .rs_symbol = "XL", .rs_value = 40 }, + { .rs_symbol = "X", .rs_value = 10 }, + { .rs_symbol = "IX", .rs_value = 9 }, + { .rs_symbol = "V", .rs_value = 5 }, + { .rs_symbol = "IV", .rs_value = 4 }, + { .rs_symbol = "I", .rs_value = 1 } +}; + +static struct option phogen_map_long_options[] = +{ + { "input", required_argument, NULL, 'i' }, + { "json", required_argument, NULL, 'j' }, + { "python", required_argument, NULL, 'p' }, + { "clang", required_argument, NULL, 'c' }, + { "test", no_argument, NULL, 't' }, + { "verbose", no_argument, NULL, 'v' }, + { NULL, 0, NULL, 0 } +}; + +struct +{ + char *input; + char *output; +} phogen_test_table[] = +{ + { "passgeny", "herang xiasem zitend qibele" }, + { "phonetic", "lineum foneum zybale mangur" }, + { "generator", "latole elitab ackina exprou" }, + { "password", "nulize nomere fonici crednt" }, + { "duck", "catabb rompor cricin prunsi" }, +}; + +const char phogen_python_header[] = +"#\n" +"# Phonetic Map -- autogenerated, do not edit.\n" +"#\n"; + +const char phogen_clang_header[] = +"/*\n" +" * Phonetic Map -- autogenerated, do not edit.\n" +" *\n" +" * The list contains exactly 27*27 entries (all combinations of 2 characters\n" +" * from the array [\" \", \"a\"...\"z\"]) and is sorted alphabetically. This makes\n" +" * the ngram lookup time constant, but it also means that the map contains\n" +" * holes. The holes have a value of `map` set to NULL.\n" +" */\n" +"\n" +"struct phogen_entry\n" +"{\n" +" char *ngram; /* Ngram */\n" +" char *map; /* Character map */\n" +"};\n" +"\n" +"struct phogen_entry phogen_map[] =\n" +"{\n"; + +static char *g_word_list = NULL; +static int g_verbose = 0; + +struct phogen_freq_entry +{ + char fe_letter; + int fe_freq; +}; + +/* Table containing ngrams and next letter frequency */ +static struct phogen_freq_entry phogen_freq_list[PHOGEN_NGRAM_NUM][PHOGEN_ENGLISH_NUM]; + +static int phogen_entry_cmp(const void *_a, const void *_b); + +/* + * Convert an integer to its roman representation as string + * + * This function returns NULL on error. + */ +char *roman_from_int(char (*roman)[ROMAN_MAX_LEN], int i) +{ + char *proman; + int ii; + int ij; + + if (i < 1 || i > 3999) return NULL; + + proman = (char *)roman; + + for (ii = 0; ii < sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0]); ii++) + { + int n = i / roman_symbol_list[ii].rs_value; + + for (ij = 0; ij < n; ij++) + { + strcpy(proman, roman_symbol_list[ii].rs_symbol); + proman += strlen(roman_symbol_list[ii].rs_symbol); + } + + if (n != 0) + { + i %= (n * roman_symbol_list[ii].rs_value); + } + } + + return (char *)roman; +} + +/* + * Convert a roman number to its integer value; return 0 on error + */ +int roman_to_int(const char *roman) +{ + char rs[ROMAN_MAX_LEN]; + char *proman; + int retval; + int ii; + + char uroman[strlen(roman) + 1]; + strcpy(uroman, roman); + + /* Convert all to upper */ + for (proman = uroman; *proman != '\0'; proman++) + { + *proman = toupper(*proman); + } + + proman = uroman; + retval = 0; + while (*proman != '\0') + { + for (ii = 0; ii < sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0]); ii++) + { + if (strncmp(proman, + roman_symbol_list[ii].rs_symbol, + strlen(roman_symbol_list[ii].rs_symbol)) == 0) + { + break; + } + } + + if (ii >= sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0])) + { + return 0; + } + + retval += roman_symbol_list[ii].rs_value; + proman += strlen(roman_symbol_list[ii].rs_symbol); + } + + /* + * Verify that the input string is a corret roman number by converting the + * value back to int + */ + if (roman_from_int(&rs, retval) == NULL || strcmp(rs, uroman) != 0) + { + return 0; + } + + return retval; +} + +/* + * Read the words file line by line (each line should represent a single word). + * Filter out words that contain special characters or are roman numbers. + */ +char *phogen_word_list(void) +{ + static char wl_buf[1024]; + static FILE *wl = NULL; + + if (wl == NULL) + { + wl = fopen(g_word_list, "r"); + if (wl == NULL) + { + fprintf(stderr, "Error opening file: %s\n", g_word_list); + exit(2); + } + } + + while (fgets(wl_buf, sizeof(wl_buf), wl) != NULL) + { + return wl_buf; + } + + if (ferror(wl)) + { + fprintf(stderr, "Error reading file : %s\n", g_word_list); + } + + fclose(wl); + wl = NULL; + return NULL; +} + +/* + * Calculate the ngram index + */ +int phogen_ngram_to_index(const char *word) +{ + int i; + + i = (word[0] == ' ' ? 0 : word[0] - 'a' + 1); + i *= PHOGEN_NGRAM_CHARS; + i += (word[1] == ' ' ? 0 : word[1] - 'a' + 1); + + return i; +} + +/* + * Map an index to its ngram string + */ +void phogen_index_to_ngram(char (*ngram)[3], int index) +{ + int n; + n = index / PHOGEN_NGRAM_CHARS; + (*ngram)[0] = (n == 0) ? ' ' : (char)('a' + n - 1); + n = index % PHOGEN_NGRAM_CHARS; + (*ngram)[1] = (n == 0) ? ' ' : (char)('a' + n - 1); + (*ngram)[2] = '\0'; +} + +/* + * Create a frequency map of `ngram -> next letter` mapping sorted by + * frequency. Take the 5 (number of vowels) most frequent letters. If the + * mapping contains less than 5 letters, fill it with vowels. + */ +void phogen_freq(void) +{ + char *word; + int ni; + + /* Take the word list and split it into ngrams. Build a frequency list + * of ngram -> next letter. + */ + while ((word = phogen_word_list()) != NULL) + { + char *pword; + + char ngram[3] = " "; + + phogen_rstrip(word, "\n\r"); + if (roman_to_int(word) > 0) + { + if (g_verbose) fprintf(stderr, "Ignoring roman number: %s\n", word); + continue; + } + + for (pword = word; *pword != '\0'; pword++) + { + *pword = tolower(*pword); + if (*pword < 'a' || *pword > 'z') + { + if (g_verbose) fprintf(stderr, "Ignoring invalid word: %s\n", word); + break; + } + } + if (*pword != '\0') continue; + + for (pword = word; *pword != '\0'; pword++) + { + int ni = phogen_ngram_to_index(ngram); + int ci = *pword - 'a'; + + /* Update the frequency */ + phogen_freq_list[ni][ci].fe_letter = *pword; + phogen_freq_list[ni][ci].fe_freq++; + + /* Calculate next ngram */ + ngram[0] = ngram[1]; + ngram[1] = *pword; + } + } + + /* + * Scan the list, sort letters by frequency and use 5 most common letters. + */ + for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++) + { + /* Sort the letters by frequency */ + qsort( + phogen_freq_list[ni], + sizeof(phogen_freq_list[ni]) / sizeof(phogen_freq_list[ni][0]), + sizeof(struct phogen_freq_entry), + phogen_entry_cmp); + } + + /* + * Filter out letters that do not produce a valid next-ngram mapping. + * + * If the list is shorter than 5 letters, fill it with vowels. + */ + for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++) + { + char ngram[3]; + int ci; + + /* + * Do not process " ", this is the _starting_ ngram and should contain + * all detected combinations + */ + if (ni == 0) continue; + + /* + * Ignore ngrams that do not have any next mappings (never seen in + * the word list) + */ + if (phogen_freq_list[ni][0].fe_letter == '\0') continue; + + /* + * From the current ngram and the list of letters, generate all possible + * next-ngrams and remove letters that produce invalid mappings. + */ + phogen_index_to_ngram(&ngram, ni); + ci = 0; + while (phogen_freq_list[ni][ci].fe_letter != '\0') + { + char next_ngram[3]; + int next_ni; + + if ((ni != 0) && (ci >= (sizeof(PHOGEN_VOWELS) - 1))) + { + break; + } + + next_ngram[0] = ngram[1]; + next_ngram[1] = phogen_freq_list[ni][ci].fe_letter; + next_ngram[2] = '\0'; + + next_ni = phogen_ngram_to_index(next_ngram); + if (phogen_freq_list[next_ni][0].fe_letter != '\0') + { + ci++; + continue; + } + + if (g_verbose) + { + fprintf(stderr, "Ignoring valid ngram mapping %s + %c -> %s\n", + ngram, + phogen_freq_list[ni][ci].fe_letter, + next_ngram); + } + + memmove(&phogen_freq_list[ni][ci], &phogen_freq_list[ni][ci + 1], + sizeof(phogen_freq_list[ni][0]) * (PHOGEN_ENGLISH_NUM - ci)); + } + + /* + * Clip the number of letters to 5 (number of vowels). If the list is + * shorter than 5, fill it with vowels + */ + + for (ci = 0; ci < sizeof(PHOGEN_VOWELS) - 1; ci++) + { + int cci; + int vi; + + if (phogen_freq_list[ni][ci].fe_letter != '\0') + { + continue; + } + + /* Find a vowel that doesn't exist in the list yet */ + for (vi = 0; vi < sizeof(PHOGEN_VOWELS) - 1; vi++) + { + for (cci = 0; cci <= ci; cci++) + { + if (PHOGEN_VOWELS[vi] == phogen_freq_list[ni][cci].fe_letter) + { + break; + } + } + /* Vowel not found, break out */ + if (cci > ci) + { + break; + } + } + phogen_freq_list[ni][ci].fe_letter = PHOGEN_VOWELS[vi]; + } + + phogen_freq_list[ni][ci].fe_letter = '\0'; + } +} + +/* + * Strip all characters in `what` from the end of string `str` + * + * Note: This function modifies `str` + */ +void phogen_rstrip(char *str, char *what) +{ + char *sl = str + strlen(str) - 1; + + while (sl > str) + { + if (strspn(sl, what) == 0) return; + *sl-- = '\0'; + } +} + + +/* + * Take the buffer in `bigin` and treat is as a big-endian big number. + * Perform a division using the 32-bit divisor in `base` and return the + * 32-bit modulo. + */ +uint32_t bigint_mod32(void *bigint, size_t bigintsz, uint32_t base) +{ + uint32_t *pi; + uint64_t n; + + uint32_t mod = 0; + + for (pi = (uint32_t *)bigint; + pi < (uint32_t *)(bigint + bigintsz); + pi++) + { + n = mod; + n <<= sizeof(uint32_t) * 8; + n |= htonl(*pi); + + *pi = ntohl(n / base); + mod = n % base; + } + + return mod; +} + +/* + * Take input buffer `in` and generate its phonetic representation + * and store it to out. + */ +void phogen(char *out, size_t outsz, void *in, size_t insz) +{ + int ii; + + /* Starting ngram */ + char ngram[3] = " "; + + for (ii = 0; ii < (outsz - 1); ii++) + { + int nsize; + int nsel; + + int ni = phogen_ngram_to_index(ngram); + + /* Calculate the length of the letter pool */ + for (nsize = 0; nsize < PHOGEN_ENGLISH_NUM; nsize++) + { + if (phogen_freq_list[ni][nsize].fe_letter == '\0') break; + } + nsel = bigint_mod32(in, insz, nsize); + + out[ii] = phogen_freq_list[ni][nsel].fe_letter; + + /* Generate next ngram */ + ngram[0] = ngram[1]; + ngram[1] = phogen_freq_list[ni][nsel].fe_letter; + } + + out[ii] = '\0'; +} + +/* + * Run basic tests + */ +bool phogen_test(void) +{ + unsigned char sha256[SHA256_DIGEST_LENGTH]; + SHA256_CTX sha256_ctx; + int ii; + int ij; + + bool retval = true; + + for (ii = 0; ii < sizeof(phogen_test_table) / sizeof(phogen_test_table[0]); ii++) + { + SHA256_Init(&sha256_ctx); + SHA256_Update(&sha256_ctx, phogen_test_table[ii].input, strlen(phogen_test_table[ii].input)); + SHA256_Final(sha256, &sha256_ctx); + + /* 4 words, each 6 characters long and 1 more character for spaces */ + char buf[(6 + 1) * 4]; + char *pbuf = buf; + for (ij = 0; ij < 4; ij++) + { + phogen(pbuf, 7, sha256, sizeof(sha256)); + pbuf += strlen(pbuf); + *pbuf++ = ' '; + } + *(--pbuf) = '\0'; + + if (strcmp(buf, phogen_test_table[ii].output) != 0) + { + fprintf(stderr, "Error, test failed: %s != %s\n", buf, phogen_test_table[ii].output); + retval = false; + } + } + + return retval; +} + +/* + * qsort() comparator + */ +int phogen_entry_cmp(const void *_a, const void *_b) +{ + const struct phogen_freq_entry *a = _a; + const struct phogen_freq_entry *b = _b; + + /* Reverse a/b below, so the sorting order is inverted */ + if (a->fe_freq != b->fe_freq) return (b->fe_freq - a->fe_freq); + if (a->fe_letter != b->fe_letter) return (b->fe_letter - a->fe_letter); + return 0; +} + +bool phogen_pre_test(void) +{ + char ngram[3]; + int ii; + int ni; + + for (ii = 0; ii < PHOGEN_NGRAM_NUM; ii++) + { + phogen_index_to_ngram(&ngram, ii); + ni = phogen_ngram_to_index(ngram); + + if (ii != ni) + { + if (g_verbose) fprintf(stderr, "Internal error, index mapping functions are broken.\n"); + return false; + } + } + + return true; +} + +/* + * Dump the frequency table in JSON format + */ +void phogen_dump_json(FILE *f) +{ + int ni; + int ci; + + fprintf(f, "{\n"); + for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++) + { + char ngram[3]; + + if (phogen_freq_list[ni][0].fe_letter == '\0') continue; + + if (ni != 0) + { + fprintf(f, "\n ],\n"); + } + + phogen_index_to_ngram(&ngram, ni); + fprintf(f, " \"%s\": [\n", ngram); + + for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++) + { + if (phogen_freq_list[ni][ci].fe_letter == '\0') break; + if (ci != 0) + { + fprintf(f, ",\n"); + } + + fprintf(f, " \"%c\"", phogen_freq_list[ni][ci].fe_letter); + } + } + fprintf(f, "\n ]\n}\n"); +} + +/* + * Dump the frequency table in PYTHON format + */ +void phogen_dump_python(FILE *f) +{ + int ni; + int ci; + + fprintf(f, "%sg_phonetic_map = \\", phogen_python_header); + for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++) + { + char ngram[3]; + + if (phogen_freq_list[ni][0].fe_letter == '\0') continue; + + phogen_index_to_ngram(&ngram, ni); + fprintf(f, "%s'%s': ", (ni == 0) ? "\n{" : "],\n ", ngram); + + for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++) + { + if (phogen_freq_list[ni][ci].fe_letter == '\0') break; + fprintf(f, "%s'%c'", (ci == 0) ? "[" : ", ", phogen_freq_list[ni][ci].fe_letter); + } + } + fprintf(f, "]}\n"); +} + +/* + * Dump the frequency table in C format + */ +void phogen_dump_clang(FILE *f) +{ + int ni; + int ci; + + fprintf(f, "%s", phogen_clang_header); + for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++) + { + char ngram[3]; + + phogen_index_to_ngram(&ngram, ni); + + if (ni != 0) + { + fprintf(f, " },\n"); + } + + fprintf(f, " {\n"); + fprintf(f, " .ngram = \"%s\",\n", ngram); + + if (phogen_freq_list[ni][0].fe_letter == '\0') + { + fprintf(f, " .map = NULL\n"); + } + else + { + fprintf(f, " .map = \""); + for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++) + { + if (phogen_freq_list[ni][ci].fe_letter == '\0') break; + fprintf(f, "%c", phogen_freq_list[ni][ci].fe_letter); + } + fprintf(f, "\"\n"); + } + } + fprintf(f, " }\n};\n"); +} + + +void help(void) +{ + printf( + "usage: phogen_map.py -i INPUT [-j JSON] [-p PYTHON] [-c CLANG] [-t] [-v]\n" + "\n" + "Generate mappings for the phonetic generator. Without arguments, just run the built-in tests.\n" + "\n" + "optional arguments:\n" + " -i INPUT, --input INPUT\n" + " Input file (word list)\n" + " -j JSON, --json JSON JSON output file\n" + " -p PYTHON, --python PYTHON\n" + " Python output file\n" + " -c CLANG, --clang CLANG\n" + " C output file\n" + " -t, --test Output random passwords\n" + " -v, --verbose Verbose\n"); +} + +int main(int argc, char *argv[]) +{ + int opt; + + char *json_output = NULL; + char *clang_output = NULL; + char *python_output = NULL; + + //bool test = false; + + while ((opt = getopt_long(argc, argv, "i:j:p:c:tv", phogen_map_long_options, NULL)) != -1) + { + switch (opt) + { + case 'i': + g_word_list = optarg; + break; + + case 'j': + json_output = optarg; + break; + + case 'p': + python_output = optarg; + break; + + case 'c': + clang_output = optarg; + break; + + case 't': + break; + + case 'v': + g_verbose++; + break; + + default: + help(); + return 127; + } + } + + if (g_word_list == NULL) + { + fprintf(stderr, "An input parameter is required (--input or -i).\n"); + return 127; + } + + if (!phogen_pre_test()) + { + printf("Basic tests failed."); + return 1; + } + + /* Generate the frequency map */ + phogen_freq(); + /* Run basic tests */ + phogen_test(); + + /* JSON output */ + if (json_output != NULL) + { + FILE *f = fopen(json_output, "w"); + if (f == NULL) + { + fprintf(stderr, "Error opening JSON output file %s: %s\n", + json_output, + strerror(errno)); + return 1; + } + phogen_dump_json(f); + fclose(f); + } + + /* PYTHON output */ + if (python_output != NULL) + { + FILE *f = fopen(python_output, "w"); + if (f == NULL) + { + fprintf(stderr, "Error opening PYTHON output file %s: %s\n", + python_output, + strerror(errno)); + return 1; + } + phogen_dump_python(f); + fclose(f); + } + + /* C output */ + if (clang_output != NULL) + { + FILE *f = fopen(clang_output, "w"); + if (f == NULL) + { + fprintf(stderr, "Error opening CLANG output file %s: %s\n", + python_output, + strerror(errno)); + return 1; + } + phogen_dump_clang(f); + fclose(f); + } + + return 0; +}