/* * =========================================================================== * Generate the phoentic mapping; this is used for mapping n-grams (two * letter combinations to the letter that most likely follows it. * =========================================================================== */ #include /* For nothl() */ #include #include #include #include #include #include #include #include #include /* List of vowels */ #define PHOGEN_VOWELS "aeiou" /* Number of letters in the english alphabet */ #define PHOGEN_ENGLISH_NUM ('z' - 'a' + 1) /* * Number of valid characters in a ngram. This is the number of letters in the * english alphabet with the addition of space (' ') */ #define PHOGEN_NGRAM_CHARS (PHOGEN_ENGLISH_NUM + 1) /* Number of all possible ngrams (two letter combination, including space) */ #define PHOGEN_NGRAM_NUM (PHOGEN_NGRAM_CHARS * PHOGEN_NGRAM_CHARS) static void phogen_rstrip(char *str, char *what); /* The largest roman number (in terms of string length) is 3888 */ #define ROMAN_MAX_LEN sizeof("MMMDCCCLXXXVIII") struct roman_symbol { char *rs_symbol; int rs_value; }; static struct roman_symbol roman_symbol_list[] = { { .rs_symbol = "M", .rs_value = 1000 }, { .rs_symbol = "CM", .rs_value = 900 }, { .rs_symbol = "D", .rs_value = 500 }, { .rs_symbol = "CD", .rs_value = 400 }, { .rs_symbol = "C", .rs_value = 100 }, { .rs_symbol = "XC", .rs_value = 90 }, { .rs_symbol = "L", .rs_value = 50 }, { .rs_symbol = "XL", .rs_value = 40 }, { .rs_symbol = "X", .rs_value = 10 }, { .rs_symbol = "IX", .rs_value = 9 }, { .rs_symbol = "V", .rs_value = 5 }, { .rs_symbol = "IV", .rs_value = 4 }, { .rs_symbol = "I", .rs_value = 1 } }; static struct option phogen_map_long_options[] = { { "input", required_argument, NULL, 'i' }, { "json", required_argument, NULL, 'j' }, { "python", required_argument, NULL, 'p' }, { "clang", required_argument, NULL, 'c' }, { "test", no_argument, NULL, 't' }, { "verbose", no_argument, NULL, 'v' }, { NULL, 0, NULL, 0 } }; struct { char *input; char *output; } phogen_test_table[] = { { "passgeny", "herang xiasem zitend qibele" }, { "phonetic", "lineum foneum zybale mangur" }, { "generator", "latole elitab ackina exprou" }, { "password", "nulize nomere fonici crednt" }, { "duck", "catabb rompor cricin prunsi" }, }; const char phogen_python_header[] = "#\n" "# Phonetic Map -- autogenerated, do not edit.\n" "#\n"; const char phogen_clang_header[] = "/*\n" " * Phonetic Map -- autogenerated, do not edit.\n" " *\n" " * The list contains exactly 27*27 entries (all combinations of 2 characters\n" " * from the array [\" \", \"a\"...\"z\"]) and is sorted alphabetically. This makes\n" " * the ngram lookup time constant, but it also means that the map contains\n" " * holes. The holes have a value of `map` set to NULL.\n" " */\n" "\n" "struct phogen_entry\n" "{\n" " char *ngram; /* Ngram */\n" " char *map; /* Character map */\n" "};\n" "\n" "struct phogen_entry phogen_map[] =\n" "{\n"; static char *g_word_list = NULL; static int g_verbose = 0; struct phogen_freq_entry { char fe_letter; int fe_freq; }; /* Table containing ngrams and next letter frequency */ static struct phogen_freq_entry phogen_freq_list[PHOGEN_NGRAM_NUM][PHOGEN_ENGLISH_NUM]; static int phogen_entry_cmp(const void *_a, const void *_b); /* * Convert an integer to its roman representation as string * * This function returns NULL on error. */ char *roman_from_int(char (*roman)[ROMAN_MAX_LEN], int i) { char *proman; int ii; int ij; if (i < 1 || i > 3999) return NULL; proman = (char *)roman; for (ii = 0; ii < sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0]); ii++) { int n = i / roman_symbol_list[ii].rs_value; for (ij = 0; ij < n; ij++) { strcpy(proman, roman_symbol_list[ii].rs_symbol); proman += strlen(roman_symbol_list[ii].rs_symbol); } if (n != 0) { i %= (n * roman_symbol_list[ii].rs_value); } } return (char *)roman; } /* * Convert a roman number to its integer value; return 0 on error */ int roman_to_int(const char *roman) { char rs[ROMAN_MAX_LEN]; char *proman; int retval; int ii; char uroman[strlen(roman) + 1]; strcpy(uroman, roman); /* Convert all to upper */ for (proman = uroman; *proman != '\0'; proman++) { *proman = toupper(*proman); } proman = uroman; retval = 0; while (*proman != '\0') { for (ii = 0; ii < sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0]); ii++) { if (strncmp(proman, roman_symbol_list[ii].rs_symbol, strlen(roman_symbol_list[ii].rs_symbol)) == 0) { break; } } if (ii >= sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0])) { return 0; } retval += roman_symbol_list[ii].rs_value; proman += strlen(roman_symbol_list[ii].rs_symbol); } /* * Verify that the input string is a corret roman number by converting the * value back to int */ if (roman_from_int(&rs, retval) == NULL || strcmp(rs, uroman) != 0) { return 0; } return retval; } /* * Read the words file line by line (each line should represent a single word). * Filter out words that contain special characters or are roman numbers. */ char *phogen_word_list(void) { static char wl_buf[1024]; static FILE *wl = NULL; if (wl == NULL) { wl = fopen(g_word_list, "r"); if (wl == NULL) { fprintf(stderr, "Error opening file: %s\n", g_word_list); exit(2); } } while (fgets(wl_buf, sizeof(wl_buf), wl) != NULL) { return wl_buf; } if (ferror(wl)) { fprintf(stderr, "Error reading file : %s\n", g_word_list); } fclose(wl); wl = NULL; return NULL; } /* * Calculate the ngram index */ int phogen_ngram_to_index(const char *word) { int i; i = (word[0] == ' ' ? 0 : word[0] - 'a' + 1); i *= PHOGEN_NGRAM_CHARS; i += (word[1] == ' ' ? 0 : word[1] - 'a' + 1); return i; } /* * Map an index to its ngram string */ void phogen_index_to_ngram(char (*ngram)[3], int index) { int n; n = index / PHOGEN_NGRAM_CHARS; (*ngram)[0] = (n == 0) ? ' ' : (char)('a' + n - 1); n = index % PHOGEN_NGRAM_CHARS; (*ngram)[1] = (n == 0) ? ' ' : (char)('a' + n - 1); (*ngram)[2] = '\0'; } /* * Create a frequency map of `ngram -> next letter` mapping sorted by * frequency. Take the 5 (number of vowels) most frequent letters. If the * mapping contains less than 5 letters, fill it with vowels. */ void phogen_freq(void) { char *word; int ni; /* Take the word list and split it into ngrams. Build a frequency list * of ngram -> next letter. */ while ((word = phogen_word_list()) != NULL) { char *pword; char ngram[3] = " "; phogen_rstrip(word, "\n\r"); if (roman_to_int(word) > 0) { if (g_verbose) fprintf(stderr, "Ignoring roman number: %s\n", word); continue; } for (pword = word; *pword != '\0'; pword++) { *pword = tolower(*pword); if (*pword < 'a' || *pword > 'z') { if (g_verbose) fprintf(stderr, "Ignoring invalid word: %s\n", word); break; } } if (*pword != '\0') continue; for (pword = word; *pword != '\0'; pword++) { int ni = phogen_ngram_to_index(ngram); int ci = *pword - 'a'; /* Update the frequency */ phogen_freq_list[ni][ci].fe_letter = *pword; phogen_freq_list[ni][ci].fe_freq++; /* Calculate next ngram */ ngram[0] = ngram[1]; ngram[1] = *pword; } } /* * Scan the list, sort letters by frequency and use 5 most common letters. */ for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++) { /* Sort the letters by frequency */ qsort( phogen_freq_list[ni], sizeof(phogen_freq_list[ni]) / sizeof(phogen_freq_list[ni][0]), sizeof(struct phogen_freq_entry), phogen_entry_cmp); } /* * Filter out letters that do not produce a valid next-ngram mapping. * * If the list is shorter than 5 letters, fill it with vowels. */ for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++) { char ngram[3]; int ci; /* * Do not process " ", this is the _starting_ ngram and should contain * all detected combinations */ if (ni == 0) continue; /* * Ignore ngrams that do not have any next mappings (never seen in * the word list) */ if (phogen_freq_list[ni][0].fe_letter == '\0') continue; /* * From the current ngram and the list of letters, generate all possible * next-ngrams and remove letters that produce invalid mappings. */ phogen_index_to_ngram(&ngram, ni); ci = 0; while (phogen_freq_list[ni][ci].fe_letter != '\0') { char next_ngram[3]; int next_ni; if ((ni != 0) && (ci >= (sizeof(PHOGEN_VOWELS) - 1))) { break; } next_ngram[0] = ngram[1]; next_ngram[1] = phogen_freq_list[ni][ci].fe_letter; next_ngram[2] = '\0'; next_ni = phogen_ngram_to_index(next_ngram); if (phogen_freq_list[next_ni][0].fe_letter != '\0') { ci++; continue; } if (g_verbose) { fprintf(stderr, "Ignoring valid ngram mapping %s + %c -> %s\n", ngram, phogen_freq_list[ni][ci].fe_letter, next_ngram); } memmove(&phogen_freq_list[ni][ci], &phogen_freq_list[ni][ci + 1], sizeof(phogen_freq_list[ni][0]) * (PHOGEN_ENGLISH_NUM - ci)); } /* * Clip the number of letters to 5 (number of vowels). If the list is * shorter than 5, fill it with vowels */ for (ci = 0; ci < sizeof(PHOGEN_VOWELS) - 1; ci++) { int cci; int vi; if (phogen_freq_list[ni][ci].fe_letter != '\0') { continue; } /* Find a vowel that doesn't exist in the list yet */ for (vi = 0; vi < sizeof(PHOGEN_VOWELS) - 1; vi++) { for (cci = 0; cci <= ci; cci++) { if (PHOGEN_VOWELS[vi] == phogen_freq_list[ni][cci].fe_letter) { break; } } /* Vowel not found, break out */ if (cci > ci) { break; } } phogen_freq_list[ni][ci].fe_letter = PHOGEN_VOWELS[vi]; } phogen_freq_list[ni][ci].fe_letter = '\0'; } } /* * Strip all characters in `what` from the end of string `str` * * Note: This function modifies `str` */ void phogen_rstrip(char *str, char *what) { char *sl = str + strlen(str) - 1; while (sl > str) { if (strspn(sl, what) == 0) return; *sl-- = '\0'; } } /* * Take the buffer in `bigin` and treat is as a big-endian big number. * Perform a division using the 32-bit divisor in `base` and return the * 32-bit modulo. */ uint32_t bigint_mod32(void *bigint, size_t bigintsz, uint32_t base) { uint32_t *pi; uint64_t n; uint32_t mod = 0; for (pi = (uint32_t *)bigint; pi < (uint32_t *)(bigint + bigintsz); pi++) { n = mod; n <<= sizeof(uint32_t) * 8; n |= htonl(*pi); *pi = ntohl(n / base); mod = n % base; } return mod; } /* * Take input buffer `in` and generate its phonetic representation * and store it to out. */ void phogen(char *out, size_t outsz, void *in, size_t insz) { int ii; /* Starting ngram */ char ngram[3] = " "; for (ii = 0; ii < (outsz - 1); ii++) { int nsize; int nsel; int ni = phogen_ngram_to_index(ngram); /* Calculate the length of the letter pool */ for (nsize = 0; nsize < PHOGEN_ENGLISH_NUM; nsize++) { if (phogen_freq_list[ni][nsize].fe_letter == '\0') break; } nsel = bigint_mod32(in, insz, nsize); out[ii] = phogen_freq_list[ni][nsel].fe_letter; /* Generate next ngram */ ngram[0] = ngram[1]; ngram[1] = phogen_freq_list[ni][nsel].fe_letter; } out[ii] = '\0'; } /* * Run basic tests */ bool phogen_test(void) { unsigned char sha256[SHA256_DIGEST_LENGTH]; SHA256_CTX sha256_ctx; int ii; int ij; bool retval = true; for (ii = 0; ii < sizeof(phogen_test_table) / sizeof(phogen_test_table[0]); ii++) { SHA256_Init(&sha256_ctx); SHA256_Update(&sha256_ctx, phogen_test_table[ii].input, strlen(phogen_test_table[ii].input)); SHA256_Final(sha256, &sha256_ctx); /* 4 words, each 6 characters long and 1 more character for spaces */ char buf[(6 + 1) * 4]; char *pbuf = buf; for (ij = 0; ij < 4; ij++) { phogen(pbuf, 7, sha256, sizeof(sha256)); pbuf += strlen(pbuf); *pbuf++ = ' '; } *(--pbuf) = '\0'; if (strcmp(buf, phogen_test_table[ii].output) != 0) { fprintf(stderr, "Error, test failed: %s != %s\n", buf, phogen_test_table[ii].output); retval = false; } } return retval; } /* * qsort() comparator */ int phogen_entry_cmp(const void *_a, const void *_b) { const struct phogen_freq_entry *a = _a; const struct phogen_freq_entry *b = _b; /* Reverse a/b below, so the sorting order is inverted */ if (a->fe_freq != b->fe_freq) return (b->fe_freq - a->fe_freq); if (a->fe_letter != b->fe_letter) return (b->fe_letter - a->fe_letter); return 0; } bool phogen_pre_test(void) { char ngram[3]; int ii; int ni; for (ii = 0; ii < PHOGEN_NGRAM_NUM; ii++) { phogen_index_to_ngram(&ngram, ii); ni = phogen_ngram_to_index(ngram); if (ii != ni) { if (g_verbose) fprintf(stderr, "Internal error, index mapping functions are broken.\n"); return false; } } return true; } /* * Dump the frequency table in JSON format */ void phogen_dump_json(FILE *f) { int ni; int ci; fprintf(f, "{\n"); for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++) { char ngram[3]; if (phogen_freq_list[ni][0].fe_letter == '\0') continue; if (ni != 0) { fprintf(f, "\n ],\n"); } phogen_index_to_ngram(&ngram, ni); fprintf(f, " \"%s\": [\n", ngram); for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++) { if (phogen_freq_list[ni][ci].fe_letter == '\0') break; if (ci != 0) { fprintf(f, ",\n"); } fprintf(f, " \"%c\"", phogen_freq_list[ni][ci].fe_letter); } } fprintf(f, "\n ]\n}\n"); } /* * Dump the frequency table in PYTHON format */ void phogen_dump_python(FILE *f) { int ni; int ci; fprintf(f, "%sg_phonetic_map = \\", phogen_python_header); for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++) { char ngram[3]; if (phogen_freq_list[ni][0].fe_letter == '\0') continue; phogen_index_to_ngram(&ngram, ni); fprintf(f, "%s'%s': ", (ni == 0) ? "\n{" : "],\n ", ngram); for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++) { if (phogen_freq_list[ni][ci].fe_letter == '\0') break; fprintf(f, "%s'%c'", (ci == 0) ? "[" : ", ", phogen_freq_list[ni][ci].fe_letter); } } fprintf(f, "]}\n"); } /* * Dump the frequency table in C format */ void phogen_dump_clang(FILE *f) { int ni; int ci; fprintf(f, "%s", phogen_clang_header); for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++) { char ngram[3]; phogen_index_to_ngram(&ngram, ni); if (ni != 0) { fprintf(f, " },\n"); } fprintf(f, " {\n"); fprintf(f, " .ngram = \"%s\",\n", ngram); if (phogen_freq_list[ni][0].fe_letter == '\0') { fprintf(f, " .map = NULL\n"); } else { fprintf(f, " .map = \""); for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++) { if (phogen_freq_list[ni][ci].fe_letter == '\0') break; fprintf(f, "%c", phogen_freq_list[ni][ci].fe_letter); } fprintf(f, "\"\n"); } } fprintf(f, " }\n};\n"); } void help(void) { printf( "usage: phogen_map.py -i INPUT [-j JSON] [-p PYTHON] [-c CLANG] [-t] [-v]\n" "\n" "Generate mappings for the phonetic generator. Without arguments, just run the built-in tests.\n" "\n" "optional arguments:\n" " -i INPUT, --input INPUT\n" " Input file (word list)\n" " -j JSON, --json JSON JSON output file\n" " -p PYTHON, --python PYTHON\n" " Python output file\n" " -c CLANG, --clang CLANG\n" " C output file\n" " -t, --test Output random passwords\n" " -v, --verbose Verbose\n"); } int main(int argc, char *argv[]) { int opt; char *json_output = NULL; char *clang_output = NULL; char *python_output = NULL; //bool test = false; while ((opt = getopt_long(argc, argv, "i:j:p:c:tv", phogen_map_long_options, NULL)) != -1) { switch (opt) { case 'i': g_word_list = optarg; break; case 'j': json_output = optarg; break; case 'p': python_output = optarg; break; case 'c': clang_output = optarg; break; case 't': break; case 'v': g_verbose++; break; default: help(); return 127; } } if (g_word_list == NULL) { fprintf(stderr, "An input parameter is required (--input or -i).\n"); return 127; } if (!phogen_pre_test()) { printf("Basic tests failed."); return 1; } /* Generate the frequency map */ phogen_freq(); /* Run basic tests */ phogen_test(); /* JSON output */ if (json_output != NULL) { FILE *f = fopen(json_output, "w"); if (f == NULL) { fprintf(stderr, "Error opening JSON output file %s: %s\n", json_output, strerror(errno)); return 1; } phogen_dump_json(f); fclose(f); } /* PYTHON output */ if (python_output != NULL) { FILE *f = fopen(python_output, "w"); if (f == NULL) { fprintf(stderr, "Error opening PYTHON output file %s: %s\n", python_output, strerror(errno)); return 1; } phogen_dump_python(f); fclose(f); } /* C output */ if (clang_output != NULL) { FILE *f = fopen(clang_output, "w"); if (f == NULL) { fprintf(stderr, "Error opening CLANG output file %s: %s\n", python_output, strerror(errno)); return 1; } phogen_dump_clang(f); fclose(f); } return 0; }