/*
 * ===========================================================================
 *  Generate the phoentic mapping; this is used for mapping n-grams (two
 *  letter combinations to the letter that most likely follows it.
 * ===========================================================================
 */

#include <arpa/inet.h> /* For nothl() */

#include <ctype.h>
#include <errno.h>
#include <getopt.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string.h>

#include <openssl/sha.h>

/* List of vowels */
#define PHOGEN_VOWELS       "aeiou"

/* Number of letters in the english alphabet */
#define PHOGEN_ENGLISH_NUM  ('z' - 'a' + 1)

/*
 * Number of valid characters in a ngram. This is the number of letters in the
 * english alphabet with the addition of space (' ')
 */
#define PHOGEN_NGRAM_CHARS (PHOGEN_ENGLISH_NUM + 1)

/* Number of all possible ngrams (two letter combination, including space) */
#define PHOGEN_NGRAM_NUM    (PHOGEN_NGRAM_CHARS * PHOGEN_NGRAM_CHARS)

static void phogen_rstrip(char *str, char *what);

/* The largest roman number (in terms of string length) is 3888 */
#define ROMAN_MAX_LEN   sizeof("MMMDCCCLXXXVIII")

struct roman_symbol
{
    char    *rs_symbol;
    int      rs_value;
};

static struct roman_symbol roman_symbol_list[] =
{
    {   .rs_symbol = "M",   .rs_value = 1000    },
    {   .rs_symbol = "CM",  .rs_value = 900     },
    {   .rs_symbol = "D",   .rs_value = 500     },
    {   .rs_symbol = "CD",  .rs_value = 400     },
    {   .rs_symbol = "C",   .rs_value = 100     },
    {   .rs_symbol = "XC",  .rs_value = 90      },
    {   .rs_symbol = "L",   .rs_value = 50      },
    {   .rs_symbol = "XL",  .rs_value = 40      },
    {   .rs_symbol = "X",   .rs_value = 10      },
    {   .rs_symbol = "IX",  .rs_value = 9       },
    {   .rs_symbol = "V",   .rs_value = 5       },
    {   .rs_symbol = "IV",  .rs_value = 4       },
    {   .rs_symbol = "I",   .rs_value = 1       }
};

static struct option phogen_map_long_options[] =
{
    {   "input",    required_argument,      NULL,   'i' },
    {   "json",     required_argument,      NULL,   'j' },
    {   "python",   required_argument,      NULL,   'p' },
    {   "clang",    required_argument,      NULL,   'c' },
    {   "test",     no_argument,            NULL,   't' },
    {   "verbose",  no_argument,            NULL,   'v' },
    {   NULL,       0,                      NULL,   0   }
};

struct
{
    char *input;
    char *output;
} phogen_test_table[] =
{
    { "passgeny",  "herang xiasem zitend qibele" },
    { "phonetic",  "lineum foneum zybale mangur" },
    { "generator", "latole elitab ackina exprou" },
    { "password",  "nulize nomere fonici crednt" },
    { "duck",      "catabb rompor cricin prunsi" },
};

const char phogen_python_header[] =
"#\n"
"# Phonetic Map -- autogenerated, do not edit.\n"
"#\n";

const char phogen_clang_header[] =
"/*\n"
" * Phonetic Map -- autogenerated, do not edit.\n"
" *\n"
" * The list contains exactly 27*27 entries (all combinations of 2 characters\n"
" * from the array  [\" \", \"a\"...\"z\"]) and is sorted alphabetically. This makes\n"
" * the ngram lookup time constant, but it also means that the map contains\n"
" * holes. The holes have a value of `map` set to NULL.\n"
" */\n"
"\n"
"struct phogen_entry\n"
"{\n"
"    char    *ngram;     /* Ngram */\n"
"    char    *map;       /* Character map */\n"
"};\n"
"\n"
"struct phogen_entry phogen_map[] =\n"
"{\n";

static char *g_word_list = NULL;
static int g_verbose = 0;

struct phogen_freq_entry
{
    char    fe_letter;
    int     fe_freq;
};

/* Table containing ngrams and next letter frequency */
static struct phogen_freq_entry phogen_freq_list[PHOGEN_NGRAM_NUM][PHOGEN_ENGLISH_NUM];

static int phogen_entry_cmp(const void *_a, const void *_b);

/*
 * Convert an integer to its roman representation as string
 *
 * This function returns NULL on error.
 */
char *roman_from_int(char (*roman)[ROMAN_MAX_LEN], int i)
{
    char *proman;
    int ii;
    int ij;

    if (i < 1 || i > 3999) return NULL;

    proman = (char *)roman;

    for (ii = 0; ii < sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0]); ii++)
    {
        int n = i / roman_symbol_list[ii].rs_value;

        for (ij = 0; ij < n; ij++)
        {
            strcpy(proman, roman_symbol_list[ii].rs_symbol);
            proman += strlen(roman_symbol_list[ii].rs_symbol);
        }

        if (n != 0)
        {
            i %= (n * roman_symbol_list[ii].rs_value);
        }
    }

    return (char *)roman;
}

/*
 * Convert a roman number to its integer value; return 0 on error
 */
int roman_to_int(const char *roman)
{
    char rs[ROMAN_MAX_LEN];
    char *proman;
    int retval;
    int ii;

    char uroman[strlen(roman) + 1];
    strcpy(uroman, roman);

    /* Convert all to upper */
    for (proman = uroman; *proman != '\0'; proman++)
    {
        *proman = toupper(*proman);
    }

    proman = uroman;
    retval = 0;
    while (*proman != '\0')
    {
        for (ii = 0; ii < sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0]); ii++)
        {
            if (strncmp(proman,
                        roman_symbol_list[ii].rs_symbol,
                        strlen(roman_symbol_list[ii].rs_symbol)) == 0)
            {
                break;
            }
        }

        if (ii >= sizeof(roman_symbol_list) / sizeof(roman_symbol_list[0]))
        {
            return 0;
        }

        retval += roman_symbol_list[ii].rs_value;
        proman += strlen(roman_symbol_list[ii].rs_symbol);
    }

    /*
     * Verify that the input string is a corret roman number by converting the
     * value back to int
     */
    if (roman_from_int(&rs, retval) == NULL || strcmp(rs, uroman) != 0)
    {
        return 0;
    }

    return retval;
}

/*
 * Read the words file line by line (each line should represent a single word).
 * Filter out words that contain special characters or are roman numbers.
 */
char *phogen_word_list(void)
{
    static char wl_buf[1024];
    static FILE *wl = NULL;

    if (wl == NULL)
    {
        wl = fopen(g_word_list, "r");
        if (wl == NULL)
        {
            fprintf(stderr, "Error opening file: %s\n", g_word_list);
            exit(2);
        }
    }

    while (fgets(wl_buf, sizeof(wl_buf), wl) != NULL)
    {
        return wl_buf;
    }

    if (ferror(wl))
    {
        fprintf(stderr, "Error reading file : %s\n", g_word_list);
    }

    fclose(wl);
    wl = NULL;
    return NULL;
}

/*
 * Calculate the ngram index
 */
int phogen_ngram_to_index(const char *word)
{
    int i;

    i = (word[0] == ' ' ? 0 : word[0] - 'a' + 1);
    i *= PHOGEN_NGRAM_CHARS;
    i += (word[1] == ' ' ? 0 : word[1] - 'a' + 1);

    return i;
}

/*
 * Map an index to its ngram string
 */
void phogen_index_to_ngram(char (*ngram)[3], int index)
{
    int n;
    n = index / PHOGEN_NGRAM_CHARS;
    (*ngram)[0] = (n == 0) ? ' ' : (char)('a' + n - 1);
    n = index % PHOGEN_NGRAM_CHARS;
    (*ngram)[1] = (n == 0) ? ' ' : (char)('a' + n - 1);
    (*ngram)[2] = '\0';
}

/*
 * Create a frequency map of `ngram -> next letter` mapping sorted by
 * frequency. Take the 5 (number of vowels) most frequent letters. If the
 * mapping contains less than 5 letters, fill it with vowels.
 */
void phogen_freq(void)
{
    char *word;
    int ni;

    /* Take the word list and split it into ngrams. Build a frequency list
     * of ngram -> next letter.
     */
    while ((word = phogen_word_list()) != NULL)
    {
        char *pword;

        char ngram[3] = "  ";

        phogen_rstrip(word, "\n\r");
        if (roman_to_int(word) > 0)
        {
            if (g_verbose) fprintf(stderr, "Ignoring roman number: %s\n", word);
            continue;
        }

        for (pword = word; *pword != '\0'; pword++)
        {
            *pword = tolower(*pword);
            if (*pword < 'a' || *pword > 'z')
            {
                if (g_verbose) fprintf(stderr, "Ignoring invalid word: %s\n", word);
                break;
            }
        }
        if (*pword != '\0') continue;

        for (pword = word; *pword != '\0'; pword++)
        {
            int ni = phogen_ngram_to_index(ngram);
            int ci = *pword - 'a';

            /* Update the frequency */
            phogen_freq_list[ni][ci].fe_letter = *pword;
            phogen_freq_list[ni][ci].fe_freq++;

            /* Calculate next ngram */
            ngram[0] = ngram[1];
            ngram[1] = *pword;
        }
    }

    /*
     * Scan the list, sort letters by frequency and use 5 most common letters.
     */
    for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++)
    {
        /* Sort the letters by frequency */
        qsort(
                phogen_freq_list[ni],
                sizeof(phogen_freq_list[ni]) / sizeof(phogen_freq_list[ni][0]),
                sizeof(struct phogen_freq_entry),
                phogen_entry_cmp);
    }

    /*
     * Filter out letters that do not produce a valid next-ngram mapping.
     *
     * If the list is shorter than 5 letters, fill it with vowels.
     */
    for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++)
    {
        char ngram[3];
        int ci;

        /*
         * Do not process "  ", this is the _starting_ ngram and should contain
         * all detected combinations
         */
        if (ni == 0) continue;

        /*
         * Ignore ngrams that do not have any next mappings (never seen in
         * the word list)
         */
        if (phogen_freq_list[ni][0].fe_letter == '\0') continue;

        /*
         * From the current ngram and the list of letters, generate all possible
         * next-ngrams and remove letters that produce invalid mappings.
         */
        phogen_index_to_ngram(&ngram, ni);
        ci = 0;
        while (phogen_freq_list[ni][ci].fe_letter != '\0')
        {
            char next_ngram[3];
            int next_ni;

            if ((ni != 0) && (ci >= (sizeof(PHOGEN_VOWELS) - 1)))
            {
                break;
            }

            next_ngram[0] = ngram[1];
            next_ngram[1] = phogen_freq_list[ni][ci].fe_letter;
            next_ngram[2] = '\0';

            next_ni = phogen_ngram_to_index(next_ngram);
            if (phogen_freq_list[next_ni][0].fe_letter != '\0')
            {
                ci++;
                continue;
            }

            if (g_verbose)
            {
                fprintf(stderr, "Ignoring valid ngram mapping %s + %c -> %s\n",
                        ngram,
                        phogen_freq_list[ni][ci].fe_letter,
                        next_ngram);
            }

            memmove(&phogen_freq_list[ni][ci], &phogen_freq_list[ni][ci + 1],
                    sizeof(phogen_freq_list[ni][0]) * (PHOGEN_ENGLISH_NUM - ci));
        }

        /*
         * Clip the number of letters to 5 (number of vowels). If the list is
         * shorter than 5, fill it with vowels
         */

        for (ci = 0; ci < sizeof(PHOGEN_VOWELS) - 1; ci++)
        {
            int cci;
            int vi;

            if (phogen_freq_list[ni][ci].fe_letter != '\0')
            {
                continue;
            }

            /* Find a vowel that doesn't exist in the list yet */
            for (vi = 0; vi < sizeof(PHOGEN_VOWELS) - 1; vi++)
            {
                for (cci = 0; cci <= ci; cci++)
                {
                    if (PHOGEN_VOWELS[vi] == phogen_freq_list[ni][cci].fe_letter)
                    {
                        break;
                    }
                }
                /* Vowel not found, break out */
                if (cci > ci)
                {
                    break;
                }
            }
            phogen_freq_list[ni][ci].fe_letter = PHOGEN_VOWELS[vi];
        }

        phogen_freq_list[ni][ci].fe_letter = '\0';
    }
}

/*
 * Strip all characters in `what` from the end of string `str`
 *
 * Note: This function modifies `str`
 */
void phogen_rstrip(char *str, char *what)
{
    char *sl = str + strlen(str) - 1;

    while (sl > str)
    {
        if (strspn(sl, what) == 0) return;
        *sl-- = '\0';
    }
}


/*
 * Take the buffer in `bigin` and treat is as a big-endian big number.
 * Perform a division using the 32-bit divisor in `base` and return the
 * 32-bit modulo.
 */
uint32_t bigint_mod32(void *bigint, size_t bigintsz, uint32_t base)
{
    uint32_t *pi;
    uint64_t n;

    uint32_t mod = 0;

    for (pi = (uint32_t *)bigint;
            pi < (uint32_t *)(bigint + bigintsz);
            pi++)
    {
        n = mod;
        n <<= sizeof(uint32_t) * 8;
        n |= htonl(*pi);

        *pi = ntohl(n / base);
        mod = n % base;
    }

    return mod;
}

/*
 * Take input buffer `in` and generate its phonetic representation
 * and store it to out.
 */
void phogen(char *out, size_t outsz, void *in, size_t insz)
{
    int ii;

    /* Starting ngram */
    char ngram[3] = "  ";

    for (ii = 0; ii < (outsz - 1); ii++)
    {
        int nsize;
        int nsel;

        int ni = phogen_ngram_to_index(ngram);

        /* Calculate the length of the letter pool */
        for (nsize = 0; nsize < PHOGEN_ENGLISH_NUM; nsize++)
        {
            if (phogen_freq_list[ni][nsize].fe_letter == '\0') break;
        }
        nsel = bigint_mod32(in, insz, nsize);

        out[ii] = phogen_freq_list[ni][nsel].fe_letter;

        /* Generate next ngram */
        ngram[0] = ngram[1];
        ngram[1] = phogen_freq_list[ni][nsel].fe_letter;
    }

    out[ii] = '\0';
}

/*
 * Run basic tests
 */
bool phogen_test(void)
{
    unsigned char sha256[SHA256_DIGEST_LENGTH];
    SHA256_CTX sha256_ctx;
    int ii;
    int ij;

    bool retval = true;

    for (ii = 0; ii < sizeof(phogen_test_table) / sizeof(phogen_test_table[0]); ii++)
    {
        SHA256_Init(&sha256_ctx);
        SHA256_Update(&sha256_ctx, phogen_test_table[ii].input, strlen(phogen_test_table[ii].input));
        SHA256_Final(sha256, &sha256_ctx);

        /* 4 words, each 6 characters long and 1 more character for spaces */
        char buf[(6 + 1) * 4];
        char *pbuf = buf;
        for (ij = 0; ij < 4; ij++)
        {
            phogen(pbuf, 7, sha256, sizeof(sha256));
            pbuf += strlen(pbuf);
            *pbuf++ = ' ';
        }
        *(--pbuf) = '\0';

        if (strcmp(buf, phogen_test_table[ii].output) != 0)
        {
            fprintf(stderr, "Error, test failed: %s != %s\n", buf, phogen_test_table[ii].output);
            retval = false;
        }
    }

    return retval;
}

/*
 * qsort() comparator
 */
int phogen_entry_cmp(const void *_a, const void *_b)
{
    const struct phogen_freq_entry *a = _a;
    const struct phogen_freq_entry *b = _b;

    /* Reverse a/b below, so the sorting order is inverted */
    if (a->fe_freq != b->fe_freq) return (b->fe_freq - a->fe_freq);
    if (a->fe_letter != b->fe_letter) return (b->fe_letter - a->fe_letter);
    return 0;
}

bool phogen_pre_test(void)
{
    char ngram[3];
    int ii;
    int ni;

    for (ii = 0; ii < PHOGEN_NGRAM_NUM; ii++)
    {
        phogen_index_to_ngram(&ngram, ii);
        ni = phogen_ngram_to_index(ngram);

        if (ii != ni)
        {
            if (g_verbose) fprintf(stderr, "Internal error, index mapping functions are broken.\n");
            return false;
        }
    }

    return true;
}

/*
 * Dump the frequency table in JSON format
 */
void phogen_dump_json(FILE *f)
{
    int ni;
    int ci;

    fprintf(f, "{\n");
    for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++)
    {
        char ngram[3];

        if (phogen_freq_list[ni][0].fe_letter == '\0') continue;

        if (ni != 0)
        {
            fprintf(f, "\n    ],\n");
        }

        phogen_index_to_ngram(&ngram, ni);
        fprintf(f, "    \"%s\": [\n", ngram);

        for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++)
        {
            if (phogen_freq_list[ni][ci].fe_letter == '\0') break;
            if (ci != 0)
            {
                fprintf(f, ",\n");
            }

            fprintf(f, "        \"%c\"", phogen_freq_list[ni][ci].fe_letter);
        }
    }
    fprintf(f, "\n    ]\n}\n");
}

/*
 * Dump the frequency table in PYTHON format
 */
void phogen_dump_python(FILE *f)
{
    int ni;
    int ci;

    fprintf(f, "%sg_phonetic_map = \\", phogen_python_header);
    for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++)
    {
        char ngram[3];

        if (phogen_freq_list[ni][0].fe_letter == '\0') continue;

        phogen_index_to_ngram(&ngram, ni);
        fprintf(f, "%s'%s': ", (ni == 0) ? "\n{" : "],\n ", ngram);

        for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++)
        {
            if (phogen_freq_list[ni][ci].fe_letter == '\0') break;
            fprintf(f, "%s'%c'", (ci == 0) ? "[" : ", ", phogen_freq_list[ni][ci].fe_letter);
        }
    }
    fprintf(f, "]}\n");
}

/*
 * Dump the frequency table in C format
 */
void phogen_dump_clang(FILE *f)
{
    int ni;
    int ci;

    fprintf(f, "%s", phogen_clang_header);
    for (ni = 0; ni < (sizeof(phogen_freq_list) / sizeof(phogen_freq_list[0])); ni++)
    {
        char ngram[3];

        phogen_index_to_ngram(&ngram, ni);

        if (ni != 0)
        {
            fprintf(f, "    },\n");
        }

        fprintf(f, "    {\n");
        fprintf(f, "        .ngram = \"%s\",\n", ngram);

        if (phogen_freq_list[ni][0].fe_letter == '\0')
        {
            fprintf(f, "        .map = NULL\n");
        }
        else
        {
            fprintf(f, "        .map = \"");
            for (ci = 0; ci < PHOGEN_ENGLISH_NUM; ci++)
            {
                if (phogen_freq_list[ni][ci].fe_letter == '\0') break;
                fprintf(f, "%c", phogen_freq_list[ni][ci].fe_letter);
            }
            fprintf(f, "\"\n");
        }
    }
    fprintf(f, "    }\n};\n");
}


void help(void)
{
    printf(
            "usage: phogen_map.py -i INPUT [-j JSON] [-p PYTHON] [-c CLANG] [-t] [-v]\n"
            "\n"
            "Generate mappings for the phonetic generator. Without arguments, just run the built-in tests.\n"
            "\n"
            "optional arguments:\n"
            "  -i INPUT, --input INPUT\n"
            "                        Input file (word list)\n"
            "  -j JSON, --json JSON  JSON output file\n"
            "  -p PYTHON, --python PYTHON\n"
            "                        Python output file\n"
            "  -c CLANG, --clang CLANG\n"
            "                        C output file\n"
            "  -t, --test            Output random passwords\n"
            "  -v, --verbose         Verbose\n");
}

int main(int argc, char *argv[])
{
    int opt;

    char *json_output = NULL;
    char *clang_output = NULL;
    char *python_output = NULL;

    //bool test = false;

    while ((opt = getopt_long(argc, argv, "i:j:p:c:tv", phogen_map_long_options, NULL)) != -1)
    {
        switch (opt)
        {
            case 'i':
                g_word_list = optarg;
                break;

            case 'j':
                json_output = optarg;
                break;

            case 'p':
                python_output = optarg;
                break;

            case 'c':
                clang_output = optarg;
                break;

            case 't':
                break;

            case 'v':
                g_verbose++;
                break;

            default:
                help();
                return 127;
        }
    }

    if (g_word_list == NULL)
    {
        fprintf(stderr, "An input parameter is required (--input or -i).\n");
        return 127;
    }

    if (!phogen_pre_test())
    {
        printf("Basic tests failed.");
        return 1;
    }

    /* Generate the frequency map */
    phogen_freq();
    /* Run basic tests */
    phogen_test();

    /* JSON output */
    if (json_output != NULL)
    {
        FILE *f = fopen(json_output, "w");
        if (f == NULL)
        {
            fprintf(stderr, "Error opening JSON output file %s: %s\n",
                    json_output,
                    strerror(errno));
            return 1;
        }
        phogen_dump_json(f);
        fclose(f);
    }

    /* PYTHON output */
    if (python_output != NULL)
    {
        FILE *f = fopen(python_output, "w");
        if (f == NULL)
        {
            fprintf(stderr, "Error opening PYTHON output file %s: %s\n",
                    python_output,
                    strerror(errno));
            return 1;
        }
        phogen_dump_python(f);
        fclose(f);
    }

    /* C output */
    if (clang_output != NULL)
    {
        FILE *f = fopen(clang_output, "w");
        if (f == NULL)
        {
            fprintf(stderr, "Error opening CLANG output file %s: %s\n",
                    python_output,
                    strerror(errno));
            return 1;
        }
        phogen_dump_clang(f);
        fclose(f);
    }

    return 0;
}