c: Finish the phogen module

This commit implements the `phogen_encode` function, which takes a bhash as argument and generates a phonetic string.
2021-11-14 09:12:20 +01:00
parent ad0627b9c1
commit 560db14a85
4 changed files with 78 additions and 4 deletions
--- a/c/phogen/src/phogen.c
+++ b/c/phogen/src/phogen.c
@ -1,10 +1,78 @@
-#include <stdlib.h>
-
 /*
+ * =========================================================================
 * Phonetic generator -- used to convert binary data to semi-pronouncable
 * strings
+ * =========================================================================
 */
+#include <stdlib.h>
+#include <stdio.h>

 #include "phogen_map.h"
 #include "phogen.h"

+/* Number of letters in the english alphabet */
+#define PHOGEN_ENGLISH_NUM  ('z' - 'a' + 1)
+/* Number of valid characters in a n gram */
+#define PHOGEN_NGRAM_CHARS (PHOGEN_ENGLISH_NUM + 1)
+
+static int phogen_ngram_to_index(const char *word);
+
+/*
+ * Create a phonetic string of size `out_sz` by consuming bits from the bhash
+ * `bh_in` and write it to out.
+ */
+bool phogen_encode(char *out, size_t out_sz, bhash_t *bh_in)
+{
+    size_t ii;
+    size_t nmap;
+    uint32_t nm;
+
+    char ngram[3] = "  ";
+
+    if (out_sz == 0) return true;
+
+    for (ii = 0; ii < (out_sz - 1); ii++)
+    {
+        int ni = phogen_ngram_to_index(ngram);
+
+        /* Calculte the number of letters in the current map */
+        for (nmap = 0; nmap < PHOGEN_ENGLISH_NUM; nmap++)
+        {
+            if (phogen_map[ni].map[nmap] == '\0') break;
+        }
+
+        /* Get the index of the next phonetic letter */
+        nm = bhash_mod32(bh_in, nmap);
+        if (nm == BHASH_MOD32_ERR)
+        {
+            fprintf(stderr, "bhash error, unable to compute next phoentic char.");
+            return false;
+        }
+
+        out[ii] = phogen_map[ni].map[nm];
+
+        /* Calculate the next ngram */
+        ngram[0] = ngram[1];
+        ngram[1] = out[ii];
+    }
+
+    out[ii] = '\0';
+
+    return true;
+}
+
+/*
+ * The phogen_map table is fixed size and sorted. We can calculate the offset
+ * of a ngram with the simple calculation below.
+ */
+int phogen_ngram_to_index(const char *word)
+{
+    int i;
+
+    i = (word[0] == ' ' ? 0 : word[0] - 'a' + 1);
+    i *= PHOGEN_NGRAM_CHARS;
+    i += (word[1] == ' ' ? 0 : word[1] - 'a' + 1);
+
+    return i;
+}
+