From 560db14a85c43d91eb6254ef588eea74bddacedf Mon Sep 17 00:00:00 2001 From: Mitja HORVAT Date: Sun, 14 Nov 2021 09:12:20 +0100 Subject: [PATCH] c: Finish the phogen module This commit implements the `phogen_encode` function, which takes a bhash as argument and generates a phonetic string. --- c/phogen/inc/phogen.h | 6 +++ c/phogen/meson.build | 3 +- c/phogen/phogen_map/phogen_map.c | 1 - c/phogen/src/phogen.c | 72 +++++++++++++++++++++++++++++++- 4 files changed, 78 insertions(+), 4 deletions(-) diff --git a/c/phogen/inc/phogen.h b/c/phogen/inc/phogen.h index 29b27bc..67ccf2c 100644 --- a/c/phogen/inc/phogen.h +++ b/c/phogen/inc/phogen.h @@ -1,4 +1,10 @@ #ifndef PHOGEN_H_INCLUDED #define PHOGEN_H_INCLUDED +#include + +#include "bhash.h" + +bool phogen_encode(char *out, size_t out_len, bhash_t *bh_in); + #endif /* PHOGEN_H_INCLUDED */ diff --git a/c/phogen/meson.build b/c/phogen/meson.build index 393d9fd..e8d24f8 100644 --- a/c/phogen/meson.build +++ b/c/phogen/meson.build @@ -13,6 +13,7 @@ phogen_inc = include_directories('inc') phogen_lib = static_library( 'phogen', [phogen_map_h, 'src/phogen.c'], - include_directories : phogen_inc) + include_directories : phogen_inc, + dependencies: bhash_dep) phogen_dep = declare_dependency(link_with : phogen_lib, include_directories : phogen_inc) diff --git a/c/phogen/phogen_map/phogen_map.c b/c/phogen/phogen_map/phogen_map.c index 6b607e3..db06558 100644 --- a/c/phogen/phogen_map/phogen_map.c +++ b/c/phogen/phogen_map/phogen_map.c @@ -452,7 +452,6 @@ void phogen_rstrip(char *str, char *what) } } - /* * Take the buffer in `bigin` and treat is as a big-endian big number. * Perform a division using the 32-bit divisor in `base` and return the diff --git a/c/phogen/src/phogen.c b/c/phogen/src/phogen.c index e25eb8a..fb8b529 100644 --- a/c/phogen/src/phogen.c +++ b/c/phogen/src/phogen.c @@ -1,10 +1,78 @@ -#include - /* + * ========================================================================= * Phonetic generator -- used to convert binary data to semi-pronouncable * strings + * ========================================================================= */ +#include +#include #include "phogen_map.h" #include "phogen.h" +/* Number of letters in the english alphabet */ +#define PHOGEN_ENGLISH_NUM ('z' - 'a' + 1) +/* Number of valid characters in a n gram */ +#define PHOGEN_NGRAM_CHARS (PHOGEN_ENGLISH_NUM + 1) + +static int phogen_ngram_to_index(const char *word); + +/* + * Create a phonetic string of size `out_sz` by consuming bits from the bhash + * `bh_in` and write it to out. + */ +bool phogen_encode(char *out, size_t out_sz, bhash_t *bh_in) +{ + size_t ii; + size_t nmap; + uint32_t nm; + + char ngram[3] = " "; + + if (out_sz == 0) return true; + + for (ii = 0; ii < (out_sz - 1); ii++) + { + int ni = phogen_ngram_to_index(ngram); + + /* Calculte the number of letters in the current map */ + for (nmap = 0; nmap < PHOGEN_ENGLISH_NUM; nmap++) + { + if (phogen_map[ni].map[nmap] == '\0') break; + } + + /* Get the index of the next phonetic letter */ + nm = bhash_mod32(bh_in, nmap); + if (nm == BHASH_MOD32_ERR) + { + fprintf(stderr, "bhash error, unable to compute next phoentic char."); + return false; + } + + out[ii] = phogen_map[ni].map[nm]; + + /* Calculate the next ngram */ + ngram[0] = ngram[1]; + ngram[1] = out[ii]; + } + + out[ii] = '\0'; + + return true; +} + +/* + * The phogen_map table is fixed size and sorted. We can calculate the offset + * of a ngram with the simple calculation below. + */ +int phogen_ngram_to_index(const char *word) +{ + int i; + + i = (word[0] == ' ' ? 0 : word[0] - 'a' + 1); + i *= PHOGEN_NGRAM_CHARS; + i += (word[1] == ' ' ? 0 : word[1] - 'a' + 1); + + return i; +} +