c: Finish the phogen module
This commit implements the `phogen_encode` function, which takes a bhash as argument and generates a phonetic string.
This commit is contained in:
@ -1,10 +1,78 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
/*
|
||||
* =========================================================================
|
||||
* Phonetic generator -- used to convert binary data to semi-pronouncable
|
||||
* strings
|
||||
* =========================================================================
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "phogen_map.h"
|
||||
#include "phogen.h"
|
||||
|
||||
/* Number of letters in the english alphabet */
|
||||
#define PHOGEN_ENGLISH_NUM ('z' - 'a' + 1)
|
||||
/* Number of valid characters in a n gram */
|
||||
#define PHOGEN_NGRAM_CHARS (PHOGEN_ENGLISH_NUM + 1)
|
||||
|
||||
static int phogen_ngram_to_index(const char *word);
|
||||
|
||||
/*
|
||||
* Create a phonetic string of size `out_sz` by consuming bits from the bhash
|
||||
* `bh_in` and write it to out.
|
||||
*/
|
||||
bool phogen_encode(char *out, size_t out_sz, bhash_t *bh_in)
|
||||
{
|
||||
size_t ii;
|
||||
size_t nmap;
|
||||
uint32_t nm;
|
||||
|
||||
char ngram[3] = " ";
|
||||
|
||||
if (out_sz == 0) return true;
|
||||
|
||||
for (ii = 0; ii < (out_sz - 1); ii++)
|
||||
{
|
||||
int ni = phogen_ngram_to_index(ngram);
|
||||
|
||||
/* Calculte the number of letters in the current map */
|
||||
for (nmap = 0; nmap < PHOGEN_ENGLISH_NUM; nmap++)
|
||||
{
|
||||
if (phogen_map[ni].map[nmap] == '\0') break;
|
||||
}
|
||||
|
||||
/* Get the index of the next phonetic letter */
|
||||
nm = bhash_mod32(bh_in, nmap);
|
||||
if (nm == BHASH_MOD32_ERR)
|
||||
{
|
||||
fprintf(stderr, "bhash error, unable to compute next phoentic char.");
|
||||
return false;
|
||||
}
|
||||
|
||||
out[ii] = phogen_map[ni].map[nm];
|
||||
|
||||
/* Calculate the next ngram */
|
||||
ngram[0] = ngram[1];
|
||||
ngram[1] = out[ii];
|
||||
}
|
||||
|
||||
out[ii] = '\0';
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* The phogen_map table is fixed size and sorted. We can calculate the offset
|
||||
* of a ngram with the simple calculation below.
|
||||
*/
|
||||
int phogen_ngram_to_index(const char *word)
|
||||
{
|
||||
int i;
|
||||
|
||||
i = (word[0] == ' ' ? 0 : word[0] - 'a' + 1);
|
||||
i *= PHOGEN_NGRAM_CHARS;
|
||||
i += (word[1] == ' ' ? 0 : word[1] - 'a' + 1);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user