libpostal-sys 0.1.1

Low-level wrappers for libpostal address normalization (with locks to support thread-safe initialization)
Documentation
#ifndef TRANSLITERATE_H
#define TRANSLITERATE_H

#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>

#include "collections.h"
#include "constants.h"
#include "klib/khash.h"
#include "string_utils.h"
#include "trie.h"
#include "trie_search.h"
#include "unicode_scripts.h"

#define LATIN_ASCII "latin-ascii"
#define LATIN_ASCII_SIMPLE "latin-ascii-simple"
#define HTML_ESCAPE "html-escape"

#define TRANSLITERATION_DATA_FILE "transliteration.dat"
#define DEFAULT_TRANSLITERATION_PATH LIBPOSTAL_TRANSLITERATION_DIR PATH_SEPARATOR TRANSLITERATION_DATA_FILE

#define MAX_TRANS_NAME_LEN 100

typedef enum {
    STEP_RULESET,
    STEP_TRANSFORM,
    STEP_UNICODE_NORMALIZATION
} step_type_t;

typedef struct transliteration_step {
    step_type_t type;
    char *name;
} transliteration_step_t;

transliteration_step_t *transliteration_step_new(char *name, step_type_t type);
void transliteration_step_destroy(transliteration_step_t *self);

VECTOR_INIT_FREE_DATA(step_array, transliteration_step_t *, transliteration_step_destroy)

typedef struct transliterator {
    char *name;
    uint8_t internal;
    uint32_t steps_index;
    size_t steps_length;
} transliterator_t;

#define MAX_GROUP_LEN 5

typedef struct group_capture {
    size_t start;
    size_t len;
} group_capture_t;

VECTOR_INIT(group_capture_array, group_capture_t)

typedef struct transliteration_replacement {
    uint32_t string_index;
    uint32_t revisit_index;
    size_t num_groups;
    group_capture_array *groups;
} transliteration_replacement_t;

transliteration_replacement_t *transliteration_replacement_new(
    uint32_t string_index,
    uint32_t revisit_index,
    group_capture_array *groups
);

void transliteration_replacement_destroy(transliteration_replacement_t *self);

VECTOR_INIT_FREE_DATA(transliteration_replacement_array, transliteration_replacement_t *, transliteration_replacement_destroy)

KHASH_MAP_INIT_STR(str_transliterator, transliterator_t *)

#define kh_script_lang_hash(key)  ((khint_t)(key).script ^ (((key).language == NULL) ? 0 : kh_str_hash_func((key).language)))
#define kh_script_lang_equal(a, b)  (((a).script == (b).script) && strcmp((a).language, (b).language) == 0)

typedef struct transliterator_index {
    size_t transliterator_index;
    size_t num_transliterators;
} transliterator_index_t;

#define NULL_TRANSLITERATOR_INDEX (transliterator_index_t) {0, 0}

KHASH_INIT(script_language_index, script_language_t, transliterator_index_t, 1, kh_script_lang_hash, kh_script_lang_equal)

typedef struct transliteration_table {
    khash_t(str_transliterator) *transliterators;

    khash_t(script_language_index) *script_languages;
    cstring_array *transliterator_names;

    step_array *steps;
    trie_t *trie;

    transliteration_replacement_array *replacements;
    cstring_array *replacement_strings;
    cstring_array *revisit_strings;
} transliteration_table_t;

// Control characters are special
#define WORD_BOUNDARY_CHAR "\x01"
#define WORD_BOUNDARY_CODEPOINT 1
#define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR)
#define PRE_CONTEXT_CHAR "\x86"
#define PRE_CONTEXT_CODEPOINT 134
#define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR)
#define POST_CONTEXT_CHAR "\x87"
#define POST_CONTEXT_CODEPOINT 135
#define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR)
#define EMPTY_TRANSITION_CHAR "\x04"
#define EMPTY_TRANSITION_CODEPOINT 4
#define EMPTY_TRANSITION_CHAR_LEN strlen(EMPTY_TRANSITION_CHAR)
#define REPEAT_CHAR "\x05"
#define REPEAT_CODEPOINT 5
#define REPEAT_CHAR_LEN strlen(REPEAT_CHAR)
#define GROUP_INDICATOR_CHAR "\x1d"
#define GROUP_INDICATOR_CODEPOINT 29
#define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
#define BEGIN_SET_CHAR "\x0f"
#define BEGIN_SET_CODEPOINT 15
#define BEGIN_SET_CHAR_LEN strlen(BEGIN_SET_CHAR)
#define END_SET_CHAR "\x0e"
#define END_SET_CODEPOINT 14
#define END_SET_CHAR_LEN strlen(END_SET_CHAR)


#define DOLLAR_CODEPOINT 36

#define LPAREN_CODEPOINT 40
#define RPAREN_CODEPOINT 41

#define STAR_CODEPOINT 42
#define PLUS_CODEPOINT 43

#define LSQUARE_CODEPOINT 91
#define BACKSLASH_CODEPOINT 92
#define RSQUARE_CODEPOINT 93

#define LCURLY_CODEPOINT 123
#define RCURLY_CODEPOINT 125


// Primary API
transliteration_table_t *get_transliteration_table(void);

transliterator_t *transliterator_new(char *name, uint8_t internal, uint32_t steps_index, size_t steps_length);
void transliterator_destroy(transliterator_t *self);

bool transliteration_table_add_transliterator(transliterator_t *trans);

transliterator_t *get_transliterator(char *name);
char *transliterate(char *trans_name, char *str, size_t len);

bool transliteration_table_add_script_language(script_language_t script_language, transliterator_index_t index);
transliterator_index_t get_transliterator_index_for_script_language(script_t script, char *language);

#define foreach_transliterator(script, language, transliterator_var, code) do {                                                     \
        transliteration_table_t *__trans_table = get_transliteration_table();                                                       \
        transliterator_index_t __index = get_transliterator_index_for_script_language(script, language);                            \
        for (size_t __i = __index.transliterator_index; __i < __index.transliterator_index + __index.num_transliterators; __i++) {  \
            transliterator_var = cstring_array_get_string(__trans_table->transliterator_names, (uint32_t)__i);                                \
            if (transliterator_var == NULL) break;                                                                                  \
            code;                                                                                                                   \
        }                                                                                                                           \
    } while (0);

bool transliteration_table_write(FILE *file);
bool transliteration_table_save(char *filename);

// Module setup/teardown
bool transliteration_module_init(void);
bool transliteration_module_setup(char *filename);
void transliteration_module_teardown(void);

#endif