libpostal-sys 0.1.1

#include "address_parser.h"
#include "address_dictionary.h"
#include "features.h"
#include "ngrams.h"
#include "scanner.h"

#include "graph_builder.h"

#include "klib/ksort.h"
#include "log/log.h"

#define ADDRESS_PARSER_MODEL_FILENAME "address_parser.dat"
#define ADDRESS_PARSER_MODEL_FILENAME_CRF "address_parser_crf.dat"
#define ADDRESS_PARSER_VOCAB_FILENAME "address_parser_vocab.trie"
#define ADDRESS_PARSER_PHRASE_FILENAME "address_parser_phrases.dat"
#define ADDRESS_PARSER_POSTAL_CODES_FILENAME "address_parser_postal_codes.dat"

#define UNKNOWN_WORD "UNKNOWN"
#define UNKNOWN_NUMERIC "UNKNOWN_NUMERIC"

#define DEFAULT_RARE_WORD_THRESHOLD 50

static address_parser_t *parser = NULL;

typedef enum {
    ADDRESS_PARSER_NULL_PHRASE,
    ADDRESS_PARSER_DICTIONARY_PHRASE,
    ADDRESS_PARSER_COMPONENT_PHRASE,
    ADDRESS_PARSER_PREFIX_PHRASE,
    ADDRESS_PARSER_SUFFIX_PHRASE
} address_parser_phrase_type_t;

static parser_options_t PARSER_DEFAULT_OPTIONS = {
    .rare_word_threshold = DEFAULT_RARE_WORD_THRESHOLD,
    .print_features = false
};

address_parser_t *address_parser_new_options(parser_options_t options) {
    address_parser_t *parser = calloc(1, sizeof(address_parser_t));
    parser->options = options;
    return parser;
}

address_parser_t *address_parser_new(void) {
    return address_parser_new_options(PARSER_DEFAULT_OPTIONS);
}

address_parser_t *get_address_parser(void) {
    return parser;
}

bool address_parser_print_features(bool print_features) {
    if (parser == NULL) return false;

    parser->options.print_features = print_features;
    return true;
}

bool address_parser_save(address_parser_t *self, char *output_dir) {
    if (self == NULL || output_dir == NULL) return false;

    char *model_filename = NULL;
    if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
        model_filename = ADDRESS_PARSER_MODEL_FILENAME;
    } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
        model_filename = ADDRESS_PARSER_MODEL_FILENAME_CRF;
    } else {
        return false;
    }

    char_array *path = char_array_new_size(strlen(output_dir));

    char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, model_filename);
    char *model_path = char_array_get_string(path);

    if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
        if (!averaged_perceptron_save(self->model.ap, model_path)) {
            log_info("Error in averaged_perceptron_save\n");
            char_array_destroy(path);
            return false;
        }
    } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
        if (!crf_save(self->model.crf, model_path)) {
            log_info("Error in crf_save\n");
            char_array_destroy(path);
            return false;
        }
    }

    char_array_clear(path);

    char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_VOCAB_FILENAME);
    char *vocab_path = char_array_get_string(path);

    if (!trie_save(self->vocab, vocab_path)) {
        return false;
    }

    char_array_clear(path);

    char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_PHRASE_FILENAME);
    char *phrases_path = char_array_get_string(path);

    FILE *phrases_file = fopen(phrases_path, "w+");
    if (phrases_file == NULL || self->phrases == NULL) {
        return false;
    }

    if (!trie_write(self->phrases, phrases_file)) {
        return false;
    }

    if (self->phrase_types == NULL) {
        return false;
    }

    size_t num_phrase_types = self->phrase_types->n;
    if (!file_write_uint64(phrases_file, num_phrase_types)) {
        return false;
    }

    for (size_t i = 0; i < self->phrase_types->n; i++) {
        address_parser_types_t phrase_type_value = self->phrase_types->a[i];
        if (!file_write_uint32(phrases_file, phrase_type_value.value)) {
            return false;
        }
    }

    fclose(phrases_file);

    char_array_clear(path);

    char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_POSTAL_CODES_FILENAME);
    char *postal_codes_path = char_array_get_string(path);

    FILE *postal_codes_file = fopen(postal_codes_path, "w+");
    if (postal_codes_file == NULL || self->postal_codes == NULL) {
        return false;
    }

    if (!trie_write(self->postal_codes, postal_codes_file)) {
        return false;
    }

    if (self->postal_code_contexts == NULL) {
        return false;
    }

    if (!graph_write(self->postal_code_contexts, postal_codes_file)) {
        return false;
    }

    fclose(postal_codes_file);

    char_array_destroy(path);

    return true;
}

static bool postal_code_context_exists(address_parser_t *self, uint32_t postal_code_id, uint32_t admin_id) {
    graph_t *g = self->postal_code_contexts;

    return graph_has_edge(g, postal_code_id, admin_id);
}

bool address_parser_load(char *dir) {
    if (parser != NULL) return false;
    if (dir == NULL) {
        dir = LIBPOSTAL_ADDRESS_PARSER_DIR;
    }

    char_array *path = char_array_new_size(strlen(dir));

    char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_MODEL_FILENAME);
    char *model_path = char_array_get_string(path);

    if (file_exists(model_path)) {
        averaged_perceptron_t *ap_model = averaged_perceptron_load(model_path);
        if (ap_model != NULL) {
            parser = address_parser_new();
            parser->model_type = ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON;
            parser->model.ap = ap_model;
        } else {
            char_array_destroy(path);
            log_error("Averaged perceptron model could not be loaded\n");
            return false;
        }
    } else {
        model_path = NULL;
    }

    if (model_path == NULL) {
        char_array_clear(path);
        char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_MODEL_FILENAME_CRF);
        model_path = char_array_get_string(path);

        if (file_exists(model_path)) {
            crf_t *crf_model = crf_load(model_path);
            if (crf_model != NULL) {
                parser = address_parser_new();
                parser->model_type = ADDRESS_PARSER_TYPE_CRF;
                parser->model.crf = crf_model;
            } else {
                char_array_destroy(path);
                log_error("Averaged perceptron model could not be loaded\n");
                return false;
            }
        } else {
            model_path = NULL;
        }
    }

    if (parser == NULL) {
        char_array_destroy(path);
        log_error("Could not find parser model file of known type\n");
        return false;
    }

    char_array_clear(path);

    char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_VOCAB_FILENAME);

    char *vocab_path = char_array_get_string(path);

    trie_t *vocab = trie_load(vocab_path);

    if (vocab == NULL) {
        goto exit_address_parser_created;
    }

    parser->vocab = vocab;

    char_array_clear(path);

    char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_PHRASE_FILENAME);
    char *phrases_path = char_array_get_string(path);

    FILE *phrases_file = fopen(phrases_path, "rb");
    if (phrases_file == NULL) {
        goto exit_address_parser_created;
    }

    parser->phrases = trie_read(phrases_file);
    if (parser->phrases == NULL) {
        goto exit_address_parser_created;
    }

    uint64_t num_phrase_types;

    if (!file_read_uint64(phrases_file, &num_phrase_types)) {
        goto exit_address_parser_created;
    }

    parser->phrase_types = address_parser_types_array_new_size(num_phrase_types);

    uint32_array *phrase_type_values = uint32_array_new_size(num_phrase_types);
    if (!file_read_uint32_array(phrases_file, phrase_type_values->a, num_phrase_types)) {
        uint32_array_destroy(phrase_type_values);
        goto exit_address_parser_created;
    }
    phrase_type_values->n = num_phrase_types;

    for (size_t i = 0; i < phrase_type_values->n; i++) {
        uint32_t phrase_type_value = phrase_type_values->a[i];
        address_parser_types_t phrase_type = {.value = phrase_type_value};
        address_parser_types_array_push(parser->phrase_types, phrase_type);
    }

    uint32_array_destroy(phrase_type_values);

    fclose(phrases_file);

    char_array_clear(path);

    char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_POSTAL_CODES_FILENAME);

    char *postal_codes_path = char_array_get_string(path);

    FILE *postal_codes_file = fopen(postal_codes_path, "rb");
    if (postal_codes_file == NULL) {
        goto exit_address_parser_created;
    }

    parser->postal_codes = trie_read(postal_codes_file);
    if (parser->postal_codes == NULL) {
        goto exit_address_parser_created;
    }

    parser->postal_code_contexts = graph_read(postal_codes_file);

    if (parser->postal_code_contexts == NULL) {
        goto exit_address_parser_created;
    }

    fclose(postal_codes_file);

    parser->context = address_parser_context_new();
    if (parser->context == NULL) {
        goto exit_address_parser_created;
    }

    char_array_destroy(path);
    return true;

exit_address_parser_created:
    address_parser_destroy(parser);
    char_array_destroy(path);
    return false;
}

void address_parser_destroy(address_parser_t *self) {
    if (self == NULL) return;

    if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON && self->model.ap != NULL) {
        averaged_perceptron_destroy(self->model.ap);
    } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF && self->model.crf != NULL) {
        crf_destroy(self->model.crf);
    }

    if (self->context != NULL) {
        address_parser_context_destroy(self->context);
    }

    if (self->vocab != NULL) {
        trie_destroy(self->vocab);
    }

    if (self->phrases != NULL) {
        trie_destroy(self->phrases);
    }

    if (self->phrase_types != NULL) {
        address_parser_types_array_destroy(self->phrase_types);
    }

    if (self->postal_codes != NULL) {
        trie_destroy(self->postal_codes);
    }

    if (self->postal_code_contexts != NULL) {
        graph_destroy(self->postal_code_contexts);
    }

    free(self);
}

static inline uint32_t word_vocab_frequency(address_parser_t *parser, char *word) {   
    uint32_t count = 0;
    bool has_key = trie_get_data(parser->vocab, word, &count);
    return count;
}

inline void address_parser_normalize_token(cstring_array *array, char *str, token_t token) {
    normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS);
}

static inline void address_parser_normalize_phrase_token(cstring_array *array, char *str, token_t token) {
    normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS);
}

inline char *address_parser_normalize_string(char *str) {
    return normalize_string_latin(str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS);
}


void address_parser_context_destroy(address_parser_context_t *self) {
    if (self == NULL) return;

    if (self->phrase != NULL) {
        char_array_destroy(self->phrase);
    }

    if (self->context_phrase != NULL) {
        char_array_destroy(self->context_phrase);
    }

    if (self->long_context_phrase != NULL) {
        char_array_destroy(self->long_context_phrase);
    }

    if (self->component_phrase != NULL) {
        char_array_destroy(self->component_phrase);
    }

    if (self->context_component_phrase != NULL) {
        char_array_destroy(self->context_component_phrase);
    }

    if (self->long_context_component_phrase != NULL) {
        char_array_destroy(self->long_context_component_phrase);
    }

    if (self->prefix_phrase != NULL) {
        char_array_destroy(self->prefix_phrase);
    }

    if (self->context_prefix_phrase != NULL) {
        char_array_destroy(self->context_prefix_phrase);
    }

    if (self->long_context_prefix_phrase != NULL) {
        char_array_destroy(self->long_context_prefix_phrase);
    }

    if (self->suffix_phrase != NULL) {
        char_array_destroy(self->suffix_phrase);
    }

    if (self->context_suffix_phrase != NULL) {
        char_array_destroy(self->context_suffix_phrase);
    }

    if (self->long_context_suffix_phrase != NULL) {
        char_array_destroy(self->long_context_suffix_phrase);
    }

    if (self->ngrams != NULL) {
        cstring_array_destroy(self->ngrams);
    }

    if (self->sub_token != NULL) {
        char_array_destroy(self->sub_token);
    }

    if (self->sub_tokens != NULL) {
        token_array_destroy(self->sub_tokens);
    }

    if (self->separators != NULL) {
        uint32_array_destroy(self->separators);
    }

    if (self->normalized != NULL) {
        cstring_array_destroy(self->normalized);
    }

    if (self->normalized_tokens != NULL) {
        token_array_destroy(self->normalized_tokens);
    }

    if (self->normalized_admin != NULL) {
        cstring_array_destroy(self->normalized_admin);
    }

    if (self->normalized_admin_tokens != NULL) {
        token_array_destroy(self->normalized_admin_tokens);
    }

    if (self->features != NULL) {
        cstring_array_destroy(self->features);
    }

    if (self->prev_tag_features != NULL) {
        cstring_array_destroy(self->prev_tag_features);
    }

    if (self->prev2_tag_features != NULL) {
        cstring_array_destroy(self->prev2_tag_features);
    }

    if (self->tokenized_str != NULL) {
        tokenized_string_destroy(self->tokenized_str);
    }

    if (self->address_dictionary_phrases != NULL) {
        phrase_array_destroy(self->address_dictionary_phrases);
    }

    if (self->address_phrase_memberships != NULL) {
        int64_array_destroy(self->address_phrase_memberships);
    }

    if (self->component_phrases != NULL) {
        phrase_array_destroy(self->component_phrases);
    }

    if (self->component_phrase_memberships != NULL) {
        int64_array_destroy(self->component_phrase_memberships);
    }

    if (self->postal_code_phrases != NULL) {
        phrase_array_destroy(self->postal_code_phrases);
    }

    if (self->postal_code_phrase_memberships != NULL) {
        int64_array_destroy(self->postal_code_phrase_memberships);
    }

    if (self->prefix_phrases != NULL) {
        phrase_array_destroy(self->prefix_phrases);
    }

    if (self->suffix_phrases != NULL) {
        phrase_array_destroy(self->suffix_phrases);
    }

    free(self);
}

address_parser_context_t *address_parser_context_new(void) {
    address_parser_context_t *context = malloc(sizeof(address_parser_context_t));

    if (context == NULL) return NULL;

    context->language = NULL;
    context->country = NULL;

    context->phrase = char_array_new();
    if (context->phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->context_phrase = char_array_new();
    if (context->context_phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->long_context_phrase = char_array_new();
    if (context->long_context_phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->component_phrase = char_array_new();
    if (context->component_phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->context_component_phrase = char_array_new();
    if (context->context_component_phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->long_context_component_phrase = char_array_new();
    if (context->long_context_component_phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->prefix_phrase = char_array_new();
    if (context->prefix_phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->context_prefix_phrase = char_array_new();
    if (context->context_prefix_phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->long_context_prefix_phrase = char_array_new();
    if (context->long_context_prefix_phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->suffix_phrase = char_array_new();
    if (context->suffix_phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->context_suffix_phrase = char_array_new();
    if (context->context_suffix_phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->long_context_suffix_phrase = char_array_new();
    if (context->long_context_suffix_phrase == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->ngrams = cstring_array_new();
    if (context->ngrams == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->sub_token = char_array_new();
    if (context->sub_token == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->sub_tokens = token_array_new();
    if (context->sub_tokens == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->separators = uint32_array_new();
    if (context->separators == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->normalized = cstring_array_new();
    if (context->normalized == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->normalized_tokens = token_array_new();
    if (context->normalized_tokens == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->normalized_admin = cstring_array_new();
    if (context->normalized_admin == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->normalized_admin_tokens = token_array_new();
    if (context->normalized_admin_tokens == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->features = cstring_array_new();
    if (context->features == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->prev_tag_features = cstring_array_new();
    if (context->prev_tag_features == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->prev2_tag_features = cstring_array_new();
    if (context->prev2_tag_features == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->tokenized_str = tokenized_string_new();
    if (context->tokenized_str == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->address_dictionary_phrases = phrase_array_new();
    if (context->address_dictionary_phrases == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->address_phrase_memberships = int64_array_new();
    if (context->address_phrase_memberships == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->component_phrases = phrase_array_new();
    if (context->component_phrases == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->component_phrase_memberships = int64_array_new();
    if (context->component_phrase_memberships == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->postal_code_phrases = phrase_array_new();
    if (context->postal_code_phrases == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->postal_code_phrase_memberships = int64_array_new();
    if (context->postal_code_phrase_memberships == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->prefix_phrases = phrase_array_new();
    if (context->prefix_phrases == NULL) {
        goto exit_address_parser_context_allocated;
    }

    context->suffix_phrases = phrase_array_new();
    if (context->suffix_phrases == NULL) {
        goto exit_address_parser_context_allocated;
    }

    return context;

exit_address_parser_context_allocated:
    address_parser_context_destroy(context);
    return NULL;
}

bool is_valid_component_phrase(cstring_array *strings, phrase_t phrase) {
    bool valid = false;
    for (uint32_t i = phrase.start; i < phrase.start + phrase.len; i++) {
        char *s = cstring_array_get_string(strings, i);
        if (!string_is_digit(s, strlen(s))) {
            valid = true;
            break;
        }
    }
    return valid;
}

void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country) {
    uint32_t token_index;
    char *word;
    phrase_t phrase;

    context->language = language;
    context->country = country;

    cstring_array *normalized = context->normalized;
    token_array *normalized_tokens = context->normalized_tokens;
    cstring_array_clear(normalized);
    token_array_clear(normalized_tokens);

    cstring_array *normalized_admin = context->normalized_admin;
    token_array *normalized_admin_tokens = context->normalized_admin_tokens;
    cstring_array_clear(normalized_admin);
    token_array_clear(normalized_admin_tokens);

    char *str = tokenized_str->str;
    token_array *tokens = tokenized_str->tokens;

    cstring_array_foreach(tokenized_str->strings, token_index, word, {
        token_t token = tokens->a[token_index];

        size_t token_offset = normalized->str->n;
        address_parser_normalize_token(normalized, str, token);
        size_t token_len;
        if (normalized->str->n > token_offset) {
           token_len = normalized->str->n - 1 - token_offset;
        } else {
            token_len = 0;
        }
        token_t normalized_token;
        normalized_token.offset = token_offset;
        normalized_token.len = token_len;
        normalized_token.type = token.type;
        token_array_push(normalized_tokens, normalized_token);

        size_t admin_token_offset = normalized_admin->str->n;
        address_parser_normalize_phrase_token(normalized_admin, str, token);
        size_t admin_token_len;
        if (normalized_admin->str->n > admin_token_offset) {
           admin_token_len = normalized_admin->str->n - 1 - admin_token_offset;
        } else {
            admin_token_len = 0;
        }
        token_t normalized_admin_token;
        normalized_admin_token.offset = admin_token_offset;
        normalized_admin_token.len = admin_token_len;
        normalized_admin_token.type = token.type;
        token_array_push(normalized_admin_tokens, normalized_admin_token);
    })

    char *normalized_str = normalized->str->a;
    char *normalized_str_admin = normalized_admin->str->a;

    /*
    Address dictionary phrases
    --------------------------
    Recognizing phrases that occur in libpostal's dictionaries.

    Note: if the dictionaries are updates to try to improve the parser,
    we'll need to retrain. This can be done without rebuilding the
    training data (a long-running process which can take up to a week),
    but will require running address_parser_train, the main training script.
    */

    phrase_array_clear(context->address_dictionary_phrases);
    int64_array_clear(context->address_phrase_memberships);

    phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
    int64_array *address_phrase_memberships = context->address_phrase_memberships;

    size_t num_tokens = tokens->n;

    bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, NULL, &address_dictionary_phrases);
    token_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens);

    phrase_array_clear(context->prefix_phrases);
    phrase_array_clear(context->suffix_phrases);

    for (size_t i = 0; i < num_tokens; i++) {
        token_t token = tokens->a[i];
        char *word_pre_norm = tokenized_string_get_token(tokenized_str, i);

        phrase_t prefix_phrase = search_address_dictionaries_prefix(word_pre_norm, token.len, NULL);
        phrase_array_push(context->prefix_phrases, prefix_phrase);

        phrase_t suffix_phrase = search_address_dictionaries_suffix(word_pre_norm, token.len, NULL);
        phrase_array_push(context->suffix_phrases, suffix_phrase);
    }

    /*
    Component phrases
    -----------------
    Precomputed phrases for cities, states, countries, etc. from the training data

    Note: if the training data has lots of mislabeled examples (e.g. Brooklyn as city
    instead of a city_district), this may cause the parser to get confused. It will
    penalize itself for getting the wrong answer when really the underlying data
    is simply ambiguous. In the OSM training data a lot of work has been done to
    ensure that there's little or no systematic mislabeling. As such, other data
    sets shouldn't be added willy-nilly unless the labels are consistent.
    */

    phrase_array_clear(context->component_phrases);
    int64_array_clear(context->component_phrase_memberships);

    phrase_array *component_phrases = context->component_phrases;
    int64_array *component_phrase_memberships = context->component_phrase_memberships;

    bool have_component_phrases = trie_search_tokens_with_phrases(parser->phrases, normalized_str_admin, normalized_admin_tokens, &component_phrases);
    token_phrase_memberships(component_phrases, component_phrase_memberships, num_tokens);

    for (size_t i = 0; i < component_phrases->n; i++) {
        phrase_t phrase = component_phrases->a[i];
        if (!is_valid_component_phrase(context->normalized_admin, phrase)) {
            for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
                component_phrase_memberships->a[j] = NULL_PHRASE_MEMBERSHIP;
            }
        }
    }

    phrase_array_clear(context->postal_code_phrases);
    int64_array_clear(context->postal_code_phrase_memberships);

    phrase_array *postal_code_phrases = context->postal_code_phrases;
    int64_array *postal_code_phrase_memberships = context->postal_code_phrase_memberships;

    bool have_postal_code_phrases = trie_search_tokens_with_phrases(parser->postal_codes, normalized_str_admin, normalized_admin_tokens, &postal_code_phrases);
    token_phrase_memberships(postal_code_phrases, postal_code_phrase_memberships, num_tokens);

}

static inline phrase_t phrase_at_index(phrase_array *phrases, int64_array *phrase_memberships, uint32_t i) {
    if (phrases == NULL || phrase_memberships == NULL || i > phrase_memberships->n - 1) {
        return NULL_PHRASE;
    }

    int64_t phrase_index = phrase_memberships->a[i];
    if (phrase_index != NULL_PHRASE_MEMBERSHIP) {
        phrase_t phrase = phrases->a[phrase_index];
        return phrase;
    }

    return NULL_PHRASE;
}

char *phrase_prefix(char *word, size_t len, phrase_t prefix_phrase, char_array *prefix_phrase_array) {
    char_array_clear(prefix_phrase_array);
    size_t prefix_len = prefix_phrase.len;
    char_array_add_len(prefix_phrase_array, word, prefix_len);
    char *prefix = char_array_get_string(prefix_phrase_array);
    return prefix;
}

char *phrase_suffix(char *word, size_t len, phrase_t suffix_phrase, char_array *suffix_phrase_array) {
    char_array_clear(suffix_phrase_array);
    size_t suffix_len = suffix_phrase.len;
    char_array_add_len(suffix_phrase_array, word + (len - suffix_len), suffix_len);
    char *suffix = char_array_get_string(suffix_phrase_array);
    return suffix;
}

bool is_valid_dictionary_phrase(phrase_t phrase) {
    uint32_t expansion_index = phrase.data;
    address_expansion_value_t *expansion_value = address_dictionary_get_expansions(expansion_index);

    if (expansion_value == NULL) {
        log_warn("expansion_value is NULL for index %u\n", expansion_index);
        return false;
    }
    uint32_t address_phrase_types = expansion_value->components;

    if (address_phrase_types & (LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_PO_BOX)) {
        for (size_t i = 0; i < expansion_value->expansions->n; i++) {
            address_expansion_t expansion = expansion_value->expansions->a[i];
            if (!address_expansion_in_dictionary(expansion, DICTIONARY_TOPONYM)) {
                return true;
            }
        }
    }

    return false;
}

typedef struct address_parser_phrase {
    char *str;
    address_parser_phrase_type_t type;
    phrase_t phrase;
} address_parser_phrase_t;

static inline bool is_plain_word_phrase_type(address_parser_phrase_type_t type) {
    return type == ADDRESS_PARSER_NULL_PHRASE || type == ADDRESS_PARSER_SUFFIX_PHRASE || type == ADDRESS_PARSER_PREFIX_PHRASE;
}

static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser, tokenized_string_t *tokenized, address_parser_context_t *context, uint32_t i, bool long_context) {
    phrase_t phrase;
    address_parser_phrase_t response;
    char *phrase_string = NULL;

    phrase = phrase_at_index(context->address_dictionary_phrases, context->address_phrase_memberships, i);
    
    phrase_t component_phrase = phrase_at_index(context->component_phrases, context->component_phrase_memberships, i);

    if (phrase.len > 0 && is_valid_dictionary_phrase(phrase) && component_phrase.len <= phrase.len) {
        phrase_string = cstring_array_get_phrase(context->normalized, long_context ? context->long_context_phrase : context->context_phrase, phrase),

        response = (address_parser_phrase_t){
            phrase_string,
            ADDRESS_PARSER_DICTIONARY_PHRASE,
            phrase
        };
        return response;
    }

    phrase = component_phrase;

    if (phrase.len > 0) {
        phrase_string = cstring_array_get_phrase(context->normalized_admin, long_context ? context->long_context_component_phrase : context->context_component_phrase, phrase);

        response = (address_parser_phrase_t){
            phrase_string,
            ADDRESS_PARSER_COMPONENT_PHRASE,
            phrase
        };
        return response;
    }

    phrase_t prefix_phrase = context->prefix_phrases->a[i];
    phrase_t suffix_phrase = context->suffix_phrases->a[i];

    uint32_t expansion_index;
    address_expansion_value_t *expansion_value;

    cstring_array *normalized = context->normalized;

    char *word = cstring_array_get_string(normalized, i);
    token_t token = tokenized->tokens->a[i];

    // Suffixes like straße, etc.
    if (suffix_phrase.len > 0) {
        expansion_index = suffix_phrase.data;
        expansion_value = address_dictionary_get_expansions(expansion_index);

        if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) {
            response = (address_parser_phrase_t){
                word,
                ADDRESS_PARSER_SUFFIX_PHRASE,
                suffix_phrase
            };
            return response;
        }
    }

    // Prefixes like hinter, etc.
    if (prefix_phrase.len > 0) {
        expansion_index = prefix_phrase.data;
        expansion_value = address_dictionary_get_expansions(expansion_index);

        // Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category
        if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) {
            response = (address_parser_phrase_t){
                word,
                ADDRESS_PARSER_PREFIX_PHRASE,
                prefix_phrase
            };
            return response;
        }
    }

    response = (address_parser_phrase_t){
        word,
        ADDRESS_PARSER_NULL_PHRASE,
        NULL_PHRASE
    };
    return response;

}

static inline int64_t phrase_index(int64_array *phrase_memberships, size_t start, int8_t direction) {
    if (phrase_memberships == NULL) {
        return -1;
    }

    int64_t *memberships = phrase_memberships->a;
    int64_t membership;

    if (direction == -1) {
        for (ssize_t idx = start; idx >= 0; idx--) {
            if (memberships[idx] != NULL_PHRASE_MEMBERSHIP) {
                return (int64_t)idx;
            }
        }
    } else if (direction == 1) {
        size_t n = phrase_memberships->n;
        for (size_t idx = start; idx < n; idx++) {
            if (memberships[idx] != NULL_PHRASE_MEMBERSHIP) {
                return (int64_t)idx;
            }
        }
    }

    return -1;
}


static inline int64_t next_numeric_token_index(tokenized_string_t *tokenized, address_parser_context_t *context, size_t start) {
    if (context == NULL) return -1;

    token_array *tokens = tokenized->tokens;

    if (tokens == NULL || start > tokens->n - 1) return -1;

    phrase_t phrase;

    for (size_t i = start; i < tokens->n; i++) {
        if (context->address_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP &&
            context->component_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP) {
            token_t token = tokens->a[i];
            if (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) {
                return i;
            }
        }
    }

    return -1;
}


static inline void add_phrase_features(cstring_array *features, uint32_t phrase_types, uint32_t component, char *phrase_type, char *phrase_string) {
    if (phrase_types == component) {
        log_debug("phrase=%s, phrase_types=%d\n", phrase_string, phrase_types);
        feature_array_add(features, 2, "unambiguous phrase type", phrase_type);
        feature_array_add(features, 3, "unambiguous phrase type+phrase", phrase_type, phrase_string);
    } else if (phrase_types & component) {
        feature_array_add(features, 3, "phrase type+phrase", phrase_type, phrase_string);
    }
}

static bool add_ngram_features(cstring_array *features, char *feature_prefix, cstring_array *ngrams, char *str, size_t n, size_t prefix_len, size_t suffix_len) {
    if (features == NULL || ngrams == NULL) return false;

    size_t len = strlen(str);

    if (n == 0 || n > len - 1) return false;

    size_t ngram_num_chars_len = INT64_MAX_STRING_SIZE;
    char ngram_num_chars[ngram_num_chars_len];
    sprintf(ngram_num_chars, "%zu", n);

    bool known_prefix = prefix_len > 0;
    bool known_suffix = suffix_len > 0;

    cstring_array_clear(ngrams);
    if (!add_ngrams(ngrams, n, str + prefix_len, len - suffix_len - prefix_len, !known_prefix, !known_suffix)) {
        return false;
    }
    
    uint32_t idx;
    char *ngram;

    if (feature_prefix != NULL) {
        cstring_array_foreach(ngrams, idx, ngram, {
            feature_array_add(features, 4, feature_prefix, "ngrams", ngram_num_chars, ngram);
        })
    } else {
        cstring_array_foreach(ngrams, idx, ngram, {
            feature_array_add(features, 3, "ngrams", ngram_num_chars, ngram);
        })
    }

    return true;
}

/*
address_parser_features
-----------------------

This is a feature function similar to those found in MEMM and CRF models.

Follows the signature of a tagger_feature_function so it can be called
as a function pointer by the averaged perceptron or CRF model.

Parameters:

address_parser_t *self: a pointer to the address_parser struct, which contains
word frequencies and perhaps other useful corpus-wide statistics.

address_parser_context_t *context: The context struct containing:
- phrase dictionary memberships for all the tokens
- country (if knkown)
- language (if known)
- features array

tokenized_string_t *tokenized: the sequence of tokens for parsing
uint32_t i: the current token index
char *prev: the predicted tag at index i - 1
char *prev2: the predicted tag at index i - 2

*/

bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx) {
    if (self == NULL || ctx == NULL) return false;

    address_parser_t *parser = (address_parser_t *)self;
    address_parser_context_t *context = (address_parser_context_t *)ctx;

    cstring_array *features = context->features;
    cstring_array *prev_tag_features = context->prev_tag_features;
    cstring_array *prev2_tag_features = context->prev2_tag_features;
    char *language = context->language;
    char *country = context->country;

    phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
    int64_array *address_phrase_memberships = context->address_phrase_memberships;
    phrase_array *component_phrases = context->component_phrases;
    int64_array *component_phrase_memberships = context->component_phrase_memberships;
    phrase_array *postal_code_phrases = context->postal_code_phrases;
    int64_array *postal_code_phrase_memberships = context->postal_code_phrase_memberships;
    cstring_array *normalized = context->normalized;

    uint32_array *separators = context->separators;

    cstring_array_clear(features);
    cstring_array_clear(prev_tag_features);
    cstring_array_clear(prev2_tag_features);

    token_array *tokens = tokenized->tokens;

    token_t token = tokens->a[idx];

    ssize_t last_index = (ssize_t)idx - 1;
    ssize_t next_index = (ssize_t)idx + 1;

    char *word_pre_norm = tokenized_string_get_token(tokenized, idx);

    char *word = cstring_array_get_string(normalized, idx);
    if (word == NULL) {
        log_error("got NULL word at %d\n", idx);
        return false;
    }

    size_t word_len = strlen(word);

    log_debug("word=%s\n", word);

    phrase_t phrase = NULL_PHRASE;
    phrase_t component_phrase = NULL_PHRASE;

    char *phrase_string = NULL;
    char *component_phrase_string = NULL;

    int64_t address_phrase_index = address_phrase_memberships->a[idx];
    int64_t component_phrase_index = component_phrase_memberships->a[idx];
    
    if (address_phrase_index != NULL_PHRASE_MEMBERSHIP) {
        phrase = address_dictionary_phrases->a[address_phrase_index];
    }

    if (component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
        component_phrase = component_phrases->a[component_phrase_index];
    }

    char_array *phrase_tokens = context->phrase;
    char_array *component_phrase_tokens = context->component_phrase;

    uint32_t expansion_index;
    address_expansion_value_t *expansion_value;

    bool add_word_feature = true;

    size_t num_tokens = tokenized->tokens->n;

    // Address dictionary phrases
    if (phrase.len > 0 && phrase.len >= component_phrase.len) {
        log_debug("phrase\n");

        last_index = (ssize_t)phrase.start - 1;
        next_index = (ssize_t)phrase.start + phrase.len;

        if(is_valid_dictionary_phrase(phrase)) {
            uint32_t expansion_index = phrase.data;
            address_expansion_value_t *expansion_value = address_dictionary_get_expansions(expansion_index);

            if (expansion_value == NULL) {
                log_warn("expansion_value is NULL for index %u\n", expansion_index);
                return false;
            }
            uint32_t address_phrase_types = expansion_value->components;

            phrase_string = cstring_array_get_phrase(context->normalized, phrase_tokens, phrase);

            add_word_feature = false;
            log_debug("phrase_string=%s\n", phrase_string);

            add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STREET, "street", phrase_string);
            add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_NAME, "name", phrase_string);
            add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_CATEGORY, "category", phrase_string);
            add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_UNIT, "unit", phrase_string);
            add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_PO_BOX, "po_box", phrase_string);
            add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_LEVEL, "level", phrase_string);
            add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_ENTRANCE, "entrance", phrase_string);
            add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STAIRCASE, "staircase", phrase_string);
            add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_HOUSE_NUMBER, "house_number", phrase_string);
            add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_POSTAL_CODE, "postal_code", phrase_string);
        }
    }


    address_parser_types_t types;

    // Component phrases
    if (component_phrase.len > 0 && component_phrase.len >= phrase.len) {
        component_phrase = component_phrases->a[component_phrase_index];

        component_phrase_string = cstring_array_get_phrase(context->normalized_admin, component_phrase_tokens, component_phrase);
        
        uint32_t component_phrase_index = component_phrase.data;
        if (component_phrase_index > parser->phrase_types->n) {
            log_error("Invalid component_phrase_index: %u (parser->phrase_types->n=%zu)\n", component_phrase_index, parser->phrase_types->n);
            return false;
        }

        types = parser->phrase_types->a[component_phrase_index];

        uint32_t component_phrase_types = types.components;
        uint32_t most_common = types.most_common;

        if (last_index >= (ssize_t)component_phrase.start - 1) {
            last_index = (ssize_t)component_phrase.start - 1;
        }

        if (next_index < (ssize_t)component_phrase.start + component_phrase.len) {
            next_index = (ssize_t)component_phrase.start + component_phrase.len;
        }

        if (component_phrase_string != NULL && component_phrase_types > 0) {
            feature_array_add(features, 2, "phrase", component_phrase_string);
            add_word_feature = false;
        }

        if (component_phrase_types > 0) {
            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_SUBURB, "suburb", component_phrase_string);
            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY, "city", component_phrase_string);
            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY_DISTRICT, "city_district", component_phrase_string);
            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_ISLAND, "island", component_phrase_string);
            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE_DISTRICT, "state_district", component_phrase_string);
            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE, "state", component_phrase_string);
            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY_REGION, "country_region", component_phrase_string);
            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY, "country", component_phrase_string);
            add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_WORLD_REGION, "world_region", component_phrase_string);
        }

        if (component_phrase_types != most_common) {
            if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) {
                feature_array_add(features, 2, "commonly city", component_phrase_string);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) {
                feature_array_add(features, 2, "commonly country", component_phrase_string);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) {
                feature_array_add(features, 2, "commonly suburb", component_phrase_string);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) {
                feature_array_add(features, 2, "commonly city_district", component_phrase_string);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) {
                feature_array_add(features, 2, "commonly state", component_phrase_string);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) {
                feature_array_add(features, 2, "commonly country_region", component_phrase_string);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) {
                feature_array_add(features, 2, "commonly state_district", component_phrase_string);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_ISLAND) {
                feature_array_add(features, 2, "commonly island", component_phrase_string);
            }
        }
    }

    bool possible_postal_code = false;
    bool postal_code_have_admin = false;
    int64_t postal_code_phrase_index = postal_code_phrase_memberships->a[idx];
    phrase_t postal_code_phrase = NULL_PHRASE;

    if (postal_code_phrase_index != NULL_PHRASE_MEMBERSHIP) {
        postal_code_phrase = postal_code_phrases->a[postal_code_phrase_index];

        uint32_t postal_code_id = postal_code_phrase.data;

        possible_postal_code = true;

        if (last_index >= (ssize_t)postal_code_phrase.start - 1) {
            last_index = (ssize_t)postal_code_phrase.start - 1;
        }

        if (next_index < (ssize_t)postal_code_phrase.start + postal_code_phrase.len) {
            next_index = (ssize_t)postal_code_phrase.start + postal_code_phrase.len;
        }

        uint32_t admin_id;
        uint64_t postal_code_context;

        khiter_t k;

        if (last_index >= 0) {
            int64_t last_component_phrase_index = component_phrase_memberships->a[last_index];
            if (last_component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
                phrase_t last_component_phrase = component_phrases->a[last_component_phrase_index];
                admin_id = last_component_phrase.data;

                if (postal_code_context_exists(parser, postal_code_id, admin_id)) {
                    postal_code_have_admin = true;
                }
            }
        }

        if (!postal_code_have_admin && next_index < num_tokens) {
            int64_t next_component_phrase_index = component_phrase_memberships->a[next_index];
            if (next_component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
                phrase_t next_component_phrase = component_phrases->a[next_component_phrase_index];
                admin_id = next_component_phrase.data;
                if (postal_code_context_exists(parser, postal_code_id, admin_id)) {
                    postal_code_have_admin = true;
                }
            }
        }

    }

    if (possible_postal_code) {
        if (postal_code_have_admin) {
            feature_array_add(features, 1, "postcode have context");
            feature_array_add(features, 2, "postcode have context", word);
        } else {
            feature_array_add(features, 2, "postcode no context", word);
        }
    }

    uint32_t word_freq = word_vocab_frequency(parser, word);

    bool is_word = is_word_token(token.type);

    bool is_unknown_word = false;
    bool is_unknown = false;

    bool known_prefix = false;
    bool known_suffix = false;

    size_t prefix_len = 0;
    size_t suffix_len = 0;

    char *prefix = NULL;
    char *suffix = NULL;

    if (add_word_feature) {
        // Bias unit, acts as an intercept
        feature_array_add(features, 1, "bias");

        phrase_t prefix_phrase = context->prefix_phrases->a[idx];
        phrase_t suffix_phrase = context->suffix_phrases->a[idx];

        // Prefixes like hinter, etc.
        if (prefix_phrase.len > 0) {
            expansion_index = prefix_phrase.data;
            expansion_value = address_dictionary_get_expansions(expansion_index);

            // Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category
            if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) {
                known_prefix = true;
                char_array_clear(phrase_tokens);
                prefix_len = prefix_phrase.len;
                char_array_add_len(phrase_tokens, word_pre_norm, prefix_len);
                prefix = char_array_get_string(phrase_tokens);
                log_debug("got prefix: %s\n", prefix);
                feature_array_add(features, 2, "prefix", prefix);
            }
        }

        // Suffixes like straße, etc.
        if (suffix_phrase.len > 0) {
            expansion_index = suffix_phrase.data;
            expansion_value = address_dictionary_get_expansions(expansion_index);

            if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) {
                known_suffix = true;
                char_array_clear(context->suffix_phrase);
                suffix_len = suffix_phrase.len;
                size_t word_pre_norm_len = cstring_array_token_length(tokenized->strings, idx);
                size_t suffix_offset = word_pre_norm_len - suffix_len;
                char_array_add_len(context->suffix_phrase, word_pre_norm + suffix_offset, suffix_len);
                suffix = char_array_get_string(context->suffix_phrase);
                log_debug("got suffix: %s\n", suffix);
                feature_array_add(features, 2, "suffix", suffix);
            }
        }

        bool is_hyphenated = false;

        // For rare words and unknown words (so unknown words can benefit from statistics of known but super common words)
        if (word_freq <= parser->options.rare_word_threshold && is_word) {
            log_debug("rare word: %s\n", word);
            bool ngrams_added = false;
            size_t hyphenated_word_offset = 0;
            bool first_sub_token = true;
            bool last_sub_token = true;

            ssize_t next_hyphen_index;

            token_array_clear(context->sub_tokens);

            do {
                next_hyphen_index = string_next_hyphen_index(word + hyphenated_word_offset, word_len - hyphenated_word_offset);
                char *sub_word = word;
                size_t sub_word_len = word_len;

                if (next_hyphen_index >= 0) {
                    is_hyphenated = true;
                    char_array_clear(context->sub_token);
                    char_array_add_len(context->sub_token, word + hyphenated_word_offset, next_hyphen_index);
                    token_array_push(context->sub_tokens, (token_t){hyphenated_word_offset, next_hyphen_index, token.type});
                    sub_word = char_array_get_string(context->sub_token);
                    sub_word_len = context->sub_token->n;
                    last_sub_token = false;
                } else if (is_hyphenated) {
                    char_array_clear(context->sub_token);
                    char_array_add_len(context->sub_token, word + hyphenated_word_offset, word_len - hyphenated_word_offset);
                    sub_word = char_array_get_string(context->sub_token);
                    sub_word_len = context->sub_token->n;
                    last_sub_token = true;
                }

                bool add_prefix = first_sub_token && prefix_len < sub_word_len;
                bool add_suffix = last_sub_token && suffix_len < sub_word_len;

                uint32_t sub_word_freq = word_freq;
                if (is_hyphenated) {
                    sub_word_freq = word_vocab_frequency(parser, sub_word);
                    if (sub_word_freq > 0) {
                        feature_array_add(features, 2, "sub_word", sub_word);
                    }

                }

                if (sub_word_freq <= parser->options.rare_word_threshold) {
                    // prefix/suffix features from 3-6 characters
                    for (size_t ng = 3; ng <= 6; ng++) {
                        ngrams_added = add_ngram_features(features, is_hyphenated ? "sub_word" : "word", context->ngrams, sub_word, ng, add_prefix ? prefix_len : 0, add_suffix ? suffix_len : 0);
                    }
                }

                hyphenated_word_offset += next_hyphen_index + 1;
                first_sub_token = false;

                log_debug("next_hyphen_index=%zd\n", next_hyphen_index);
            } while(next_hyphen_index >= 0);

        }

        if (word_freq > 0) {
            // The individual word
            feature_array_add(features, 2, "word", word);
        } else {
            log_debug("word not in vocab: %s\n", word);

            is_unknown = true;
            word = (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) ? UNKNOWN_WORD : UNKNOWN_NUMERIC;

            if (is_word_token(token.type)) {
                is_unknown_word = true;
            }
        }

        if (idx == 0 && !is_unknown_word) {
            feature_array_add(features, 2, "first word", word);
            //feature_array_add(features, 3, "first word+next word", word, next_word);
        }

    } else if (component_phrase_string != NULL) {
        word = component_phrase_string;
    } else if (phrase_string != NULL) {
        word = phrase_string;
    }

    if (last_index == idx - 1) {
        // Previous tag and current word
        feature_array_add(prev_tag_features, 2, "word", word);

        // Previous two tags and current word
        if (parser->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
            // In the CRF this is accounted for by the transition weights
            // so only need it for the averaged perceptron
            feature_array_add(prev_tag_features, 1, "trans");

            // Averaged perceptron uses two tags of history, CRF uses one
            feature_array_add(prev2_tag_features, 2, "word", word);
            feature_array_add(prev2_tag_features, 1, "trans");
        }
    }

    if (last_index >= 0) {
        address_parser_phrase_t prev_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, last_index, false);
        char *prev_word = prev_word_or_phrase.str;

        if (is_plain_word_phrase_type(prev_word_or_phrase.type)) {
            uint32_t prev_word_freq = word_vocab_frequency(parser, prev_word);
            token_t prev_token = tokenized->tokens->a[last_index];
            bool prev_token_numeric = is_numeric_token(prev_token.type);
            if (prev_word_freq == 0) {
                prev_word = !prev_token_numeric ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
            }
        }

        // Previous word
        feature_array_add(features, 2, "prev word", prev_word);


        if (last_index == idx - 1) {
            feature_array_add(prev_tag_features, 2, "prev word", prev_word);
        }

        // Previous word and current word
        feature_array_add(features, 3, "prev word+word", prev_word, word);
    }

    if (next_index < num_tokens) {
        address_parser_phrase_t next_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, next_index, false);
        char *next_word = next_word_or_phrase.str;
        size_t next_word_len = 1;

        if (is_plain_word_phrase_type(next_word_or_phrase.type)) {
            uint32_t next_word_freq = word_vocab_frequency(parser, next_word);
            token_t next_token = tokenized->tokens->a[next_index];
            bool next_token_numeric = is_numeric_token(next_token.type);
            if (next_word_freq == 0) {
                next_word = !next_token_numeric ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
            }
        } else {
            next_word_len = next_word_or_phrase.phrase.len;
        }

        // Next word e.g. if the current word is unknown and the next word is "street"
        feature_array_add(features, 2, "next word", next_word);

        // Current word and next word
        feature_array_add(features, 3, "word+next word", word, next_word);

        // Prev tag, current word and next word
        //feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word);

        // Venue names ("house") are almost always at the beginning of the string
        // and often contain out-of-vocabulary words. Consider a case like "Barboncino 781 Franklin Ave".
        // The features available to classify "Barboncino" are going to be unknown word featuers (n-grams),
        // next word features (unknown word where next word=DDD is just as likely to be a street)
        // and no previous tags of history since it's the first word. If the parser predicts the
        // first token correctly, it's going to have an easier time getting the rest of the sequence
        // correct (unknown word + prev tag was "house" is probably still part of the venue, etc.) so
        // we're only really worried about that first token.  This group of features, called
        // "long-context features" finds the relative position of the next numeric token as well
        // as the next street-level phrase (words like "ave", "street", etc.) in the right context.
        // In an English or French address, if we know there's a number somewhere to our right,
        // and that a word like "Ave" appears to the right of the number, it's very likely that
        // the current unknown word is part of a venue name. Similarly, if a venue-word like "Pizzeria"
        // occurred prior to the number, that would also be strong evidence that we're in a venue name.
        // Conversely, if we're in a Spanish address and a word like "Calle" comes before the first number
        // to our right, it's also likely that we're in a venue name, but we'd need to note that the
        // phrase we saw was "Calle" and not an English thoroughfare type.

        if (idx == 0 && add_word_feature && is_unknown_word) {
            bool seen_number = false;
            bool seen_phrase = false;
            for (uint32_t right_idx = idx + 1; right_idx < num_tokens; right_idx++) {
                token_t right_token = tokens->a[right_idx];

                /* Check */
                address_parser_phrase_t right_context_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, right_idx, true);
                address_parser_phrase_type_t right_context_phrase_type = right_context_word_or_phrase.type;
                if (right_context_phrase_type != ADDRESS_PARSER_NULL_PHRASE &&
                    right_context_phrase_type != ADDRESS_PARSER_DICTIONARY_PHRASE &&
                    right_context_phrase_type != ADDRESS_PARSER_SUFFIX_PHRASE &&
                    right_context_phrase_type != ADDRESS_PARSER_PREFIX_PHRASE) {
                    continue;
                }
                char *right_context_word = right_context_word_or_phrase.str;
                phrase_t right_context_phrase = right_context_word_or_phrase.phrase;

                phrase_t suffix_phrase = context->suffix_phrases->a[right_idx];

                uint32_t right_context_expansion_index;
                address_expansion_value_t *right_context_expansion_value;

                uint32_t right_context_components = 0;
                bool right_context_name = false;
                bool right_context_street = false;

                if (right_context_phrase.len > 0) {
                    right_context_expansion_index = right_context_phrase.data;
                    right_context_expansion_value = address_dictionary_get_expansions(right_context_expansion_index);
                    right_context_components = right_context_expansion_value->components;

                    char *right_affix_type = NULL;
                    char *right_context_affix = NULL;

                    char *relation_to_number = seen_number ? "after number" : "before number";

                    seen_phrase = true;

                    char *right_context_word_pre_norm;

                     if (right_context_phrase_type == ADDRESS_PARSER_SUFFIX_PHRASE) {
                        right_affix_type = "suffix";
                        right_context_word_pre_norm = tokenized_string_get_token(tokenized, right_idx);
                        right_context_affix = phrase_suffix(right_context_word, strlen(right_context_word_pre_norm), right_context_phrase, context->long_context_suffix_phrase);
                    } else if (right_context_word_or_phrase.type == ADDRESS_PARSER_PREFIX_PHRASE) {
                        right_affix_type = "prefix";
                        right_context_word_pre_norm = tokenized_string_get_token(tokenized, right_idx);
                        right_context_affix = phrase_prefix(right_context_word, strlen(right_context_word_pre_norm), right_context_phrase, context->long_context_suffix_phrase);
                    }

                    if (right_context_components & LIBPOSTAL_ADDRESS_STREET && !(right_context_components & LIBPOSTAL_ADDRESS_NAME)) {
                        feature_array_add(features, 2, "first word unknown+street phrase right", relation_to_number);
                        feature_array_add(features, 3, "first word unknown+street phrase right", relation_to_number, right_context_word);
                        if (right_context_affix != NULL && right_affix_type != NULL) {
                            feature_array_add(features, 4, "first word unknown+street affix right", relation_to_number, right_affix_type, right_context_affix);
                        }
                        break;
                    } else if (right_context_components & LIBPOSTAL_ADDRESS_NAME && !(right_context_components & LIBPOSTAL_ADDRESS_STREET)) {
                        feature_array_add(features, 2, "first word unknown+venue phrase right", relation_to_number);
                        feature_array_add(features, 3, "first word unknown+venue phrase right", relation_to_number, right_context_word);
                        if (right_context_affix != NULL && right_affix_type != NULL) {
                            feature_array_add(features, 4, "first word unknown+venue affix right", relation_to_number, right_affix_type, right_context_affix);
                        }
                    } else if (right_context_components & (LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET)) {
                        if (seen_number) {
                            feature_array_add(features, 1, "first word unknown+number+ambiguous phrase right");
                            feature_array_add(features, 2, "first word unknown+number+ambiguous phrase right", right_context_word);
                            if (right_context_affix != NULL && right_affix_type != NULL) {
                                feature_array_add(features, 3, "first word unknown+number+ambiguous affix right", right_affix_type, right_context_affix);
                            }
                            break;
                        } else {
                            continue;
                        }
                    }

                    if (seen_number) break;
                }

                if (is_numeric_token(right_token.type)) {
                    seen_number = true;
                    char *relation_to_phrase = seen_phrase ? "after phrase" : "before phrase";
                    feature_array_add(features, 2, "first word unknown+number right", relation_to_phrase);
                    feature_array_add(features, 3, "first word unknown+number right", relation_to_phrase, right_context_word);
                    if (seen_phrase) break;
                }
            }
        }
    }

    return true;

}

bool address_parser_predict(address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str) {
    if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
        return averaged_perceptron_tagger_predict(self->model.ap, self, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features);
    } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
        return crf_tagger_predict(self->model.crf, self, context, context->features, context->prev_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features);
    } else {
        log_error("Parser has unknown model type\n");
    }
    return false;
}

libpostal_address_parser_response_t *address_parser_response_new(void) {
    libpostal_address_parser_response_t *response = malloc(sizeof(libpostal_address_parser_response_t));
    return response;
}

libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country) {
    if (address == NULL) return NULL;

    address_parser_t *parser = get_address_parser();
    if (parser == NULL || parser->context == NULL) {
        log_error("parser is not setup, call libpostal_setup_address_parser()\n");
        return NULL;
    }

    address_parser_context_t *context = parser->context;

    char *normalized = address_parser_normalize_string(address);
    bool is_normalized = normalized != NULL;
    if (!is_normalized) {
        normalized = address;
    }

    token_array *tokens = tokenize(normalized);

    tokenized_string_t *tokenized_str = tokenized_string_new_from_str_size(normalized, strlen(normalized), tokens->n);

    for (size_t i = 0; i < tokens->n; i++) {
        token_t token = tokens->a[i];
        if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
            uint32_array_pop(context->separators);
            uint32_array_push(context->separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
            continue;
        } else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
            continue;
        }

        tokenized_string_add_token(tokenized_str, (const char *)normalized, token.len, token.type, token.offset);
        uint32_array_push(context->separators, ADDRESS_SEPARATOR_NONE);
    }

    // This parser was trained without knowing language/country.
    // If at some point we build country-specific/language-specific
    // parsers, these parameters could be used to select a model.
    // The language parameter does technically control which dictionaries
    // are searched at the street level. It's possible with e.g. a phrase
    // like "de", which can be either the German country code or a stopword
    // in Spanish, that even in the case where it's being used as a country code,
    // it's possible that both the street-level and admin-level phrase features
    // may be working together as a kind of intercept. Depriving the model
    // of the street-level phrase features by passing in a known language
    // may change the decision threshold so explicitly ignore these
    // options until there's a use for them (country-specific or language-specific
    // parser models).

    language = NULL;
    country = NULL;
    address_parser_context_fill(context, parser, tokenized_str, language, country);

    libpostal_address_parser_response_t *response = NULL;

    // If the whole input string is a single known phrase at the SUBURB level or higher, bypass sequence prediction altogether
    phrase_t only_phrase = NULL_PHRASE;
    token_t token, prev_token;
    bool is_postal = false;
    if (context->component_phrases->n == 1) {
        only_phrase = context->component_phrases->a[0];
    } else if (context->postal_code_phrases->n == 1) {
        only_phrase = context->postal_code_phrases->a[0];
        is_postal = true;
    }

    if (only_phrase.start == 0 && only_phrase.len == tokenized_str->tokens->n && only_phrase.len > 0) {
        uint32_t most_common = 0;

        char *label = NULL;

        if (!is_postal) {
            uint32_t component_phrase_index = only_phrase.data;
            address_parser_types_t types = parser->phrase_types->a[component_phrase_index];
            most_common = types.most_common;

            if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) {
                label = strdup(ADDRESS_PARSER_LABEL_CITY);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) {
                label = strdup(ADDRESS_PARSER_LABEL_STATE);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) {
                label = strdup(ADDRESS_PARSER_LABEL_COUNTRY);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) {
                label = strdup(ADDRESS_PARSER_LABEL_STATE_DISTRICT);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) {
                label = strdup(ADDRESS_PARSER_LABEL_COUNTRY_REGION);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) {
                label = strdup(ADDRESS_PARSER_LABEL_SUBURB);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) {
                label = strdup(ADDRESS_PARSER_LABEL_CITY_DISTRICT);
            } else if (most_common == ADDRESS_PARSER_BOUNDARY_WORLD_REGION) {
                label = strdup(ADDRESS_PARSER_LABEL_WORLD_REGION);
            }
        } else {
            label = strdup(ADDRESS_PARSER_LABEL_POSTAL_CODE);
        }

        // Implicit: if most_common is not one of the above, ignore and parse regularly
        if (label != NULL) {
            char **single_label = malloc(sizeof(char *));
            single_label[0] = label;
            char **single_component = malloc(sizeof(char *));
            single_component[0] = strdup(normalized);

            response = address_parser_response_new();

            response->num_components = 1;
            response->labels = single_label;
            response->components = single_component;

            token_array_destroy(tokens);
            tokenized_string_destroy(tokenized_str);

            if (is_normalized) {
                free(normalized);
            }
            return response;
        }
    }

    cstring_array *token_labels = cstring_array_new_size(tokens->n);

    char *prev_label = NULL;

    bool prediction_success = address_parser_predict(parser, context, token_labels, &address_parser_features, tokenized_str);

    if (prediction_success) {
        response = address_parser_response_new();

        size_t num_strings = cstring_array_num_strings(tokenized_str->strings);

        cstring_array *labels = cstring_array_new_size(num_strings);
        cstring_array *components = cstring_array_new_size(strlen(address) + num_strings);

        token_t *tokens = tokenized_str->tokens->a;

        for (size_t i = 0; i < num_strings; i++) {
            char *str = tokenized_string_get_token(tokenized_str, i);

            char *label = cstring_array_get_string(token_labels, i);

            if (prev_label == NULL || strcmp(label, prev_label) != 0) {
                cstring_array_add_string(labels, label);
                cstring_array_start_token(components);

            }

            if (prev_label != NULL && strcmp(label, prev_label) == 0) {
                token = tokens[i];
                prev_token = tokens[i - 1];
                if (token.offset > prev_token.offset + prev_token.len) {
                    cstring_array_cat_string(components, " ");
                }
                cstring_array_cat_string(components, str);
            } else {
                cstring_array_append_string(components, str);
                cstring_array_terminate(components);
            }

            prev_label = label;
        }
        response->num_components = cstring_array_num_strings(components);
        response->components = cstring_array_to_strings(components);
        response->labels = cstring_array_to_strings(labels);

    } else {
        log_error("Error in prediction\n");
    }

    token_array_destroy(tokens);
    tokenized_string_destroy(tokenized_str);
    cstring_array_destroy(token_labels);

    if (is_normalized) {
        free(normalized);
    }

    return response;
}



bool address_parser_module_setup(char *dir) {
    if (parser == NULL) {
        return address_parser_load(dir);
    }
    return true;
}

void address_parser_module_teardown(void) {
    if (parser != NULL) {
        address_parser_destroy(parser);
    }
    parser = NULL;
}