#ifndef ADDRESS_PARSER_H
#define ADDRESS_PARSER_H
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include "libpostal.h"
#include "libpostal_config.h"
#include "averaged_perceptron.h"
#include "averaged_perceptron_tagger.h"
#include "collections.h"
#include "crf.h"
#include "graph.h"
#include "normalize.h"
#include "string_utils.h"
#define DEFAULT_ADDRESS_PARSER_PATH LIBPOSTAL_ADDRESS_PARSER_DIR PATH_SEPARATOR "address_parser.dat"
#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_SIMPLE_LATIN_ASCII
#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_LATIN NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII
#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8 NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_STRIP_ACCENTS
#define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS
#define ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS
#define ADDRESS_PARSER_NORMALIZE_POSTAL_CODE_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
#define ADDRESS_SEPARATOR_NONE 0
#define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0
#define ADDRESS_SEPARATOR_FIELD 1 << 1
#define ADDRESS_PARSER_IS_SEPARATOR(token_type) ((token_type) == COMMA || (token_type) == NEWLINE || (token_type) == HYPHEN || (token_type) == DASH || (token_type) == BREAKING_DASH|| (token_type) == SEMICOLON || (token_type) == PUNCT_OPEN || (token_type) == PUNCT_CLOSE )
#define ADDRESS_PARSER_IS_IGNORABLE(token_type) ((token.type) == INVALID_CHAR || (token.type) == PERIOD || (token_type) == COLON )
#define SEPARATOR_LABEL "sep"
#define FIELD_SEPARATOR_LABEL "fsep"
#define ADDRESS_COMPONENT_NON_BOUNDARY 0
#define ADDRESS_COMPONENT_SUBURB 1 << 3
#define ADDRESS_COMPONENT_CITY_DISTRICT 1 << 4
#define ADDRESS_COMPONENT_CITY 1 << 5
#define ADDRESS_COMPONENT_ISLAND 1 << 7
#define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 8
#define ADDRESS_COMPONENT_STATE 1 << 9
#define ADDRESS_COMPONENT_COUNTRY_REGION 1 << 11
#define ADDRESS_COMPONENT_COUNTRY 1 << 13
#define ADDRESS_COMPONENT_WORLD_REGION 1 << 14
typedef enum {
ADDRESS_PARSER_BOUNDARY_NONE,
ADDRESS_PARSER_BOUNDARY_SUBURB,
ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT,
ADDRESS_PARSER_BOUNDARY_CITY,
ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT,
ADDRESS_PARSER_BOUNDARY_ISLAND,
ADDRESS_PARSER_BOUNDARY_STATE,
ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION,
ADDRESS_PARSER_BOUNDARY_COUNTRY,
ADDRESS_PARSER_BOUNDARY_WORLD_REGION,
NUM_ADDRESS_PARSER_BOUNDARY_TYPES
} address_parser_boundary_components;
#define ADDRESS_PARSER_LABEL_HOUSE "house"
#define ADDRESS_PARSER_LABEL_HOUSE_NUMBER "house_number"
#define ADDRESS_PARSER_LABEL_PO_BOX "po_box"
#define ADDRESS_PARSER_LABEL_BUILDING "building"
#define ADDRESS_PARSER_LABEL_ENTRANCE "entrance"
#define ADDRESS_PARSER_LABEL_STAIRCASE "staircase"
#define ADDRESS_PARSER_LABEL_LEVEL "level"
#define ADDRESS_PARSER_LABEL_UNIT "unit"
#define ADDRESS_PARSER_LABEL_ROAD "road"
#define ADDRESS_PARSER_LABEL_METRO_STATION "metro_station"
#define ADDRESS_PARSER_LABEL_SUBURB "suburb"
#define ADDRESS_PARSER_LABEL_CITY_DISTRICT "city_district"
#define ADDRESS_PARSER_LABEL_CITY "city"
#define ADDRESS_PARSER_LABEL_STATE_DISTRICT "state_district"
#define ADDRESS_PARSER_LABEL_ISLAND "island"
#define ADDRESS_PARSER_LABEL_STATE "state"
#define ADDRESS_PARSER_LABEL_POSTAL_CODE "postcode"
#define ADDRESS_PARSER_LABEL_COUNTRY_REGION "country_region"
#define ADDRESS_PARSER_LABEL_COUNTRY "country"
#define ADDRESS_PARSER_LABEL_WORLD_REGION "world_region"
#define ADDRESS_PARSER_LABEL_WEBSITE "website"
#define ADDRESS_PARSER_LABEL_TELEPHONE "phone"
typedef union address_parser_types {
uint32_t value;
struct {
uint32_t components:16; uint32_t most_common:16; };
} address_parser_types_t;
VECTOR_INIT(address_parser_types_array, address_parser_types_t)
typedef struct address_parser_context {
char *language;
char *country;
cstring_array *features;
cstring_array *prev_tag_features;
cstring_array *prev2_tag_features;
char_array *phrase;
char_array *context_phrase;
char_array *long_context_phrase;
char_array *prefix_phrase;
char_array *context_prefix_phrase;
char_array *long_context_prefix_phrase;
char_array *suffix_phrase;
char_array *context_suffix_phrase;
char_array *long_context_suffix_phrase;
char_array *component_phrase;
char_array *context_component_phrase;
char_array *long_context_component_phrase;
cstring_array *ngrams;
char_array *sub_token;
token_array *sub_tokens;
uint32_array *separators;
cstring_array *normalized;
token_array *normalized_tokens;
cstring_array *normalized_admin;
token_array *normalized_admin_tokens;
phrase_array *address_dictionary_phrases;
int64_array *address_phrase_memberships; phrase_array *component_phrases;
int64_array *component_phrase_memberships; phrase_array *postal_code_phrases;
int64_array *postal_code_phrase_memberships; phrase_array *prefix_phrases;
phrase_array *suffix_phrases;
tokenized_string_t *tokenized_str;
} address_parser_context_t;
typedef union postal_code_context_value {
uint64_t value;
struct {
uint64_t postcode:32;
uint64_t admin:32;
};
} postal_code_context_value_t;
#define POSTAL_CODE_CONTEXT(pc, ad) ((postal_code_context_value_t){.postcode = (pc), .admin = (ad) })
typedef enum address_parser_model_type {
ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON,
ADDRESS_PARSER_TYPE_CRF
} address_parser_model_type_t;
typedef struct parser_options {
uint64_t rare_word_threshold;
bool print_features;
} parser_options_t;
typedef struct address_parser {
parser_options_t options;
size_t num_classes;
address_parser_model_type_t model_type;
union {
averaged_perceptron_t *ap;
crf_t *crf;
} model;
address_parser_context_t *context;
trie_t *vocab;
trie_t *phrases;
address_parser_types_array *phrase_types;
trie_t *postal_codes;
graph_t *postal_code_contexts;
} address_parser_t;
address_parser_t *address_parser_new(void);
address_parser_t *address_parser_new_options(parser_options_t options);
address_parser_t *get_address_parser(void);
bool address_parser_load(char *dir);
bool address_parser_print_features(bool print_features);
libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country);
void address_parser_destroy(address_parser_t *self);
char *address_parser_normalize_string(char *str);
void address_parser_normalize_token(cstring_array *array, char *str, token_t token);
bool address_parser_predict(address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str);
address_parser_context_t *address_parser_context_new(void);
void address_parser_context_destroy(address_parser_context_t *self);
void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country);
bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i);
bool address_parser_load(char *dir);
bool address_parser_save(address_parser_t *self, char *output_dir);
bool address_parser_module_setup(char *dir);
void address_parser_module_teardown(void);
#endif