libpostal-sys 0.1.1

Low-level wrappers for libpostal address normalization (with locks to support thread-safe initialization)
Documentation
#include "jaccard.h"

double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2) {
    if (s1 == NULL || s2 == NULL) return 0.0;

    size_t set_intersection = 0;
    size_t set_union = 0;

    khiter_t k;
    const char *key;

    kh_foreach_key(s1, key, {
        k = kh_get(str_set, s2, key);
        if (k != kh_end(s2)) {
            set_intersection++;
        } else {
            set_union++;
        }
    });

    // set_union contains all the keys that were in s1 but not s2
    // so just add all the keys in s2 to complete the union
    set_union += kh_size(s2);

    return (double)set_intersection / set_union;
}


double jaccard_similarity_string_arrays(size_t num_strings1, char **strings1, size_t num_strings2, char **strings2) {
    if (strings1 == NULL || strings2 == NULL || num_strings1 == 0 || num_strings2 == 0) return 0.0;

    khash_t(str_set) *string_set1 = kh_init(str_set);
    if (string_set1 == NULL) return 0.0;

    kh_resize(str_set, string_set1, num_strings1);
    int ret = 0;

    khiter_t k;

    for (size_t i = 0; i < num_strings1; i++) {
        char *str1 = strings1[i];
        k = kh_put(str_set, string_set1, str1, &ret);
        if (ret < 0) {
            kh_destroy(str_set, string_set1);
            return 0.0;
        }
    }

    khash_t(str_set) *string_set2 = kh_init(str_set);
    if (string_set2 == NULL) {
        kh_destroy(str_set, string_set1);
        return 0.0;
    }
    kh_resize(str_set, string_set2, num_strings2);
    for (size_t i = 0; i < num_strings2; i++) {
        char *str2 = strings2[i];
        k = kh_put(str_set, string_set2, str2, &ret);
        if (ret < 0) {
            kh_destroy(str_set, string_set1);
            kh_destroy(str_set, string_set2);
            return 0.0;
        }
    }

    double sim = jaccard_similarity(string_set1, string_set2);
    kh_destroy(str_set, string_set1);
    kh_destroy(str_set, string_set2);
    return sim;
}