libpostal-sys 0.1.1

Low-level wrappers for libpostal address normalization (with locks to support thread-safe initialization)
Documentation
/*
* Copyright (c) 2012-2013 Spotify AB
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>

#include "hashheader.h"
#include "hashiter.h"
#include "util.h"
#include "endiantools.h"
#include "sparkey.h"
#include "sparkey-internal.h"

#define MAGIC_VALUE_HASHREADER (0x75103df9)

sparkey_returncode sparkey_hash_open(sparkey_hashreader **reader_ref, const char *hash_filename, const char *log_filename) {
  RETHROW(correct_endian_platform());

  sparkey_returncode returncode;

  sparkey_hashreader *reader = malloc(sizeof(sparkey_hashreader));
  if (reader == NULL) {
    return SPARKEY_INTERNAL_ERROR;
  }

  TRY(sparkey_load_hashheader(&reader->header, hash_filename), free_reader);
  TRY(sparkey_logreader_open_noalloc(&reader->log, log_filename), free_reader);
  if (reader->header.file_identifier != reader->log.header.file_identifier) {
    returncode = SPARKEY_FILE_IDENTIFIER_MISMATCH;
    goto close_reader;
  }
  if (reader->header.data_end > reader->log.header.data_end) {
    returncode = SPARKEY_HASH_HEADER_CORRUPT;
    goto close_reader;
  }
  if (reader->header.max_key_len > reader->log.header.max_key_len) {
    returncode = SPARKEY_HASH_HEADER_CORRUPT;
    goto close_reader;
  }
  if (reader->header.max_value_len > reader->log.header.max_value_len) {
    returncode = SPARKEY_HASH_HEADER_CORRUPT;
    goto close_reader;
  }

  reader->fd = open(hash_filename, O_RDONLY);
  if (reader->fd < 0) {
    int e = errno;
    returncode = sparkey_open_returncode(e);
    goto close_reader;
  }

  reader->data_len = reader->header.header_size + reader->header.hash_capacity * (reader->header.hash_size + reader->header.address_size);

  struct stat s;
  stat(hash_filename, &s);
  if (reader->data_len > (uint64_t) s.st_size) {
    returncode = SPARKEY_HASH_TOO_SMALL;
    goto close_reader;
  }

  reader->data = mmap(NULL, reader->data_len, PROT_READ, MAP_SHARED, reader->fd, 0);
  if (reader->data == MAP_FAILED) {
    returncode = SPARKEY_MMAP_FAILED;
    goto close_reader;
  }

  *reader_ref = reader;
  reader->open_status = MAGIC_VALUE_HASHREADER;
  return SPARKEY_SUCCESS;

close_reader:
  sparkey_hash_close(&reader);
  return returncode;

free_reader:
  free(reader);
  return returncode;
}

void sparkey_hash_close(sparkey_hashreader **reader_ref) {
  if (reader_ref == NULL) {
    return;
  }
  sparkey_hashreader *reader = *reader_ref;
  if (reader == NULL) {
    return;
  }

  if (reader->open_status != MAGIC_VALUE_HASHREADER) {
    return;
  }
  sparkey_logreader_close_nodealloc(&reader->log);

  reader->open_status = 0;
  if (reader->data != NULL) {
    munmap(reader->data, reader->data_len);
    reader->data = NULL;
  }
  close(reader->fd);
  reader->fd = -1;
  free(reader);
  *reader_ref = NULL;
}

static sparkey_returncode assert_reader_open(sparkey_hashreader *reader) {
  if (reader->open_status != MAGIC_VALUE_HASHREADER) {
    return SPARKEY_HASH_CLOSED;
  }
  return SPARKEY_SUCCESS;
}

sparkey_returncode sparkey_hash_get(sparkey_hashreader *reader, const uint8_t *key, uint64_t keylen, sparkey_logiter *iter) {
  RETHROW(assert_reader_open(reader));
  uint64_t hash = reader->header.hash_algorithm.hash(key, keylen, reader->header.hash_seed);
  uint64_t wanted_slot = hash % reader->header.hash_capacity;

  int slot_size = reader->header.address_size + reader->header.hash_size;
  uint64_t pos = wanted_slot * slot_size;

  uint64_t displacement = 0;
  uint64_t slot = wanted_slot;

  uint8_t *hashtable = reader->data + reader->header.header_size;

  while (1) {
    uint64_t hash2 = reader->header.hash_algorithm.read_hash(hashtable, pos);
    uint64_t position2 = read_addr(hashtable, pos + reader->header.hash_size, reader->header.address_size);
    if (position2 == 0) {
      iter->state = SPARKEY_ITER_INVALID;
      return SPARKEY_SUCCESS;
    }
    int entry_index2 = (int) (position2) & reader->header.entry_block_bitmask;
    position2 >>= reader->header.entry_block_bits;
    if (hash == hash2) {
      RETHROW(sparkey_logiter_seek(iter, &reader->log, position2));
      RETHROW(sparkey_logiter_skip(iter, &reader->log, entry_index2));
      RETHROW(sparkey_logiter_next(iter, &reader->log));
      uint64_t keylen2 = iter->keylen;
      if (iter->type != SPARKEY_ENTRY_PUT) {
        iter->state = SPARKEY_ITER_INVALID;
        return SPARKEY_INTERNAL_ERROR;
      }
      if (keylen == keylen2) {
        uint64_t pos2 = 0;
        int equals = 1;
        while (pos2 < keylen) {
          uint8_t *buf2;
          uint64_t len2;
          RETHROW(sparkey_logiter_keychunk(iter, &reader->log, keylen, &buf2, &len2));
          if (memcmp(&key[pos2], buf2, len2) != 0) {
            equals = 0;
            break;
          }
          pos2 += len2;
        }
        if (equals) {
          return SPARKEY_SUCCESS;
        }
      }
    }
    uint64_t other_displacement = get_displacement(reader->header.hash_capacity, slot, hash2);
    if (displacement > other_displacement) {
      iter->state = SPARKEY_ITER_INVALID;
      return SPARKEY_SUCCESS;
    }
    pos += slot_size;
    displacement++;
    slot++;
    if (slot >= reader->header.hash_capacity) {
      pos = 0;
      slot = 0;
    }
  }
  iter->state = SPARKEY_ITER_INVALID;
  return SPARKEY_INTERNAL_ERROR;
}

sparkey_returncode sparkey_logiter_hashnext(sparkey_logiter *iter, sparkey_hashreader *reader) {
  RETHROW(assert_reader_open(reader));

  uint8_t *hashtable = reader->data + reader->header.header_size;
  int slot_size = reader->header.address_size + reader->header.hash_size;

  while (1) {
    RETHROW(sparkey_logiter_next(iter, &reader->log));
    if (iter->state != SPARKEY_ITER_ACTIVE) {
      return SPARKEY_SUCCESS;
    }
    if (iter->type != SPARKEY_ENTRY_PUT) {
      continue;
    }
    uint64_t position = (iter->entry_block_position << reader->header.entry_block_bits) | iter->entry_count;

    uint64_t key_hash = sparkey_iter_hash(&reader->header, iter, &reader->log);
    uint64_t wanted_slot = key_hash % reader->header.hash_capacity;

    uint64_t pos = wanted_slot * slot_size;

    uint64_t displacement = 0;
    uint64_t slot = wanted_slot;

    while (1) {
      uint64_t hash2 = reader->header.hash_algorithm.read_hash(hashtable, pos);
      uint64_t position2 = read_addr(hashtable, pos + reader->header.hash_size, reader->header.address_size);
      if (position2 == 0) {
        break;
      }
      if (position == position2) {
        // Found a match! Just reset the iterator
        RETHROW(sparkey_logiter_reset(iter, &reader->log));
        return SPARKEY_SUCCESS;
      }
      uint64_t other_displacement = get_displacement(reader->header.hash_capacity, slot, hash2);
      if (displacement > other_displacement) {
        break;
      }
      pos += slot_size;
      displacement++;
      slot++;
      if (slot >= reader->header.hash_capacity) {
        pos = 0;
        slot = 0;
      }
    }
  }
}

sparkey_logreader * sparkey_hash_getreader(sparkey_hashreader *reader) {
  return &reader->log;
}

uint64_t sparkey_hash_numentries(sparkey_hashreader *reader) {
  return reader->header.num_entries;
}

uint64_t sparkey_hash_numcollisions(sparkey_hashreader *reader) {
  return reader->header.hash_collisions;
}