use std::i32;
use smallvec::smallvec;
use crate::subword::{BucketIndexer, Indexer, IndicesScope, NGramVec, StrWithCharLen};
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct FastTextIndexer {
buckets: u32,
}
impl BucketIndexer for FastTextIndexer {
fn new(buckets: usize) -> Self {
assert!(
buckets <= i32::MAX as usize,
"The largest possible number of buckets is: {}",
i32::MAX
);
FastTextIndexer {
buckets: buckets as u32,
}
}
fn buckets(&self) -> usize {
self.buckets as usize
}
}
impl Indexer for FastTextIndexer {
fn index_ngram(&self, ngram: &StrWithCharLen) -> NGramVec {
let index = u64::from(fasttext_hash(ngram.as_str()) % self.buckets);
smallvec![index]
}
fn upper_bound(&self) -> u64 {
u64::from(self.buckets)
}
fn infallible() -> bool {
true
}
fn scope() -> IndicesScope {
IndicesScope::Substrings
}
}
fn fasttext_hash(ngram: &str) -> u32 {
let mut h = 2_166_136_261;
for byte in ngram.bytes() {
h ^= (byte as i8) as u32;
h = h.wrapping_mul(16_777_619);
}
h
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::iter::FromIterator;
use lazy_static::lazy_static;
use super::FastTextIndexer;
use crate::subword::{BucketIndexer, SubwordIndices};
lazy_static! {
static ref SUBWORD_TESTS: HashMap<&'static str, Vec<u64>> = HashMap::from_iter(vec![
(
"<Daniël>",
vec![
69886, 84537, 338340, 441697, 448390, 468430, 504093, 573175, 749365, 804851,
811506, 991985, 1022467, 1105725, 1249224, 1418443, 1493412, 1880616
]
),
(
"<überspringen>",
vec![
79599, 119685, 255527, 263610, 352266, 385524, 403356, 421853, 485366, 488156,
586161, 619228, 629649, 642367, 716781, 751724, 754367, 771707, 799583, 887882,
894109, 904527, 908492, 978563, 991164, 992241, 1142035, 1230973, 1278156,
1350653, 1414694, 1513262, 1533308, 1607098, 1607788, 1664269, 1712300,
1749574, 1793082, 1891605, 1934955, 1992797
]
),
]);
static ref SUBWORD_TESTS_5_5: HashMap<&'static str, Vec<u64>> = HashMap::from_iter(vec![
("<Daniël>", vec![441697, 749365, 1105725, 1880616]),
(
"<überspringen>",
vec![
79599, 352266, 385524, 629649, 716781, 978563, 991164, 1230973, 1350653,
1992797
]
)
]);
}
#[test]
fn subword_indices_test() {
let indexer = FastTextIndexer::new(2_000_000);
for (word, indices_check) in SUBWORD_TESTS.iter() {
let mut indices = word.subword_indices(3, 6, &indexer).collect::<Vec<_>>();
indices.sort_unstable();
assert_eq!(indices_check, &indices);
}
}
#[test]
fn subword_indices_test_5_5() {
let indexer = FastTextIndexer::new(2_000_000);
for (word, indices_check) in SUBWORD_TESTS_5_5.iter() {
let mut indices = word.subword_indices(5, 5, &indexer).collect::<Vec<_>>();
indices.sort_unstable();
assert_eq!(indices_check, &indices);
}
}
}