use std::cmp;
use std::collections::{HashMap, VecDeque};
use std::fmt;
use std::hash::{Hash, Hasher};
use std::marker::PhantomData;
use std::ops::Deref;
use fnv::FnvHasher;
use crate::util::CollectWithCapacity;
pub trait Indexer {
fn index_ngram(&self, ngram: &StrWithCharLen) -> Option<u64>;
fn upper_bound(&self) -> u64;
fn infallible() -> bool;
}
pub trait BucketIndexer: Indexer {
fn new(buckets: usize) -> Self;
fn buckets(&self) -> usize;
}
pub struct HashIndexer<H> {
buckets_exp: usize,
mask: u64,
_phantom: PhantomData<H>,
}
impl<H> BucketIndexer for HashIndexer<H>
where
H: Default + Hasher,
{
fn new(buckets_exp: usize) -> Self {
assert!(
buckets_exp <= 64,
"The largest possible buckets exponent is 64."
);
let mask = if buckets_exp == 64 {
!0
} else {
(1 << buckets_exp) - 1
};
HashIndexer {
buckets_exp,
mask,
_phantom: PhantomData,
}
}
fn buckets(&self) -> usize {
self.buckets_exp as usize
}
}
impl<H> Clone for HashIndexer<H> {
fn clone(&self) -> Self {
HashIndexer {
buckets_exp: self.buckets_exp,
mask: self.mask,
_phantom: PhantomData,
}
}
}
impl<H> Copy for HashIndexer<H> {}
impl<H> fmt::Debug for HashIndexer<H> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "HashIndexer<impl Hasher> {{ mask: {} }}", self.mask)
}
}
impl<H> Eq for HashIndexer<H> {}
impl<H> Indexer for HashIndexer<H>
where
H: Default + Hasher,
{
fn index_ngram(&self, ngram: &StrWithCharLen) -> Option<u64> {
let mut hasher = H::default();
ngram.hash(&mut hasher);
Some(hasher.finish() & self.mask)
}
fn upper_bound(&self) -> u64 {
2u64.pow(self.buckets_exp as u32)
}
fn infallible() -> bool {
true
}
}
impl<H> PartialEq for HashIndexer<H> {
fn eq(&self, other: &Self) -> bool {
self.mask.eq(&other.mask)
}
}
pub type FinalfusionHashIndexer = HashIndexer<FnvHasher>;
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct ExplicitIndexer {
ngrams: Vec<String>,
index: HashMap<String, u64>,
bound: usize,
}
impl ExplicitIndexer {
pub fn ngrams(&self) -> &[String] {
&self.ngrams
}
}
impl ExplicitIndexer {
pub fn new(ngrams: impl Into<Vec<String>>) -> Self {
let ngrams = ngrams.into();
let index = ngrams
.iter()
.cloned()
.enumerate()
.map(|(idx, ngram)| (ngram, idx as u64))
.collect::<HashMap<String, u64>>();
assert_eq!(
index.len(),
ngrams.len(),
"ngrams contained duplicate entries."
);
let bound = index.len();
ExplicitIndexer {
ngrams,
index,
bound,
}
}
pub fn new_with_indices(ngram_tuples: Vec<(String, u64)>) -> Self {
let mut old_to_new_indices = HashMap::new();
let mut index = HashMap::with_capacity(ngram_tuples.len());
let mut ngrams = Vec::with_capacity(ngram_tuples.len());
for (ngram, bucket) in ngram_tuples {
let cur_idx = old_to_new_indices.len();
let new_idx = *old_to_new_indices.entry(bucket).or_insert(cur_idx);
assert!(
index.insert(ngram.clone(), new_idx as u64).is_none(),
"ngrams contains duplicate entries."
);
ngrams.push(ngram);
}
let bound = old_to_new_indices.len();
ExplicitIndexer {
ngrams,
index,
bound,
}
}
}
impl Indexer for ExplicitIndexer {
fn index_ngram(&self, ngram: &StrWithCharLen) -> Option<u64> {
self.index.get(ngram.inner).cloned()
}
fn upper_bound(&self) -> u64 {
self.bound as u64
}
fn infallible() -> bool {
false
}
}
pub struct StrWithCharLen<'a> {
inner: &'a str,
char_len: usize,
}
impl<'a> From<&'a str> for StrWithCharLen<'a> {
fn from(s: &'a str) -> Self {
StrWithCharLen::new(s)
}
}
impl<'a> StrWithCharLen<'a> {
pub fn new(s: &'a str) -> Self {
let char_len = s.chars().count();
StrWithCharLen { inner: s, char_len }
}
pub fn as_str(&self) -> &str {
self.inner
}
pub fn char_len(&self) -> usize {
self.char_len
}
}
impl<'a> Deref for StrWithCharLen<'a> {
type Target = str;
fn deref(&self) -> &Self::Target {
self.inner
}
}
impl<'a> Hash for StrWithCharLen<'a> {
fn hash<H>(&self, hasher: &mut H)
where
H: Hasher,
{
hasher.write(&(self.char_len as u64).to_le_bytes());
self.inner
.chars()
.for_each(|ch| hasher.write(&(ch as u32).to_le_bytes()));
}
}
pub struct NGrams<'a> {
max_n: usize,
min_n: usize,
string: &'a str,
char_offsets: VecDeque<usize>,
ngram_len: usize,
}
impl<'a> NGrams<'a> {
pub fn new(string: &'a str, min_n: usize, max_n: usize) -> Self {
assert!(min_n != 0, "The minimum n-gram length cannot be zero.");
assert!(
min_n <= max_n,
"The maximum length should be equal to or greater than the minimum length."
);
let char_offsets = string
.char_indices()
.map(|(idx, _)| idx)
.collect_with_capacity::<VecDeque<_>>(string.len());
let ngram_len = cmp::min(max_n, char_offsets.len());
NGrams {
min_n,
max_n,
string,
char_offsets,
ngram_len,
}
}
}
impl<'a> Iterator for NGrams<'a> {
type Item = StrWithCharLen<'a>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.ngram_len < self.min_n {
self.char_offsets.pop_front();
if self.char_offsets.len() < self.min_n {
return None;
}
self.ngram_len = cmp::min(self.max_n, self.char_offsets.len());
}
let ngram = if self.ngram_len == self.char_offsets.len() {
&self.string[self.char_offsets[0]..]
} else {
&self.string[self.char_offsets[0]..self.char_offsets[self.ngram_len]]
};
let ngram_with_len = StrWithCharLen {
inner: ngram,
char_len: self.ngram_len,
};
self.ngram_len -= 1;
Some(ngram_with_len)
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let cap_approx = (self.max_n - self.min_n + 1) * self.char_offsets.len();
(cap_approx, Some(cap_approx))
}
}
pub trait SubwordIndices<'a, 'b, I>
where
I: Indexer + 'b,
{
type Iter: Iterator<Item = (&'a str, Option<u64>)>;
fn subword_indices(
&'a self,
min_n: usize,
max_n: usize,
indexer: &'b I,
) -> Box<dyn Iterator<Item = u64> + 'a>
where
'b: 'a,
{
Box::new(
self.subword_indices_with_ngrams(min_n, max_n, indexer)
.filter_map(|(_, idx)| idx),
)
}
fn subword_indices_with_ngrams(
&'a self,
min_n: usize,
max_n: usize,
indexer: &'b I,
) -> Self::Iter;
}
impl<'a, 'b, I> SubwordIndices<'a, 'b, I> for str
where
I: Indexer + 'b,
{
type Iter = NGramsIndicesIter<'a, 'b, I>;
fn subword_indices_with_ngrams(
&'a self,
min_n: usize,
max_n: usize,
indexer: &'b I,
) -> Self::Iter {
NGramsIndicesIter::new(self, min_n, max_n, indexer)
}
}
pub struct NGramsIndicesIter<'a, 'b, I> {
ngrams: NGrams<'a>,
indexer: &'b I,
}
impl<'a, 'b, I> NGramsIndicesIter<'a, 'b, I> {
pub fn new(string: &'a str, min_n: usize, max_n: usize, indexer: &'b I) -> Self {
let ngrams = NGrams::new(string, min_n, max_n);
NGramsIndicesIter { indexer, ngrams }
}
}
impl<'a, 'b, I> Iterator for NGramsIndicesIter<'a, 'b, I>
where
I: Indexer,
{
type Item = (&'a str, Option<u64>);
#[inline]
fn next(&mut self) -> Option<Self::Item> {
self.ngrams
.next()
.map(|ngram| (ngram.inner, self.indexer.index_ngram(&ngram)))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.ngrams.size_hint()
}
}
#[cfg(test)]
mod tests {
use lazy_static::lazy_static;
use maplit::hashmap;
use std::collections::HashMap;
use super::{BucketIndexer, FinalfusionHashIndexer, NGrams, SubwordIndices};
#[test]
fn ngrams_test() {
let mut hello_check: Vec<&str> = vec![
"h", "he", "hel", "e", "el", "ell", "l", "ll", "llö", "l", "lö", "lö ", "ö", "ö ",
"ö w", " ", " w", " wo", "w", "wo", "wor", "o", "or", "orl", "r", "rl", "rld", "l",
"ld", "d",
];
hello_check.sort();
let mut hello_ngrams: Vec<_> = NGrams::new("hellö world", 1, 3).map(|s| s.inner).collect();
hello_ngrams.sort();
assert_eq!(hello_check, hello_ngrams);
}
#[test]
fn ngrams_23_test() {
let mut hello_check: Vec<&str> = vec![
"he", "hel", "el", "ell", "ll", "llo", "lo", "lo ", "o ", "o w", " w", " wo", "wo",
"wor", "or", "orl", "rl", "rld", "ld",
];
hello_check.sort();
let mut hello_ngrams: Vec<_> = NGrams::new("hello world", 2, 3).map(|s| s.inner).collect();
hello_ngrams.sort();
assert_eq!(hello_check, hello_ngrams);
}
#[test]
fn short_ngram_test() {
let mut yep_check: Vec<&str> = vec!["ˈjə", "jəp", "ˈjəp"];
yep_check.sort();
let mut yep_ngrams: Vec<_> = NGrams::new("ˈjəp", 3, 6).map(|s| s.inner).collect();
yep_ngrams.sort();
assert_eq!(yep_check, yep_ngrams);
}
#[test]
fn empty_ngram_test() {
let check: &[&str] = &[];
assert_eq!(
NGrams::new("", 1, 3).map(|s| s.inner).collect::<Vec<_>>(),
check
);
}
#[test]
#[should_panic]
fn incorrect_min_n_test() {
NGrams::new("", 0, 3);
}
#[test]
#[should_panic]
fn incorrect_max_n_test() {
NGrams::new("", 2, 1);
}
lazy_static! {
static ref SUBWORD_TESTS_2: HashMap<&'static str, Vec<u64>> = hashmap! {
"<Daniël>" =>
vec![0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
"<hallo>" =>
vec![0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3],
};
}
lazy_static! {
static ref SUBWORD_TESTS_21: HashMap<&'static str, Vec<u64>> = hashmap! {
"<Daniël>" =>
vec![214157, 233912, 311961, 488897, 620206, 741276, 841219,
1167494, 1192256, 1489905, 1532271, 1644730, 1666166,
1679745, 1680294, 1693100, 2026735, 2065822],
"<hallo>" =>
vec![75867, 104120, 136555, 456131, 599360, 722393, 938007,
985859, 1006102, 1163391, 1218704, 1321513, 1505861,
1892376],
};
}
lazy_static! {
static ref NGRAMS_INDICES_TESTS_36: HashMap<&'static str, Vec<(&'static str, u64)>> = [
(
"<Daniël>",
vec![
("Dan", 214157),
("iël", 233912),
("Danië", 311961),
("iël>", 488897),
("niël>", 620206),
("anië", 741276),
("Dani", 841219),
("Daniël", 1167494),
("ani", 1192256),
("niël", 1489905),
("ël>", 1532271),
("nië", 1644730),
("<Dan", 1666166),
("aniël", 1679745),
("<Danië", 1680294),
("aniël>", 1693100),
("<Da", 2026735),
("<Dani", 2065822)
]
),
(
"<hallo>",
vec![
("lo>", 75867),
("<hal", 104120),
("hallo>", 136555),
("hal", 456131),
("allo>", 599360),
("llo", 722393),
("all", 938007),
("<ha", 985859),
("hallo", 1006102),
("allo", 1163391),
("llo>", 1218704),
("<hallo", 1321513),
("<hall", 1505861),
("hall", 1892376)
]
)
]
.iter()
.cloned()
.collect();
}
#[test]
fn subword_indices_4_test() {
let indexer = FinalfusionHashIndexer::new(2);
for (word, indices_check) in SUBWORD_TESTS_2.iter() {
let mut indices = word.subword_indices(3, 6, &indexer).collect::<Vec<_>>();
indices.sort();
assert_eq!(indices_check, &indices);
}
}
#[test]
fn subword_indices_2m_test() {
let indexer = FinalfusionHashIndexer::new(21);
for (word, indices_check) in SUBWORD_TESTS_21.iter() {
let mut indices = word.subword_indices(3, 6, &indexer).collect::<Vec<_>>();
indices.sort();
assert_eq!(indices_check, &indices);
}
}
#[test]
fn ngrams_indices_2m_test() {
let indexer = FinalfusionHashIndexer::new(21);
for (word, ngrams_indices_check) in NGRAMS_INDICES_TESTS_36.iter() {
let mut ngrams_indices_test = word
.subword_indices_with_ngrams(3, 6, &indexer)
.collect::<Vec<_>>();
ngrams_indices_test.sort_by_key(|ngrams_indices_pairs| ngrams_indices_pairs.1);
for (iter_check, iter_test) in ngrams_indices_check.into_iter().zip(ngrams_indices_test)
{
assert_eq!(iter_check.0, iter_test.0);
}
}
}
}