use std::cmp;
use std::collections::{HashMap, VecDeque};
use std::fmt;
use std::hash::{Hash, Hasher};
use std::marker::PhantomData;
use std::ops::Deref;
use fnv::FnvHasher;
use smallvec::{smallvec, SmallVec};
use crate::util::CollectWithCapacity;
pub type NGramVec = SmallVec<[u64; 4]>;
pub trait Indexer {
fn index_ngram(&self, ngram: &StrWithCharLen) -> NGramVec;
fn upper_bound(&self) -> u64;
fn infallible() -> bool;
fn scope() -> IndicesScope;
}
pub trait BucketIndexer: Indexer {
fn new(buckets: usize) -> Self;
fn buckets(&self) -> usize;
}
pub struct HashIndexer<H> {
buckets_exp: usize,
mask: u64,
_phantom: PhantomData<H>,
}
impl<H> BucketIndexer for HashIndexer<H>
where
H: Default + Hasher,
{
fn new(buckets_exp: usize) -> Self {
assert!(
buckets_exp <= 64,
"The largest possible buckets exponent is 64."
);
let mask = if buckets_exp == 64 {
!0
} else {
(1 << buckets_exp) - 1
};
HashIndexer {
buckets_exp,
mask,
_phantom: PhantomData,
}
}
fn buckets(&self) -> usize {
self.buckets_exp as usize
}
}
impl<H> Clone for HashIndexer<H> {
fn clone(&self) -> Self {
HashIndexer {
buckets_exp: self.buckets_exp,
mask: self.mask,
_phantom: PhantomData,
}
}
}
impl<H> Copy for HashIndexer<H> {}
impl<H> fmt::Debug for HashIndexer<H> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "HashIndexer<impl Hasher> {{ mask: {} }}", self.mask)
}
}
impl<H> Eq for HashIndexer<H> {}
impl<H> Indexer for HashIndexer<H>
where
H: Default + Hasher,
{
fn index_ngram(&self, ngram: &StrWithCharLen) -> NGramVec {
let mut hasher = H::default();
ngram.hash(&mut hasher);
smallvec![hasher.finish() & self.mask]
}
fn upper_bound(&self) -> u64 {
2u64.pow(self.buckets_exp as u32)
}
fn infallible() -> bool {
true
}
fn scope() -> IndicesScope {
IndicesScope::Substrings
}
}
impl<H> PartialEq for HashIndexer<H> {
fn eq(&self, other: &Self) -> bool {
self.mask.eq(&other.mask)
}
}
pub type FinalfusionHashIndexer = HashIndexer<FnvHasher>;
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct ExplicitIndexer {
ngrams: Vec<String>,
index: HashMap<String, u64>,
bound: usize,
}
impl ExplicitIndexer {
pub fn ngrams(&self) -> &[String] {
&self.ngrams
}
}
impl ExplicitIndexer {
pub fn new(ngrams: impl Into<Vec<String>>) -> Self {
let ngrams = ngrams.into();
let index = ngrams
.iter()
.cloned()
.enumerate()
.map(|(idx, ngram)| (ngram, idx as u64))
.collect::<HashMap<String, u64>>();
assert_eq!(
index.len(),
ngrams.len(),
"ngrams contained duplicate entries."
);
let bound = index.len();
ExplicitIndexer {
ngrams,
index,
bound,
}
}
pub fn new_with_indices(
ngram_tuples: impl IntoIterator<Item = (String, u64)>,
) -> (Self, HashMap<u64, usize>) {
let ngram_tuples = ngram_tuples.into_iter();
let mut old_to_new_indices = HashMap::with_capacity(ngram_tuples.size_hint().0);
let mut index = HashMap::with_capacity(ngram_tuples.size_hint().0);
let mut ngrams = Vec::with_capacity(ngram_tuples.size_hint().0);
for (ngram, bucket) in ngram_tuples {
let cur_idx = old_to_new_indices.len();
let new_idx = *old_to_new_indices.entry(bucket).or_insert(cur_idx);
assert!(
index.insert(ngram.clone(), new_idx as u64).is_none(),
"ngrams contains duplicate entries."
);
ngrams.push(ngram);
}
let bound = old_to_new_indices.len();
(
ExplicitIndexer {
ngrams,
index,
bound,
},
old_to_new_indices,
)
}
}
impl Indexer for ExplicitIndexer {
fn index_ngram(&self, ngram: &StrWithCharLen) -> NGramVec {
match self.index.get(ngram.inner) {
Some(&idx) => smallvec![idx],
None => smallvec![],
}
}
fn upper_bound(&self) -> u64 {
self.bound as u64
}
fn infallible() -> bool {
false
}
fn scope() -> IndicesScope {
IndicesScope::Substrings
}
}
pub struct StrWithCharLen<'a> {
inner: &'a str,
char_len: usize,
}
impl<'a> From<&'a str> for StrWithCharLen<'a> {
fn from(s: &'a str) -> Self {
StrWithCharLen::new(s)
}
}
impl<'a> StrWithCharLen<'a> {
pub fn new(s: &'a str) -> Self {
let char_len = s.chars().count();
StrWithCharLen { inner: s, char_len }
}
pub fn as_str(&self) -> &str {
self.inner
}
pub fn char_len(&self) -> usize {
self.char_len
}
}
impl<'a> Deref for StrWithCharLen<'a> {
type Target = str;
fn deref(&self) -> &Self::Target {
self.inner
}
}
impl<'a> Hash for StrWithCharLen<'a> {
fn hash<H>(&self, hasher: &mut H)
where
H: Hasher,
{
hasher.write(&(self.char_len as u64).to_le_bytes());
self.inner
.chars()
.for_each(|ch| hasher.write(&(ch as u32).to_le_bytes()));
}
}
pub struct NGrams<'a> {
max_n: usize,
min_n: usize,
string: &'a str,
char_offsets: VecDeque<usize>,
ngram_len: usize,
}
impl<'a> NGrams<'a> {
pub fn new(string: &'a str, min_n: usize, max_n: usize) -> Self {
assert!(min_n != 0, "The minimum n-gram length cannot be zero.");
assert!(
min_n <= max_n,
"The maximum length should be equal to or greater than the minimum length."
);
let char_offsets = string
.char_indices()
.map(|(idx, _)| idx)
.collect_with_capacity::<VecDeque<_>>(string.len());
let ngram_len = cmp::min(max_n, char_offsets.len());
NGrams {
max_n,
min_n,
string,
char_offsets,
ngram_len,
}
}
}
impl<'a> Iterator for NGrams<'a> {
type Item = StrWithCharLen<'a>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.ngram_len < self.min_n {
self.char_offsets.pop_front();
if self.char_offsets.len() < self.min_n {
return None;
}
self.ngram_len = cmp::min(self.max_n, self.char_offsets.len());
}
let ngram = if self.ngram_len == self.char_offsets.len() {
&self.string[self.char_offsets[0]..]
} else {
&self.string[self.char_offsets[0]..self.char_offsets[self.ngram_len]]
};
let ngram_with_len = StrWithCharLen {
inner: ngram,
char_len: self.ngram_len,
};
self.ngram_len -= 1;
Some(ngram_with_len)
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let cap_approx = (self.max_n - self.min_n + 1) * self.char_offsets.len();
(cap_approx, Some(cap_approx))
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum IndicesScope {
Substrings,
StringAndSubstrings,
}
pub trait SubwordIndices<'a, 'b, I>
where
I: Indexer + 'b,
{
type Iter: Iterator<Item = (&'a str, NGramVec)>;
fn subword_indices(
&'a self,
min_n: usize,
max_n: usize,
indexer: &'b I,
) -> Box<dyn Iterator<Item = u64> + 'a>
where
'b: 'a,
{
Box::new(
self.subword_indices_with_ngrams(min_n, max_n, indexer)
.flat_map(|(_, indices)| indices),
)
}
fn subword_indices_with_ngrams(
&'a self,
min_n: usize,
max_n: usize,
indexer: &'b I,
) -> Self::Iter;
}
impl<'a, 'b, I> SubwordIndices<'a, 'b, I> for str
where
I: Indexer + 'b,
{
type Iter = NGramsIndicesIter<'a, 'b, I>;
fn subword_indices_with_ngrams(
&'a self,
min_n: usize,
max_n: usize,
indexer: &'b I,
) -> Self::Iter {
NGramsIndicesIter::new(self, min_n, max_n, indexer)
}
}
pub struct NGramsIndicesIter<'a, 'b, I> {
string: Option<&'a str>,
ngrams: NGrams<'a>,
indexer: &'b I,
}
impl<'a, 'b, I> NGramsIndicesIter<'a, 'b, I> {
pub fn new(string: &'a str, min_n: usize, max_n: usize, indexer: &'b I) -> Self
where
I: Indexer,
{
let ngrams = NGrams::new(string, min_n, max_n);
let string = match I::scope() {
IndicesScope::Substrings => None,
IndicesScope::StringAndSubstrings => Some(string),
};
NGramsIndicesIter {
ngrams,
indexer,
string,
}
}
}
impl<'a, 'b, I> Iterator for NGramsIndicesIter<'a, 'b, I>
where
I: Indexer,
{
type Item = (&'a str, NGramVec);
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if let Some(string) = self.string.take() {
return Some((string, self.indexer.index_ngram(&string.into())));
}
self.ngrams
.next()
.map(|ngram| (ngram.inner, self.indexer.index_ngram(&ngram)))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.ngrams.size_hint()
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use lazy_static::lazy_static;
use maplit::hashmap;
use smallvec::smallvec;
use super::{BucketIndexer, FinalfusionHashIndexer, NGrams, SubwordIndices};
use crate::subword::NGramVec;
#[test]
fn ngrams_test() {
let mut hello_check: Vec<&str> = vec![
"h", "he", "hel", "e", "el", "ell", "l", "ll", "llö", "l", "lö", "lö ", "ö", "ö ",
"ö w", " ", " w", " wo", "w", "wo", "wor", "o", "or", "orl", "r", "rl", "rld", "l",
"ld", "d",
];
hello_check.sort_unstable();
let mut hello_ngrams: Vec<_> = NGrams::new("hellö world", 1, 3).map(|s| s.inner).collect();
hello_ngrams.sort_unstable();
assert_eq!(hello_check, hello_ngrams);
}
#[test]
fn ngrams_23_test() {
let mut hello_check: Vec<&str> = vec![
"he", "hel", "el", "ell", "ll", "llo", "lo", "lo ", "o ", "o w", " w", " wo", "wo",
"wor", "or", "orl", "rl", "rld", "ld",
];
hello_check.sort_unstable();
let mut hello_ngrams: Vec<_> = NGrams::new("hello world", 2, 3).map(|s| s.inner).collect();
hello_ngrams.sort_unstable();
assert_eq!(hello_check, hello_ngrams);
}
#[test]
fn short_ngram_test() {
let mut yep_check: Vec<&str> = vec!["ˈjə", "jəp", "ˈjəp"];
yep_check.sort_unstable();
let mut yep_ngrams: Vec<_> = NGrams::new("ˈjəp", 3, 6).map(|s| s.inner).collect();
yep_ngrams.sort_unstable();
assert_eq!(yep_check, yep_ngrams);
}
#[test]
fn empty_ngram_test() {
let check: &[&str] = &[];
assert_eq!(
NGrams::new("", 1, 3).map(|s| s.inner).collect::<Vec<_>>(),
check
);
}
#[test]
#[should_panic]
fn incorrect_min_n_test() {
NGrams::new("", 0, 3);
}
#[test]
#[should_panic]
fn incorrect_max_n_test() {
NGrams::new("", 2, 1);
}
lazy_static! {
static ref SUBWORD_TESTS_2: HashMap<&'static str, Vec<u64>> = hashmap! {
"<Daniël>" =>
vec![0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
"<hallo>" =>
vec![0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3],
};
}
lazy_static! {
static ref SUBWORD_TESTS_21: HashMap<&'static str, Vec<u64>> = hashmap! {
"<Daniël>" =>
vec![214157, 233912, 311961, 488897, 620206, 741276, 841219,
1167494, 1192256, 1489905, 1532271, 1644730, 1666166,
1679745, 1680294, 1693100, 2026735, 2065822],
"<hallo>" =>
vec![75867, 104120, 136555, 456131, 599360, 722393, 938007,
985859, 1006102, 1163391, 1218704, 1321513, 1505861,
1892376],
};
}
lazy_static! {
static ref NGRAMS_INDICES_TESTS_36: HashMap<&'static str, Vec<(&'static str, NGramVec)>> =
[
(
"<Daniël>",
vec![
("Dan", smallvec![214157]),
("iël", smallvec![233912]),
("Danië", smallvec![311961]),
("iël>", smallvec![488897]),
("niël>", smallvec![620206]),
("anië", smallvec![741276]),
("Dani", smallvec![841219]),
("Daniël", smallvec![1167494]),
("ani", smallvec![1192256]),
("niël", smallvec![1489905]),
("ël>", smallvec![1532271]),
("nië", smallvec![1644730]),
("<Dan", smallvec![1666166]),
("aniël", smallvec![1679745]),
("<Danië", smallvec![1680294]),
("aniël>", smallvec![1693100]),
("<Da", smallvec![2026735]),
("<Dani", smallvec![2065822])
]
),
(
"<hallo>",
vec![
("lo>", smallvec![75867]),
("<hal", smallvec![104120]),
("hallo>", smallvec![136555]),
("hal", smallvec![456131]),
("allo>", smallvec![599360]),
("llo", smallvec![722393]),
("all", smallvec![938007]),
("<ha", smallvec![985859]),
("hallo", smallvec![1006102]),
("allo", smallvec![1163391]),
("llo>", smallvec![1218704]),
("<hallo", smallvec![1321513]),
("<hall", smallvec![1505861]),
("hall", smallvec![1892376])
]
)
]
.iter()
.cloned()
.collect();
}
#[test]
fn subword_indices_4_test() {
let indexer = FinalfusionHashIndexer::new(2);
for (word, indices_check) in SUBWORD_TESTS_2.iter() {
let mut indices = word.subword_indices(3, 6, &indexer).collect::<Vec<_>>();
indices.sort_unstable();
assert_eq!(indices_check, &indices);
}
}
#[test]
fn subword_indices_2m_test() {
let indexer = FinalfusionHashIndexer::new(21);
for (word, indices_check) in SUBWORD_TESTS_21.iter() {
let mut indices = word.subword_indices(3, 6, &indexer).collect::<Vec<_>>();
indices.sort_unstable();
assert_eq!(indices_check, &indices);
}
}
#[test]
fn ngrams_indices_2m_test() {
let indexer = FinalfusionHashIndexer::new(21);
for (word, ngrams_indices_check) in NGRAMS_INDICES_TESTS_36.iter() {
let mut ngrams_indices_test = word
.subword_indices_with_ngrams(3, 6, &indexer)
.collect::<Vec<_>>();
ngrams_indices_test.sort_by_key(|ngrams_indices_pairs| ngrams_indices_pairs.1.clone());
for (iter_check, iter_test) in ngrams_indices_check.into_iter().zip(ngrams_indices_test)
{
assert_eq!(iter_check.0, iter_test.0);
}
}
}
}