use std::sync::Mutex;
use crate::errors::{FindSimdocError, Result};
use crate::feature::{FeatureConfig, FeatureExtractor};
use crate::lsh::minhash::MinHasher;
use all_pairs_hamming::chunked_join::ChunkedJoiner;
use rand::{RngCore, SeedableRng};
use rayon::prelude::*;
pub struct JaccardSearcher {
config: FeatureConfig,
hasher: MinHasher,
joiner: Option<ChunkedJoiner<u64>>,
shows_progress: bool,
}
impl JaccardSearcher {
pub fn new(window_size: usize, delimiter: Option<char>, seed: Option<u64>) -> Result<Self> {
let seed = seed.unwrap_or_else(rand::random::<u64>);
let mut seeder = rand_xoshiro::SplitMix64::seed_from_u64(seed);
let config = FeatureConfig::new(window_size, delimiter, seeder.next_u64())?;
let hasher = MinHasher::new(seeder.next_u64());
Ok(Self {
config,
hasher,
joiner: None,
shows_progress: false,
})
}
pub const fn shows_progress(mut self, yes: bool) -> Self {
self.shows_progress = yes;
self
}
pub fn build_sketches<I, D>(mut self, documents: I, num_chunks: usize) -> Result<Self>
where
I: IntoIterator<Item = D>,
D: AsRef<str>,
{
let mut joiner = ChunkedJoiner::<u64>::new(num_chunks).shows_progress(self.shows_progress);
let extractor = FeatureExtractor::new(&self.config);
let mut feature = vec![];
for (i, doc) in documents.into_iter().enumerate() {
if self.shows_progress && (i + 1) % 10000 == 0 {
eprintln!("Processed {} documents...", i + 1);
}
let doc = doc.as_ref();
if doc.is_empty() {
return Err(FindSimdocError::input("Input document must not be empty."));
}
extractor.extract(doc, &mut feature);
joiner.add(self.hasher.iter(&feature)).unwrap();
}
self.joiner = Some(joiner);
Ok(self)
}
pub fn build_sketches_in_parallel<I, D>(
mut self,
documents: I,
num_chunks: usize,
) -> Result<Self>
where
I: Iterator<Item = D> + Send,
D: AsRef<str> + Send,
{
let extractor = FeatureExtractor::new(&self.config);
#[allow(clippy::mutex_atomic)]
let processed = Mutex::new(0usize);
let mut sketches: Vec<_> = documents
.into_iter()
.enumerate()
.par_bridge()
.map(|(i, doc)| {
#[allow(clippy::mutex_atomic)]
{
let mut cnt = processed.lock().unwrap();
*cnt += 1;
if self.shows_progress && *cnt % 10000 == 0 {
eprintln!("Processed {} documents...", *cnt);
}
}
let doc = doc.as_ref();
assert!(!doc.is_empty(), "Input document must not be empty.");
let mut feature = vec![];
extractor.extract(doc, &mut feature);
let mut gen = self.hasher.iter(&feature);
let sketch: Vec<_> = (0..num_chunks).map(|_| gen.next().unwrap()).collect();
(i, sketch)
})
.collect();
sketches.par_sort_by_key(|&(i, _)| i);
let mut joiner = ChunkedJoiner::<u64>::new(num_chunks).shows_progress(self.shows_progress);
for (_, sketch) in sketches {
joiner.add(sketch).unwrap();
}
self.joiner = Some(joiner);
Ok(self)
}
pub fn search_similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)> {
self.joiner.as_ref().map_or_else(Vec::new, |joiner| {
let mut results = joiner.similar_pairs(radius / 2.);
results.iter_mut().for_each(|(_, _, d)| *d *= 2.);
results
})
}
pub fn len(&self) -> usize {
self.joiner
.as_ref()
.map_or(0, |joiner| joiner.num_sketches())
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn memory_in_bytes(&self) -> usize {
self.joiner
.as_ref()
.map_or(0, |joiner| joiner.memory_in_bytes())
}
pub const fn config(&self) -> &FeatureConfig {
&self.config
}
}