Skip to main content

sketchir/
lib.rs

1//! `sketchir`: sketching primitives for IR.
2//!
3//! This crate is intended for **index-only** similarity sketches used in:
4//! - near-duplicate detection (MinHash / shingles)
5//! - text fingerprinting (SimHash)
6//! - approximate similarity search (LSH-style candidate generation)
7//!
8//! Scope here is *primitives*: signatures, basic indexing, deterministic behavior.
9//! Higher-level workflows (crawl dedupe pipelines, content extraction, etc.) belong elsewhere.
10//!
11//! ## Dense-vector LSH: `LSHIndex` vs `DenseSimHashLSH`
12//!
13//! Both index dense `f32` vectors but differ in lifecycle and recall strategy:
14//!
15//! - [`LSHIndex`]: batch workflow (add, build, search). Multi-table random projection
16//!   with tunable `num_tables` / `num_functions` for recall control.
17//! - [`DenseSimHashLSH`]: incremental insertion (no build step). Single SimHash table
18//!   with Hamming-distance-1 neighbor probing for approximate recall.
19
20#![warn(missing_docs)]
21
22pub mod blocking;
23pub mod dense_simhash;
24pub mod lsh;
25pub mod minhash;
26pub mod simhash;
27
28pub use blocking::{BlockingConfig, MinHashTextLSH};
29pub use dense_simhash::DenseSimHashLSH;
30pub use lsh::{LSHIndex, MinHashLSH, SimHashLSH};
31pub use minhash::{MinHash, MinHashSignature};
32pub use simhash::{simhash_fingerprint, SimHashFingerprint};
33
34use std::fmt;
35use std::hash::Hasher;
36
37/// Errors for sketchir indexes and operations.
38#[derive(Debug)]
39#[non_exhaustive]
40pub enum Error {
41    /// A parameter is out of range or inconsistent.
42    InvalidParam(&'static str),
43    /// Dimension mismatch between expected and provided vectors.
44    DimensionMismatch {
45        /// Expected dimension.
46        expected: usize,
47        /// Actual provided dimension.
48        got: usize,
49    },
50    /// The index has no inserted items.
51    EmptyIndex,
52    /// The index has not been built yet.
53    NotBuilt,
54    /// Attempted to add after the index was built.
55    AddAfterBuild,
56    /// Input data contains non-finite values (NaN or infinity).
57    NonFiniteInput,
58}
59
60impl fmt::Display for Error {
61    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
62        match self {
63            Error::InvalidParam(msg) => write!(f, "invalid parameter: {msg}"),
64            Error::DimensionMismatch { expected, got } => {
65                write!(f, "dimension mismatch (expected {expected}, got {got})")
66            }
67            Error::EmptyIndex => f.write_str("empty index"),
68            Error::NotBuilt => f.write_str("index not built"),
69            Error::AddAfterBuild => f.write_str("cannot add after build"),
70            Error::NonFiniteInput => f.write_str("input contains non-finite values (NaN or Inf)"),
71        }
72    }
73}
74
75impl std::error::Error for Error {}
76
77/// A small stable 64-bit FNV-1a hasher.
78///
79/// This avoids relying on `std`'s `DefaultHasher` stability guarantees.
80pub(crate) struct Fnv1a64 {
81    state: u64,
82}
83
84impl Fnv1a64 {
85    pub(crate) fn new() -> Self {
86        // FNV offset basis
87        Self {
88            state: 0xcbf29ce484222325,
89        }
90    }
91}
92
93impl Hasher for Fnv1a64 {
94    fn finish(&self) -> u64 {
95        self.state
96    }
97
98    fn write(&mut self, bytes: &[u8]) {
99        // FNV-1a
100        const PRIME: u64 = 0x00000100000001B3;
101        for &b in bytes {
102            self.state ^= b as u64;
103            self.state = self.state.wrapping_mul(PRIME);
104        }
105    }
106}
107
108/// Deterministic 64-bit LCG (Knuth constants). Not cryptographic.
109pub(crate) fn lcg_next(state: &mut u64) -> u64 {
110    *state = state
111        .wrapping_mul(6364136223846793005)
112        .wrapping_add(1442695040888963407);
113    *state
114}
115
116/// LCG-based uniform f32 in [-1, 1].
117pub(crate) fn lcg_f32(state: &mut u64) -> f32 {
118    lcg_next(state);
119    let u = (*state >> 16) as u32;
120    (u as f32 / u32::MAX as f32) * 2.0 - 1.0
121}
122
123/// True if all values are finite (not NaN or infinity).
124pub(crate) fn all_finite(values: &[f32]) -> bool {
125    values.iter().all(|v| v.is_finite())
126}