use alloc::format;
use alloc::string::String;
use crate::canonical::Canonicalizer;
pub mod algo {
pub const MINHASH_128: &str = "minhash-h128";
pub const MINHASH: &str = "minhash";
pub const SIMHASH_64: &str = "simhash-b64";
pub const TLSH: &str = "tlsh";
pub const EMBEDDING: &str = "embedding";
}
#[allow(clippy::large_enum_variant)]
#[cfg(any(
feature = "minhash",
feature = "simhash",
feature = "tlsh",
feature = "semantic"
))]
#[derive(Clone, Debug, PartialEq)]
#[non_exhaustive]
pub enum Fingerprint {
#[cfg(feature = "minhash")]
#[cfg_attr(docsrs, doc(cfg(feature = "minhash")))]
MinHash(crate::classical::minhash::MinHashSig<128>),
#[cfg(feature = "simhash")]
#[cfg_attr(docsrs, doc(cfg(feature = "simhash")))]
SimHash(crate::classical::simhash::SimHash64),
#[cfg(feature = "tlsh")]
#[cfg_attr(docsrs, doc(cfg(feature = "tlsh")))]
Tlsh(TlshFingerprint),
#[cfg(feature = "semantic")]
#[cfg_attr(docsrs, doc(cfg(feature = "semantic")))]
Embedding(crate::semantic::Embedding),
}
#[cfg(feature = "tlsh")]
#[cfg_attr(docsrs, doc(cfg(feature = "tlsh")))]
#[derive(Clone, Debug, PartialEq)]
pub struct TlshFingerprint {
pub hex: alloc::string::String,
}
pub const UNCOMPUTED_CONFIG_HASH: u64 = 0;
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub struct FingerprintMetadata {
pub algorithm: &'static str,
pub config_hash: u64,
pub model_id: Option<String>,
pub schema_version: u16,
pub byte_size: usize,
}
impl FingerprintMetadata {
#[must_use]
pub fn with_config_hash(mut self, hash: u64) -> Self {
self.config_hash = hash;
self
}
}
#[cfg(any(
feature = "minhash",
feature = "simhash",
feature = "tlsh",
feature = "semantic"
))]
impl Fingerprint {
#[must_use]
pub fn metadata(&self) -> FingerprintMetadata {
self.metadata_inner(UNCOMPUTED_CONFIG_HASH)
}
#[must_use]
pub fn metadata_with(
&self,
canonicalizer: &Canonicalizer,
tokenizer_name: &str,
algo_cfg: &str,
) -> FingerprintMetadata {
let hash = config_hash(canonicalizer, tokenizer_name, algo_cfg);
self.metadata_inner(hash)
}
fn metadata_inner(&self, config_hash: u64) -> FingerprintMetadata {
match self {
#[cfg(feature = "minhash")]
Fingerprint::MinHash(sig) => FingerprintMetadata {
algorithm: algo::MINHASH_128,
config_hash,
model_id: None,
schema_version: sig.schema,
byte_size: sig.as_bytes().len(),
},
#[cfg(feature = "simhash")]
Fingerprint::SimHash(sig) => FingerprintMetadata {
algorithm: algo::SIMHASH_64,
config_hash,
model_id: None,
schema_version: crate::classical::simhash::SCHEMA_VERSION,
byte_size: sig.as_bytes().len(),
},
#[cfg(feature = "tlsh")]
Fingerprint::Tlsh(tlsh) => FingerprintMetadata {
algorithm: algo::TLSH,
config_hash,
model_id: None,
schema_version: 1,
byte_size: tlsh.hex.len(),
},
#[cfg(feature = "semantic")]
Fingerprint::Embedding(emb) => FingerprintMetadata {
algorithm: algo::EMBEDDING,
config_hash,
model_id: emb.model_id.clone(),
schema_version: 1,
byte_size: emb.vector.len() * core::mem::size_of::<f32>(),
},
}
}
#[must_use]
pub fn name(&self) -> String {
match self {
#[cfg(feature = "minhash")]
Fingerprint::MinHash(sig) => format!("{}-v{}", algo::MINHASH_128, sig.schema),
#[cfg(feature = "simhash")]
Fingerprint::SimHash(_) => format!(
"{}-v{}",
algo::SIMHASH_64,
crate::classical::simhash::SCHEMA_VERSION
),
#[cfg(feature = "tlsh")]
Fingerprint::Tlsh(_) => format!("{}-v1", algo::TLSH),
#[cfg(feature = "semantic")]
Fingerprint::Embedding(emb) => match &emb.model_id {
Some(m) => format!("{}/{m}-v1", algo::EMBEDDING),
None => format!("{}-v1", algo::EMBEDDING),
},
}
}
}
#[must_use]
pub fn config_hash(canonicalizer: &Canonicalizer, tokenizer_name: &str, algo_cfg: &str) -> u64 {
let mut buf = String::with_capacity(64);
buf.push_str(canonicalizer.config_string().as_str());
buf.push('|');
buf.push_str(tokenizer_name);
buf.push('|');
buf.push_str(algo_cfg);
xxhash_rust::xxh3::xxh3_64(buf.as_bytes())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn config_hash_is_deterministic() {
let c = Canonicalizer::default();
let a = config_hash(&c, "word-uax29", "minhash-h128-mmh3");
let b = config_hash(&c, "word-uax29", "minhash-h128-mmh3");
assert_eq!(a, b);
}
#[test]
fn config_hash_differs_on_change() {
let c = Canonicalizer::default();
let a = config_hash(&c, "word-uax29", "minhash-h128-mmh3");
let b = config_hash(&c, "grapheme-uax29", "minhash-h128-mmh3");
assert_ne!(a, b);
}
#[cfg(feature = "minhash")]
#[test]
fn minhash_metadata_round_trip() {
let sig = crate::classical::minhash::MinHashSig::<128>::empty();
let fp = Fingerprint::MinHash(sig);
let md = fp.metadata();
assert_eq!(md.algorithm, algo::MINHASH_128);
assert_eq!(md.config_hash, UNCOMPUTED_CONFIG_HASH);
assert!(md.byte_size >= 128 * 8);
assert_eq!(md.schema_version, 1);
}
#[cfg(feature = "simhash")]
#[test]
fn simhash_metadata_round_trip() {
use crate::classical::simhash::SimHash64;
let fp = Fingerprint::SimHash(SimHash64::new(0xDEADBEEF));
let md = fp.metadata();
assert_eq!(md.algorithm, algo::SIMHASH_64);
assert_eq!(md.byte_size, 8);
}
#[cfg(feature = "minhash")]
#[test]
fn metadata_with_populates_config_hash() {
let sig = crate::classical::minhash::MinHashSig::<128>::empty();
let fp = Fingerprint::MinHash(sig);
let canon = Canonicalizer::default();
let md = fp.metadata_with(&canon, "word-uax29", "h128-mmh3");
let expected = config_hash(&canon, "word-uax29", "h128-mmh3");
assert_eq!(md.config_hash, expected);
assert_ne!(md.config_hash, UNCOMPUTED_CONFIG_HASH);
}
#[cfg(feature = "minhash")]
#[test]
fn with_config_hash_attaches_lazily() {
let sig = crate::classical::minhash::MinHashSig::<128>::empty();
let fp = Fingerprint::MinHash(sig);
let md = fp.metadata().with_config_hash(0xDEAD_BEEF_CAFE_BABE);
assert_eq!(md.config_hash, 0xDEAD_BEEF_CAFE_BABE);
}
#[cfg(feature = "minhash")]
#[test]
fn minhash_name_matches_documented_format() {
let sig = crate::classical::minhash::MinHashSig::<128>::empty();
let n = Fingerprint::MinHash(sig).name();
assert_eq!(n, "minhash-h128-v1");
assert!(!n.contains("cfg="), "name() must not include cfg=");
}
#[cfg(feature = "simhash")]
#[test]
fn simhash_name_is_stable() {
use crate::classical::simhash::SimHash64;
let n = Fingerprint::SimHash(SimHash64::new(0)).name();
assert_eq!(n, "simhash-b64-v1");
}
#[test]
fn uncomputed_sentinel_is_zero() {
assert_eq!(UNCOMPUTED_CONFIG_HASH, 0);
}
}