use sha2::{Digest, Sha256};
use std::collections::HashSet;
use crate::stemmer;
pub fn generate_blind_indices(text: &str) -> Vec<String> {
let tokens = tokenize(text);
let mut seen = HashSet::new();
let mut indices = Vec::new();
for token in &tokens {
let hash = sha256_hex(token.as_bytes());
if seen.insert(hash.clone()) {
indices.push(hash);
}
let stem = stemmer::stem(token);
if stem.len() >= 2 {
let stem_input = format!("stem:{}", stem);
let stem_hash = sha256_hex(stem_input.as_bytes());
if seen.insert(stem_hash.clone()) {
indices.push(stem_hash);
}
}
}
indices
}
fn tokenize(text: &str) -> Vec<String> {
let lowered = text.to_lowercase();
let cleaned: String = lowered
.chars()
.map(|c| {
if c.is_alphanumeric() || c.is_whitespace() {
c
} else {
' '
}
})
.collect();
cleaned
.split_whitespace()
.filter(|t| t.len() >= 2)
.map(|t| t.to_string())
.collect()
}
fn sha256_hex(data: &[u8]) -> String {
let hash = Sha256::digest(data);
hex::encode(hash)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_blind_indices_parity() {
let fixture: serde_json::Value = serde_json::from_str(
include_str!("../tests/fixtures/crypto_vectors.json"),
)
.unwrap();
let test_cases = fixture["blind_indices"]["test_cases"].as_array().unwrap();
for tc in test_cases {
let text = tc["text"].as_str().unwrap();
let expected: Vec<String> = tc["indices"]
.as_array()
.unwrap()
.iter()
.map(|v| v.as_str().unwrap().to_string())
.collect();
let result = generate_blind_indices(text);
assert_eq!(
result, expected,
"Blind indices mismatch for text: {:?}\n got: {:?}\n expected: {:?}",
text, result, expected
);
}
}
#[test]
fn test_token_hash_mappings() {
let fixture: serde_json::Value = serde_json::from_str(
include_str!("../tests/fixtures/crypto_vectors.json"),
)
.unwrap();
let mappings = fixture["blind_indices"]["token_hash_mappings"]
.as_object()
.unwrap();
for (token, expected_hash) in mappings {
let hash = sha256_hex(token.as_bytes());
assert_eq!(
hash,
expected_hash.as_str().unwrap(),
"Token hash mismatch for '{}'",
token
);
}
}
}