use std::convert::Infallible;
use triblespace_core::id::ExclusiveId;
use triblespace_core::id_hex;
use triblespace_core::macros::entity;
use triblespace_core::metadata::{self, MetaDescribe};
use triblespace_core::trible::Fragment;
use triblespace_core::inline::{Inline, InlineEncoding};
pub enum WordHash {}
impl MetaDescribe for WordHash {
fn describe() -> Fragment {
let id = id_hex!("8868FA39C4CDA947DD4CAA1652C30D06");
entity! { ExclusiveId::force_ref(&id) @
metadata::name: "WordHash",
metadata::description: "Term schema for hash_tokens / code_tokens — Blake3 hash of a lowercased word or code segment.",
metadata::tag: metadata::KIND_INLINE_ENCODING,
}
}
}
impl InlineEncoding for WordHash {
type ValidationError = Infallible;
type Encoding = Self;
}
pub enum BigramHash {}
impl MetaDescribe for BigramHash {
fn describe() -> Fragment {
let id = id_hex!("2EC1CAAD948B959D32023EF32D500148");
entity! { ExclusiveId::force_ref(&id) @
metadata::name: "BigramHash",
metadata::description: "Term schema for bigram_tokens — Blake3 hash of a pair of adjacent lowercased words, NUL-delimited.",
metadata::tag: metadata::KIND_INLINE_ENCODING,
}
}
}
impl InlineEncoding for BigramHash {
type ValidationError = Infallible;
type Encoding = Self;
}
pub enum NgramHash {}
impl MetaDescribe for NgramHash {
fn describe() -> Fragment {
let id = id_hex!("52472B53D201532D7FAA7D89AE80A6ED");
entity! { ExclusiveId::force_ref(&id) @
metadata::name: "NgramHash",
metadata::description: "Term schema for ngram_tokens — Blake3 hash of a character n-gram window, with the n-size prefixed into the hash input so different n values don't collide within the same schema.",
metadata::tag: metadata::KIND_INLINE_ENCODING,
}
}
}
impl InlineEncoding for NgramHash {
type ValidationError = Infallible;
type Encoding = Self;
}
pub fn hash_tokens(text: &str) -> Vec<Inline<WordHash>> {
normalize_words(text)
.map(|w| Inline::<WordHash>::new(*blake3::hash(w.as_bytes()).as_bytes()))
.collect()
}
fn normalize_words(text: &str) -> impl Iterator<Item = String> + '_ {
text.split_ascii_whitespace().filter_map(|raw| {
let trimmed = raw.trim_matches(|c: char| c.is_ascii_punctuation());
if !trimmed.chars().any(|c| c.is_alphanumeric()) {
return None;
}
let mut lower = String::with_capacity(trimmed.len());
for c in trimmed.chars() {
lower.push(c.to_ascii_lowercase());
}
Some(lower)
})
}
pub fn bigram_tokens(text: &str) -> Vec<Inline<BigramHash>> {
let words: Vec<String> = normalize_words(text).collect();
if words.len() < 2 {
return Vec::new();
}
let mut out = Vec::with_capacity(words.len() - 1);
for pair in words.windows(2) {
let mut buf = String::with_capacity(pair[0].len() + pair[1].len() + 1);
buf.push_str(&pair[0]);
buf.push('\u{0}');
buf.push_str(&pair[1]);
out.push(Inline::<BigramHash>::new(*blake3::hash(buf.as_bytes()).as_bytes()));
}
out
}
pub fn code_tokens(text: &str) -> Vec<Inline<WordHash>> {
let mut segments: Vec<String> = Vec::new();
let mut cur = String::new();
#[derive(Clone, Copy, PartialEq)]
enum Kind {
Lower,
Upper,
Digit,
None,
}
fn kind(c: char) -> Kind {
if c.is_ascii_digit() {
Kind::Digit
} else if c.is_uppercase() {
Kind::Upper
} else if c.is_lowercase() {
Kind::Lower
} else {
Kind::None
}
}
let chars: Vec<char> = text.chars().collect();
let mut prev = Kind::None;
let mut i = 0;
while i < chars.len() {
let c = chars[i];
let k = kind(c);
if !c.is_alphanumeric() {
if !cur.is_empty() {
segments.push(std::mem::take(&mut cur));
}
prev = Kind::None;
i += 1;
continue;
}
let split_here = match (prev, k) {
(Kind::Lower, Kind::Upper) => true,
(Kind::Lower, Kind::Digit) => true,
(Kind::Digit, Kind::Lower) => true,
(Kind::Digit, Kind::Upper) => true,
(Kind::Upper, Kind::Digit) => true,
(Kind::Upper, Kind::Lower) if cur.chars().count() >= 2 => {
let popped = cur.pop().unwrap();
segments.push(std::mem::take(&mut cur));
cur.push(popped);
false
}
_ => false,
};
if split_here && !cur.is_empty() {
segments.push(std::mem::take(&mut cur));
}
cur.push(c);
prev = k;
i += 1;
}
if !cur.is_empty() {
segments.push(cur);
}
segments
.into_iter()
.filter_map(|s| {
let lower: String = s.chars().map(|c| c.to_ascii_lowercase()).collect();
if lower.is_empty() {
None
} else {
Some(Inline::<WordHash>::new(*blake3::hash(lower.as_bytes()).as_bytes()))
}
})
.collect()
}
pub fn ngram_tokens(text: &str, n: usize) -> Vec<Inline<NgramHash>> {
if n == 0 {
return Vec::new();
}
let mut normalized = String::with_capacity(text.len());
for c in text.chars() {
if c.is_alphanumeric() {
for l in c.to_lowercase() {
normalized.push(l);
}
} else {
normalized.push(' ');
}
}
let mut out = Vec::new();
for run in normalized.split_ascii_whitespace() {
let chars: Vec<char> = run.chars().collect();
if chars.len() < n {
continue;
}
let mut gram = String::with_capacity(n * 4);
for window in chars.windows(n) {
gram.clear();
for &c in window {
gram.push(c);
}
out.push(Inline::<NgramHash>::new(*blake3::hash(gram.as_bytes()).as_bytes()));
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn splits_on_whitespace() {
let tokens = hash_tokens("one two three");
assert_eq!(tokens.len(), 3);
}
#[test]
fn case_insensitive() {
let a = hash_tokens("FOO");
let b = hash_tokens("foo");
assert_eq!(a, b);
}
#[test]
fn strips_punctuation() {
let a = hash_tokens("hello,");
let b = hash_tokens("hello");
assert_eq!(a, b);
}
#[test]
fn preserves_duplicates() {
let tokens = hash_tokens("foo bar foo");
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0], tokens[2]);
}
#[test]
fn drops_empty_tokens() {
let tokens = hash_tokens("foo --- bar");
assert_eq!(tokens.len(), 2);
}
#[test]
fn stable_hash() {
let tokens = hash_tokens("hello");
let expected = *blake3::hash(b"hello").as_bytes();
assert_eq!(tokens[0].raw, expected);
}
#[test]
fn ngram_empty_n_returns_nothing() {
assert!(ngram_tokens("anything", 0).is_empty());
}
#[test]
fn ngram_skips_short_runs() {
assert!(ngram_tokens("hi", 3).is_empty());
}
#[test]
fn ngram_counts() {
assert_eq!(ngram_tokens("foxes", 3).len(), 3);
assert_eq!(ngram_tokens("foxes", 2).len(), 4);
}
#[test]
fn ngram_case_insensitive() {
let a = ngram_tokens("FOX", 3);
let b = ngram_tokens("fox", 3);
assert_eq!(a, b);
}
#[test]
fn ngram_does_not_cross_punctuation() {
let dashed = ngram_tokens("foo-bar", 3);
let spaced = ngram_tokens("foo bar", 3);
assert_eq!(dashed, spaced);
assert_eq!(dashed.len(), 2); }
#[test]
fn ngram_size_namespaced() {
let bi = ngram_tokens("fo", 2);
let tri = ngram_tokens("foo", 3);
assert_eq!(bi.len(), 1);
assert_eq!(tri.len(), 1);
assert_ne!(bi[0], tri[0]);
}
#[test]
fn bigram_tokens_basic_count() {
assert_eq!(bigram_tokens("the quick brown fox").len(), 3);
assert_eq!(bigram_tokens("one two").len(), 1);
assert!(bigram_tokens("lonely").is_empty());
assert!(bigram_tokens("").is_empty());
}
#[test]
fn bigram_tokens_case_and_punctuation_normalized() {
let a = bigram_tokens("Hello, WORLD!");
let b = bigram_tokens("hello world");
assert_eq!(a, b);
}
#[test]
fn bigram_tokens_order_matters() {
let ab = bigram_tokens("foo bar");
let ba = bigram_tokens("bar foo");
assert_ne!(ab, ba);
}
#[test]
fn bigram_tokens_separated_from_hash_by_schema() {
let single = hash_tokens("foobar");
let bigram = bigram_tokens("foo bar");
assert_eq!(single.len(), 1);
assert_eq!(bigram.len(), 1);
assert_ne!(single[0].raw, bigram[0].raw);
}
#[test]
fn bigram_tokens_word_boundary_preserved() {
let ab_c = bigram_tokens("ab c");
let a_bc = bigram_tokens("a bc");
assert_eq!(ab_c.len(), 1);
assert_eq!(a_bc.len(), 1);
assert_ne!(ab_c[0], a_bc[0]);
}
#[test]
fn bigram_tokens_enables_phrase_match() {
use crate::bm25::BM25Builder;
use triblespace_core::id::Id;
use triblespace_core::inline::encodings::genid::GenId;
fn iid(byte: u8) -> Id {
Id::new([byte; 16]).unwrap()
}
let mut b: BM25Builder<GenId, BigramHash> = BM25Builder::new();
b.insert(iid(1), bigram_tokens("the quick brown fox"));
b.insert(iid(2), bigram_tokens("fox fight club"));
b.insert(iid(3), bigram_tokens("quick silver brown fox"));
let idx = b.build();
let phrase = bigram_tokens("quick brown");
assert_eq!(phrase.len(), 1);
let hits: Vec<_> = idx.query_term(&phrase[0]).collect();
assert_eq!(hits.len(), 1);
let mut key1 = [0u8; 32];
key1[16..32].copy_from_slice(AsRef::<[u8; 16]>::as_ref(&iid(1)));
assert_eq!(hits[0].0.raw, key1);
}
#[test]
fn code_tokens_snake_case() {
let t = code_tokens("parse_http_response");
let expected = ["parse", "http", "response"]
.iter()
.map(|s| Inline::<WordHash>::new(*blake3::hash(s.as_bytes()).as_bytes()))
.collect::<Vec<_>>();
assert_eq!(t, expected);
}
#[test]
fn code_tokens_camel_case() {
let t = code_tokens("parseResponseBody");
let expected = ["parse", "response", "body"]
.iter()
.map(|s| Inline::<WordHash>::new(*blake3::hash(s.as_bytes()).as_bytes()))
.collect::<Vec<_>>();
assert_eq!(t, expected);
}
#[test]
fn code_tokens_acronym_boundary() {
let t = code_tokens("HTMLParser");
let expected = ["html", "parser"]
.iter()
.map(|s| Inline::<WordHash>::new(*blake3::hash(s.as_bytes()).as_bytes()))
.collect::<Vec<_>>();
assert_eq!(t, expected);
}
#[test]
fn code_tokens_digits_split() {
let t = code_tokens("parseV2Request");
let expected = ["parse", "v", "2", "request"]
.iter()
.map(|s| Inline::<WordHash>::new(*blake3::hash(s.as_bytes()).as_bytes()))
.collect::<Vec<_>>();
assert_eq!(t, expected);
}
#[test]
fn code_tokens_mixed_separators() {
let a = code_tokens("foo-bar.baz qux");
let b = code_tokens("foo bar baz qux");
assert_eq!(a, b);
assert_eq!(a.len(), 4);
}
#[test]
fn code_tokens_shares_terms_with_hash_tokens() {
let code = code_tokens("parseFooBar");
let text = hash_tokens("parse foo bar");
assert_eq!(code.len(), 3);
assert_eq!(text.len(), 3);
for (c, t) in code.iter().zip(text.iter()) {
assert_eq!(c, t);
}
}
#[test]
fn code_tokens_example_in_doc() {
let t = code_tokens("parseHTMLResponse_v2");
assert_eq!(t.len(), 5);
}
#[test]
fn ngram_shared_prefix_matches_extension() {
let short = ngram_tokens("fox", 3);
let long = ngram_tokens("foxes", 3);
assert!(long.contains(&short[0]));
}
}