Skip to main content

ski/
text.rs

1//! Tiny text utilities shared by the embedder, ranker, and skill parser.
2//! Deterministic by design — embeddings persisted in the index must reproduce
3//! byte-for-byte across runs and builds, so we use a fixed FNV hash, not the
4//! std hasher (whose seed/impl is not a stability guarantee).
5
6/// Lowercase, split on non-alphanumerics, drop tokens shorter than 2 chars.
7pub fn tokenize(s: &str) -> Vec<String> {
8    let mut out = Vec::new();
9    let mut cur = String::new();
10    for ch in s.chars() {
11        if ch.is_ascii_alphanumeric() {
12            cur.push(ch.to_ascii_lowercase());
13        } else if !cur.is_empty() {
14            if cur.len() >= 2 {
15                out.push(std::mem::take(&mut cur));
16            } else {
17                cur.clear();
18            }
19        }
20    }
21    if cur.len() >= 2 {
22        out.push(cur);
23    }
24    out
25}
26
27/// Function words that carry no discriminative signal for phrase matching. Kept
28/// deliberately small — just the high-frequency glue that would otherwise let a
29/// trigger phrase fire on, or be padded to length by, unrelated prose. Domain
30/// terms are never listed here.
31const STOPWORDS: &[&str] = &[
32    "the", "an", "of", "to", "for", "and", "or", "in", "on", "at", "is", "it", "be", "as", "by",
33    "with", "from", "into", "me", "my", "we", "our", "you", "your", "this", "that", "these",
34    "those", "use", "used", "when", "user", "users", "say", "says", "want", "wants", "ask", "asks",
35    "do", "does", "not", "if", "so", "up", "out", "via", "are", "was", "will", "can", "a", "i",
36];
37
38/// `tokenize`, minus stopwords — the discriminative tokens of a phrase. Used both
39/// to gate a candidate phrase by length and to match it against a prompt.
40pub fn content_tokens(s: &str) -> Vec<String> {
41    tokenize(s)
42        .into_iter()
43        .filter(|t| !STOPWORDS.contains(&t.as_str()))
44        .collect()
45}
46
47/// Light, deterministic singular form of a (lowercase) token, so the surface-form
48/// channels — keyword, phrase, BM25 — match across trivial inflection
49/// ("spreadsheets" ↔ "spreadsheet", "dependencies" ↔ "dependency"). Not a real
50/// stemmer: it only needs to be *consistent*, because both the prompt side and the
51/// skill side are normalized through it at match time. Applied at match time only —
52/// never inside the embedders — so persisted index vectors are untouched.
53pub fn norm_token(t: &str) -> String {
54    let b = t.as_bytes();
55    let n = b.len();
56    if n <= 3 || !t.ends_with('s') {
57        return t.to_string();
58    }
59    // "class", "status", "analysis": common non-plural s-endings stay whole.
60    if t.ends_with("ss") || t.ends_with("us") || t.ends_with("is") {
61        return t.to_string();
62    }
63    if n > 4 && t.ends_with("ies") {
64        return format!("{}y", &t[..n - 3]); // dependencies -> dependency
65    }
66    if t.ends_with("sses")
67        || t.ends_with("ches")
68        || t.ends_with("shes")
69        || t.ends_with("xes")
70        || t.ends_with("zes")
71    {
72        return t[..n - 2].to_string(); // branches -> branch, boxes -> box
73    }
74    t[..n - 1].to_string() // charts -> chart
75}
76
77/// [`content_tokens`], each normalized through [`norm_token`] — the form the
78/// surface-matching channels (phrase, BM25) compare prompt and skill text in.
79pub fn match_tokens(s: &str) -> Vec<String> {
80    content_tokens(s).iter().map(|t| norm_token(t)).collect()
81}
82
83/// FNV-1a 32-bit — stable token→bucket hash for the bag-of-words embedder.
84pub fn fnv1a_32(s: &str) -> u32 {
85    let mut h: u32 = 0x811c_9dc5;
86    for b in s.bytes() {
87        h ^= b as u32;
88        h = h.wrapping_mul(0x0100_0193);
89    }
90    h
91}
92
93/// FNV-1a 64-bit — content hash for index cache invalidation (not security).
94pub fn fnv1a_64(bytes: &[u8]) -> u64 {
95    let mut h: u64 = 0xcbf2_9ce4_8422_2325;
96    for &b in bytes {
97        h ^= b as u64;
98        h = h.wrapping_mul(0x0000_0100_0000_01b3);
99    }
100    h
101}
102
103#[cfg(test)]
104mod tests {
105    use super::*;
106
107    #[test]
108    fn tokenize_splits_and_lowercases() {
109        assert_eq!(
110            tokenize("Set-up a NEW uv_project!"),
111            ["set", "up", "new", "uv", "project"]
112        );
113    }
114
115    #[test]
116    fn tokenize_drops_single_chars() {
117        assert_eq!(tokenize("a b cd e"), ["cd"]);
118    }
119
120    #[test]
121    fn content_tokens_drops_stopwords() {
122        // Function words carry no discriminative signal for phrase matching, so
123        // they are excluded from the content-token set (and the length gate).
124        assert_eq!(
125            content_tokens("connect to the Neon database"),
126            ["connect", "neon", "database"]
127        );
128        // A phrase that is *only* stopwords/short words collapses to nothing.
129        assert!(content_tokens("set it up").is_empty() || content_tokens("set it up") == ["set"]);
130    }
131
132    #[test]
133    fn norm_token_singularizes_common_plurals() {
134        assert_eq!(norm_token("spreadsheets"), "spreadsheet");
135        assert_eq!(norm_token("charts"), "chart");
136        assert_eq!(norm_token("dependencies"), "dependency");
137        assert_eq!(norm_token("branches"), "branch");
138        assert_eq!(norm_token("boxes"), "box");
139        assert_eq!(norm_token("classes"), "class");
140    }
141
142    #[test]
143    fn norm_token_leaves_non_plurals_alone() {
144        // Short tokens and common non-plural s-endings must survive intact.
145        for t in ["uv", "css", "class", "status", "analysis", "chart", "rust"] {
146            assert_eq!(norm_token(t), t);
147        }
148    }
149
150    #[test]
151    fn match_tokens_normalizes_content_tokens() {
152        assert_eq!(
153            match_tokens("compute the formulas in these spreadsheets"),
154            ["compute", "formula", "spreadsheet"]
155        );
156    }
157
158    #[test]
159    fn fnv_is_deterministic() {
160        assert_eq!(fnv1a_32("commit"), fnv1a_32("commit"));
161        assert_ne!(fnv1a_32("commit"), fnv1a_32("attribution"));
162        assert_eq!(fnv1a_64(b"hello"), fnv1a_64(b"hello"));
163    }
164}