Skip to main content

coding_agent_search/search/
canonicalize.rs

1//! Text canonicalization for consistent embedding input.
2//!
3//! Delegates to [`frankensearch::DefaultCanonicalizer`] for the full preprocessing
4//! pipeline (NFC normalization, markdown stripping, code block collapsing,
5//! whitespace normalization, low-signal filtering, and truncation).
6//!
7//! This module adds content hashing on top of the shared canonicalization logic.
8//!
9//! # Example
10//!
11//! ```ignore
12//! use crate::search::canonicalize::{canonicalize_for_embedding, content_hash};
13//!
14//! let raw = "**Hello** world!\n\n```rust\nfn main() {}\n```";
15//! let canonical = canonicalize_for_embedding(raw);
16//! let hash = content_hash(&canonical);
17//! ```
18
19use frankensearch::{Canonicalizer, DefaultCanonicalizer};
20use ring::digest::{self, SHA256};
21
22/// Maximum characters to keep after canonicalization.
23pub const MAX_EMBED_CHARS: usize = 2000;
24
25/// Maximum lines to keep from the beginning of a code block.
26pub const CODE_HEAD_LINES: usize = 20;
27
28/// Maximum lines to keep from the end of a code block.
29pub const CODE_TAIL_LINES: usize = 10;
30
31thread_local! {
32    /// Per-thread cached canonicalizer. DefaultCanonicalizer is a stateless
33    /// POD (three `usize` fields), so the cost of `Default::default()` per
34    /// call was pure overhead; caching it also gives a clean injection point
35    /// for future input-length short-circuiting.
36    static CANONICALIZER: DefaultCanonicalizer = DefaultCanonicalizer::default();
37}
38
39/// Low-signal content tokens. Must stay in sync with frankensearch's
40/// `LOW_SIGNAL_CONTENT` constant; the slow path falls through to the shared
41/// canonicalizer so any drift is caught by `canonicalize_for_embedding_fast_path_matches_slow_path`.
42const LOW_SIGNAL_CONTENT: &[&str] = &[
43    "ok",
44    "done",
45    "done.",
46    "got it",
47    "got it.",
48    "understood",
49    "understood.",
50    "sure",
51    "sure.",
52    "yes",
53    "no",
54    "thanks",
55    "thanks.",
56    "thank you",
57    "thank you.",
58];
59
60/// Return `Some(canonical)` when `text` can be processed by the cheap
61/// whitespace-only fast path, `None` otherwise. The fast path matches the
62/// output of the full `DefaultCanonicalizer` pipeline exactly when the input
63/// is pure ASCII and contains no markdown discriminators.
64///
65/// For the dominant tool-output message shape (short plain-ASCII strings
66/// without inline markdown markers, headers, links, blockquotes, or list
67/// markers), this skips NFC normalization, markdown line-by-line stripping,
68/// and code-block collapse — the expensive parts of the slow path — and just
69/// does whitespace collapse + low-signal filter + truncation.
70fn canonicalize_fast_path(text: &str) -> Option<String> {
71    // Pure-ASCII check implies NFC is a no-op; any non-ASCII byte must
72    // flow through the full pipeline because NFC may re-encode composed
73    // characters.
74    if !text.is_ascii() {
75        return None;
76    }
77    // Any markdown discriminator byte forces the slow path. `]` is excluded
78    // because on its own it's harmless; `[` is the real link start token, so
79    // looking for `[` alone suffices.
80    if text
81        .bytes()
82        .any(|b| matches!(b, b'`' | b'*' | b'_' | b'#' | b'['))
83    {
84        return None;
85    }
86    if has_markdown_line_prefix(text) {
87        return None;
88    }
89
90    // Whitespace-collapsed string: split_whitespace + join(' ') produces the
91    // same output as the slow path's char-by-char collapse + trim.
92    // Pre-size the buffer from the input length — collapsed output is always
93    // <= input length for ASCII.
94    let mut collapsed = String::with_capacity(text.len());
95    let mut first = true;
96    for token in text.split_whitespace() {
97        if !first {
98            collapsed.push(' ');
99        }
100        collapsed.push_str(token);
101        first = false;
102    }
103
104    // Low-signal filter: case-insensitive ASCII match against the shared
105    // pattern list. `str::eq_ignore_ascii_case` walks both operands byte-by-
106    // byte and does the case-fold inline, so we avoid the `to_ascii_lowercase`
107    // allocation that the previous version paid on every ack-length input.
108    if !collapsed.is_empty() {
109        for pattern in LOW_SIGNAL_CONTENT {
110            if collapsed.eq_ignore_ascii_case(pattern) {
111                return Some(String::new());
112            }
113        }
114    }
115
116    // Truncate to MAX_EMBED_CHARS. Pure-ASCII inputs let us slice by byte
117    // index == char index.
118    if collapsed.len() > MAX_EMBED_CHARS {
119        collapsed.truncate(MAX_EMBED_CHARS);
120    }
121
122    Some(collapsed)
123}
124
125fn has_markdown_line_prefix(text: &str) -> bool {
126    text.lines().any(|line| {
127        let trimmed = line.trim_start();
128        trimmed.starts_with('>')
129            || trimmed.starts_with("- ")
130            || trimmed.starts_with("+ ")
131            || has_ordered_list_marker(trimmed)
132    })
133}
134
135fn has_ordered_list_marker(line: &str) -> bool {
136    let mut bytes = line.bytes().peekable();
137    let mut saw_digit = false;
138
139    while bytes.next_if(u8::is_ascii_digit).is_some() {
140        saw_digit = true;
141    }
142
143    saw_digit && bytes.next() == Some(b'.') && bytes.next() == Some(b' ')
144}
145
146/// Canonicalize text for embedding.
147///
148/// Applies the full preprocessing pipeline to produce clean, consistent text
149/// suitable for embedding. The output is deterministic: the same visual input
150/// always produces the same output.
151///
152/// Hot-path: when the input is pure ASCII and contains no markdown
153/// discriminator bytes, a cheap whitespace-only fast path is used and the
154/// full `DefaultCanonicalizer` pipeline is skipped. The fast path is a
155/// superset-preserving refinement — for any input where it fires, its output
156/// is byte-identical to the slow path.
157pub fn canonicalize_for_embedding(text: &str) -> String {
158    if let Some(fast) = canonicalize_fast_path(text) {
159        return fast;
160    }
161    CANONICALIZER.with(|c| c.canonicalize(text))
162}
163
164/// Compute SHA256 content hash of text.
165///
166/// The hash is computed on the UTF-8 bytes of the input. For consistent
167/// hashing, always canonicalize text first.
168pub fn content_hash(text: &str) -> [u8; 32] {
169    let digest = digest::digest(&SHA256, text.as_bytes());
170    let mut hash = [0u8; 32];
171    hash.copy_from_slice(digest.as_ref());
172    hash
173}
174
175/// Compute SHA256 content hash as hex string.
176///
177/// Convenience wrapper around [`content_hash`] that returns a hex-encoded string.
178pub fn content_hash_hex(text: &str) -> String {
179    let hash = content_hash(text);
180    hex::encode(hash)
181}
182
183fn role_is(role: Option<&str>, expected: &str) -> bool {
184    role.is_some_and(|role| role.trim().eq_ignore_ascii_case(expected))
185}
186
187fn is_short_acknowledgement(lower: &str) -> bool {
188    matches!(
189        lower,
190        "ok" | "ok."
191            | "okay"
192            | "okay."
193            | "done"
194            | "done."
195            | "done!"
196            | "got it"
197            | "got it."
198            | "got it!"
199            | "ack"
200            | "ack."
201            | "acknowledged"
202            | "acknowledged."
203            | "confirmed"
204            | "confirmed."
205            | "completed"
206            | "completed."
207            | "complete"
208            | "complete."
209    )
210}
211
212/// Return true when text is a low-value acknowledgement/tool confirmation.
213///
214/// These messages add little search value and tend to dominate result sets with
215/// repeated "done/acknowledged/wrote file" noise.
216pub fn is_tool_acknowledgement(role: Option<&str>, text: &str) -> bool {
217    let trimmed = text.trim();
218    if trimmed.is_empty() {
219        return false;
220    }
221
222    if trimmed.len() > 200 {
223        return false;
224    }
225
226    let lower = trimmed.to_ascii_lowercase();
227    if is_short_acknowledgement(&lower) {
228        return true;
229    }
230
231    let toolish = role_is(role, "tool");
232    let short_tool_ack = lower == "no matches found"
233        || lower == "no changes made"
234        || lower == "no changes"
235        || lower == "already up to date"
236        || lower == "up to date"
237        || lower == "file written";
238    if short_tool_ack && (toolish || lower.contains("file") || lower.contains("match")) {
239        return true;
240    }
241
242    let prefixed_tool_ack = lower.starts_with("successfully wrote to ")
243        || lower.starts_with("successfully updated ")
244        || lower.starts_with("successfully created ")
245        || lower.starts_with("successfully deleted ")
246        || lower.starts_with("successfully saved ")
247        || lower.starts_with("successfully applied ")
248        || lower.starts_with("applied patch")
249        || lower.starts_with("patch applied");
250    prefixed_tool_ack && (toolish || lower.contains('/') || lower.contains("file"))
251}
252
253/// Return true when content looks like an injected prompt/instructions block.
254///
255/// We keep these messages in storage, but suppress them from normal search
256/// results unless the query is clearly asking for prompt/instruction content.
257pub fn is_system_prompt_text(text: &str) -> bool {
258    let trimmed = text.trim();
259    if trimmed.is_empty() {
260        return false;
261    }
262
263    let lower = trimmed.to_ascii_lowercase();
264    lower.starts_with("# agents.md instructions for ")
265        || lower.starts_with("agents.md instructions for ")
266        || lower.starts_with("system prompt:")
267        || lower.starts_with("developer prompt:")
268        || lower.starts_with("developer message:")
269        || lower.starts_with("system message:")
270        || lower.contains("follow the agents.md instructions")
271        || ((lower.starts_with("you are a ") || lower.starts_with("you are an "))
272            && (lower.contains("assistant") || lower.contains("coding agent"))
273            && (lower.contains("instructions")
274                || lower.contains("follow")
275                || lower.contains("must")
276                || lower.contains("rules")))
277}
278
279/// Return true when a query explicitly asks for prompt/instructions content.
280pub fn query_requests_system_prompt(query: &str) -> bool {
281    let lower = query.trim().to_ascii_lowercase();
282    if lower.is_empty() {
283        return false;
284    }
285
286    lower.contains("system prompt")
287        || lower.contains("developer prompt")
288        || lower.contains("system message")
289        || lower.contains("developer message")
290        || lower.contains("system instructions")
291        || lower.contains("developer instructions")
292        || lower.contains("agents.md")
293        || lower.contains("agents md")
294        || lower.contains("claude.md")
295        || lower.contains("claude md")
296        || lower.contains("prompt text")
297        || ((lower.starts_with("you are ") || lower.contains(" you are "))
298            && (lower.contains("assistant") || lower.contains("coding agent")))
299        || lower.contains("\"you are")
300}
301
302/// Noise we can safely skip during indexing.
303pub fn is_hard_message_noise(role: Option<&str>, text: &str) -> bool {
304    text.trim().is_empty() || is_tool_acknowledgement(role, text)
305}
306
307/// Noise we should suppress from search results.
308pub fn is_search_noise_text(text: &str, query: &str) -> bool {
309    let trimmed = text.trim();
310    trimmed.is_empty()
311        || is_tool_acknowledgement(None, trimmed)
312        || (is_system_prompt_text(trimmed) && !query_requests_system_prompt(query))
313}
314
315#[cfg(test)]
316mod tests {
317    use super::*;
318
319    #[test]
320    fn canonicalize_fast_path_matches_slow_path_for_pure_ascii_inputs() {
321        // Every input in this table must either (a) hit the fast path and
322        // match the slow path byte-for-byte, or (b) correctly fall through
323        // to the slow path because it contains a markdown discriminator or
324        // non-ASCII bytes. If the fast path ever diverges, this test catches
325        // it before it reaches production.
326        let cases = &[
327            // Pure-ASCII, no markdown — fast path eligible
328            "hello world",
329            "  hello   world  ",
330            "hello\n\n\nworld\n",
331            "line one\nline two\nline three",
332            "Thanks!",
333            "plain text with punctuation: comma, period. question?",
334            "simple-hyphen and plus+signs",
335            "parens (like this) are fine",
336            // Low-signal acks — fast path must return ""
337            "OK",
338            "ok",
339            "  Done.  ",
340            "got it",
341            "Thanks",
342            "thank you.",
343            // Markdown discriminators — fall through to slow path
344            "**bold** text",
345            "has `inline code`",
346            "# A Header",
347            "list [link](url)",
348            "_italic_ too",
349            "> quoted text",
350            ">> nested quoted text",
351            "1. First item\n2. Second item",
352            "  - dash item\n  + plus item",
353            // Non-ASCII — fall through (NFC must run)
354            "café au lait",
355            "caf\u{0065}\u{0301}",
356            "emoji 👋 mix",
357            // Empty / whitespace-only
358            "",
359            "   ",
360            "\n\n\n",
361        ];
362
363        for input in cases {
364            let slow = CANONICALIZER.with(|c| c.canonicalize(input));
365            let combined = canonicalize_for_embedding(input);
366            assert_eq!(
367                combined, slow,
368                "canonicalize_for_embedding({input:?}) diverged from slow path"
369            );
370        }
371    }
372
373    #[test]
374    fn canonicalize_fast_path_truncates_to_max_embed_chars() {
375        let long_ascii: String = "a ".repeat(MAX_EMBED_CHARS);
376        let out = canonicalize_for_embedding(&long_ascii);
377        assert!(out.chars().count() <= MAX_EMBED_CHARS);
378    }
379
380    #[test]
381    fn test_unicode_nfc_normalization() {
382        let composed = "caf\u{00E9}";
383        let decomposed = "cafe\u{0301}";
384        assert_ne!(composed, decomposed);
385        let canon_composed = canonicalize_for_embedding(composed);
386        let canon_decomposed = canonicalize_for_embedding(decomposed);
387        assert_eq!(canon_composed, canon_decomposed);
388    }
389
390    #[test]
391    fn test_unicode_nfc_hash_stability() {
392        let composed = "caf\u{00E9}";
393        let decomposed = "cafe\u{0301}";
394        let hash1 = content_hash(&canonicalize_for_embedding(composed));
395        let hash2 = content_hash(&canonicalize_for_embedding(decomposed));
396        assert_eq!(hash1, hash2);
397    }
398
399    #[test]
400    fn test_canonicalize_deterministic() {
401        let text = "**Hello** _world_!\n\nThis is a [link](http://example.com).";
402        let result1 = canonicalize_for_embedding(text);
403        let result2 = canonicalize_for_embedding(text);
404        assert_eq!(result1, result2);
405    }
406
407    #[test]
408    fn test_strip_markdown_bold_italic() {
409        let text = "**bold** and *italic* and __also bold__";
410        let canonical = canonicalize_for_embedding(text);
411        assert!(!canonical.contains("**"));
412        assert!(!canonical.contains("__"));
413        assert!(canonical.contains("bold"));
414        assert!(canonical.contains("italic"));
415    }
416
417    #[test]
418    fn test_strip_markdown_links() {
419        let text = "Check out [this link](http://example.com) for more info.";
420        let canonical = canonicalize_for_embedding(text);
421        assert!(canonical.contains("this link"));
422        assert!(!canonical.contains("http://example.com"));
423    }
424
425    #[test]
426    fn test_strip_markdown_headers() {
427        let text = "# Header 1\n## Header 2\n### Header 3";
428        let canonical = canonicalize_for_embedding(text);
429        assert!(canonical.contains("Header 1"));
430        assert!(canonical.contains("Header 2"));
431        assert!(canonical.contains("Header 3"));
432    }
433
434    #[test]
435    fn test_code_block_short() {
436        let text = "```rust\nfn main() {\n    println!(\"Hello\");\n}\n```";
437        let canonical = canonicalize_for_embedding(text);
438        assert!(canonical.contains("[code: rust]"));
439        assert!(canonical.contains("fn main()"));
440    }
441
442    #[test]
443    fn test_code_block_collapse_long() {
444        let mut lines = Vec::new();
445        for i in 0..50 {
446            lines.push(format!("line {i}"));
447        }
448        let code = format!("```python\n{}\n```", lines.join("\n"));
449        let canonical = canonicalize_for_embedding(&code);
450
451        assert!(canonical.contains("line 0"));
452        assert!(canonical.contains("line 19"));
453        assert!(canonical.contains("line 40"));
454        assert!(canonical.contains("line 49"));
455        assert!(canonical.contains("lines omitted"));
456        assert!(!canonical.contains("line 25"));
457    }
458
459    #[test]
460    fn test_whitespace_normalization() {
461        let text = "hello    world\n\n\nwith   multiple   spaces";
462        let canonical = canonicalize_for_embedding(text);
463        assert!(!canonical.contains("  "));
464        assert!(canonical.contains("hello"));
465        assert!(canonical.contains("world"));
466    }
467
468    #[test]
469    fn test_low_signal_filtered() {
470        assert_eq!(canonicalize_for_embedding("OK"), "");
471        assert_eq!(canonicalize_for_embedding("Done."), "");
472        assert_eq!(canonicalize_for_embedding("Got it."), "");
473        assert_eq!(canonicalize_for_embedding("Thanks!"), "Thanks!");
474    }
475
476    #[test]
477    fn test_truncation() {
478        let long_text: String = "a".repeat(5000);
479        let canonical = canonicalize_for_embedding(&long_text);
480        assert_eq!(canonical.chars().count(), 2000);
481    }
482
483    #[test]
484    fn test_empty_input() {
485        assert_eq!(canonicalize_for_embedding(""), "");
486    }
487
488    #[test]
489    fn test_content_hash_deterministic() {
490        let text = "Hello, world!";
491        let hash1 = content_hash(text);
492        let hash2 = content_hash(text);
493        assert_eq!(hash1, hash2);
494    }
495
496    #[test]
497    fn test_content_hash_different_for_different_input() {
498        let hash1 = content_hash("Hello");
499        let hash2 = content_hash("World");
500        assert_ne!(hash1, hash2);
501    }
502
503    #[test]
504    fn test_content_hash_hex() {
505        let hex = content_hash_hex("test");
506        assert_eq!(hex.len(), 64);
507        assert!(hex.chars().all(|c| c.is_ascii_hexdigit()));
508    }
509
510    #[test]
511    fn test_is_tool_acknowledgement_detects_short_replies() {
512        assert!(is_tool_acknowledgement(None, "OK"));
513        assert!(is_tool_acknowledgement(None, "Acknowledged."));
514        assert!(is_tool_acknowledgement(None, "Done!"));
515        assert!(!is_tool_acknowledgement(None, "Thanks!"));
516    }
517
518    #[test]
519    fn test_is_tool_acknowledgement_detects_tool_write_confirmations() {
520        assert!(is_tool_acknowledgement(
521            Some("tool"),
522            "Successfully wrote to /tmp/output.rs"
523        ));
524        assert!(is_tool_acknowledgement(Some("tool"), "No matches found"));
525        assert!(!is_tool_acknowledgement(
526            Some("tool"),
527            "Compilation failed with an auth refresh error"
528        ));
529    }
530
531    #[test]
532    fn test_is_system_prompt_text_detects_instruction_blocks() {
533        assert!(is_system_prompt_text(
534            "# AGENTS.md instructions for /repo\n\nFollow these rules carefully."
535        ));
536        assert!(is_system_prompt_text(
537            "You are a coding assistant. You must follow the instructions exactly."
538        ));
539        assert!(!is_system_prompt_text(
540            "You are looking at the auth module."
541        ));
542    }
543
544    #[test]
545    fn test_query_requests_system_prompt_matches_prompt_terms() {
546        assert!(query_requests_system_prompt("AGENTS.md instructions"));
547        assert!(query_requests_system_prompt("show me the system prompt"));
548        assert!(query_requests_system_prompt("you are a coding assistant"));
549        assert!(!query_requests_system_prompt("build instructions"));
550        assert!(!query_requests_system_prompt("authentication failure"));
551    }
552
553    #[test]
554    fn test_list_markers_stripped() {
555        let text = "1. First item\n2. Second item\n10. Tenth item";
556        let canonical = canonicalize_for_embedding(text);
557        assert!(canonical.contains("First item"));
558        assert!(canonical.contains("Second item"));
559        assert!(canonical.contains("Tenth item"));
560    }
561
562    #[test]
563    fn test_numbers_not_list_markers_preserved() {
564        let text = "3.14159 is pi";
565        let canonical = canonicalize_for_embedding(text);
566        assert!(canonical.contains("3.14159"));
567    }
568
569    #[test]
570    fn test_blockquote() {
571        let text = "> This is a quote\n> spanning multiple lines";
572        let canonical = canonicalize_for_embedding(text);
573        assert!(canonical.contains("This is a quote"));
574    }
575
576    #[test]
577    fn test_inline_code() {
578        let text = "Use `fn main()` to start.";
579        let canonical = canonicalize_for_embedding(text);
580        assert!(canonical.contains("fn main()"));
581        assert!(!canonical.contains('`'));
582    }
583
584    #[test]
585    fn test_emoji_preserved() {
586        let text = "Hello 👋 World 🌍";
587        let canonical = canonicalize_for_embedding(text);
588        assert!(canonical.contains('👋'));
589        assert!(canonical.contains('🌍'));
590    }
591
592    #[test]
593    fn test_mixed_content() {
594        let text = r#"# Welcome
595
596**Bold** and *italic* text.
597
598```rust
599fn hello() {
600    println!("Hello!");
601}
602```
603
604See [docs](http://docs.rs) for more.
605"#;
606        let canonical = canonicalize_for_embedding(text);
607        assert!(canonical.contains("Welcome"));
608        assert!(!canonical.contains("**"));
609        assert!(canonical.contains("Bold"));
610        assert!(canonical.contains("[code: rust]"));
611        assert!(canonical.contains("docs"));
612        assert!(!canonical.contains("http://docs.rs"));
613    }
614
615    #[test]
616    fn test_unbalanced_link_preserves_content() {
617        let text = "Check [link](url( unbalanced. Next sentence.";
618        let canonical = canonicalize_for_embedding(text);
619        assert!(canonical.contains("Next sentence"));
620        assert!(canonical.contains("unbalanced"));
621    }
622}