coding_agent_search/search/
canonicalize.rs1use frankensearch::{Canonicalizer, DefaultCanonicalizer};
20use ring::digest::{self, SHA256};
21
22pub const MAX_EMBED_CHARS: usize = 2000;
24
25pub const CODE_HEAD_LINES: usize = 20;
27
28pub const CODE_TAIL_LINES: usize = 10;
30
31thread_local! {
32 static CANONICALIZER: DefaultCanonicalizer = DefaultCanonicalizer::default();
37}
38
39const LOW_SIGNAL_CONTENT: &[&str] = &[
43 "ok",
44 "done",
45 "done.",
46 "got it",
47 "got it.",
48 "understood",
49 "understood.",
50 "sure",
51 "sure.",
52 "yes",
53 "no",
54 "thanks",
55 "thanks.",
56 "thank you",
57 "thank you.",
58];
59
60fn canonicalize_fast_path(text: &str) -> Option<String> {
71 if !text.is_ascii() {
75 return None;
76 }
77 if text
81 .bytes()
82 .any(|b| matches!(b, b'`' | b'*' | b'_' | b'#' | b'['))
83 {
84 return None;
85 }
86 if has_markdown_line_prefix(text) {
87 return None;
88 }
89
90 let mut collapsed = String::with_capacity(text.len());
95 let mut first = true;
96 for token in text.split_whitespace() {
97 if !first {
98 collapsed.push(' ');
99 }
100 collapsed.push_str(token);
101 first = false;
102 }
103
104 if !collapsed.is_empty() {
109 for pattern in LOW_SIGNAL_CONTENT {
110 if collapsed.eq_ignore_ascii_case(pattern) {
111 return Some(String::new());
112 }
113 }
114 }
115
116 if collapsed.len() > MAX_EMBED_CHARS {
119 collapsed.truncate(MAX_EMBED_CHARS);
120 }
121
122 Some(collapsed)
123}
124
125fn has_markdown_line_prefix(text: &str) -> bool {
126 text.lines().any(|line| {
127 let trimmed = line.trim_start();
128 trimmed.starts_with('>')
129 || trimmed.starts_with("- ")
130 || trimmed.starts_with("+ ")
131 || has_ordered_list_marker(trimmed)
132 })
133}
134
135fn has_ordered_list_marker(line: &str) -> bool {
136 let mut bytes = line.bytes().peekable();
137 let mut saw_digit = false;
138
139 while bytes.next_if(u8::is_ascii_digit).is_some() {
140 saw_digit = true;
141 }
142
143 saw_digit && bytes.next() == Some(b'.') && bytes.next() == Some(b' ')
144}
145
146pub fn canonicalize_for_embedding(text: &str) -> String {
158 if let Some(fast) = canonicalize_fast_path(text) {
159 return fast;
160 }
161 CANONICALIZER.with(|c| c.canonicalize(text))
162}
163
164pub fn content_hash(text: &str) -> [u8; 32] {
169 let digest = digest::digest(&SHA256, text.as_bytes());
170 let mut hash = [0u8; 32];
171 hash.copy_from_slice(digest.as_ref());
172 hash
173}
174
175pub fn content_hash_hex(text: &str) -> String {
179 let hash = content_hash(text);
180 hex::encode(hash)
181}
182
183fn role_is(role: Option<&str>, expected: &str) -> bool {
184 role.is_some_and(|role| role.trim().eq_ignore_ascii_case(expected))
185}
186
187fn is_short_acknowledgement(lower: &str) -> bool {
188 matches!(
189 lower,
190 "ok" | "ok."
191 | "okay"
192 | "okay."
193 | "done"
194 | "done."
195 | "done!"
196 | "got it"
197 | "got it."
198 | "got it!"
199 | "ack"
200 | "ack."
201 | "acknowledged"
202 | "acknowledged."
203 | "confirmed"
204 | "confirmed."
205 | "completed"
206 | "completed."
207 | "complete"
208 | "complete."
209 )
210}
211
212pub fn is_tool_acknowledgement(role: Option<&str>, text: &str) -> bool {
217 let trimmed = text.trim();
218 if trimmed.is_empty() {
219 return false;
220 }
221
222 if trimmed.len() > 200 {
223 return false;
224 }
225
226 let lower = trimmed.to_ascii_lowercase();
227 if is_short_acknowledgement(&lower) {
228 return true;
229 }
230
231 let toolish = role_is(role, "tool");
232 let short_tool_ack = lower == "no matches found"
233 || lower == "no changes made"
234 || lower == "no changes"
235 || lower == "already up to date"
236 || lower == "up to date"
237 || lower == "file written";
238 if short_tool_ack && (toolish || lower.contains("file") || lower.contains("match")) {
239 return true;
240 }
241
242 let prefixed_tool_ack = lower.starts_with("successfully wrote to ")
243 || lower.starts_with("successfully updated ")
244 || lower.starts_with("successfully created ")
245 || lower.starts_with("successfully deleted ")
246 || lower.starts_with("successfully saved ")
247 || lower.starts_with("successfully applied ")
248 || lower.starts_with("applied patch")
249 || lower.starts_with("patch applied");
250 prefixed_tool_ack && (toolish || lower.contains('/') || lower.contains("file"))
251}
252
253pub fn is_system_prompt_text(text: &str) -> bool {
258 let trimmed = text.trim();
259 if trimmed.is_empty() {
260 return false;
261 }
262
263 let lower = trimmed.to_ascii_lowercase();
264 lower.starts_with("# agents.md instructions for ")
265 || lower.starts_with("agents.md instructions for ")
266 || lower.starts_with("system prompt:")
267 || lower.starts_with("developer prompt:")
268 || lower.starts_with("developer message:")
269 || lower.starts_with("system message:")
270 || lower.contains("follow the agents.md instructions")
271 || ((lower.starts_with("you are a ") || lower.starts_with("you are an "))
272 && (lower.contains("assistant") || lower.contains("coding agent"))
273 && (lower.contains("instructions")
274 || lower.contains("follow")
275 || lower.contains("must")
276 || lower.contains("rules")))
277}
278
279pub fn query_requests_system_prompt(query: &str) -> bool {
281 let lower = query.trim().to_ascii_lowercase();
282 if lower.is_empty() {
283 return false;
284 }
285
286 lower.contains("system prompt")
287 || lower.contains("developer prompt")
288 || lower.contains("system message")
289 || lower.contains("developer message")
290 || lower.contains("system instructions")
291 || lower.contains("developer instructions")
292 || lower.contains("agents.md")
293 || lower.contains("agents md")
294 || lower.contains("claude.md")
295 || lower.contains("claude md")
296 || lower.contains("prompt text")
297 || ((lower.starts_with("you are ") || lower.contains(" you are "))
298 && (lower.contains("assistant") || lower.contains("coding agent")))
299 || lower.contains("\"you are")
300}
301
302pub fn is_hard_message_noise(role: Option<&str>, text: &str) -> bool {
304 text.trim().is_empty() || is_tool_acknowledgement(role, text)
305}
306
307pub fn is_search_noise_text(text: &str, query: &str) -> bool {
309 let trimmed = text.trim();
310 trimmed.is_empty()
311 || is_tool_acknowledgement(None, trimmed)
312 || (is_system_prompt_text(trimmed) && !query_requests_system_prompt(query))
313}
314
315#[cfg(test)]
316mod tests {
317 use super::*;
318
319 #[test]
320 fn canonicalize_fast_path_matches_slow_path_for_pure_ascii_inputs() {
321 let cases = &[
327 "hello world",
329 " hello world ",
330 "hello\n\n\nworld\n",
331 "line one\nline two\nline three",
332 "Thanks!",
333 "plain text with punctuation: comma, period. question?",
334 "simple-hyphen and plus+signs",
335 "parens (like this) are fine",
336 "OK",
338 "ok",
339 " Done. ",
340 "got it",
341 "Thanks",
342 "thank you.",
343 "**bold** text",
345 "has `inline code`",
346 "# A Header",
347 "list [link](url)",
348 "_italic_ too",
349 "> quoted text",
350 ">> nested quoted text",
351 "1. First item\n2. Second item",
352 " - dash item\n + plus item",
353 "café au lait",
355 "caf\u{0065}\u{0301}",
356 "emoji 👋 mix",
357 "",
359 " ",
360 "\n\n\n",
361 ];
362
363 for input in cases {
364 let slow = CANONICALIZER.with(|c| c.canonicalize(input));
365 let combined = canonicalize_for_embedding(input);
366 assert_eq!(
367 combined, slow,
368 "canonicalize_for_embedding({input:?}) diverged from slow path"
369 );
370 }
371 }
372
373 #[test]
374 fn canonicalize_fast_path_truncates_to_max_embed_chars() {
375 let long_ascii: String = "a ".repeat(MAX_EMBED_CHARS);
376 let out = canonicalize_for_embedding(&long_ascii);
377 assert!(out.chars().count() <= MAX_EMBED_CHARS);
378 }
379
380 #[test]
381 fn test_unicode_nfc_normalization() {
382 let composed = "caf\u{00E9}";
383 let decomposed = "cafe\u{0301}";
384 assert_ne!(composed, decomposed);
385 let canon_composed = canonicalize_for_embedding(composed);
386 let canon_decomposed = canonicalize_for_embedding(decomposed);
387 assert_eq!(canon_composed, canon_decomposed);
388 }
389
390 #[test]
391 fn test_unicode_nfc_hash_stability() {
392 let composed = "caf\u{00E9}";
393 let decomposed = "cafe\u{0301}";
394 let hash1 = content_hash(&canonicalize_for_embedding(composed));
395 let hash2 = content_hash(&canonicalize_for_embedding(decomposed));
396 assert_eq!(hash1, hash2);
397 }
398
399 #[test]
400 fn test_canonicalize_deterministic() {
401 let text = "**Hello** _world_!\n\nThis is a [link](http://example.com).";
402 let result1 = canonicalize_for_embedding(text);
403 let result2 = canonicalize_for_embedding(text);
404 assert_eq!(result1, result2);
405 }
406
407 #[test]
408 fn test_strip_markdown_bold_italic() {
409 let text = "**bold** and *italic* and __also bold__";
410 let canonical = canonicalize_for_embedding(text);
411 assert!(!canonical.contains("**"));
412 assert!(!canonical.contains("__"));
413 assert!(canonical.contains("bold"));
414 assert!(canonical.contains("italic"));
415 }
416
417 #[test]
418 fn test_strip_markdown_links() {
419 let text = "Check out [this link](http://example.com) for more info.";
420 let canonical = canonicalize_for_embedding(text);
421 assert!(canonical.contains("this link"));
422 assert!(!canonical.contains("http://example.com"));
423 }
424
425 #[test]
426 fn test_strip_markdown_headers() {
427 let text = "# Header 1\n## Header 2\n### Header 3";
428 let canonical = canonicalize_for_embedding(text);
429 assert!(canonical.contains("Header 1"));
430 assert!(canonical.contains("Header 2"));
431 assert!(canonical.contains("Header 3"));
432 }
433
434 #[test]
435 fn test_code_block_short() {
436 let text = "```rust\nfn main() {\n println!(\"Hello\");\n}\n```";
437 let canonical = canonicalize_for_embedding(text);
438 assert!(canonical.contains("[code: rust]"));
439 assert!(canonical.contains("fn main()"));
440 }
441
442 #[test]
443 fn test_code_block_collapse_long() {
444 let mut lines = Vec::new();
445 for i in 0..50 {
446 lines.push(format!("line {i}"));
447 }
448 let code = format!("```python\n{}\n```", lines.join("\n"));
449 let canonical = canonicalize_for_embedding(&code);
450
451 assert!(canonical.contains("line 0"));
452 assert!(canonical.contains("line 19"));
453 assert!(canonical.contains("line 40"));
454 assert!(canonical.contains("line 49"));
455 assert!(canonical.contains("lines omitted"));
456 assert!(!canonical.contains("line 25"));
457 }
458
459 #[test]
460 fn test_whitespace_normalization() {
461 let text = "hello world\n\n\nwith multiple spaces";
462 let canonical = canonicalize_for_embedding(text);
463 assert!(!canonical.contains(" "));
464 assert!(canonical.contains("hello"));
465 assert!(canonical.contains("world"));
466 }
467
468 #[test]
469 fn test_low_signal_filtered() {
470 assert_eq!(canonicalize_for_embedding("OK"), "");
471 assert_eq!(canonicalize_for_embedding("Done."), "");
472 assert_eq!(canonicalize_for_embedding("Got it."), "");
473 assert_eq!(canonicalize_for_embedding("Thanks!"), "Thanks!");
474 }
475
476 #[test]
477 fn test_truncation() {
478 let long_text: String = "a".repeat(5000);
479 let canonical = canonicalize_for_embedding(&long_text);
480 assert_eq!(canonical.chars().count(), 2000);
481 }
482
483 #[test]
484 fn test_empty_input() {
485 assert_eq!(canonicalize_for_embedding(""), "");
486 }
487
488 #[test]
489 fn test_content_hash_deterministic() {
490 let text = "Hello, world!";
491 let hash1 = content_hash(text);
492 let hash2 = content_hash(text);
493 assert_eq!(hash1, hash2);
494 }
495
496 #[test]
497 fn test_content_hash_different_for_different_input() {
498 let hash1 = content_hash("Hello");
499 let hash2 = content_hash("World");
500 assert_ne!(hash1, hash2);
501 }
502
503 #[test]
504 fn test_content_hash_hex() {
505 let hex = content_hash_hex("test");
506 assert_eq!(hex.len(), 64);
507 assert!(hex.chars().all(|c| c.is_ascii_hexdigit()));
508 }
509
510 #[test]
511 fn test_is_tool_acknowledgement_detects_short_replies() {
512 assert!(is_tool_acknowledgement(None, "OK"));
513 assert!(is_tool_acknowledgement(None, "Acknowledged."));
514 assert!(is_tool_acknowledgement(None, "Done!"));
515 assert!(!is_tool_acknowledgement(None, "Thanks!"));
516 }
517
518 #[test]
519 fn test_is_tool_acknowledgement_detects_tool_write_confirmations() {
520 assert!(is_tool_acknowledgement(
521 Some("tool"),
522 "Successfully wrote to /tmp/output.rs"
523 ));
524 assert!(is_tool_acknowledgement(Some("tool"), "No matches found"));
525 assert!(!is_tool_acknowledgement(
526 Some("tool"),
527 "Compilation failed with an auth refresh error"
528 ));
529 }
530
531 #[test]
532 fn test_is_system_prompt_text_detects_instruction_blocks() {
533 assert!(is_system_prompt_text(
534 "# AGENTS.md instructions for /repo\n\nFollow these rules carefully."
535 ));
536 assert!(is_system_prompt_text(
537 "You are a coding assistant. You must follow the instructions exactly."
538 ));
539 assert!(!is_system_prompt_text(
540 "You are looking at the auth module."
541 ));
542 }
543
544 #[test]
545 fn test_query_requests_system_prompt_matches_prompt_terms() {
546 assert!(query_requests_system_prompt("AGENTS.md instructions"));
547 assert!(query_requests_system_prompt("show me the system prompt"));
548 assert!(query_requests_system_prompt("you are a coding assistant"));
549 assert!(!query_requests_system_prompt("build instructions"));
550 assert!(!query_requests_system_prompt("authentication failure"));
551 }
552
553 #[test]
554 fn test_list_markers_stripped() {
555 let text = "1. First item\n2. Second item\n10. Tenth item";
556 let canonical = canonicalize_for_embedding(text);
557 assert!(canonical.contains("First item"));
558 assert!(canonical.contains("Second item"));
559 assert!(canonical.contains("Tenth item"));
560 }
561
562 #[test]
563 fn test_numbers_not_list_markers_preserved() {
564 let text = "3.14159 is pi";
565 let canonical = canonicalize_for_embedding(text);
566 assert!(canonical.contains("3.14159"));
567 }
568
569 #[test]
570 fn test_blockquote() {
571 let text = "> This is a quote\n> spanning multiple lines";
572 let canonical = canonicalize_for_embedding(text);
573 assert!(canonical.contains("This is a quote"));
574 }
575
576 #[test]
577 fn test_inline_code() {
578 let text = "Use `fn main()` to start.";
579 let canonical = canonicalize_for_embedding(text);
580 assert!(canonical.contains("fn main()"));
581 assert!(!canonical.contains('`'));
582 }
583
584 #[test]
585 fn test_emoji_preserved() {
586 let text = "Hello 👋 World 🌍";
587 let canonical = canonicalize_for_embedding(text);
588 assert!(canonical.contains('👋'));
589 assert!(canonical.contains('🌍'));
590 }
591
592 #[test]
593 fn test_mixed_content() {
594 let text = r#"# Welcome
595
596**Bold** and *italic* text.
597
598```rust
599fn hello() {
600 println!("Hello!");
601}
602```
603
604See [docs](http://docs.rs) for more.
605"#;
606 let canonical = canonicalize_for_embedding(text);
607 assert!(canonical.contains("Welcome"));
608 assert!(!canonical.contains("**"));
609 assert!(canonical.contains("Bold"));
610 assert!(canonical.contains("[code: rust]"));
611 assert!(canonical.contains("docs"));
612 assert!(!canonical.contains("http://docs.rs"));
613 }
614
615 #[test]
616 fn test_unbalanced_link_preserves_content() {
617 let text = "Check [link](url( unbalanced. Next sentence.";
618 let canonical = canonicalize_for_embedding(text);
619 assert!(canonical.contains("Next sentence"));
620 assert!(canonical.contains("unbalanced"));
621 }
622}