1use sha2::{Digest, Sha256};
31
32pub use crate::cloud::api_types::{Observation, ObservationScope};
33use crate::observability::privacy::strip_private_tagged_regions;
34
35#[derive(Debug, Clone, Copy)]
39pub struct ClassifyInput<'a> {
40 pub tool: &'a str,
43 pub file_path: Option<&'a str>,
45 pub diff: Option<&'a str>,
48 pub new_text: Option<&'a str>,
51 pub old_text: Option<&'a str>,
53 pub session_id: Option<&'a str>,
56 pub ts_ms: Option<i64>,
59}
60
61pub const DIFF_EXCERPT_MAX_BYTES: usize = 1024;
65
66pub const TITLE_MAX_CHARS: usize = 120;
69
70pub const NARRATIVE_MAX_CHARS: usize = 500;
72
73const PRIVACY_DENY_SUBSTRINGS: &[&str] =
76 &[".env", ".secrets", ".key", ".pem", "id_rsa", "credentials"];
77
78pub fn classify(input: &ClassifyInput<'_>) -> Option<Observation> {
82 if !matches!(input.tool, "Edit" | "MultiEdit" | "Write") {
84 return None;
85 }
86
87 let file_path = input.file_path?;
88 if is_privacy_denied(file_path) {
89 return None;
90 }
91
92 if input.diff.is_none() && input.new_text.is_none() {
95 return None;
96 }
97
98 let obs_type = determine_obs_type(input);
99 let title = build_title(input.tool, file_path, &obs_type);
100 let narrative = build_narrative(input);
101 let diff_excerpt = input
102 .diff
103 .map(strip_private_tagged_regions)
104 .map(|diff| truncate_diff_excerpt(&diff));
105
106 let session_id = input.session_id.unwrap_or("").to_owned();
107 let ts_ms = input.ts_ms.unwrap_or_else(now_unix_ms);
108 let content_hash =
109 compute_content_hash(&session_id, Some(file_path), &title, narrative.as_deref());
110
111 Some(Observation {
112 session_id,
113 ts_ms,
114 obs_type,
115 tool: input.tool.to_owned(),
116 file_path: Some(file_path.to_owned()),
117 scope: derive_scope(file_path),
118 title,
119 narrative,
120 diff_excerpt,
121 content_hash,
122 })
123}
124
125fn determine_obs_type(input: &ClassifyInput<'_>) -> String {
128 if input.tool == "Write" && input.old_text.is_none() {
133 return "feature".to_owned();
134 }
135
136 if let Some(old) = input.old_text
140 && removes_bug_marker(old, input.new_text.unwrap_or(""))
141 {
142 return "bugfix".to_owned();
143 }
144
145 if let Some(diff) = input.diff {
148 if diff_is_whitespace_only(diff) {
149 return "refactor".to_owned();
150 }
151 } else if let (Some(old), Some(new)) = (input.old_text, input.new_text)
152 && strip_ws(old) == strip_ws(new)
153 && old != new
154 {
155 return "refactor".to_owned();
156 }
157
158 "change".to_owned()
159}
160
161fn removes_bug_marker(old: &str, new: &str) -> bool {
171 const MARKERS: &[&str] = &["FIXME", "BUG", "TODO"];
172 for marker in MARKERS {
173 let before = count_word_occurrences(old, marker);
174 let after = count_word_occurrences(new, marker);
175 if before > after {
176 return true;
177 }
178 }
179 false
180}
181
182fn count_word_occurrences(haystack: &str, needle: &str) -> usize {
183 if needle.is_empty() {
184 return 0;
185 }
186 let bytes = haystack.as_bytes();
187 let nbytes = needle.as_bytes();
188 let mut count = 0;
189 let mut i = 0;
190 while i + nbytes.len() <= bytes.len() {
191 if &bytes[i..i + nbytes.len()] == nbytes {
192 let prev_ok = i == 0 || !is_word_byte(bytes[i - 1]);
193 let next_ok = i + nbytes.len() == bytes.len() || !is_word_byte(bytes[i + nbytes.len()]);
194 if prev_ok && next_ok {
195 count += 1;
196 i += nbytes.len();
197 continue;
198 }
199 }
200 i += 1;
201 }
202 count
203}
204
205const fn is_word_byte(b: u8) -> bool {
206 b.is_ascii_alphanumeric() || b == b'_'
207}
208
209fn diff_is_whitespace_only(diff: &str) -> bool {
213 let mut removed = String::new();
214 let mut added = String::new();
215 let mut saw_change = false;
216 for line in diff.lines() {
217 if let Some(rest) = line.strip_prefix('-') {
218 saw_change = true;
219 removed.push_str(rest);
220 removed.push('\n');
221 } else if let Some(rest) = line.strip_prefix('+') {
222 saw_change = true;
223 added.push_str(rest);
224 added.push('\n');
225 }
226 }
227 if !saw_change {
228 return false;
229 }
230 strip_ws(&removed) == strip_ws(&added)
231}
232
233fn strip_ws(s: &str) -> String {
237 s.chars().filter(|c| !c.is_whitespace()).collect()
238}
239
240fn build_title(tool: &str, file_path: &str, obs_type: &str) -> String {
243 let hint = match obs_type {
244 "feature" => "new file",
245 "bugfix" => "remove bug marker",
246 "refactor" => "whitespace/rename",
247 _ => "edit",
248 };
249 let base = format!("{tool} {file_path}: {hint}");
250 truncate_chars(&base, TITLE_MAX_CHARS)
251}
252
253fn build_narrative(input: &ClassifyInput<'_>) -> Option<String> {
257 let diff = strip_private_tagged_regions(input.diff?);
258 let mut collected = String::new();
259 for line in diff.lines().take(6) {
260 if !collected.is_empty() {
261 collected.push('\n');
262 }
263 collected.push_str(line);
264 }
265 if collected.is_empty() {
266 return None;
267 }
268 Some(truncate_chars(&collected, NARRATIVE_MAX_CHARS))
269}
270
271fn truncate_chars(s: &str, max_chars: usize) -> String {
274 if s.chars().count() <= max_chars {
275 return s.to_owned();
276 }
277 let mut out: String = s.chars().take(max_chars.saturating_sub(1)).collect();
278 out.push('…');
279 out
280}
281
282fn truncate_diff_excerpt(diff: &str) -> String {
285 if diff.len() <= DIFF_EXCERPT_MAX_BYTES {
286 return diff.to_owned();
287 }
288 let mut end = DIFF_EXCERPT_MAX_BYTES;
291 while end > 0 && !diff.is_char_boundary(end) {
292 end -= 1;
293 }
294 let mut out = String::with_capacity(end + 16);
295 out.push_str(&diff[..end]);
296 out.push_str("\n…[truncated]");
297 out
298}
299
300pub(crate) fn compute_content_hash(
305 session_id: &str,
306 file_path: Option<&str>,
307 title: &str,
308 narrative: Option<&str>,
309) -> String {
310 let mut hasher = Sha256::new();
311 hasher.update(session_id.as_bytes());
312 hasher.update(b"|");
313 hasher.update(file_path.unwrap_or("").as_bytes());
314 hasher.update(b"|");
315 hasher.update(title.as_bytes());
316 hasher.update(b"|");
317 hasher.update(narrative.unwrap_or("").as_bytes());
318 let digest = hasher.finalize();
319 let mut hex = String::with_capacity(16);
320 for byte in digest.iter().take(8) {
321 hex.push_str(&format!("{byte:02x}"));
322 }
323 hex
324}
325
326pub fn is_privacy_denied(path: &str) -> bool {
330 let lower = path.to_ascii_lowercase();
331 PRIVACY_DENY_SUBSTRINGS
335 .iter()
336 .any(|needle| lower.contains(needle))
337}
338
339fn derive_scope(file_path: &str) -> Option<ObservationScope> {
340 let trimmed = file_path.trim_matches('/');
341 if trimmed.is_empty() {
342 return None;
343 }
344
345 let parts: Vec<&str> = trimmed.split('/').filter(|part| !part.is_empty()).collect();
346 if parts.is_empty() {
347 return None;
348 }
349
350 let display_name = parts.last().map(|part| (*part).to_owned());
351 let parent_path = if parts.len() > 1 {
352 Some(parts[..parts.len() - 1].join("/"))
353 } else {
354 None
355 };
356
357 Some(ObservationScope {
358 anchor_kind: "file".to_owned(),
359 anchor_key: parts.join("/"),
360 parent_path,
361 display_name,
362 })
363}
364
365fn now_unix_ms() -> i64 {
366 use std::time::{SystemTime, UNIX_EPOCH};
367 SystemTime::now()
368 .duration_since(UNIX_EPOCH)
369 .map_or(0, |d| d.as_millis() as i64)
370}
371
372#[cfg(test)]
373mod tests {
374 use super::*;
375
376 fn input<'a>(
377 tool: &'a str,
378 file: &'a str,
379 diff: Option<&'a str>,
380 new_text: Option<&'a str>,
381 old_text: Option<&'a str>,
382 ) -> ClassifyInput<'a> {
383 ClassifyInput {
384 tool,
385 file_path: Some(file),
386 diff,
387 new_text,
388 old_text,
389 session_id: Some("sess_test"),
390 ts_ms: Some(1_714_000_000_000),
391 }
392 }
393
394 #[test]
395 fn classify_write_new_file_returns_feature() {
396 let inp = input(
400 "Write",
401 "src/new_mod.rs",
402 Some("+fn hello() {}\n"),
403 Some("fn hello() {}\n"),
404 None,
405 );
406 let obs = classify(&inp).expect("some");
407 assert_eq!(obs.obs_type, "feature");
408 assert_eq!(obs.tool, "Write");
409 assert_eq!(obs.file_path.as_deref(), Some("src/new_mod.rs"));
410 assert!(
411 obs.title.contains("Write"),
412 "title missing tool: {}",
413 obs.title
414 );
415 }
416
417 #[test]
418 fn classify_edit_removing_fixme_returns_bugfix() {
419 let old = "// FIXME: panics on None\nfoo.unwrap();\n";
423 let new = "if let Some(x) = foo { use_x(x); }\n";
424 let diff =
425 "-// FIXME: panics on None\n-foo.unwrap();\n+if let Some(x) = foo { use_x(x); }\n";
426 let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
427 let obs = classify(&inp).expect("some");
428 assert_eq!(obs.obs_type, "bugfix");
429 }
430
431 #[test]
432 fn classify_edit_whitespace_only_returns_refactor() {
433 let old = "let x=1;let y=2;";
438 let new = "let x = 1;\nlet y = 2;";
439 let diff = "-let x=1;let y=2;\n+let x = 1;\n+let y = 2;\n";
440 let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
441 let obs = classify(&inp).expect("some");
442 assert_eq!(obs.obs_type, "refactor");
443 }
444
445 #[test]
446 fn removing_debug_line_does_not_count_as_bug_marker_removal() {
447 let old = "// DEBUG: tracing\nlog::trace!(\"x={x}\");\n";
452 let new = "// (debug line removed)\n";
453 let diff = "-// DEBUG: tracing\n-log::trace!(\"x={x}\");\n+// (debug line removed)\n";
454 let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
455 let obs = classify(&inp).expect("some");
456 assert_ne!(
457 obs.obs_type, "bugfix",
458 "DEBUG → empty must not be classified as a bugfix"
459 );
460 }
461
462 #[test]
463 fn classify_edit_default_returns_change() {
464 let old = "let x = 1;";
468 let new = "let x = compute_answer();";
469 let diff = "-let x = 1;\n+let x = compute_answer();\n";
470 let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
471 let obs = classify(&inp).expect("some");
472 assert_eq!(obs.obs_type, "change");
473 }
474
475 #[test]
476 fn privacy_guard_blocks_env_files() {
477 let inp = input(
481 "Write",
482 "src/app/.env.local",
483 Some("+SECRET=abc\n"),
484 Some("SECRET=abc\n"),
485 None,
486 );
487 assert!(classify(&inp).is_none());
488 }
489
490 #[test]
491 fn privacy_guard_allows_normal_source_files() {
492 let inp = input(
494 "Write",
495 "src/foo.rs",
496 Some("+fn main() {}\n"),
497 Some("fn main() {}\n"),
498 None,
499 );
500 assert!(classify(&inp).is_some());
501 }
502
503 #[test]
504 fn privacy_guard_covers_pem_key_credentials() {
505 for path in &[
507 "config/.env",
508 "app.secrets.json",
509 "infra/prod.secrets.yaml",
510 "keys/server.key",
511 "certs/app.pem",
512 "home/user/.ssh/id_rsa",
513 "credentials.json",
514 ] {
515 assert!(is_privacy_denied(path), "expected deny for `{path}`");
516 }
517 }
518
519 #[test]
520 fn private_tagged_regions_are_redacted_from_observation_payload() {
521 let diff = "-safe\n+safe <private>token=abc</private>\n+done\n";
522 let inp = input(
523 "Edit",
524 "src/foo.rs",
525 Some(diff),
526 Some("safe done\n"),
527 Some("safe\n"),
528 );
529
530 let obs = classify(&inp).expect("some");
531
532 assert!(
533 obs.narrative
534 .as_deref()
535 .unwrap()
536 .contains("[redacted private content]")
537 );
538 assert!(
539 obs.diff_excerpt
540 .as_deref()
541 .unwrap()
542 .contains("[redacted private content]")
543 );
544 assert!(!obs.narrative.as_deref().unwrap().contains("token=abc"));
545 assert!(!obs.diff_excerpt.as_deref().unwrap().contains("token=abc"));
546 }
547
548 #[test]
549 fn content_hash_is_stable_and_file_sensitive() {
550 let old = "let x = 1;";
553 let new = "let x = compute_answer();";
554 let diff = "-let x = 1;\n+let x = compute_answer();\n";
555 let inp = input("Edit", "src/foo.rs", Some(diff), Some(new), Some(old));
556 let a = classify(&inp).expect("some");
557 let b = classify(&inp).expect("some");
558 assert_eq!(a.content_hash, b.content_hash);
559 assert_eq!(a.content_hash.len(), 16);
560
561 let other = classify(&input("Edit", "b.rs", Some(diff), Some(new), Some(old))).unwrap();
562 assert_ne!(a.content_hash, other.content_hash);
563 }
564
565 #[test]
566 fn non_edit_tool_returns_none() {
567 let inp = input("Read", "src/foo.rs", None, None, None);
572 assert!(classify(&inp).is_none());
573 }
574
575 #[test]
576 fn missing_diff_and_new_text_returns_none() {
577 let inp = input("Edit", "src/foo.rs", None, None, Some("old"));
582 assert!(classify(&inp).is_none());
583 }
584
585 #[test]
586 fn classify_emits_structured_scope_metadata() {
587 let old = "let x = 1;";
588 let new = "let x = compute_answer();";
589 let diff = "-let x = 1;\n+let x = compute_answer();\n";
590 let obs = classify(&input(
591 "Edit",
592 "src/auth/login/handler.rs",
593 Some(diff),
594 Some(new),
595 Some(old),
596 ))
597 .expect("some");
598
599 assert_eq!(
600 obs.scope,
601 Some(ObservationScope {
602 anchor_kind: "file".to_owned(),
603 anchor_key: "src/auth/login/handler.rs".to_owned(),
604 parent_path: Some("src/auth/login".to_owned()),
605 display_name: Some("handler.rs".to_owned()),
606 })
607 );
608 }
609
610 #[test]
611 fn wire_shape_accepts_optional_scope_metadata() {
612 let payload = serde_json::json!({
613 "session_id": "sess_new",
614 "ts_ms": 2,
615 "obs_type": "bugfix",
616 "tool": "Edit",
617 "file_path": "src/auth/login/handler.rs",
618 "scope": {
619 "anchor_kind": "file",
620 "anchor_key": "src/auth/login/handler.rs",
621 "parent_path": "src/auth/login",
622 "display_name": "handler.rs"
623 },
624 "title": "Edit src/auth/login/handler.rs: remove bug marker",
625 "narrative": "guard login retry state",
626 "diff_excerpt": "-old\n+new",
627 "content_hash": "def456"
628 });
629
630 let obs: Observation = serde_json::from_value(payload).expect("deserialize");
631 assert_eq!(
632 obs.scope.as_ref().map(|scope| scope.anchor_key.as_str()),
633 Some("src/auth/login/handler.rs")
634 );
635 }
636
637 #[test]
642 #[ignore = "doc helper for sample wire output, run manually"]
643 fn print_wire_samples() {
644 let samples = [
645 (
646 "feature",
647 input(
648 "Write",
649 "src/new_mod.rs",
650 Some("+fn hello() {}\n+pub fn world() {}\n"),
651 Some("fn hello() {}\npub fn world() {}\n"),
652 None,
653 ),
654 ),
655 (
656 "bugfix",
657 input(
658 "Edit",
659 "src/foo.rs",
660 Some(
661 "-// FIXME: crash on None\n-foo.unwrap();\n+if let Some(x) = foo { use_x(x); }\n",
662 ),
663 Some("if let Some(x) = foo { use_x(x); }\n"),
664 Some("// FIXME: crash on None\nfoo.unwrap();\n"),
665 ),
666 ),
667 (
668 "refactor",
669 input(
670 "Edit",
671 "src/foo.rs",
672 Some("-let x=1;let y=2;\n+let x = 1;\n+let y = 2;\n"),
673 Some("let x = 1;\nlet y = 2;"),
674 Some("let x=1;let y=2;"),
675 ),
676 ),
677 (
678 "change",
679 input(
680 "Edit",
681 "src/foo.rs",
682 Some("-let x = 1;\n+let x = compute_answer();\n"),
683 Some("let x = compute_answer();"),
684 Some("let x = 1;"),
685 ),
686 ),
687 ];
688 for (label, inp) in samples {
689 let obs = classify(&inp).expect("some");
690 let json = serde_json::to_string_pretty(&obs).unwrap();
691 println!("=== {label} ===\n{json}\n");
692 }
693 }
694
695 #[test]
696 fn diff_excerpt_truncates_large_diffs() {
697 let big: String = (0..4096).map(|_| 'x').collect();
701 let diff = format!("-{big}\n+{big}Y\n");
702 let inp = input("Edit", "src/foo.rs", Some(&diff), Some("yYY"), Some("xxx"));
703 let obs = classify(&inp).expect("some");
704 let excerpt = obs.diff_excerpt.expect("excerpt present");
705 assert!(
706 excerpt.len() <= DIFF_EXCERPT_MAX_BYTES + 32,
707 "excerpt too long: {}",
708 excerpt.len()
709 );
710 assert!(excerpt.ends_with("[truncated]"));
711 }
712}