1use crate::telemetry::Shape;
12use serde_json::Value;
13
14#[derive(Debug, Clone)]
16pub struct ClassifiedResponse {
17 pub shape: Shape,
18 pub raw_chars: usize,
19 pub inner_formats: Vec<InnerFormat>,
20 pub md_n_cols: Option<usize>,
22 pub md_n_rows: Option<usize>,
23 pub n_items: Option<usize>,
25 pub key_stability: Option<f32>,
27 pub n_fields: Option<usize>,
29 pub depth_max: Option<usize>,
31}
32
33#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum InnerFormat {
36 Url,
37 Log,
38 Hash,
39 Diff,
40 Markdown,
41 MarkdownTable,
42 MarkdownWithCode,
43 CodeFence,
44 XmlHtml,
45 Yaml,
46 StackTrace,
47 NumberedList,
48 InlineJson,
49 Prose,
50}
51
52impl InnerFormat {
53 pub fn as_tag(&self) -> &'static str {
54 match self {
55 Self::Url => "url",
56 Self::Log => "log",
57 Self::Hash => "hash",
58 Self::Diff => "diff",
59 Self::Markdown => "md",
60 Self::MarkdownTable => "md_table",
61 Self::MarkdownWithCode => "md_with_code",
62 Self::CodeFence => "code_fence",
63 Self::XmlHtml => "xml_html",
64 Self::Yaml => "yaml",
65 Self::StackTrace => "stack_trace",
66 Self::NumberedList => "numbered_list",
67 Self::InlineJson => "inline_json",
68 Self::Prose => "prose",
69 }
70 }
71}
72
73fn has_md_table(text: &str) -> Option<(usize, usize)> {
76 let lines: Vec<&str> = text.lines().collect();
78 for (i, line) in lines.iter().enumerate() {
79 if !line.trim_start().starts_with('|') || !line.trim_end().ends_with('|') {
80 continue;
81 }
82 let next = lines.get(i + 1)?;
84 let n_trim = next.trim();
85 if !n_trim.starts_with('|') {
86 continue;
87 }
88 let chars_ok = n_trim
89 .chars()
90 .all(|c| c == '|' || c == '-' || c == ':' || c.is_whitespace());
91 if !chars_ok {
92 continue;
93 }
94 let n_cols = line
96 .trim_matches('|')
97 .split('|')
98 .filter(|c| !c.trim().is_empty() || !c.is_empty())
99 .count();
100 let mut n_rows = 0;
102 for l in &lines[i + 2..] {
103 if l.trim_start().starts_with('|') {
104 n_rows += 1;
105 } else if l.trim().is_empty() {
106 break;
107 }
108 }
109 return Some((n_cols, n_rows));
110 }
111 None
112}
113
114fn has_code_fence(text: &str) -> bool {
115 text.lines()
116 .filter(|l| l.trim_start().starts_with("```"))
117 .count()
118 >= 2
119}
120
121fn has_numbered_list(text: &str) -> bool {
122 text.lines()
124 .filter(|l| {
125 let mut chars = l.chars();
126 let mut seen_digit = false;
127 for c in chars.by_ref() {
128 if c == ' ' {
129 continue;
130 }
131 if c.is_ascii_digit() {
132 seen_digit = true;
133 continue;
134 }
135 return seen_digit && c == '→';
136 }
137 false
138 })
139 .count()
140 >= 3
141}
142
143fn has_bullet_list(text: &str) -> bool {
144 text.lines()
145 .filter(|l| {
146 let t = l.trim_start();
147 (t.starts_with("- ") || t.starts_with("* ")) && !t.starts_with("--")
148 })
149 .count()
150 >= 3
151}
152
153fn count_urls(text: &str) -> usize {
155 let mut n = 0;
156 let bytes = text.as_bytes();
157 let mut i = 0;
158 while i + 7 <= bytes.len() {
159 let w = &bytes[i..i + 7];
160 if w == b"http://" || (i + 8 <= bytes.len() && &bytes[i..i + 8] == b"https://") {
161 n += 1;
162 i += 8;
163 } else {
164 i += 1;
165 }
166 }
167 n
168}
169
170fn count_timestamps(text: &str) -> usize {
172 let bytes = text.as_bytes();
173 let mut n = 0;
174 let mut i = 0;
175 while i + 19 <= bytes.len() {
176 if bytes[i].is_ascii_digit()
177 && bytes[i + 1].is_ascii_digit()
178 && bytes[i + 2].is_ascii_digit()
179 && bytes[i + 3].is_ascii_digit()
180 && bytes[i + 4] == b'-'
181 && bytes[i + 5].is_ascii_digit()
182 && bytes[i + 6].is_ascii_digit()
183 && bytes[i + 7] == b'-'
184 && bytes[i + 8].is_ascii_digit()
185 && bytes[i + 9].is_ascii_digit()
186 && (bytes[i + 10] == b' ' || bytes[i + 10] == b'T')
187 && bytes[i + 11].is_ascii_digit()
188 && bytes[i + 12].is_ascii_digit()
189 && bytes[i + 13] == b':'
190 {
191 n += 1;
192 i += 19;
193 } else {
194 i += 1;
195 }
196 }
197 n
198}
199
200fn count_hashes(text: &str) -> usize {
202 let mut n = 0;
203 let mut run = 0;
204 for c in text.chars() {
205 if c.is_ascii_hexdigit() {
206 run += 1;
207 } else {
208 if (7..=40).contains(&run) {
209 n += 1;
210 }
211 run = 0;
212 }
213 }
214 if (7..=40).contains(&run) {
215 n += 1;
216 }
217 n
218}
219
220fn has_diff(text: &str) -> bool {
222 text.lines().any(|l| {
223 let t = l.trim_start();
224 t.starts_with("@@ ") && t.contains(" @@")
225 }) || text.contains("diff --git")
226}
227
228fn has_stack_trace(text: &str) -> bool {
229 text.contains("Traceback (most recent call last):") || text.contains("\n at ")
230}
231
232fn classify_json(val: &Value) -> (Shape, JsonDetails) {
235 let mut details = JsonDetails::default();
236 match val {
237 Value::Array(items) => {
238 details.n_items = Some(items.len());
239 if items.is_empty() {
240 return (Shape::Empty, details);
241 }
242 let all_objects = items.iter().all(|v| v.is_object());
244 if all_objects {
245 details.key_stability = Some(compute_key_stability(items));
246 details.has_nested_values = items.iter().take(20).any(|v| {
247 if let Value::Object(m) = v {
248 m.values().any(|vv| vv.is_object() || vv.is_array())
249 } else {
250 false
251 }
252 });
253 (Shape::ArrayOfObjects, details)
254 } else if items
255 .iter()
256 .all(|v| v.is_string() || v.is_number() || v.is_boolean() || v.is_null())
257 {
258 (Shape::ArrayOfPrimitives, details)
259 } else {
260 (Shape::NestedObject, details) }
262 }
263 Value::Object(m) => {
264 details.n_fields = Some(m.len());
265 if m.is_empty() {
266 return (Shape::Empty, details);
267 }
268 let any_nested = m.values().any(|v| v.is_object() || v.is_array());
269 if any_nested {
270 details.depth_max = Some(json_depth(val));
271 (Shape::NestedObject, details)
272 } else {
273 (Shape::FlatObject, details)
274 }
275 }
276 _ => (Shape::Unknown, details),
277 }
278}
279
280fn json_depth(val: &Value) -> usize {
281 match val {
282 Value::Object(m) => 1 + m.values().map(json_depth).max().unwrap_or(0),
283 Value::Array(a) => 1 + a.iter().map(json_depth).max().unwrap_or(0),
284 _ => 0,
285 }
286}
287
288fn compute_key_stability(items: &[Value]) -> f32 {
289 use std::collections::HashSet;
290 let sets: Vec<HashSet<String>> = items
291 .iter()
292 .take(20)
293 .filter_map(|v| v.as_object().map(|o| o.keys().cloned().collect()))
294 .collect();
295 if sets.len() < 2 {
296 return 1.0;
297 }
298 let first = &sets[0];
299 let mut jac = Vec::with_capacity(sets.len() - 1);
300 for s in &sets[1..] {
301 let union: HashSet<_> = first.union(s).cloned().collect();
302 let inter: HashSet<_> = first.intersection(s).cloned().collect();
303 if union.is_empty() {
304 jac.push(1.0);
305 } else {
306 jac.push(inter.len() as f32 / union.len() as f32);
307 }
308 }
309
310 jac.iter().sum::<f32>() / jac.len() as f32
311}
312
313#[derive(Default)]
314struct JsonDetails {
315 n_items: Option<usize>,
316 n_fields: Option<usize>,
317 depth_max: Option<usize>,
318 key_stability: Option<f32>,
319 has_nested_values: bool,
320}
321
322pub fn classify(content: &str) -> ClassifiedResponse {
327 let raw_chars = content.len();
328
329 let trimmed = content.trim_start();
331 if (trimmed.starts_with('{') || trimmed.starts_with('['))
332 && let Ok(val) = serde_json::from_str::<Value>(trimmed)
333 {
334 let (shape, details) = classify_json(&val);
335 let inner = scan_inner_formats_in_json(&val);
336 return ClassifiedResponse {
337 shape,
338 raw_chars,
339 inner_formats: inner,
340 md_n_cols: None,
341 md_n_rows: None,
342 n_items: details.n_items,
343 key_stability: details.key_stability,
344 n_fields: details.n_fields,
345 depth_max: details.depth_max,
346 };
347 }
348
349 if let Some((cols, rows)) = has_md_table(content) {
351 return ClassifiedResponse {
352 shape: Shape::MarkdownTable,
353 raw_chars,
354 inner_formats: text_inner_formats(content),
355 md_n_cols: Some(cols),
356 md_n_rows: Some(rows),
357 n_items: None,
358 key_stability: None,
359 n_fields: None,
360 depth_max: None,
361 };
362 }
363 if has_code_fence(content) {
364 return ClassifiedResponse {
365 shape: Shape::CodeBlock,
366 raw_chars,
367 inner_formats: text_inner_formats(content),
368 md_n_cols: None,
369 md_n_rows: None,
370 n_items: None,
371 key_stability: None,
372 n_fields: None,
373 depth_max: None,
374 };
375 }
376 if has_numbered_list(content) {
377 return ClassifiedResponse {
378 shape: Shape::NumberedList,
379 raw_chars,
380 inner_formats: vec![],
381 md_n_cols: None,
382 md_n_rows: None,
383 n_items: None,
384 key_stability: None,
385 n_fields: None,
386 depth_max: None,
387 };
388 }
389 if has_bullet_list(content) {
390 return ClassifiedResponse {
391 shape: Shape::BulletList,
392 raw_chars,
393 inner_formats: text_inner_formats(content),
394 md_n_cols: None,
395 md_n_rows: None,
396 n_items: None,
397 key_stability: None,
398 n_fields: None,
399 depth_max: None,
400 };
401 }
402 ClassifiedResponse {
403 shape: Shape::Prose,
404 raw_chars,
405 inner_formats: text_inner_formats(content),
406 md_n_cols: None,
407 md_n_rows: None,
408 n_items: None,
409 key_stability: None,
410 n_fields: None,
411 depth_max: None,
412 }
413}
414
415fn text_inner_formats(text: &str) -> Vec<InnerFormat> {
416 let mut out = Vec::new();
417 if count_urls(text) > 0 {
418 out.push(InnerFormat::Url);
419 }
420 if count_timestamps(text) > 0 {
421 out.push(InnerFormat::Log);
422 }
423 if count_hashes(text) > 0 {
424 out.push(InnerFormat::Hash);
425 }
426 if has_diff(text) {
427 out.push(InnerFormat::Diff);
428 }
429 if has_stack_trace(text) {
430 out.push(InnerFormat::StackTrace);
431 }
432 out
433}
434
435fn scan_inner_formats_in_json(val: &Value) -> Vec<InnerFormat> {
436 use std::collections::HashSet;
437 let mut seen: HashSet<&'static str> = HashSet::new();
438 walk_json_strings(val, &mut seen, 0);
439 let mut out = Vec::new();
440 for tag in [
441 "url",
442 "log",
443 "hash",
444 "diff",
445 "md",
446 "md_table",
447 "xml_html",
448 "yaml",
449 "stack_trace",
450 "numbered_list",
451 "prose",
452 ] {
453 if seen.contains(tag) {
454 out.push(match tag {
455 "url" => InnerFormat::Url,
456 "log" => InnerFormat::Log,
457 "hash" => InnerFormat::Hash,
458 "diff" => InnerFormat::Diff,
459 "md" => InnerFormat::Markdown,
460 "md_table" => InnerFormat::MarkdownTable,
461 "xml_html" => InnerFormat::XmlHtml,
462 "yaml" => InnerFormat::Yaml,
463 "stack_trace" => InnerFormat::StackTrace,
464 "numbered_list" => InnerFormat::NumberedList,
465 "prose" => InnerFormat::Prose,
466 _ => continue,
467 });
468 }
469 }
470 out
471}
472
473fn walk_json_strings(
474 val: &Value,
475 seen: &mut std::collections::HashSet<&'static str>,
476 depth: usize,
477) {
478 if depth > 5 {
479 return;
480 }
481 match val {
482 Value::String(s) => {
483 if s.len() < 8 {
484 return;
485 }
486 if count_urls(s) > 0 {
487 seen.insert("url");
488 }
489 if count_timestamps(s) > 0 {
490 seen.insert("log");
491 }
492 if count_hashes(s) > 0 {
493 seen.insert("hash");
494 }
495 if has_diff(s) {
496 seen.insert("diff");
497 }
498 if has_md_table(s).is_some() {
499 seen.insert("md_table");
500 }
501 if has_stack_trace(s) {
502 seen.insert("stack_trace");
503 }
504 }
505 Value::Array(items) => {
506 for v in items.iter().take(100) {
507 walk_json_strings(v, seen, depth + 1);
508 }
509 }
510 Value::Object(m) => {
511 for v in m.values().take(200) {
512 walk_json_strings(v, seen, depth + 1);
513 }
514 }
515 _ => {}
516 }
517}
518
519#[cfg(test)]
522mod tests {
523 use super::*;
524
525 #[test]
526 fn classifies_json_array_of_objects() {
527 let text = r#"[{"id":1,"name":"a"},{"id":2,"name":"b"}]"#;
528 let c = classify(text);
529 assert_eq!(c.shape, Shape::ArrayOfObjects);
530 assert_eq!(c.n_items, Some(2));
531 assert!(c.key_stability.unwrap() > 0.99);
532 }
533
534 #[test]
535 fn classifies_flat_object() {
536 let text = r#"{"a":1,"b":"text","c":true}"#;
537 let c = classify(text);
538 assert_eq!(c.shape, Shape::FlatObject);
539 assert_eq!(c.n_fields, Some(3));
540 }
541
542 #[test]
543 fn classifies_nested_object() {
544 let text = r#"{"a":{"b":{"c":1}}}"#;
545 let c = classify(text);
546 assert_eq!(c.shape, Shape::NestedObject);
547 assert!(c.depth_max.unwrap() >= 3);
548 }
549
550 #[test]
551 fn classifies_markdown_table() {
552 let text = "| id | name |\n|----|------|\n| 1 | Alice |\n| 2 | Bob |\n";
553 let c = classify(text);
554 assert_eq!(c.shape, Shape::MarkdownTable);
555 assert_eq!(c.md_n_cols, Some(2));
556 assert_eq!(c.md_n_rows, Some(2));
557 }
558
559 #[test]
560 fn classifies_code_block() {
561 let text = "Some docs.\n```python\ndef foo():\n return 1\n```\n";
562 let c = classify(text);
563 assert_eq!(c.shape, Shape::CodeBlock);
564 }
565
566 #[test]
567 fn classifies_numbered_list_file_read() {
568 let text = " 1→use chrono::DateTime;\n 2→use serde::Deserialize;\n 3→pub struct X;\n";
569 let c = classify(text);
570 assert_eq!(c.shape, Shape::NumberedList);
571 }
572
573 #[test]
574 fn classifies_prose_with_url() {
575 let text = "Here is a URL: https://example.com/foo and some text.";
576 let c = classify(text);
577 assert_eq!(c.shape, Shape::Prose);
578 assert!(c.inner_formats.contains(&InnerFormat::Url));
579 }
580
581 #[test]
582 fn detects_log_and_hash_in_json_strings() {
583 let text = r#"{"commit":"abc1234def","time":"2026-04-24 18:30:00","url":"https://x.y/z"}"#;
584 let c = classify(text);
585 assert_eq!(c.shape, Shape::FlatObject);
586 assert!(c.inner_formats.contains(&InnerFormat::Log));
587 assert!(c.inner_formats.contains(&InnerFormat::Hash));
588 assert!(c.inner_formats.contains(&InnerFormat::Url));
589 }
590
591 #[test]
592 fn detects_diff() {
593 let text = "--- a/foo\n+++ b/foo\n@@ -1,3 +1,3 @@\n-old\n+new\n line\n";
594 let c = classify(text);
595 assert!(c.inner_formats.contains(&InnerFormat::Diff));
596 }
597
598 #[test]
599 fn empty_array_is_empty() {
600 let c = classify("[]");
601 assert_eq!(c.shape, Shape::Empty);
602 }
603
604 #[test]
605 fn empty_object_is_empty() {
606 let c = classify("{}");
607 assert_eq!(c.shape, Shape::Empty);
608 }
609
610 #[test]
611 fn classifies_array_of_primitives() {
612 let c = classify("[1, 2, 3, 4, 5]");
613 assert_eq!(c.shape, Shape::ArrayOfPrimitives);
614 assert_eq!(c.n_items, Some(5));
615 }
616
617 #[test]
618 fn classifies_heterogeneous_array_as_nested() {
619 let c = classify(r#"[1, "two", {"three": 3}]"#);
621 assert_eq!(c.shape, Shape::NestedObject);
622 }
623
624 #[test]
625 fn classifies_bullet_list() {
626 let text = "Items:\n- one\n- two\n- three\n";
627 let c = classify(text);
628 assert_eq!(c.shape, Shape::BulletList);
629 }
630
631 #[test]
632 fn classifies_plain_prose_fallback() {
633 let c = classify("Just one sentence, no structure.");
634 assert_eq!(c.shape, Shape::Prose);
635 }
636
637 #[test]
638 fn detects_python_traceback_in_prose() {
639 let text = "Traceback (most recent call last):\n File \"x.py\", line 1, in <module>\n raise ValueError(\"bad\")\nValueError: bad\n";
640 let c = classify(text);
641 assert!(c.inner_formats.contains(&InnerFormat::StackTrace));
642 }
643
644 #[test]
645 fn detects_js_style_stack_trace() {
646 let text =
647 "Error occurred\n at Object.<anonymous> (/foo.js:1:1)\n at Module._compile\n";
648 let c = classify(text);
649 assert!(c.inner_formats.contains(&InnerFormat::StackTrace));
650 }
651
652 #[test]
653 fn detects_git_diff_header() {
654 let text = "diff --git a/x b/x\n--- a/x\n+++ b/x\n@@ -1 +1 @@\n-a\n+b\n";
655 let c = classify(text);
656 assert!(c.inner_formats.contains(&InnerFormat::Diff));
657 }
658
659 #[test]
660 fn classifies_nested_object_with_diff_inside() {
661 let text = r#"{"mr_id":42,"diffs":"@@ -1,3 +1,3 @@\n-old\n+new"}"#;
662 let c = classify(text);
663 assert_eq!(c.shape, Shape::FlatObject);
664 assert!(c.inner_formats.contains(&InnerFormat::Diff));
665 }
666
667 #[test]
668 fn detects_md_table_inside_json_string() {
669 let text = r#"{"body":"| a | b |\n|---|---|\n| 1 | 2 |\n"}"#;
670 let c = classify(text);
671 assert!(c.inner_formats.contains(&InnerFormat::MarkdownTable));
672 }
673
674 #[test]
675 fn inner_format_as_tag_covers_all_variants() {
676 let variants = [
678 InnerFormat::Url,
679 InnerFormat::Log,
680 InnerFormat::Hash,
681 InnerFormat::Diff,
682 InnerFormat::Markdown,
683 InnerFormat::MarkdownTable,
684 InnerFormat::MarkdownWithCode,
685 InnerFormat::CodeFence,
686 InnerFormat::XmlHtml,
687 InnerFormat::Yaml,
688 InnerFormat::StackTrace,
689 InnerFormat::NumberedList,
690 InnerFormat::InlineJson,
691 InnerFormat::Prose,
692 ];
693 for v in &variants {
694 assert!(!v.as_tag().is_empty(), "missing tag for {v:?}");
695 }
696 }
697
698 #[test]
699 fn array_of_objects_key_stability_detects_drift() {
700 let text = r#"[{"a":1,"b":2}, {"c":3,"d":4}]"#;
702 let c = classify(text);
703 assert_eq!(c.shape, Shape::ArrayOfObjects);
704 assert!(
705 c.key_stability.unwrap() < 0.1,
706 "expected low stability, got {:?}",
707 c.key_stability
708 );
709 }
710
711 #[test]
712 fn malformed_json_falls_through_to_text_classifier() {
713 let text = "{ malformed, not json at all";
714 let c = classify(text);
715 assert!(matches!(
717 c.shape,
718 Shape::Prose | Shape::BulletList | Shape::CodeBlock | Shape::MarkdownTable
719 ));
720 }
721
722 #[test]
723 fn json_inside_string_opens_up_recursion() {
724 let deep = r#"{"a":{"b":{"c":{"d":{"e":"https://nested.example/path/here"}}}}}"#;
726 let c = classify(deep);
727 assert_eq!(c.shape, Shape::NestedObject);
728 assert!(c.inner_formats.contains(&InnerFormat::Url));
729 }
730}