Skip to main content

devboy_format_pipeline/
shape.rs

1//! Structural shape classifier for tool responses.
2//!
3//! Ports the classifier from `docs/research/scripts/extract_paper2_format_events.py`
4//! into Rust. Keep the two in sync so offline analyses and online decisions
5//! share the same taxonomy.
6//!
7//! Shape is detected by a two-step rule:
8//! 1. If the content parses as JSON, classify by JSON structure.
9//! 2. Otherwise, scan the text for markdown / code / numbered-list patterns.
10
11use crate::telemetry::Shape;
12use serde_json::Value;
13
14/// Result of classifying one response.
15#[derive(Debug, Clone)]
16pub struct ClassifiedResponse {
17    pub shape: Shape,
18    pub raw_chars: usize,
19    pub inner_formats: Vec<InnerFormat>,
20    /// For markdown-table: number of columns detected in the header row.
21    pub md_n_cols: Option<usize>,
22    pub md_n_rows: Option<usize>,
23    /// For array_of_objects: number of elements.
24    pub n_items: Option<usize>,
25    /// For array_of_objects: mean jaccard of key sets across items (0–1).
26    pub key_stability: Option<f32>,
27    /// For flat_object / nested_object: number of top-level keys.
28    pub n_fields: Option<usize>,
29    /// For nested_object: max depth of JSON nesting.
30    pub depth_max: Option<usize>,
31}
32
33/// Categories of inner formats we sniff inside string leaves and prose.
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum InnerFormat {
36    Url,
37    Log,
38    Hash,
39    Diff,
40    Markdown,
41    MarkdownTable,
42    MarkdownWithCode,
43    CodeFence,
44    XmlHtml,
45    Yaml,
46    StackTrace,
47    NumberedList,
48    InlineJson,
49    Prose,
50}
51
52impl InnerFormat {
53    pub fn as_tag(&self) -> &'static str {
54        match self {
55            Self::Url => "url",
56            Self::Log => "log",
57            Self::Hash => "hash",
58            Self::Diff => "diff",
59            Self::Markdown => "md",
60            Self::MarkdownTable => "md_table",
61            Self::MarkdownWithCode => "md_with_code",
62            Self::CodeFence => "code_fence",
63            Self::XmlHtml => "xml_html",
64            Self::Yaml => "yaml",
65            Self::StackTrace => "stack_trace",
66            Self::NumberedList => "numbered_list",
67            Self::InlineJson => "inline_json",
68            Self::Prose => "prose",
69        }
70    }
71}
72
73// ─── TEXT PATTERN DETECTORS ──────────────────────────────────────────────
74
75fn has_md_table(text: &str) -> Option<(usize, usize)> {
76    // Minimum valid markdown table: `|a|b|` header + `|---|---|` separator.
77    let lines: Vec<&str> = text.lines().collect();
78    for (i, line) in lines.iter().enumerate() {
79        if !line.trim_start().starts_with('|') || !line.trim_end().ends_with('|') {
80            continue;
81        }
82        // Check next line is a separator.
83        let next = lines.get(i + 1)?;
84        let n_trim = next.trim();
85        if !n_trim.starts_with('|') {
86            continue;
87        }
88        let chars_ok = n_trim
89            .chars()
90            .all(|c| c == '|' || c == '-' || c == ':' || c.is_whitespace());
91        if !chars_ok {
92            continue;
93        }
94        // Count columns by pipe segments (trimmed outer).
95        let n_cols = line
96            .trim_matches('|')
97            .split('|')
98            .filter(|c| !c.trim().is_empty() || !c.is_empty())
99            .count();
100        // Count data rows (lines after separator that begin with |).
101        let mut n_rows = 0;
102        for l in &lines[i + 2..] {
103            if l.trim_start().starts_with('|') {
104                n_rows += 1;
105            } else if l.trim().is_empty() {
106                break;
107            }
108        }
109        return Some((n_cols, n_rows));
110    }
111    None
112}
113
114fn has_code_fence(text: &str) -> bool {
115    text.lines()
116        .filter(|l| l.trim_start().starts_with("```"))
117        .count()
118        >= 2
119}
120
121fn has_numbered_list(text: &str) -> bool {
122    // Matches lines like `   1→`, `  42→` — the Read tool's line-number prefix.
123    text.lines()
124        .filter(|l| {
125            let mut chars = l.chars();
126            let mut seen_digit = false;
127            for c in chars.by_ref() {
128                if c == ' ' {
129                    continue;
130                }
131                if c.is_ascii_digit() {
132                    seen_digit = true;
133                    continue;
134                }
135                return seen_digit && c == '→';
136            }
137            false
138        })
139        .count()
140        >= 3
141}
142
143fn has_bullet_list(text: &str) -> bool {
144    text.lines()
145        .filter(|l| {
146            let t = l.trim_start();
147            (t.starts_with("- ") || t.starts_with("* ")) && !t.starts_with("--")
148        })
149        .count()
150        >= 3
151}
152
153/// Very rough URL detection — just looks for `http://` or `https://`.
154fn count_urls(text: &str) -> usize {
155    let mut n = 0;
156    let bytes = text.as_bytes();
157    let mut i = 0;
158    while i + 7 <= bytes.len() {
159        let w = &bytes[i..i + 7];
160        if w == b"http://" || (i + 8 <= bytes.len() && &bytes[i..i + 8] == b"https://") {
161            n += 1;
162            i += 8;
163        } else {
164            i += 1;
165        }
166    }
167    n
168}
169
170/// Timestamp patterns like `2026-04-24 18:29:00` or `2026-04-24T18:29:00`.
171fn count_timestamps(text: &str) -> usize {
172    let bytes = text.as_bytes();
173    let mut n = 0;
174    let mut i = 0;
175    while i + 19 <= bytes.len() {
176        if bytes[i].is_ascii_digit()
177            && bytes[i + 1].is_ascii_digit()
178            && bytes[i + 2].is_ascii_digit()
179            && bytes[i + 3].is_ascii_digit()
180            && bytes[i + 4] == b'-'
181            && bytes[i + 5].is_ascii_digit()
182            && bytes[i + 6].is_ascii_digit()
183            && bytes[i + 7] == b'-'
184            && bytes[i + 8].is_ascii_digit()
185            && bytes[i + 9].is_ascii_digit()
186            && (bytes[i + 10] == b' ' || bytes[i + 10] == b'T')
187            && bytes[i + 11].is_ascii_digit()
188            && bytes[i + 12].is_ascii_digit()
189            && bytes[i + 13] == b':'
190        {
191            n += 1;
192            i += 19;
193        } else {
194            i += 1;
195        }
196    }
197    n
198}
199
200/// Git-style hex hashes (7–40 hex chars, word-bounded).
201fn count_hashes(text: &str) -> usize {
202    let mut n = 0;
203    let mut run = 0;
204    for c in text.chars() {
205        if c.is_ascii_hexdigit() {
206            run += 1;
207        } else {
208            if (7..=40).contains(&run) {
209                n += 1;
210            }
211            run = 0;
212        }
213    }
214    if (7..=40).contains(&run) {
215        n += 1;
216    }
217    n
218}
219
220/// Unified-diff markers (`@@ -1,5 +1,6 @@`).
221fn has_diff(text: &str) -> bool {
222    text.lines().any(|l| {
223        let t = l.trim_start();
224        t.starts_with("@@ ") && t.contains(" @@")
225    }) || text.contains("diff --git")
226}
227
228fn has_stack_trace(text: &str) -> bool {
229    text.contains("Traceback (most recent call last):") || text.contains("\n    at ")
230}
231
232// ─── JSON CLASSIFIER ────────────────────────────────────────────────────
233
234fn classify_json(val: &Value) -> (Shape, JsonDetails) {
235    let mut details = JsonDetails::default();
236    match val {
237        Value::Array(items) => {
238            details.n_items = Some(items.len());
239            if items.is_empty() {
240                return (Shape::Empty, details);
241            }
242            // Determine majority variant.
243            let all_objects = items.iter().all(|v| v.is_object());
244            if all_objects {
245                details.key_stability = Some(compute_key_stability(items));
246                details.has_nested_values = items.iter().take(20).any(|v| {
247                    if let Value::Object(m) = v {
248                        m.values().any(|vv| vv.is_object() || vv.is_array())
249                    } else {
250                        false
251                    }
252                });
253                (Shape::ArrayOfObjects, details)
254            } else if items
255                .iter()
256                .all(|v| v.is_string() || v.is_number() || v.is_boolean() || v.is_null())
257            {
258                (Shape::ArrayOfPrimitives, details)
259            } else {
260                (Shape::NestedObject, details) // heterogeneous array → treat as nested
261            }
262        }
263        Value::Object(m) => {
264            details.n_fields = Some(m.len());
265            if m.is_empty() {
266                return (Shape::Empty, details);
267            }
268            let any_nested = m.values().any(|v| v.is_object() || v.is_array());
269            if any_nested {
270                details.depth_max = Some(json_depth(val));
271                (Shape::NestedObject, details)
272            } else {
273                (Shape::FlatObject, details)
274            }
275        }
276        _ => (Shape::Unknown, details),
277    }
278}
279
280fn json_depth(val: &Value) -> usize {
281    match val {
282        Value::Object(m) => 1 + m.values().map(json_depth).max().unwrap_or(0),
283        Value::Array(a) => 1 + a.iter().map(json_depth).max().unwrap_or(0),
284        _ => 0,
285    }
286}
287
288fn compute_key_stability(items: &[Value]) -> f32 {
289    use std::collections::HashSet;
290    let sets: Vec<HashSet<String>> = items
291        .iter()
292        .take(20)
293        .filter_map(|v| v.as_object().map(|o| o.keys().cloned().collect()))
294        .collect();
295    if sets.len() < 2 {
296        return 1.0;
297    }
298    let first = &sets[0];
299    let mut jac = Vec::with_capacity(sets.len() - 1);
300    for s in &sets[1..] {
301        let union: HashSet<_> = first.union(s).cloned().collect();
302        let inter: HashSet<_> = first.intersection(s).cloned().collect();
303        if union.is_empty() {
304            jac.push(1.0);
305        } else {
306            jac.push(inter.len() as f32 / union.len() as f32);
307        }
308    }
309
310    jac.iter().sum::<f32>() / jac.len() as f32
311}
312
313#[derive(Default)]
314struct JsonDetails {
315    n_items: Option<usize>,
316    n_fields: Option<usize>,
317    depth_max: Option<usize>,
318    key_stability: Option<f32>,
319    has_nested_values: bool,
320}
321
322// ─── PUBLIC ENTRY ───────────────────────────────────────────────────────
323
324/// Classify a response body. Never panics; always returns a Shape
325/// (Unknown if nothing matches).
326pub fn classify(content: &str) -> ClassifiedResponse {
327    let raw_chars = content.len();
328
329    // Try JSON first.
330    let trimmed = content.trim_start();
331    if (trimmed.starts_with('{') || trimmed.starts_with('['))
332        && let Ok(val) = serde_json::from_str::<Value>(trimmed)
333    {
334        let (shape, details) = classify_json(&val);
335        let inner = scan_inner_formats_in_json(&val);
336        return ClassifiedResponse {
337            shape,
338            raw_chars,
339            inner_formats: inner,
340            md_n_cols: None,
341            md_n_rows: None,
342            n_items: details.n_items,
343            key_stability: details.key_stability,
344            n_fields: details.n_fields,
345            depth_max: details.depth_max,
346        };
347    }
348
349    // Text-level classification.
350    if let Some((cols, rows)) = has_md_table(content) {
351        return ClassifiedResponse {
352            shape: Shape::MarkdownTable,
353            raw_chars,
354            inner_formats: text_inner_formats(content),
355            md_n_cols: Some(cols),
356            md_n_rows: Some(rows),
357            n_items: None,
358            key_stability: None,
359            n_fields: None,
360            depth_max: None,
361        };
362    }
363    if has_code_fence(content) {
364        return ClassifiedResponse {
365            shape: Shape::CodeBlock,
366            raw_chars,
367            inner_formats: text_inner_formats(content),
368            md_n_cols: None,
369            md_n_rows: None,
370            n_items: None,
371            key_stability: None,
372            n_fields: None,
373            depth_max: None,
374        };
375    }
376    if has_numbered_list(content) {
377        return ClassifiedResponse {
378            shape: Shape::NumberedList,
379            raw_chars,
380            inner_formats: vec![],
381            md_n_cols: None,
382            md_n_rows: None,
383            n_items: None,
384            key_stability: None,
385            n_fields: None,
386            depth_max: None,
387        };
388    }
389    if has_bullet_list(content) {
390        return ClassifiedResponse {
391            shape: Shape::BulletList,
392            raw_chars,
393            inner_formats: text_inner_formats(content),
394            md_n_cols: None,
395            md_n_rows: None,
396            n_items: None,
397            key_stability: None,
398            n_fields: None,
399            depth_max: None,
400        };
401    }
402    ClassifiedResponse {
403        shape: Shape::Prose,
404        raw_chars,
405        inner_formats: text_inner_formats(content),
406        md_n_cols: None,
407        md_n_rows: None,
408        n_items: None,
409        key_stability: None,
410        n_fields: None,
411        depth_max: None,
412    }
413}
414
415fn text_inner_formats(text: &str) -> Vec<InnerFormat> {
416    let mut out = Vec::new();
417    if count_urls(text) > 0 {
418        out.push(InnerFormat::Url);
419    }
420    if count_timestamps(text) > 0 {
421        out.push(InnerFormat::Log);
422    }
423    if count_hashes(text) > 0 {
424        out.push(InnerFormat::Hash);
425    }
426    if has_diff(text) {
427        out.push(InnerFormat::Diff);
428    }
429    if has_stack_trace(text) {
430        out.push(InnerFormat::StackTrace);
431    }
432    out
433}
434
435fn scan_inner_formats_in_json(val: &Value) -> Vec<InnerFormat> {
436    use std::collections::HashSet;
437    let mut seen: HashSet<&'static str> = HashSet::new();
438    walk_json_strings(val, &mut seen, 0);
439    let mut out = Vec::new();
440    for tag in [
441        "url",
442        "log",
443        "hash",
444        "diff",
445        "md",
446        "md_table",
447        "xml_html",
448        "yaml",
449        "stack_trace",
450        "numbered_list",
451        "prose",
452    ] {
453        if seen.contains(tag) {
454            out.push(match tag {
455                "url" => InnerFormat::Url,
456                "log" => InnerFormat::Log,
457                "hash" => InnerFormat::Hash,
458                "diff" => InnerFormat::Diff,
459                "md" => InnerFormat::Markdown,
460                "md_table" => InnerFormat::MarkdownTable,
461                "xml_html" => InnerFormat::XmlHtml,
462                "yaml" => InnerFormat::Yaml,
463                "stack_trace" => InnerFormat::StackTrace,
464                "numbered_list" => InnerFormat::NumberedList,
465                "prose" => InnerFormat::Prose,
466                _ => continue,
467            });
468        }
469    }
470    out
471}
472
473fn walk_json_strings(
474    val: &Value,
475    seen: &mut std::collections::HashSet<&'static str>,
476    depth: usize,
477) {
478    if depth > 5 {
479        return;
480    }
481    match val {
482        Value::String(s) => {
483            if s.len() < 8 {
484                return;
485            }
486            if count_urls(s) > 0 {
487                seen.insert("url");
488            }
489            if count_timestamps(s) > 0 {
490                seen.insert("log");
491            }
492            if count_hashes(s) > 0 {
493                seen.insert("hash");
494            }
495            if has_diff(s) {
496                seen.insert("diff");
497            }
498            if has_md_table(s).is_some() {
499                seen.insert("md_table");
500            }
501            if has_stack_trace(s) {
502                seen.insert("stack_trace");
503            }
504        }
505        Value::Array(items) => {
506            for v in items.iter().take(100) {
507                walk_json_strings(v, seen, depth + 1);
508            }
509        }
510        Value::Object(m) => {
511            for v in m.values().take(200) {
512                walk_json_strings(v, seen, depth + 1);
513            }
514        }
515        _ => {}
516    }
517}
518
519// ─── TESTS ─────────────────────────────────────────────────────────────
520
521#[cfg(test)]
522mod tests {
523    use super::*;
524
525    #[test]
526    fn classifies_json_array_of_objects() {
527        let text = r#"[{"id":1,"name":"a"},{"id":2,"name":"b"}]"#;
528        let c = classify(text);
529        assert_eq!(c.shape, Shape::ArrayOfObjects);
530        assert_eq!(c.n_items, Some(2));
531        assert!(c.key_stability.unwrap() > 0.99);
532    }
533
534    #[test]
535    fn classifies_flat_object() {
536        let text = r#"{"a":1,"b":"text","c":true}"#;
537        let c = classify(text);
538        assert_eq!(c.shape, Shape::FlatObject);
539        assert_eq!(c.n_fields, Some(3));
540    }
541
542    #[test]
543    fn classifies_nested_object() {
544        let text = r#"{"a":{"b":{"c":1}}}"#;
545        let c = classify(text);
546        assert_eq!(c.shape, Shape::NestedObject);
547        assert!(c.depth_max.unwrap() >= 3);
548    }
549
550    #[test]
551    fn classifies_markdown_table() {
552        let text = "| id | name |\n|----|------|\n| 1 | Alice |\n| 2 | Bob |\n";
553        let c = classify(text);
554        assert_eq!(c.shape, Shape::MarkdownTable);
555        assert_eq!(c.md_n_cols, Some(2));
556        assert_eq!(c.md_n_rows, Some(2));
557    }
558
559    #[test]
560    fn classifies_code_block() {
561        let text = "Some docs.\n```python\ndef foo():\n    return 1\n```\n";
562        let c = classify(text);
563        assert_eq!(c.shape, Shape::CodeBlock);
564    }
565
566    #[test]
567    fn classifies_numbered_list_file_read() {
568        let text = "   1→use chrono::DateTime;\n   2→use serde::Deserialize;\n   3→pub struct X;\n";
569        let c = classify(text);
570        assert_eq!(c.shape, Shape::NumberedList);
571    }
572
573    #[test]
574    fn classifies_prose_with_url() {
575        let text = "Here is a URL: https://example.com/foo and some text.";
576        let c = classify(text);
577        assert_eq!(c.shape, Shape::Prose);
578        assert!(c.inner_formats.contains(&InnerFormat::Url));
579    }
580
581    #[test]
582    fn detects_log_and_hash_in_json_strings() {
583        let text = r#"{"commit":"abc1234def","time":"2026-04-24 18:30:00","url":"https://x.y/z"}"#;
584        let c = classify(text);
585        assert_eq!(c.shape, Shape::FlatObject);
586        assert!(c.inner_formats.contains(&InnerFormat::Log));
587        assert!(c.inner_formats.contains(&InnerFormat::Hash));
588        assert!(c.inner_formats.contains(&InnerFormat::Url));
589    }
590
591    #[test]
592    fn detects_diff() {
593        let text = "--- a/foo\n+++ b/foo\n@@ -1,3 +1,3 @@\n-old\n+new\n line\n";
594        let c = classify(text);
595        assert!(c.inner_formats.contains(&InnerFormat::Diff));
596    }
597
598    #[test]
599    fn empty_array_is_empty() {
600        let c = classify("[]");
601        assert_eq!(c.shape, Shape::Empty);
602    }
603
604    #[test]
605    fn empty_object_is_empty() {
606        let c = classify("{}");
607        assert_eq!(c.shape, Shape::Empty);
608    }
609
610    #[test]
611    fn classifies_array_of_primitives() {
612        let c = classify("[1, 2, 3, 4, 5]");
613        assert_eq!(c.shape, Shape::ArrayOfPrimitives);
614        assert_eq!(c.n_items, Some(5));
615    }
616
617    #[test]
618    fn classifies_heterogeneous_array_as_nested() {
619        // Mixed types → fallthrough to NestedObject treatment.
620        let c = classify(r#"[1, "two", {"three": 3}]"#);
621        assert_eq!(c.shape, Shape::NestedObject);
622    }
623
624    #[test]
625    fn classifies_bullet_list() {
626        let text = "Items:\n- one\n- two\n- three\n";
627        let c = classify(text);
628        assert_eq!(c.shape, Shape::BulletList);
629    }
630
631    #[test]
632    fn classifies_plain_prose_fallback() {
633        let c = classify("Just one sentence, no structure.");
634        assert_eq!(c.shape, Shape::Prose);
635    }
636
637    #[test]
638    fn detects_python_traceback_in_prose() {
639        let text = "Traceback (most recent call last):\n  File \"x.py\", line 1, in <module>\n    raise ValueError(\"bad\")\nValueError: bad\n";
640        let c = classify(text);
641        assert!(c.inner_formats.contains(&InnerFormat::StackTrace));
642    }
643
644    #[test]
645    fn detects_js_style_stack_trace() {
646        let text =
647            "Error occurred\n    at Object.<anonymous> (/foo.js:1:1)\n    at Module._compile\n";
648        let c = classify(text);
649        assert!(c.inner_formats.contains(&InnerFormat::StackTrace));
650    }
651
652    #[test]
653    fn detects_git_diff_header() {
654        let text = "diff --git a/x b/x\n--- a/x\n+++ b/x\n@@ -1 +1 @@\n-a\n+b\n";
655        let c = classify(text);
656        assert!(c.inner_formats.contains(&InnerFormat::Diff));
657    }
658
659    #[test]
660    fn classifies_nested_object_with_diff_inside() {
661        let text = r#"{"mr_id":42,"diffs":"@@ -1,3 +1,3 @@\n-old\n+new"}"#;
662        let c = classify(text);
663        assert_eq!(c.shape, Shape::FlatObject);
664        assert!(c.inner_formats.contains(&InnerFormat::Diff));
665    }
666
667    #[test]
668    fn detects_md_table_inside_json_string() {
669        let text = r#"{"body":"| a | b |\n|---|---|\n| 1 | 2 |\n"}"#;
670        let c = classify(text);
671        assert!(c.inner_formats.contains(&InnerFormat::MarkdownTable));
672    }
673
674    #[test]
675    fn inner_format_as_tag_covers_all_variants() {
676        // Guards against a new InnerFormat variant forgetting a tag.
677        let variants = [
678            InnerFormat::Url,
679            InnerFormat::Log,
680            InnerFormat::Hash,
681            InnerFormat::Diff,
682            InnerFormat::Markdown,
683            InnerFormat::MarkdownTable,
684            InnerFormat::MarkdownWithCode,
685            InnerFormat::CodeFence,
686            InnerFormat::XmlHtml,
687            InnerFormat::Yaml,
688            InnerFormat::StackTrace,
689            InnerFormat::NumberedList,
690            InnerFormat::InlineJson,
691            InnerFormat::Prose,
692        ];
693        for v in &variants {
694            assert!(!v.as_tag().is_empty(), "missing tag for {v:?}");
695        }
696    }
697
698    #[test]
699    fn array_of_objects_key_stability_detects_drift() {
700        // Two objects with zero key overlap → stability close to 0.
701        let text = r#"[{"a":1,"b":2}, {"c":3,"d":4}]"#;
702        let c = classify(text);
703        assert_eq!(c.shape, Shape::ArrayOfObjects);
704        assert!(
705            c.key_stability.unwrap() < 0.1,
706            "expected low stability, got {:?}",
707            c.key_stability
708        );
709    }
710
711    #[test]
712    fn malformed_json_falls_through_to_text_classifier() {
713        let text = "{ malformed, not json at all";
714        let c = classify(text);
715        // Neither object nor array → classified as prose / text fallback.
716        assert!(matches!(
717            c.shape,
718            Shape::Prose | Shape::BulletList | Shape::CodeBlock | Shape::MarkdownTable
719        ));
720    }
721
722    #[test]
723    fn json_inside_string_opens_up_recursion() {
724        // Very nested JSON containing long URLs — exercise walk_json_strings recursion.
725        let deep = r#"{"a":{"b":{"c":{"d":{"e":"https://nested.example/path/here"}}}}}"#;
726        let c = classify(deep);
727        assert_eq!(c.shape, Shape::NestedObject);
728        assert!(c.inner_formats.contains(&InnerFormat::Url));
729    }
730}