Skip to main content

hematite/agent/
parser.rs

1// ── src/agent/parser.rs: XML-ish Parser for Swarm Workloads ─────────────────────
2/// Resilient parser that maps XML-like tags to structured data without regex overhead.
3/// Designed to handle LLM output quirks (trailing commas, broken escapes) gracefully.
4
5#[derive(Debug, Clone)]
6pub struct WorkerTask {
7    /// Unique identifier for the worker task (e.g., "w-001", "worker-alpha")
8    pub id: String,
9
10    /// Target file path or directory where the work should be applied
11    pub target: String,
12
13    /// The instruction/payload describing what work to perform on the target
14    pub instruction: String,
15}
16
17#[allow(dead_code)]
18#[derive(Debug, Clone)]
19pub struct Hunk {
20    /// Starting line number (1-indexed) of the patch region
21    pub start_line: usize,
22
23    /// Ending line number (1-indexed, inclusive) of the patch region
24    pub end_line: usize,
25
26    /// The actual content/patch to apply within the specified line range
27    pub content: String,
28
29    /// Identifier of the worker that generated this hunk (for attribution)
30    pub worker_id: String,
31}
32
33impl Hunk {
34    #[allow(dead_code)]
35    /// Returns a sort key for ordering hunks by position.
36    /// Uses reverse ordering so higher line numbers come first (useful for bottom-up processing).
37    ///
38    /// # Returns
39    /// A tuple of `(Reverse(start_line), Reverse(end_line))` for stable multi-key sorting.
40    pub fn sort_key(&self) -> (std::cmp::Reverse<usize>, std::cmp::Reverse<usize>) {
41        (
42            std::cmp::Reverse(self.start_line),
43            std::cmp::Reverse(self.end_line),
44        )
45    }
46}
47
48/// Parses master specification XML content into a vector of [`WorkerTask`] items.
49///
50/// This function splits the input by `<worker_task` tags and extracts:
51/// - `id`: Unique task identifier from `id="..."` attribute
52/// - `target`: Target file/path from `target="..."` attribute  
53/// - `instruction`: The payload content between opening tag and `</worker_task>`
54///
55/// # Arguments
56/// * `xml_content` — Raw XML-ish string containing worker task definitions
57///
58/// # Returns
59/// A `Vec<WorkerTask>` with all parsed tasks (skips malformed blocks)
60///
61/// # Example
62/// ```ignore
63/// let xml = r#"<worker_task id="w-001" target="src/main.rs">
64///     // Do something
65/// </worker_task>"#;
66/// let tasks = parse_master_spec(xml);
67/// assert_eq!(tasks.len(), 1);
68/// ```
69pub fn parse_master_spec(xml_content: &str) -> Vec<WorkerTask> {
70    let mut tasks = Vec::new();
71    let iter = xml_content.split("<worker_task");
72
73    // Skip the first block because the payload physically starts after `<worker_task`
74    for block in iter.skip(1) {
75        let Some(tag_end) = block.find('>') else {
76            continue;
77        };
78        let tag_attrs = &block[..tag_end];
79
80        // Parse ID dynamically
81        let id_start = tag_attrs.find("id=\"").map(|i| i + 4).unwrap_or(0);
82        let id_end = tag_attrs[id_start..].find('"').unwrap_or(0) + id_start;
83        let id = &tag_attrs[id_start..id_end];
84
85        // Parse Target physically
86        let target_start = tag_attrs.find("target=\"").map(|i| i + 8).unwrap_or(0);
87        let target_end = tag_attrs[target_start..].find('"').unwrap_or(0) + target_start;
88        let target = &tag_attrs[target_start..target_end];
89
90        // Retrieve instruction payload bounds
91        let content_block = &block[tag_end + 1..];
92        let Some(content_end) = content_block.find("</worker_task>") else {
93            continue;
94        };
95        let instruction = content_block[..content_end].trim();
96
97        tasks.push(WorkerTask {
98            id: id.to_string(),
99            target: target.to_string(),
100            instruction: instruction.to_string(),
101        });
102    }
103
104    tasks
105}
106
107/// Parses scratchpad diff content from `.hematite_scratch` files into [`Hunk`] objects.
108///
109/// This function scans raw XML-ish content for `<patch>` tags and extracts:
110/// - `start`: Starting line number (from `start="..."` attribute)
111/// - `end`: Ending line number (from `end="..."` attribute)
112/// - `content`: The patch content between `<patch>` and `</patch>`
113///
114/// # Arguments
115/// * `raw_content` — Raw string from scratchpad file containing patch tags
116/// * `worker_id` — Identifier of the worker processing this content
117///
118/// # Returns
119/// A `Vec<Hunk>` with all parsed patches (stops at malformed or unclosed tags)
120///
121/// # Example
122/// ```ignore
123/// let xml = r#"<patch start="10" end="20">
124///     // Some diff content
125/// </patch>"#;
126/// let hunks = parse_scratchpad_diffs(xml, "worker-1".to_string());
127/// assert_eq!(hunks[0].start_line, 10);
128/// ```
129#[allow(dead_code)]
130pub fn parse_scratchpad_diffs(raw_content: &str, worker_id: String) -> Vec<Hunk> {
131    let mut hunks = Vec::new();
132    let mut current_pos = 0;
133
134    fn parse_attr(attr_str: &str, key: &str) -> Option<usize> {
135        let key_match = format!("{}=\"", key);
136        let start = attr_str.find(&key_match)? + key_match.len();
137        let end = attr_str[start..].find('"')? + start;
138        attr_str[start..end].parse().ok()
139    }
140
141    while let Some(start_idx) = raw_content[current_pos..].find("<patch") {
142        let absolute_start = current_pos + start_idx;
143        let Some(tag_end) = raw_content[absolute_start..].find('>') else {
144            break;
145        };
146        let attr_str = &raw_content[absolute_start..absolute_start + tag_end];
147
148        let start_line = parse_attr(attr_str, "start").unwrap_or(0);
149        let end_line = parse_attr(attr_str, "end").unwrap_or(0);
150
151        let body_start = absolute_start + tag_end + 1;
152        if let Some(end_idx) = raw_content[body_start..].find("</patch>") {
153            let content = raw_content[body_start..body_start + end_idx]
154                .trim()
155                .to_string();
156            hunks.push(Hunk {
157                start_line,
158                end_line,
159                content,
160                worker_id: worker_id.clone(),
161            });
162            current_pos = body_start + end_idx + 8;
163        } else {
164            break;
165        }
166    }
167    hunks
168}