Skip to main content

hematite/agent/
parser.rs

1// ── src/agent/parser.rs: XML-ish Parser for Swarm Workloads ─────────────────────
2/// Resilient parser that maps XML-like tags to structured data without regex overhead.
3/// Designed to handle LLM output quirks (trailing commas, broken escapes) gracefully.
4
5#[derive(Debug, Clone)]
6pub struct WorkerTask {
7    /// Unique identifier for the worker task (e.g., "w-001", "worker-alpha")
8    pub id: String,
9
10    /// Target file path or directory where the work should be applied
11    pub target: String,
12
13    /// The instruction/payload describing what work to perform on the target
14    pub instruction: String,
15}
16
17#[allow(dead_code)]
18#[derive(Debug, Clone)]
19pub struct Hunk {
20    /// Starting line number (1-indexed) of the patch region
21    pub start_line: usize,
22
23    /// Ending line number (1-indexed, inclusive) of the patch region
24    pub end_line: usize,
25
26    /// The actual content/patch to apply within the specified line range
27    pub content: String,
28
29    /// Identifier of the worker that generated this hunk (for attribution)
30    pub worker_id: String,
31}
32
33impl Hunk {
34    #[allow(dead_code)]
35    /// Returns a sort key for ordering hunks by position.
36    /// Uses reverse ordering so higher line numbers come first (useful for bottom-up processing).
37    ///
38    /// # Returns
39    /// A tuple of `(Reverse(start_line), Reverse(end_line))` for stable multi-key sorting.
40    pub fn sort_key(&self) -> (std::cmp::Reverse<usize>, std::cmp::Reverse<usize>) {
41        (
42            std::cmp::Reverse(self.start_line),
43            std::cmp::Reverse(self.end_line),
44        )
45    }
46}
47
48/// Parses master specification XML content into a vector of [`WorkerTask`] items.
49///
50/// This function splits the input by `<worker_task` tags and extracts:
51/// - `id`: Unique task identifier from `id="..."` attribute
52/// - `target`: Target file/path from `target="..."` attribute  
53/// - `instruction`: The payload content between opening tag and `</worker_task>`
54///
55/// # Arguments
56/// * `xml_content` — Raw XML-ish string containing worker task definitions
57///
58/// # Returns
59/// A `Vec<WorkerTask>` with all parsed tasks (skips malformed blocks)
60///
61/// # Example
62/// ```ignore
63/// let xml = r#"<worker_task id="w-001" target="src/main.rs">
64///     // Do something
65/// </worker_task>"#;
66/// let tasks = parse_master_spec(xml);
67/// assert_eq!(tasks.len(), 1);
68/// ```
69pub fn parse_master_spec(xml_content: &str) -> Vec<WorkerTask> {
70    let mut tasks = Vec::new();
71    let iter = xml_content.split("<worker_task");
72
73    // Skip the first block because the payload physically starts after `<worker_task`
74    for block in iter.skip(1) {
75        let Some(tag_end) = block.find('>') else {
76            continue;
77        };
78        let tag_attrs = &block[..tag_end];
79
80        // Parse ID — skip block if attribute is absent or unclosed
81        let Some(id_attr_pos) = tag_attrs.find("id=\"") else {
82            continue;
83        };
84        let id_start = id_attr_pos + 4;
85        let Some(id_end_rel) = tag_attrs[id_start..].find('"') else {
86            continue;
87        };
88        let id = &tag_attrs[id_start..id_start + id_end_rel];
89
90        // Parse Target — skip block if attribute is absent or unclosed
91        let Some(target_attr_pos) = tag_attrs.find("target=\"") else {
92            continue;
93        };
94        let target_start = target_attr_pos + 8;
95        let Some(target_end_rel) = tag_attrs[target_start..].find('"') else {
96            continue;
97        };
98        let target = &tag_attrs[target_start..target_start + target_end_rel];
99
100        // Retrieve instruction payload bounds
101        let content_block = &block[tag_end + 1..];
102        let Some(content_end) = content_block.find("</worker_task>") else {
103            continue;
104        };
105        let instruction = content_block[..content_end].trim();
106
107        tasks.push(WorkerTask {
108            id: id.to_string(),
109            target: target.to_string(),
110            instruction: instruction.to_string(),
111        });
112    }
113
114    tasks
115}
116
117/// Parses scratchpad diff content from `.hematite_scratch` files into [`Hunk`] objects.
118///
119/// This function scans raw XML-ish content for `<patch>` tags and extracts:
120/// - `start`: Starting line number (from `start="..."` attribute)
121/// - `end`: Ending line number (from `end="..."` attribute)
122/// - `content`: The patch content between `<patch>` and `</patch>`
123///
124/// # Arguments
125/// * `raw_content` — Raw string from scratchpad file containing patch tags
126/// * `worker_id` — Identifier of the worker processing this content
127///
128/// # Returns
129/// A `Vec<Hunk>` with all parsed patches (stops at malformed or unclosed tags)
130///
131/// # Example
132/// ```ignore
133/// let xml = r#"<patch start="10" end="20">
134///     // Some diff content
135/// </patch>"#;
136/// let hunks = parse_scratchpad_diffs(xml, "worker-1".to_string());
137/// assert_eq!(hunks[0].start_line, 10);
138/// ```
139#[allow(dead_code)]
140pub fn parse_scratchpad_diffs(raw_content: &str, worker_id: String) -> Vec<Hunk> {
141    let mut hunks = Vec::new();
142    let mut current_pos = 0;
143
144    fn parse_attr(attr_str: &str, key: &str) -> Option<usize> {
145        let klen = key.len();
146        let bytes = attr_str.as_bytes();
147        let mut pos = 0;
148        while let Some(rel) = attr_str[pos..].find(key) {
149            let abs = pos + rel;
150            let after = abs + klen;
151            if bytes.get(after).copied() == Some(b'=')
152                && bytes.get(after + 1).copied() == Some(b'"')
153            {
154                let val_start = after + 2;
155                let end = attr_str[val_start..].find('"')? + val_start;
156                return attr_str[val_start..end].parse().ok();
157            }
158            pos = abs + 1;
159        }
160        None
161    }
162
163    while let Some(start_idx) = raw_content[current_pos..].find("<patch") {
164        let absolute_start = current_pos + start_idx;
165        let Some(tag_end) = raw_content[absolute_start..].find('>') else {
166            break;
167        };
168        let attr_str = &raw_content[absolute_start..absolute_start + tag_end];
169
170        let start_line = parse_attr(attr_str, "start").unwrap_or(0);
171        let end_line = parse_attr(attr_str, "end").unwrap_or(0);
172
173        let body_start = absolute_start + tag_end + 1;
174        if let Some(end_idx) = raw_content[body_start..].find("</patch>") {
175            let content = raw_content[body_start..body_start + end_idx]
176                .trim()
177                .to_string();
178            hunks.push(Hunk {
179                start_line,
180                end_line,
181                content,
182                worker_id: worker_id.clone(),
183            });
184            current_pos = body_start + end_idx + 8;
185        } else {
186            break;
187        }
188    }
189    hunks
190}