hematite/agent/parser.rs
1// ── src/agent/parser.rs: XML-ish Parser for Swarm Workloads ─────────────────────
2/// Resilient parser that maps XML-like tags to structured data without regex overhead.
3/// Designed to handle LLM output quirks (trailing commas, broken escapes) gracefully.
4
5#[derive(Debug, Clone)]
6pub struct WorkerTask {
7 /// Unique identifier for the worker task (e.g., "w-001", "worker-alpha")
8 pub id: String,
9
10 /// Target file path or directory where the work should be applied
11 pub target: String,
12
13 /// The instruction/payload describing what work to perform on the target
14 pub instruction: String,
15}
16
17#[allow(dead_code)]
18#[derive(Debug, Clone)]
19pub struct Hunk {
20 /// Starting line number (1-indexed) of the patch region
21 pub start_line: usize,
22
23 /// Ending line number (1-indexed, inclusive) of the patch region
24 pub end_line: usize,
25
26 /// The actual content/patch to apply within the specified line range
27 pub content: String,
28
29 /// Identifier of the worker that generated this hunk (for attribution)
30 pub worker_id: String,
31}
32
33impl Hunk {
34 #[allow(dead_code)]
35 /// Returns a sort key for ordering hunks by position.
36 /// Uses reverse ordering so higher line numbers come first (useful for bottom-up processing).
37 ///
38 /// # Returns
39 /// A tuple of `(Reverse(start_line), Reverse(end_line))` for stable multi-key sorting.
40 pub fn sort_key(&self) -> (std::cmp::Reverse<usize>, std::cmp::Reverse<usize>) {
41 (
42 std::cmp::Reverse(self.start_line),
43 std::cmp::Reverse(self.end_line),
44 )
45 }
46}
47
48/// Parses master specification XML content into a vector of [`WorkerTask`] items.
49///
50/// This function splits the input by `<worker_task` tags and extracts:
51/// - `id`: Unique task identifier from `id="..."` attribute
52/// - `target`: Target file/path from `target="..."` attribute
53/// - `instruction`: The payload content between opening tag and `</worker_task>`
54///
55/// # Arguments
56/// * `xml_content` — Raw XML-ish string containing worker task definitions
57///
58/// # Returns
59/// A `Vec<WorkerTask>` with all parsed tasks (skips malformed blocks)
60///
61/// # Example
62/// ```ignore
63/// let xml = r#"<worker_task id="w-001" target="src/main.rs">
64/// // Do something
65/// </worker_task>"#;
66/// let tasks = parse_master_spec(xml);
67/// assert_eq!(tasks.len(), 1);
68/// ```
69pub fn parse_master_spec(xml_content: &str) -> Vec<WorkerTask> {
70 let mut tasks = Vec::new();
71 let iter = xml_content.split("<worker_task");
72
73 // Skip the first block because the payload physically starts after `<worker_task`
74 for block in iter.skip(1) {
75 let Some(tag_end) = block.find('>') else {
76 continue;
77 };
78 let tag_attrs = &block[..tag_end];
79
80 // Parse ID — skip block if attribute is absent or unclosed
81 let Some(id_attr_pos) = tag_attrs.find("id=\"") else {
82 continue;
83 };
84 let id_start = id_attr_pos + 4;
85 let Some(id_end_rel) = tag_attrs[id_start..].find('"') else {
86 continue;
87 };
88 let id = &tag_attrs[id_start..id_start + id_end_rel];
89
90 // Parse Target — skip block if attribute is absent or unclosed
91 let Some(target_attr_pos) = tag_attrs.find("target=\"") else {
92 continue;
93 };
94 let target_start = target_attr_pos + 8;
95 let Some(target_end_rel) = tag_attrs[target_start..].find('"') else {
96 continue;
97 };
98 let target = &tag_attrs[target_start..target_start + target_end_rel];
99
100 // Retrieve instruction payload bounds
101 let content_block = &block[tag_end + 1..];
102 let Some(content_end) = content_block.find("</worker_task>") else {
103 continue;
104 };
105 let instruction = content_block[..content_end].trim();
106
107 tasks.push(WorkerTask {
108 id: id.to_string(),
109 target: target.to_string(),
110 instruction: instruction.to_string(),
111 });
112 }
113
114 tasks
115}
116
117/// Parses scratchpad diff content from `.hematite_scratch` files into [`Hunk`] objects.
118///
119/// This function scans raw XML-ish content for `<patch>` tags and extracts:
120/// - `start`: Starting line number (from `start="..."` attribute)
121/// - `end`: Ending line number (from `end="..."` attribute)
122/// - `content`: The patch content between `<patch>` and `</patch>`
123///
124/// # Arguments
125/// * `raw_content` — Raw string from scratchpad file containing patch tags
126/// * `worker_id` — Identifier of the worker processing this content
127///
128/// # Returns
129/// A `Vec<Hunk>` with all parsed patches (stops at malformed or unclosed tags)
130///
131/// # Example
132/// ```ignore
133/// let xml = r#"<patch start="10" end="20">
134/// // Some diff content
135/// </patch>"#;
136/// let hunks = parse_scratchpad_diffs(xml, "worker-1".to_string());
137/// assert_eq!(hunks[0].start_line, 10);
138/// ```
139#[allow(dead_code)]
140pub fn parse_scratchpad_diffs(raw_content: &str, worker_id: String) -> Vec<Hunk> {
141 let mut hunks = Vec::new();
142 let mut current_pos = 0;
143
144 fn parse_attr(attr_str: &str, key: &str) -> Option<usize> {
145 let klen = key.len();
146 let bytes = attr_str.as_bytes();
147 let mut pos = 0;
148 while let Some(rel) = attr_str[pos..].find(key) {
149 let abs = pos + rel;
150 let after = abs + klen;
151 if bytes.get(after).copied() == Some(b'=')
152 && bytes.get(after + 1).copied() == Some(b'"')
153 {
154 let val_start = after + 2;
155 let end = attr_str[val_start..].find('"')? + val_start;
156 return attr_str[val_start..end].parse().ok();
157 }
158 pos = abs + 1;
159 }
160 None
161 }
162
163 while let Some(start_idx) = raw_content[current_pos..].find("<patch") {
164 let absolute_start = current_pos + start_idx;
165 let Some(tag_end) = raw_content[absolute_start..].find('>') else {
166 break;
167 };
168 let attr_str = &raw_content[absolute_start..absolute_start + tag_end];
169
170 let start_line = parse_attr(attr_str, "start").unwrap_or(0);
171 let end_line = parse_attr(attr_str, "end").unwrap_or(0);
172
173 let body_start = absolute_start + tag_end + 1;
174 if let Some(end_idx) = raw_content[body_start..].find("</patch>") {
175 let content = raw_content[body_start..body_start + end_idx]
176 .trim()
177 .to_string();
178 hunks.push(Hunk {
179 start_line,
180 end_line,
181 content,
182 worker_id: worker_id.clone(),
183 });
184 current_pos = body_start + end_idx + 8;
185 } else {
186 break;
187 }
188 }
189 hunks
190}