hematite/agent/parser.rs
1// ── src/agent/parser.rs: XML-ish Parser for Swarm Workloads ─────────────────────
2/// Resilient parser that maps XML-like tags to structured data without regex overhead.
3/// Designed to handle LLM output quirks (trailing commas, broken escapes) gracefully.
4
5#[derive(Debug, Clone)]
6pub struct WorkerTask {
7 /// Unique identifier for the worker task (e.g., "w-001", "worker-alpha")
8 pub id: String,
9
10 /// Target file path or directory where the work should be applied
11 pub target: String,
12
13 /// The instruction/payload describing what work to perform on the target
14 pub instruction: String,
15}
16
17#[allow(dead_code)]
18#[derive(Debug, Clone)]
19pub struct Hunk {
20 /// Starting line number (1-indexed) of the patch region
21 pub start_line: usize,
22
23 /// Ending line number (1-indexed, inclusive) of the patch region
24 pub end_line: usize,
25
26 /// The actual content/patch to apply within the specified line range
27 pub content: String,
28
29 /// Identifier of the worker that generated this hunk (for attribution)
30 pub worker_id: String,
31}
32
33impl Hunk {
34 #[allow(dead_code)]
35 /// Returns a sort key for ordering hunks by position.
36 /// Uses reverse ordering so higher line numbers come first (useful for bottom-up processing).
37 ///
38 /// # Returns
39 /// A tuple of `(Reverse(start_line), Reverse(end_line))` for stable multi-key sorting.
40 pub fn sort_key(&self) -> (std::cmp::Reverse<usize>, std::cmp::Reverse<usize>) {
41 (
42 std::cmp::Reverse(self.start_line),
43 std::cmp::Reverse(self.end_line),
44 )
45 }
46}
47
48/// Parses master specification XML content into a vector of [`WorkerTask`] items.
49///
50/// This function splits the input by `<worker_task` tags and extracts:
51/// - `id`: Unique task identifier from `id="..."` attribute
52/// - `target`: Target file/path from `target="..."` attribute
53/// - `instruction`: The payload content between opening tag and `</worker_task>`
54///
55/// # Arguments
56/// * `xml_content` — Raw XML-ish string containing worker task definitions
57///
58/// # Returns
59/// A `Vec<WorkerTask>` with all parsed tasks (skips malformed blocks)
60///
61/// # Example
62/// ```ignore
63/// let xml = r#"<worker_task id="w-001" target="src/main.rs">
64/// // Do something
65/// </worker_task>"#;
66/// let tasks = parse_master_spec(xml);
67/// assert_eq!(tasks.len(), 1);
68/// ```
69pub fn parse_master_spec(xml_content: &str) -> Vec<WorkerTask> {
70 let mut tasks = Vec::new();
71 let iter = xml_content.split("<worker_task");
72
73 // Skip the first block because the payload physically starts after `<worker_task`
74 for block in iter.skip(1) {
75 let Some(tag_end) = block.find('>') else {
76 continue;
77 };
78 let tag_attrs = &block[..tag_end];
79
80 // Parse ID dynamically
81 let id_start = tag_attrs.find("id=\"").map(|i| i + 4).unwrap_or(0);
82 let id_end = tag_attrs[id_start..].find('"').unwrap_or(0) + id_start;
83 let id = &tag_attrs[id_start..id_end];
84
85 // Parse Target physically
86 let target_start = tag_attrs.find("target=\"").map(|i| i + 8).unwrap_or(0);
87 let target_end = tag_attrs[target_start..].find('"').unwrap_or(0) + target_start;
88 let target = &tag_attrs[target_start..target_end];
89
90 // Retrieve instruction payload bounds
91 let content_block = &block[tag_end + 1..];
92 let Some(content_end) = content_block.find("</worker_task>") else {
93 continue;
94 };
95 let instruction = content_block[..content_end].trim();
96
97 tasks.push(WorkerTask {
98 id: id.to_string(),
99 target: target.to_string(),
100 instruction: instruction.to_string(),
101 });
102 }
103
104 tasks
105}
106
107/// Parses scratchpad diff content from `.hematite_scratch` files into [`Hunk`] objects.
108///
109/// This function scans raw XML-ish content for `<patch>` tags and extracts:
110/// - `start`: Starting line number (from `start="..."` attribute)
111/// - `end`: Ending line number (from `end="..."` attribute)
112/// - `content`: The patch content between `<patch>` and `</patch>`
113///
114/// # Arguments
115/// * `raw_content` — Raw string from scratchpad file containing patch tags
116/// * `worker_id` — Identifier of the worker processing this content
117///
118/// # Returns
119/// A `Vec<Hunk>` with all parsed patches (stops at malformed or unclosed tags)
120///
121/// # Example
122/// ```ignore
123/// let xml = r#"<patch start="10" end="20">
124/// // Some diff content
125/// </patch>"#;
126/// let hunks = parse_scratchpad_diffs(xml, "worker-1".to_string());
127/// assert_eq!(hunks[0].start_line, 10);
128/// ```
129#[allow(dead_code)]
130pub fn parse_scratchpad_diffs(raw_content: &str, worker_id: String) -> Vec<Hunk> {
131 let mut hunks = Vec::new();
132 let mut current_pos = 0;
133
134 fn parse_attr(attr_str: &str, key: &str) -> Option<usize> {
135 let key_match = format!("{}=\"", key);
136 let start = attr_str.find(&key_match)? + key_match.len();
137 let end = attr_str[start..].find('"')? + start;
138 attr_str[start..end].parse().ok()
139 }
140
141 while let Some(start_idx) = raw_content[current_pos..].find("<patch") {
142 let absolute_start = current_pos + start_idx;
143 let Some(tag_end) = raw_content[absolute_start..].find('>') else {
144 break;
145 };
146 let attr_str = &raw_content[absolute_start..absolute_start + tag_end];
147
148 let start_line = parse_attr(attr_str, "start").unwrap_or(0);
149 let end_line = parse_attr(attr_str, "end").unwrap_or(0);
150
151 let body_start = absolute_start + tag_end + 1;
152 if let Some(end_idx) = raw_content[body_start..].find("</patch>") {
153 let content = raw_content[body_start..body_start + end_idx]
154 .trim()
155 .to_string();
156 hunks.push(Hunk {
157 start_line,
158 end_line,
159 content,
160 worker_id: worker_id.clone(),
161 });
162 current_pos = body_start + end_idx + 8;
163 } else {
164 break;
165 }
166 }
167 hunks
168}