aico/diffing/
parser.rs

1use crate::diffing::diff_utils::generate_diff;
2use crate::diffing::patching::create_patched_content;
3use crate::models::{StreamYieldItem, UnparsedBlock};
4use regex::Regex;
5use std::collections::HashMap;
6use std::path::Path;
7use std::sync::LazyLock;
8
9pub struct StreamParser<'a> {
10    buffer: String,
11    current_file: Option<String>,
12    /// Queue for items found during parsing that are waiting to be yielded.
13    yield_queue: std::collections::VecDeque<StreamYieldItem>,
14    /// Baseline contents provided by the session.
15    baseline: &'a HashMap<String, String>,
16    /// Overlay of files modified during this stream.
17    overlay: HashMap<String, String>,
18    /// Maps filenames to their content pre-modification in this stream.
19    discovered_baseline: HashMap<String, String>,
20}
21
22impl<'a> StreamParser<'a> {
23    pub fn get_pending_content(&self) -> String {
24        self.buffer.clone()
25    }
26
27    pub fn new(original_contents: &'a HashMap<String, String>) -> Self {
28        Self {
29            buffer: String::new(),
30            current_file: None,
31            yield_queue: std::collections::VecDeque::new(),
32            baseline: original_contents,
33            overlay: HashMap::new(),
34            discovered_baseline: HashMap::new(),
35        }
36    }
37
38    /// Feeds a new chunk of text into the parser.
39    /// Use the Iterator implementation (next()) to retrieve yielded items.
40    pub fn feed(&mut self, chunk: &str) {
41        self.buffer.push_str(chunk);
42    }
43
44    /// Convenience method to feed content and return resolved yields in one go.
45    pub fn parse_and_resolve(&mut self, chunk: &str, session_root: &Path) -> Vec<StreamYieldItem> {
46        self.feed(chunk);
47        let raw_yields: Vec<_> = self.by_ref().collect();
48        self.process_yields(raw_yields, session_root)
49    }
50
51    /// Centralized finalization logic to resolve any remaining buffer content,
52    /// process patches, and build the final diff and structured display items.
53    pub fn final_resolve(
54        &mut self,
55        session_root: &Path,
56    ) -> (String, Vec<crate::models::DisplayItem>, Vec<String>) {
57        // 1. Drain any items currently in the iterator/buffer
58        let (_, raw_yields, _) = self.finish("");
59
60        // 2. Resolve Patch items into DiffBlocks (and update overlay/discovered_baseline)
61        let processed = self.process_yields(raw_yields, session_root);
62
63        // 3. Collect final state
64        let warnings = self.collect_warnings(&processed);
65        let diff = self.build_final_unified_diff();
66        let display_items = processed
67            .into_iter()
68            .filter_map(|y| y.to_display_item(true))
69            .collect();
70
71        (diff, display_items, warnings)
72    }
73}
74
75static FILE_HEADER_RE: LazyLock<Regex> =
76    LazyLock::new(|| Regex::new(r"(?m)^(?P<line>[ \t]*File:[ \t]*(?P<path>.*?)\r?\n)").unwrap());
77
78impl<'a> Iterator for StreamParser<'a> {
79    type Item = StreamYieldItem;
80
81    fn next(&mut self) -> Option<Self::Item> {
82        loop {
83            // 1. First, drain the pre-parsed queue
84            if let Some(item) = self.yield_queue.pop_front() {
85                return Some(item);
86            }
87
88            if self.buffer.is_empty() {
89                return None;
90            }
91
92            // 2. If we are currently "inside" a file's content section
93            if let Some(llm_file_path) = self.current_file.clone() {
94                let next_header_idx = FILE_HEADER_RE
95                    .find(&self.buffer)
96                    .map(|m| m.start())
97                    .unwrap_or(self.buffer.len());
98
99                if next_header_idx > 0 {
100                    let (chunk_items, consumed_bytes) =
101                        self.process_file_chunk(&llm_file_path, &self.buffer[..next_header_idx]);
102                    self.buffer.drain(..consumed_bytes);
103
104                    if !chunk_items.is_empty() {
105                        self.yield_queue.extend(chunk_items);
106                        continue;
107                    }
108
109                    if consumed_bytes > 0 {
110                        continue;
111                    }
112
113                    // If waiting for data within a file block, do not fall through to Text parsing.
114                    if next_header_idx == self.buffer.len() {
115                        return None;
116                    }
117                }
118
119                if next_header_idx < self.buffer.len() {
120                    self.current_file = None;
121                    continue;
122                }
123            }
124
125            // 3. Look for Global File Headers
126            if let Some(caps) = FILE_HEADER_RE.captures(&self.buffer) {
127                let mat = caps.get(0).unwrap();
128                if mat.start() > 0 {
129                    let text = self.buffer[..mat.start()].to_string();
130                    self.buffer.drain(..mat.start());
131                    return Some(StreamYieldItem::Text(text));
132                }
133
134                let path_str = caps
135                    .name("path")
136                    .unwrap()
137                    .as_str()
138                    .trim()
139                    .trim_matches(|c| c == '*' || c == '`')
140                    .to_string();
141                self.current_file = Some(path_str.clone());
142                self.buffer.drain(..mat.end());
143                return Some(StreamYieldItem::FileHeader(crate::models::FileHeader {
144                    llm_file_path: path_str,
145                }));
146            }
147
148            // 4. Handle remaining buffer as Markdown Text
149            let text = &self.buffer;
150            let mut stable_len = text.len();
151
152            if self.is_incomplete(text) {
153                if let Some(m) = FILE_HEADER_RE.find(text) {
154                    stable_len = m.start();
155                } else if let Some(search_idx) = text.find("<<<<<<< SEARCH") {
156                    stable_len = text[..search_idx].rfind('\n').map(|i| i + 1).unwrap_or(0);
157                } else if let Some(last_newline) = text.rfind('\n') {
158                    let last_line = &text[last_newline + 1..];
159                    if self.is_incomplete(last_line) {
160                        stable_len = last_newline + 1;
161                    }
162                } else {
163                    stable_len = 0;
164                }
165            }
166
167            if stable_len > 0 {
168                let text_yield = self.buffer[..stable_len].to_string();
169                self.buffer.drain(..stable_len);
170                return Some(StreamYieldItem::Text(text_yield));
171            }
172
173            return None;
174        }
175    }
176}
177
178impl<'a> StreamParser<'a> {
179    fn is_incomplete(&self, text: &str) -> bool {
180        // Check if we are inside an unclosed SEARCH block
181        if let Some(idx) = text.find("<<<<<<< SEARCH") {
182            let line_start = text[..idx].rfind('\n').map(|i| i + 1).unwrap_or(0);
183            let indent = &text[line_start..idx];
184            if indent.chars().all(|c| c.is_whitespace()) && !text.contains(">>>>>>> REPLACE") {
185                return true;
186            }
187        }
188
189        // Check for partial tokens at the end of the buffer
190        if let Some(last_line) = text.split('\n').next_back() {
191            let trimmed = last_line.trim_start();
192            if !trimmed.is_empty() {
193                // Partial "File:" header?
194                // Note: We deliberately don't check for \r here, forcing a wait for \n
195                if "File:".starts_with(trimmed) && trimmed.len() < "File:".len() {
196                    return true;
197                }
198                if trimmed.starts_with("File:") && !text.ends_with('\n') {
199                    return true;
200                }
201
202                // Partial markers?
203                for marker in ["<<<<<<< SEARCH", "=======", ">>>>>>> REPLACE"] {
204                    if marker.starts_with(trimmed) && marker.len() > trimmed.len() {
205                        return true;
206                    }
207                }
208            }
209        }
210        false
211    }
212
213    fn process_file_chunk(&self, llm_path: &str, chunk: &str) -> (Vec<StreamYieldItem>, usize) {
214        let mut items = Vec::new();
215        let mut cursor = 0;
216        let search_pattern = "<<<<<<< SEARCH";
217        let sep_pattern = "=======";
218        let replace_pattern = ">>>>>>> REPLACE";
219
220        while cursor < chunk.len() {
221            let search_idx = match chunk[cursor..].find(search_pattern) {
222                Some(i) => cursor + i,
223                None => break,
224            };
225
226            // Capture indentation from the start of the line up to the marker
227            let line_start = chunk[..search_idx].rfind('\n').map(|i| i + 1).unwrap_or(0);
228            let indent_slice = &chunk[line_start..search_idx];
229
230            // Verify indent consists only of whitespace
231            if !indent_slice.chars().all(|c| c.is_whitespace()) {
232                // If it's not a marker at the start of a line, skip it
233                items.push(StreamYieldItem::Text(
234                    chunk[cursor..search_idx + 1].to_string(),
235                ));
236                cursor = search_idx + 1;
237                continue;
238            }
239
240            let block_search_start = search_idx + search_pattern.len();
241            let block_search_start_content =
242                block_search_start + consume_line_ending(&chunk[block_search_start..]);
243
244            let (sep_line_start, sep_line_end) =
245                match find_marker_with_indent(chunk, sep_pattern, block_search_start, indent_slice)
246                {
247                    Some(pair) => pair,
248                    None => {
249                        if search_idx > cursor {
250                            items
251                                .push(StreamYieldItem::Text(chunk[cursor..search_idx].to_string()));
252                        }
253                        return (items, search_idx);
254                    }
255                };
256
257            let block_replace_start_content =
258                sep_line_end + consume_line_ending(&chunk[sep_line_end..]);
259
260            let (replace_line_start, _replace_line_end) =
261                match find_marker_with_indent(chunk, replace_pattern, sep_line_end, indent_slice) {
262                    Some(pair) => pair,
263                    None => {
264                        if search_idx > cursor {
265                            items
266                                .push(StreamYieldItem::Text(chunk[cursor..search_idx].to_string()));
267                        }
268                        return (items, search_idx);
269                    }
270                };
271
272            if search_idx > cursor {
273                items.push(StreamYieldItem::Text(chunk[cursor..search_idx].to_string()));
274            }
275
276            let final_end = replace_line_start + indent_slice.len() + replace_pattern.len();
277
278            let search_content = &chunk[block_search_start_content..sep_line_start];
279            let replace_content = &chunk[block_replace_start_content..replace_line_start];
280
281            items.push(StreamYieldItem::Patch(crate::models::AIPatch {
282                llm_file_path: llm_path.to_string(),
283                search_content: search_content.to_string(),
284                replace_content: replace_content.to_string(),
285                indent: indent_slice.to_string(),
286                raw_block: chunk[search_idx..final_end].to_string(),
287            }));
288
289            cursor = final_end;
290        }
291
292        if cursor < chunk.len() {
293            let tail = &chunk[cursor..];
294            if !self.is_incomplete(tail) {
295                items.push(StreamYieldItem::Text(tail.to_string()));
296                cursor = chunk.len();
297            }
298        }
299
300        (items, cursor)
301    }
302
303    pub fn handle_patch(
304        &mut self,
305        patch: &crate::models::AIPatch,
306        _root: &Path,
307    ) -> (Option<StreamYieldItem>, Vec<String>) {
308        let mut warnings = Vec::new();
309
310        let resolution = self.resolve_path(&patch.llm_file_path, _root, &patch.search_content);
311
312        if let Some(w) = resolution.0 {
313            warnings.push(w.clone());
314        }
315
316        if let Some((path, fallback)) = resolution.1 {
317            if let Some(fb) = fallback {
318                self.overlay
319                    .entry(path.clone())
320                    .or_insert_with(|| fb.clone());
321                self.discovered_baseline.entry(path.clone()).or_insert(fb);
322            }
323
324            let original = self
325                .overlay
326                .get(&path)
327                .map(|s| s.as_str())
328                .or_else(|| self.baseline.get(&path).map(|s| s.as_str()))
329                .unwrap_or("");
330
331            if let Some(new_content) =
332                create_patched_content(original, &patch.search_content, &patch.replace_content)
333            {
334                let diff = generate_diff(&path, Some(original), Some(&new_content));
335                self.overlay.insert(path.clone(), new_content.clone());
336                (
337                    Some(StreamYieldItem::DiffBlock(
338                        crate::models::ProcessedDiffBlock {
339                            llm_file_path: patch.llm_file_path.clone(),
340                            unified_diff: diff,
341                        },
342                    )),
343                    warnings,
344                )
345            } else {
346                warnings.push(format!(
347                    "The SEARCH block from the AI could not be found in '{}'. Patch skipped.",
348                    path
349                ));
350
351                (
352                    Some(StreamYieldItem::Unparsed(crate::models::UnparsedBlock {
353                        text: patch.raw_block.clone(),
354                    })),
355                    warnings,
356                )
357            }
358        } else {
359            warnings.push(format!(
360                "File '{}' from the AI does not match any file in context. Patch skipped.",
361                patch.llm_file_path
362            ));
363
364            (
365                Some(StreamYieldItem::Unparsed(crate::models::UnparsedBlock {
366                    text: patch.raw_block.clone(),
367                })),
368                warnings,
369            )
370        }
371    }
372
373    pub fn finish(&mut self, last_chunk: &str) -> (String, Vec<StreamYieldItem>, Vec<String>) {
374        // Process any final tokens received.
375        self.feed(last_chunk);
376
377        // Force flush if we are stuck waiting for a newline at EOF for a complete block
378        if self.is_incomplete(&self.buffer)
379            && self.buffer.contains("<<<<<<< SEARCH")
380            && self.buffer.contains(">>>>>>> REPLACE")
381        {
382            self.buffer.push('\n');
383        }
384
385        let mut items: Vec<_> = self.by_ref().collect();
386
387        // Anything remaining in the buffer is now considered a trailing segment.
388        if !self.buffer.is_empty() {
389            let looks_like_marker = self.is_incomplete(&self.buffer);
390
391            if looks_like_marker {
392                items.push(StreamYieldItem::Unparsed(UnparsedBlock {
393                    text: self.buffer.clone(),
394                }));
395            } else {
396                items.push(StreamYieldItem::Text(self.buffer.clone()));
397            }
398            self.buffer.clear();
399        }
400
401        let diff = self.build_final_unified_diff();
402
403        let warnings = self.collect_warnings(&items);
404
405        (diff, items, warnings)
406    }
407
408    pub fn collect_warnings(&self, items: &[StreamYieldItem]) -> Vec<String> {
409        items
410            .iter()
411            .filter_map(|i| match i {
412                StreamYieldItem::Warning(w) => Some(w.text.clone()),
413                _ => None,
414            })
415            .collect()
416    }
417
418    /// Processes a list of raw yields, resolving any Patch items into DiffBlocks or Warnings.
419    pub fn process_yields(
420        &mut self,
421        items: Vec<StreamYieldItem>,
422        session_root: &Path,
423    ) -> Vec<StreamYieldItem> {
424        let mut processed = Vec::with_capacity(items.len());
425        for item in items {
426            if let StreamYieldItem::Patch(ref patch) = item {
427                let (resolved, warnings) = self.handle_patch(patch, session_root);
428                for w in warnings {
429                    processed.push(StreamYieldItem::Warning(crate::models::WarningMessage {
430                        text: w,
431                    }));
432                }
433                if let Some(res) = resolved {
434                    processed.push(res);
435                }
436            } else {
437                processed.push(item);
438            }
439        }
440        processed
441    }
442
443    pub fn build_final_unified_diff(&self) -> String {
444        let mut diffs = String::new();
445        let keys: std::collections::BTreeSet<_> = self
446            .discovered_baseline
447            .keys()
448            .chain(self.overlay.keys())
449            .collect();
450
451        for k in keys {
452            let old = self
453                .discovered_baseline
454                .get(k)
455                .map(|s| s.as_str())
456                .or_else(|| self.baseline.get(k).map(|s| s.as_str()));
457            let new = self.overlay.get(k).map(|s| s.as_str());
458
459            if old != new {
460                let d = generate_diff(k, old, new);
461                diffs.push_str(&d);
462            }
463        }
464        diffs
465    }
466
467    fn resolve_path(
468        &self,
469        llm_path: &str,
470        root: &Path,
471        search_block: &str,
472    ) -> (Option<String>, Option<(String, Option<String>)>) {
473        if self.overlay.contains_key(llm_path) || self.baseline.contains_key(llm_path) {
474            return (None, Some((llm_path.to_string(), None)));
475        }
476        if search_block.trim().is_empty() {
477            return (None, Some((llm_path.to_string(), None)));
478        }
479        let abs_path = root.join(llm_path);
480        if abs_path.exists()
481            && let Ok(canon) = abs_path.canonicalize()
482            && let Ok(root_canon) = root.canonicalize()
483            && canon.starts_with(root_canon)
484            && let Ok(content) = std::fs::read_to_string(&abs_path)
485        {
486            let msg = format!(
487                "File '{}' was not in the session context but was found on disk.",
488                llm_path
489            );
490            return (Some(msg), Some((llm_path.to_string(), Some(content))));
491        }
492        (None, None)
493    }
494}
495
496fn consume_line_ending(s: &str) -> usize {
497    if s.starts_with("\r\n") {
498        2
499    } else if s.starts_with('\n') {
500        1
501    } else {
502        0
503    }
504}
505
506fn find_marker_with_indent(
507    chunk: &str,
508    marker: &str,
509    start_pos: usize,
510    expected_indent: &str,
511) -> Option<(usize, usize)> {
512    let mut search_pos = start_pos;
513    while let Some(i) = chunk[search_pos..].find(marker) {
514        let found_idx = search_pos + i;
515        let line_start = chunk[..found_idx]
516            .rfind('\n')
517            .map(|idx| idx + 1)
518            .unwrap_or(0);
519        if chunk[line_start..found_idx] == *expected_indent {
520            let after = &chunk[found_idx + marker.len()..];
521            let line_end = after
522                .find('\n')
523                .map(|idx| found_idx + marker.len() + idx)
524                .unwrap_or(chunk.len());
525            // We ignore \r here to allow CRLF support, checking only for \n as line terminator
526            if chunk[found_idx + marker.len()..line_end]
527                .chars()
528                .all(|c| c.is_whitespace() && c != '\n')
529            {
530                return Some((line_start, line_end));
531            }
532        }
533        search_pos = found_idx + marker.len();
534    }
535    None
536}