Skip to main content

aico/diffing/
parser.rs

1use crate::diffing::diff_utils::generate_diff;
2use crate::diffing::patching::create_patched_content;
3use crate::models::{StreamYieldItem, UnparsedBlock};
4use regex::Regex;
5use std::collections::HashMap;
6use std::path::Path;
7use std::sync::LazyLock;
8
9pub struct StreamParser<'a> {
10    buffer: String,
11    current_file: Option<String>,
12    /// Queue for items found during parsing that are waiting to be yielded.
13    yield_queue: std::collections::VecDeque<StreamYieldItem>,
14    /// Baseline contents provided by the session.
15    baseline: &'a HashMap<String, String>,
16    /// Overlay of files modified during this stream.
17    overlay: HashMap<String, String>,
18    /// Maps filenames to their content pre-modification in this stream.
19    discovered_baseline: HashMap<String, String>,
20    /// Tracks if the last yielded character was a newline.
21    /// Used to enforce line-start anchors for headers.
22    last_char_was_newline: bool,
23}
24
25impl<'a> StreamParser<'a> {
26    pub fn get_pending_content(&self) -> String {
27        self.buffer.clone()
28    }
29
30    pub fn is_pending_displayable(&self) -> bool {
31        let pending = &self.buffer;
32        if pending.is_empty() {
33            return false;
34        }
35
36        let tail_is_at_line_start = if pending.rfind('\n').is_some() {
37            true
38        } else {
39            self.last_char_was_newline
40        };
41
42        if !tail_is_at_line_start {
43            return true;
44        }
45
46        let last_line = pending.split('\n').next_back().unwrap_or("");
47        let trimmed = last_line.trim_start();
48
49        // 1. GLOBAL CHECK: File Headers
50        // Block if the tail looks like the start of a "File:" line.
51        if !trimmed.is_empty()
52            && ("File:".starts_with(trimmed)
53                || (trimmed.starts_with("File:") && !pending.ends_with('\n')))
54        {
55            return false;
56        }
57
58        // 2. CONTEXT CHECK: Diff Markers
59        // We only care about diff markers if we are actively inside a file context.
60        if self.current_file.is_some() {
61            // A. Body Check: Are we buffering a block?
62            // If the buffer contains the start marker, we are inside a block (or waiting for it to close).
63            // We must hold back everything until the parser consumes it.
64            if pending.contains("<<<<<<< SEARCH") {
65                return false;
66            }
67
68            // B. Tail Check: Is a block starting right now?
69            // We ONLY need to check for the start marker.
70            // (We don't check for ======= or >>>>>>> because if we see those WITHOUT
71            // the start marker in the body check above, they are just text).
72            if !trimmed.is_empty() && "<<<<<<< SEARCH".starts_with(trimmed) {
73                return false;
74            }
75        }
76
77        true
78    }
79
80    pub fn new(original_contents: &'a HashMap<String, String>) -> Self {
81        Self {
82            buffer: String::new(),
83            current_file: None,
84            yield_queue: std::collections::VecDeque::new(),
85            baseline: original_contents,
86            overlay: HashMap::new(),
87            discovered_baseline: HashMap::new(),
88            // Start of stream is treated as start of a line
89            last_char_was_newline: true,
90        }
91    }
92
93    /// Feeds a new chunk of text into the parser.
94    /// Use the Iterator implementation (next()) to retrieve yielded items.
95    pub fn feed(&mut self, chunk: &str) {
96        self.buffer.push_str(chunk);
97    }
98
99    /// Feeds content ensuring a trailing newline for correct parsing of final blocks.
100    pub fn feed_complete(&mut self, content: &str) {
101        self.feed(content);
102        if !content.ends_with('\n') {
103            self.feed("\n");
104        }
105    }
106
107    /// Convenience method to feed content and return resolved yields in one go.
108    pub fn parse_and_resolve(&mut self, chunk: &str, session_root: &Path) -> Vec<StreamYieldItem> {
109        self.feed(chunk);
110        let raw_yields: Vec<_> = self.by_ref().collect();
111        self.process_yields(raw_yields, session_root)
112    }
113
114    /// Centralized finalization logic to resolve any remaining buffer content,
115    /// process patches, and build the final diff and structured display items.
116    pub fn final_resolve(
117        &mut self,
118        session_root: &Path,
119    ) -> (String, Vec<crate::models::DisplayItem>, Vec<String>) {
120        // 1. Drain any items currently in the iterator/buffer
121        let (_, raw_yields, _) = self.finish("");
122
123        // 2. Resolve Patch items into DiffBlocks (and update overlay/discovered_baseline)
124        let processed = self.process_yields(raw_yields, session_root);
125
126        // 3. Collect final state
127        let warnings = self.collect_warnings(&processed);
128        let diff = self.build_final_unified_diff();
129        let display_items = processed
130            .into_iter()
131            .filter_map(|y| y.to_display_item(true))
132            .collect();
133
134        (diff, display_items, warnings)
135    }
136
137    fn update_newline_state(&mut self, item: &StreamYieldItem) {
138        match item {
139            StreamYieldItem::Text(s) => self.last_char_was_newline = s.ends_with('\n'),
140            StreamYieldItem::Unparsed(u) => self.last_char_was_newline = u.text.ends_with('\n'),
141            StreamYieldItem::FileHeader(_) => self.last_char_was_newline = true, // Headers end with \n
142            StreamYieldItem::Patch(p) => self.last_char_was_newline = p.raw_block.ends_with('\n'),
143            StreamYieldItem::DiffBlock(d) => {
144                self.last_char_was_newline = d.unified_diff.ends_with('\n')
145            }
146            StreamYieldItem::Warning(_) => {} // Metadata doesn't affect flow
147            StreamYieldItem::IncompleteBlock(b) => self.last_char_was_newline = b.ends_with('\n'),
148        }
149    }
150
151    fn check_header_match(&self, m: regex::Match, start_of_buffer_is_start_of_line: bool) -> bool {
152        // A match is valid if it starts at index > 0 (meaning a previous newline exists at m.start()-1)
153        // OR if it starts at 0 and the parser state says we are at the start of a line.
154        m.start() > 0 || start_of_buffer_is_start_of_line
155    }
156}
157
158static FILE_HEADER_RE: LazyLock<Regex> =
159    LazyLock::new(|| Regex::new(r"(?m)^(?P<line>[ \t]*File:[ \t]*(?P<path>.*?)\r?\n)").unwrap());
160
161impl<'a> Iterator for StreamParser<'a> {
162    type Item = StreamYieldItem;
163
164    fn next(&mut self) -> Option<Self::Item> {
165        loop {
166            // 1. First, drain the pre-parsed queue
167            if let Some(item) = self.yield_queue.pop_front() {
168                self.update_newline_state(&item);
169                return Some(item);
170            }
171
172            if self.buffer.is_empty() {
173                return None;
174            }
175
176            // 2. If we are currently "inside" a file's content section
177            if let Some(llm_file_path) = self.current_file.clone() {
178                // Find next potential header to switch context
179                let mut next_header_idx = self.buffer.len();
180                for m in FILE_HEADER_RE.find_iter(&self.buffer) {
181                    // Use helper to check validity against current line state
182                    if self.check_header_match(m, self.last_char_was_newline) {
183                        next_header_idx = m.start();
184                        break;
185                    }
186                }
187
188                // Process content UP TO that header (or end of buffer)
189                let chunk_limit = next_header_idx;
190                if chunk_limit > 0 || (chunk_limit == 0 && self.last_char_was_newline) {
191                    let (chunk_items, consumed_bytes) =
192                        self.process_file_chunk(&llm_file_path, &self.buffer[..chunk_limit]);
193
194                    if consumed_bytes > 0 {
195                        self.buffer.drain(..consumed_bytes);
196                    }
197
198                    if !chunk_items.is_empty() {
199                        self.yield_queue.extend(chunk_items);
200                        continue;
201                    }
202
203                    // If we made no progress but have a limit
204                    if consumed_bytes == 0 {
205                        // If the limit was a header, we are done with this file.
206                        if next_header_idx < self.buffer.len() {
207                            self.current_file = None;
208                            continue;
209                        } else {
210                            // Valid content but incomplete block at the end. Wait for more data.
211                            return None;
212                        }
213                    }
214                } else {
215                    // Start of buffer is a header.
216                    self.current_file = None;
217                    continue;
218                }
219            }
220
221            // 3. Look for Global File Headers
222            // We only look for headers if we are at a clean line start (managed by logic inside loop)
223            if let Some(caps) = FILE_HEADER_RE.captures(&self.buffer) {
224                let mat = caps.get(0).unwrap();
225                if self.check_header_match(mat, self.last_char_was_newline) {
226                    if mat.start() > 0 {
227                        let text = self.buffer[..mat.start()].to_string();
228                        self.buffer.drain(..mat.start());
229                        let item = StreamYieldItem::Text(text);
230                        self.update_newline_state(&item);
231                        return Some(item);
232                    }
233
234                    let path_str = caps
235                        .name("path")
236                        .unwrap()
237                        .as_str()
238                        .trim()
239                        .trim_matches(|c| c == '*' || c == '`')
240                        .to_string();
241                    self.current_file = Some(path_str.clone());
242                    self.buffer.drain(..mat.end());
243                    let item = StreamYieldItem::FileHeader(crate::models::FileHeader {
244                        llm_file_path: path_str,
245                    });
246                    self.update_newline_state(&item);
247                    return Some(item);
248                }
249                // Invalid match (e.g. " File:" mid-line). Treat as text.
250            }
251
252            // 4. Handle remaining buffer as Markdown Text
253            let text = &self.buffer;
254            let mut limit = text.len();
255
256            // Always truncate at the first valid header found inside the buffer
257            for m in FILE_HEADER_RE.find_iter(text) {
258                if self.check_header_match(m, self.last_char_was_newline) {
259                    limit = m.start();
260                    break;
261                }
262            }
263
264            // Always truncate at the first valid diff marker found inside the buffer
265            if let Some(search_idx) = text[..limit].find("<<<<<<< SEARCH") {
266                let ls = text[..search_idx].rfind('\n').map(|i| i + 1).unwrap_or(0);
267                if ls > 0 || self.last_char_was_newline {
268                    limit = limit.min(ls);
269                }
270            }
271
272            // If the remaining part is incomplete, we must truncate further
273            if self.is_incomplete(&text[..limit]) {
274                if let Some(last_newline) = text[..limit].rfind('\n') {
275                    limit = last_newline + 1;
276                } else {
277                    limit = 0;
278                }
279            }
280
281            if limit > 0 {
282                let text_yield = self.buffer[..limit].to_string();
283                self.buffer.drain(..limit);
284                let item = StreamYieldItem::Text(text_yield);
285                self.update_newline_state(&item);
286                return Some(item);
287            }
288
289            return None;
290        }
291    }
292}
293
294impl<'a> StreamParser<'a> {
295    fn is_incomplete(&self, text: &str) -> bool {
296        // We need to determine the last line content
297        let last_line = match text.rfind('\n') {
298            Some(idx) => &text[idx + 1..],
299            None => {
300                // If we are mid-stream and haven't seen a newline, we can't start a line-based marker
301                if !self.last_char_was_newline {
302                    return false;
303                }
304                text
305            }
306        };
307
308        // 1. Whitespace Gate:
309        // If the tail is pure whitespace, it might be indentation for a marker.
310        // We must wait for non-whitespace or newline.
311        if !last_line.is_empty() && last_line.chars().all(|c| c.is_whitespace()) {
312            return true;
313        }
314
315        let trimmed = last_line.trim_start();
316
317        // 2. File Header Partial Check
318        if !trimmed.is_empty()
319            && ("File:".starts_with(trimmed)
320                || (trimmed.starts_with("File:") && !text.ends_with('\n')))
321        {
322            return true;
323        }
324
325        // 3. Diff Marker Partial Check
326        // If we are anticipating a marker in the current file context, any prefix resemblance blocks.
327        if self.current_file.is_some() && !trimmed.is_empty() {
328            if "<<<<<<< SEARCH".starts_with(trimmed) {
329                return true;
330            }
331            if "=======".starts_with(trimmed) {
332                return true;
333            }
334            if ">>>>>>> REPLACE".starts_with(trimmed) {
335                return true;
336            }
337        }
338
339        // 4. Unclosed Block Check
340        if self.current_file.is_some()
341            && let Some(idx) = text.find("<<<<<<< SEARCH")
342        {
343            let line_start = text[..idx].rfind('\n').map(|i| i + 1).unwrap_or(0);
344            if line_start == 0 && !text.contains('\n') && !self.last_char_was_newline {
345                // Invalid mid-line marker
346            } else {
347                let indent = &text[line_start..idx];
348                if indent.chars().all(|c| c.is_whitespace()) && !text.contains(">>>>>>> REPLACE") {
349                    return true;
350                }
351            }
352        }
353
354        false
355    }
356
357    fn process_file_chunk(&self, llm_path: &str, chunk: &str) -> (Vec<StreamYieldItem>, usize) {
358        let mut items = Vec::new();
359        let mut cursor = 0;
360        let search_pattern = "<<<<<<< SEARCH";
361        let sep_pattern = "=======";
362        let replace_pattern = ">>>>>>> REPLACE";
363
364        while cursor < chunk.len() {
365            let search_idx = match chunk[cursor..].find(search_pattern) {
366                Some(i) => cursor + i,
367                None => break,
368            };
369
370            // Capture indentation from the start of the line up to the marker
371            let line_start = chunk[..search_idx].rfind('\n').map(|i| i + 1).unwrap_or(0);
372            let indent_slice = &chunk[line_start..search_idx];
373
374            // Verify indent consists only of whitespace
375            if !indent_slice.chars().all(|c| c.is_whitespace()) {
376                // If it's not a marker at the start of a line, skip it
377                items.push(StreamYieldItem::Text(
378                    chunk[cursor..search_idx + 1].to_string(),
379                ));
380                cursor = search_idx + 1;
381                continue;
382            }
383
384            let block_search_start = search_idx + search_pattern.len();
385            let block_search_start_content =
386                block_search_start + consume_line_ending(&chunk[block_search_start..]);
387
388            let (sep_line_start, sep_line_end) =
389                match find_marker_with_indent(chunk, sep_pattern, block_search_start, indent_slice)
390                {
391                    Some(pair) => pair,
392                    None => {
393                        let backtrack_pos = line_start.max(cursor);
394                        if backtrack_pos > cursor {
395                            items.push(StreamYieldItem::Text(
396                                chunk[cursor..backtrack_pos].to_string(),
397                            ));
398                        }
399                        return (items, backtrack_pos);
400                    }
401                };
402
403            let block_replace_start_content =
404                sep_line_end + consume_line_ending(&chunk[sep_line_end..]);
405
406            let (replace_line_start, _replace_line_end) =
407                match find_marker_with_indent(chunk, replace_pattern, sep_line_end, indent_slice) {
408                    Some(pair) => pair,
409                    None => {
410                        let backtrack_pos = line_start.max(cursor);
411                        if backtrack_pos > cursor {
412                            items.push(StreamYieldItem::Text(
413                                chunk[cursor..backtrack_pos].to_string(),
414                            ));
415                        }
416                        return (items, backtrack_pos);
417                    }
418                };
419
420            if search_idx > cursor {
421                items.push(StreamYieldItem::Text(chunk[cursor..search_idx].to_string()));
422            }
423
424            let final_end = replace_line_start + indent_slice.len() + replace_pattern.len();
425
426            let mut search_content = &chunk[block_search_start_content..sep_line_start];
427            if search_content.ends_with('\r') {
428                search_content = &search_content[..search_content.len() - 1];
429            }
430
431            let mut replace_content = &chunk[block_replace_start_content..replace_line_start];
432            if replace_content.ends_with('\r') {
433                replace_content = &replace_content[..replace_content.len() - 1];
434            }
435
436            items.push(StreamYieldItem::Patch(crate::models::AIPatch {
437                llm_file_path: llm_path.to_string(),
438                search_content: search_content.to_string(),
439                replace_content: replace_content.to_string(),
440                indent: indent_slice.to_string(),
441                raw_block: chunk[search_idx..final_end].to_string(),
442            }));
443
444            cursor = final_end;
445        }
446
447        if cursor < chunk.len() {
448            let tail = &chunk[cursor..];
449            let mut tail_limit = tail.len();
450
451            if self.is_incomplete(tail) {
452                if let Some(last_newline) = tail.rfind('\n') {
453                    tail_limit = last_newline + 1;
454                } else {
455                    tail_limit = 0;
456                }
457            }
458
459            if tail_limit > 0 {
460                items.push(StreamYieldItem::Text(tail[..tail_limit].to_string()));
461                cursor += tail_limit;
462            }
463        }
464
465        (items, cursor)
466    }
467
468    pub fn handle_patch(
469        &mut self,
470        patch: &crate::models::AIPatch,
471        _root: &Path,
472    ) -> (Option<StreamYieldItem>, Vec<String>) {
473        let mut warnings = Vec::new();
474
475        let resolution = self.resolve_path(&patch.llm_file_path, _root, &patch.search_content);
476
477        if let Some(w) = resolution.0 {
478            warnings.push(w.clone());
479        }
480
481        if let Some((path, fallback)) = resolution.1 {
482            if let Some(fb) = fallback {
483                self.overlay
484                    .entry(path.clone())
485                    .or_insert_with(|| fb.clone());
486                self.discovered_baseline.entry(path.clone()).or_insert(fb);
487            }
488
489            let original = self
490                .overlay
491                .get(&path)
492                .map(|s| s.as_str())
493                .or_else(|| self.baseline.get(&path).map(|s| s.as_str()))
494                .unwrap_or("");
495
496            let mut applied = None;
497
498            // Attempt 1: Exact match
499            if let Some(res) =
500                create_patched_content(original, &patch.search_content, &patch.replace_content)
501            {
502                applied = Some(res);
503            }
504            // Attempt 2: Strip \r if patch has it (LLM output \r\n) but file might have \n
505            else if patch.search_content.contains('\r') {
506                let search_normalized = patch.search_content.replace('\r', "");
507                if let Some(res) =
508                    create_patched_content(original, &search_normalized, &patch.replace_content)
509                {
510                    applied = Some(res);
511                }
512            }
513
514            if let Some(new_content) = applied {
515                let diff = generate_diff(&path, Some(original), Some(&new_content));
516                self.overlay.insert(path.clone(), new_content.clone());
517                (
518                    Some(StreamYieldItem::DiffBlock(
519                        crate::models::ProcessedDiffBlock {
520                            llm_file_path: patch.llm_file_path.clone(),
521                            unified_diff: diff,
522                        },
523                    )),
524                    warnings,
525                )
526            } else {
527                warnings.push(format!(
528                    "The SEARCH block from the AI could not be found in '{}'. Patch skipped.",
529                    path
530                ));
531
532                (
533                    Some(StreamYieldItem::Unparsed(crate::models::UnparsedBlock {
534                        text: patch.raw_block.clone(),
535                    })),
536                    warnings,
537                )
538            }
539        } else {
540            warnings.push(format!(
541                "File '{}' from the AI does not match any file in context. Patch skipped.",
542                patch.llm_file_path
543            ));
544
545            (
546                Some(StreamYieldItem::Unparsed(crate::models::UnparsedBlock {
547                    text: patch.raw_block.clone(),
548                })),
549                warnings,
550            )
551        }
552    }
553
554    pub fn finish(&mut self, last_chunk: &str) -> (String, Vec<StreamYieldItem>, Vec<String>) {
555        // Process any final tokens received.
556        self.feed(last_chunk);
557
558        let mut items: Vec<_> = self.by_ref().collect();
559
560        // Force flush if we are stuck waiting for a newline at EOF for a complete block
561        if self.is_incomplete(&self.buffer)
562            && self.buffer.contains("<<<<<<< SEARCH")
563            && self.buffer.contains(">>>>>>> REPLACE")
564        {
565            self.buffer.push('\n');
566            items.extend(self.by_ref());
567        }
568
569        // Anything remaining in the buffer is now considered a trailing segment.
570        if !self.buffer.is_empty() {
571            let looks_like_marker = self.is_incomplete(&self.buffer);
572
573            if looks_like_marker {
574                items.push(StreamYieldItem::Unparsed(UnparsedBlock {
575                    text: self.buffer.clone(),
576                }));
577            } else {
578                items.push(StreamYieldItem::Text(self.buffer.clone()));
579            }
580            self.buffer.clear();
581        }
582
583        let diff = self.build_final_unified_diff();
584
585        let warnings = self.collect_warnings(&items);
586
587        (diff, items, warnings)
588    }
589
590    pub fn collect_warnings(&self, items: &[StreamYieldItem]) -> Vec<String> {
591        items
592            .iter()
593            .filter_map(|i| match i {
594                StreamYieldItem::Warning(w) => Some(w.text.clone()),
595                _ => None,
596            })
597            .collect()
598    }
599
600    /// Processes a list of raw yields, resolving any Patch items into DiffBlocks or Warnings.
601    pub fn process_yields(
602        &mut self,
603        items: Vec<StreamYieldItem>,
604        session_root: &Path,
605    ) -> Vec<StreamYieldItem> {
606        let mut processed = Vec::with_capacity(items.len());
607        for item in items {
608            if let StreamYieldItem::Patch(ref patch) = item {
609                let (resolved, warnings) = self.handle_patch(patch, session_root);
610                for w in warnings {
611                    processed.push(StreamYieldItem::Warning(crate::models::WarningMessage {
612                        text: w,
613                    }));
614                }
615                if let Some(res) = resolved {
616                    processed.push(res);
617                }
618            } else {
619                processed.push(item);
620            }
621        }
622        processed
623    }
624
625    pub fn build_final_unified_diff(&self) -> String {
626        let mut diffs = String::new();
627        let keys: std::collections::BTreeSet<_> = self
628            .discovered_baseline
629            .keys()
630            .chain(self.overlay.keys())
631            .collect();
632
633        for k in keys {
634            let old = self
635                .discovered_baseline
636                .get(k)
637                .map(|s| s.as_str())
638                .or_else(|| self.baseline.get(k).map(|s| s.as_str()));
639            let new = self.overlay.get(k).map(|s| s.as_str());
640
641            if old != new {
642                let d = generate_diff(k, old, new);
643                diffs.push_str(&d);
644            }
645        }
646        diffs
647    }
648
649    fn resolve_path(
650        &self,
651        llm_path: &str,
652        root: &Path,
653        search_block: &str,
654    ) -> (Option<String>, Option<(String, Option<String>)>) {
655        if self.overlay.contains_key(llm_path) || self.baseline.contains_key(llm_path) {
656            return (None, Some((llm_path.to_string(), None)));
657        }
658        if search_block.trim().is_empty() {
659            return (None, Some((llm_path.to_string(), None)));
660        }
661        let abs_path = root.join(llm_path);
662        if abs_path.exists()
663            && let Ok(canon) = abs_path.canonicalize()
664            && let Ok(root_canon) = root.canonicalize()
665            && canon.starts_with(root_canon)
666            && let Ok(content) = std::fs::read_to_string(&abs_path)
667        {
668            let msg = format!(
669                "File '{}' was not in the session context but was found on disk.",
670                llm_path
671            );
672            return (Some(msg), Some((llm_path.to_string(), Some(content))));
673        }
674        (None, None)
675    }
676}
677
678fn consume_line_ending(s: &str) -> usize {
679    if s.starts_with("\r\n") {
680        2
681    } else if s.starts_with('\n') {
682        1
683    } else {
684        0
685    }
686}
687
688fn find_marker_with_indent(
689    chunk: &str,
690    marker: &str,
691    start_pos: usize,
692    expected_indent: &str,
693) -> Option<(usize, usize)> {
694    let mut search_pos = start_pos;
695    while let Some(i) = chunk[search_pos..].find(marker) {
696        let found_idx = search_pos + i;
697        let line_start = chunk[..found_idx]
698            .rfind('\n')
699            .map(|idx| idx + 1)
700            .unwrap_or(0);
701        if chunk[line_start..found_idx] == *expected_indent {
702            let after = &chunk[found_idx + marker.len()..];
703            let line_end = after
704                .find('\n')
705                .map(|idx| found_idx + marker.len() + idx)
706                .unwrap_or(chunk.len());
707            // We ignore \r here to allow CRLF support, checking only for \n as line terminator
708            if chunk[found_idx + marker.len()..line_end]
709                .chars()
710                .all(|c| c.is_whitespace() && c != '\n')
711            {
712                return Some((line_start, line_end));
713            }
714        }
715        search_pos = found_idx + marker.len();
716    }
717    None
718}