Skip to main content

aico/diffing/
parser.rs

1use crate::diffing::diff_utils::generate_diff;
2use crate::diffing::patching::create_patched_content;
3use crate::models::{StreamYieldItem, UnparsedBlock};
4use regex::Regex;
5use std::collections::HashMap;
6use std::path::Path;
7use std::sync::LazyLock;
8
9pub struct StreamParser<'a> {
10    buffer: String,
11    current_file: Option<String>,
12    /// Queue for items found during parsing that are waiting to be yielded.
13    yield_queue: std::collections::VecDeque<StreamYieldItem>,
14    /// Baseline contents provided by the session.
15    baseline: &'a HashMap<String, String>,
16    /// Overlay of files modified during this stream.
17    overlay: HashMap<String, String>,
18    /// Maps filenames to their content pre-modification in this stream.
19    discovered_baseline: HashMap<String, String>,
20    /// Tracks if the last yielded character was a newline.
21    /// Used to enforce line-start anchors for headers.
22    last_char_was_newline: bool,
23}
24
25impl<'a> StreamParser<'a> {
26    pub fn get_pending_content(&self) -> String {
27        self.buffer.clone()
28    }
29
30    pub fn is_pending_displayable(&self) -> bool {
31        let pending = &self.buffer;
32        if pending.is_empty() {
33            return false;
34        }
35
36        let tail_is_at_line_start = if pending.rfind('\n').is_some() {
37            true
38        } else {
39            self.last_char_was_newline
40        };
41
42        if !tail_is_at_line_start {
43            return true;
44        }
45
46        let last_line = pending.split('\n').next_back().unwrap_or("");
47        let trimmed = last_line.trim_start();
48
49        // 1. GLOBAL CHECK: File Headers
50        // Block if the tail looks like the start of a "File:" line.
51        if !trimmed.is_empty()
52            && ("File:".starts_with(trimmed)
53                || (trimmed.starts_with("File:") && !pending.ends_with('\n')))
54        {
55            return false;
56        }
57
58        // 2. CONTEXT CHECK: Diff Markers
59        // We only care about diff markers if we are actively inside a file context.
60        if self.current_file.is_some() {
61            // A. Body Check: Are we buffering a block?
62            // If the buffer contains the start marker, we are inside a block (or waiting for it to close).
63            // We must hold back everything until the parser consumes it.
64            if pending.contains("<<<<<<< SEARCH") {
65                return false;
66            }
67
68            // B. Tail Check: Is a block starting right now?
69            // We ONLY need to check for the start marker.
70            // (We don't check for ======= or >>>>>>> because if we see those WITHOUT
71            // the start marker in the body check above, they are just text).
72            if !trimmed.is_empty() && "<<<<<<< SEARCH".starts_with(trimmed) {
73                return false;
74            }
75        }
76
77        true
78    }
79
80    pub fn new(original_contents: &'a HashMap<String, String>) -> Self {
81        Self {
82            buffer: String::new(),
83            current_file: None,
84            yield_queue: std::collections::VecDeque::new(),
85            baseline: original_contents,
86            overlay: HashMap::new(),
87            discovered_baseline: HashMap::new(),
88            // Start of stream is treated as start of a line
89            last_char_was_newline: true,
90        }
91    }
92
93    /// Feeds a new chunk of text into the parser.
94    /// Use the Iterator implementation (next()) to retrieve yielded items.
95    pub fn feed(&mut self, chunk: &str) {
96        self.buffer.push_str(chunk);
97    }
98
99    /// Feeds content ensuring a trailing newline for correct parsing of final blocks.
100    pub fn feed_complete(&mut self, content: &str) {
101        self.feed(content);
102        if !content.ends_with('\n') {
103            self.feed("\n");
104        }
105    }
106
107    /// Convenience method to feed content and return resolved yields in one go.
108    pub fn parse_and_resolve(&mut self, chunk: &str, session_root: &Path) -> Vec<StreamYieldItem> {
109        self.feed(chunk);
110        let raw_yields: Vec<_> = self.by_ref().collect();
111        self.process_yields(raw_yields, session_root)
112    }
113
114    /// Centralized finalization logic to resolve any remaining buffer content,
115    /// process patches, and build the final diff and structured display items.
116    pub fn final_resolve(
117        &mut self,
118        session_root: &Path,
119    ) -> (String, Vec<crate::models::DisplayItem>, Vec<String>) {
120        // 1. Drain any items currently in the iterator/buffer
121        let (_, raw_yields, _) = self.finish("");
122
123        // 2. Resolve Patch items into DiffBlocks (and update overlay/discovered_baseline)
124        let processed = self.process_yields(raw_yields, session_root);
125
126        // 3. Collect final state
127        let warnings = self.collect_warnings(&processed);
128        let diff = self.build_final_unified_diff();
129        let display_items = processed
130            .into_iter()
131            .filter_map(|y| y.to_display_item(true))
132            .collect();
133
134        (diff, display_items, warnings)
135    }
136
137    fn update_newline_state(&mut self, item: &StreamYieldItem) {
138        match item {
139            StreamYieldItem::Text(s) => self.last_char_was_newline = s.ends_with('\n'),
140            StreamYieldItem::Unparsed(u) => self.last_char_was_newline = u.text.ends_with('\n'),
141            StreamYieldItem::FileHeader(_) => self.last_char_was_newline = true, // Headers end with \n
142            StreamYieldItem::Patch(p) => self.last_char_was_newline = p.raw_block.ends_with('\n'),
143            StreamYieldItem::DiffBlock(d) => {
144                self.last_char_was_newline = d.unified_diff.ends_with('\n')
145            }
146            StreamYieldItem::Warning(_) => {} // Metadata doesn't affect flow
147            StreamYieldItem::IncompleteBlock(b) => self.last_char_was_newline = b.ends_with('\n'),
148        }
149    }
150
151    fn check_header_match(&self, m: regex::Match, start_of_buffer_is_start_of_line: bool) -> bool {
152        // A match is valid if it starts at index > 0 (meaning a previous newline exists at m.start()-1)
153        // OR if it starts at 0 and the parser state says we are at the start of a line.
154        m.start() > 0 || start_of_buffer_is_start_of_line
155    }
156}
157
158static FILE_HEADER_RE: LazyLock<Regex> =
159    LazyLock::new(|| Regex::new(r"(?m)^(?P<line>[ \t]*File:[ \t]*(?P<path>.*?)\r?\n)").unwrap());
160
161impl<'a> Iterator for StreamParser<'a> {
162    type Item = StreamYieldItem;
163
164    fn next(&mut self) -> Option<Self::Item> {
165        loop {
166            // 1. First, drain the pre-parsed queue
167            if let Some(item) = self.yield_queue.pop_front() {
168                self.update_newline_state(&item);
169                return Some(item);
170            }
171
172            if self.buffer.is_empty() {
173                return None;
174            }
175
176            // 2. If we are currently "inside" a file's content section
177            if let Some(llm_file_path) = self.current_file.clone() {
178                // Find next potential header to switch context
179                let mut next_header_idx = self.buffer.len();
180                for m in FILE_HEADER_RE.find_iter(&self.buffer) {
181                    // Use helper to check validity against current line state
182                    if self.check_header_match(m, self.last_char_was_newline) {
183                        next_header_idx = m.start();
184                        break;
185                    }
186                }
187
188                // Process content UP TO that header (or end of buffer)
189                let chunk_limit = next_header_idx;
190                if chunk_limit > 0 || (chunk_limit == 0 && self.last_char_was_newline) {
191                    let (chunk_items, consumed_bytes) =
192                        self.process_file_chunk(&llm_file_path, &self.buffer[..chunk_limit]);
193
194                    if consumed_bytes > 0 {
195                        self.buffer.drain(..consumed_bytes);
196                    }
197
198                    if !chunk_items.is_empty() {
199                        self.yield_queue.extend(chunk_items);
200                        continue;
201                    }
202
203                    // If we made no progress but have a limit
204                    if consumed_bytes == 0 {
205                        // If the limit was a header, we are done with this file.
206                        if next_header_idx < self.buffer.len() {
207                            self.current_file = None;
208                            continue;
209                        } else {
210                            // Valid content but incomplete block at the end. Wait for more data.
211                            return None;
212                        }
213                    }
214                } else {
215                    // Start of buffer is a header.
216                    self.current_file = None;
217                    continue;
218                }
219            }
220
221            // 3. Look for Global File Headers
222            // We only look for headers if we are at a clean line start (managed by logic inside loop)
223            if let Some(caps) = FILE_HEADER_RE.captures(&self.buffer) {
224                let mat = caps.get(0).unwrap();
225                if self.check_header_match(mat, self.last_char_was_newline) {
226                    if mat.start() > 0 {
227                        let text = self.buffer[..mat.start()].to_string();
228                        self.buffer.drain(..mat.start());
229                        let item = StreamYieldItem::Text(text);
230                        self.update_newline_state(&item);
231                        return Some(item);
232                    }
233
234                    let path_str = caps
235                        .name("path")
236                        .unwrap()
237                        .as_str()
238                        .trim()
239                        .trim_matches(|c| c == '*' || c == '`')
240                        .to_string();
241                    self.current_file = Some(path_str.clone());
242                    self.buffer.drain(..mat.end());
243                    let item = StreamYieldItem::FileHeader(crate::models::FileHeader {
244                        llm_file_path: path_str,
245                    });
246                    self.update_newline_state(&item);
247                    return Some(item);
248                }
249                // Invalid match (e.g. " File:" mid-line). Treat as text.
250            }
251
252            // 4. Handle remaining buffer as Markdown Text
253            let text = &self.buffer;
254            let mut limit = text.len();
255
256            // Always truncate at the first valid header found inside the buffer
257            for m in FILE_HEADER_RE.find_iter(text) {
258                if self.check_header_match(m, self.last_char_was_newline) {
259                    limit = m.start();
260                    break;
261                }
262            }
263
264            // Always truncate at the first valid diff marker found inside the buffer
265            if let Some(search_idx) = text[..limit].find("<<<<<<< SEARCH") {
266                let ls = text[..search_idx].rfind('\n').map(|i| i + 1).unwrap_or(0);
267                if ls > 0 || self.last_char_was_newline {
268                    limit = limit.min(ls);
269                }
270            }
271
272            // If the remaining part is incomplete, we must truncate further
273            if self.is_incomplete(&text[..limit]) {
274                if let Some(last_newline) = text[..limit].rfind('\n') {
275                    limit = last_newline + 1;
276                } else {
277                    limit = 0;
278                }
279            }
280
281            if limit > 0 {
282                let text_yield = self.buffer[..limit].to_string();
283                self.buffer.drain(..limit);
284                let item = StreamYieldItem::Text(text_yield);
285                self.update_newline_state(&item);
286                return Some(item);
287            }
288
289            return None;
290        }
291    }
292}
293
294impl<'a> StreamParser<'a> {
295    fn is_incomplete(&self, text: &str) -> bool {
296        // We need to determine the last line content
297        let last_line = match text.rfind('\n') {
298            Some(idx) => &text[idx + 1..],
299            None => {
300                // If we are mid-stream and haven't seen a newline, we can't start a line-based marker
301                if !self.last_char_was_newline {
302                    return false;
303                }
304                text
305            }
306        };
307
308        // 1. Whitespace Gate:
309        // If the tail is pure whitespace, it might be indentation for a marker.
310        // We must wait for non-whitespace or newline.
311        if !last_line.is_empty() && last_line.chars().all(|c| c.is_whitespace()) {
312            return true;
313        }
314
315        let trimmed = last_line.trim_start();
316
317        // 2. File Header Partial Check
318        if !trimmed.is_empty()
319            && ("File:".starts_with(trimmed)
320                || (trimmed.starts_with("File:") && !text.ends_with('\n')))
321        {
322            return true;
323        }
324
325        // 3. Diff Marker Partial Check
326        // If we are anticipating a marker in the current file context, any prefix resemblance blocks.
327        if self.current_file.is_some() && !trimmed.is_empty() {
328            if "<<<<<<< SEARCH".starts_with(trimmed) {
329                return true;
330            }
331            if "=======".starts_with(trimmed) {
332                return true;
333            }
334            if ">>>>>>> REPLACE".starts_with(trimmed) {
335                return true;
336            }
337        }
338
339        // 4. Unclosed Block Check
340        if self.current_file.is_some()
341            && let Some(idx) = text.find("<<<<<<< SEARCH")
342        {
343            let line_start = text[..idx].rfind('\n').map(|i| i + 1).unwrap_or(0);
344            if line_start == 0 && !text.contains('\n') && !self.last_char_was_newline {
345                // Invalid mid-line marker
346            } else {
347                let indent = &text[line_start..idx];
348                if indent.chars().all(|c| c.is_whitespace()) && !text.contains(">>>>>>> REPLACE") {
349                    return true;
350                }
351            }
352        }
353
354        false
355    }
356
357    fn process_file_chunk(&self, llm_path: &str, chunk: &str) -> (Vec<StreamYieldItem>, usize) {
358        let mut items = Vec::new();
359        let mut cursor = 0;
360        let search_pattern = "<<<<<<< SEARCH";
361        let sep_pattern = "=======";
362        let replace_pattern = ">>>>>>> REPLACE";
363
364        while cursor < chunk.len() {
365            let search_idx = match chunk[cursor..].find(search_pattern) {
366                Some(i) => cursor + i,
367                None => break,
368            };
369
370            // Capture indentation from the start of the line up to the marker
371            let line_start = chunk[..search_idx].rfind('\n').map(|i| i + 1).unwrap_or(0);
372            let indent_slice = &chunk[line_start..search_idx];
373
374            // Verify indent consists only of whitespace
375            if !indent_slice.chars().all(|c| c.is_whitespace()) {
376                // If it's not a marker at the start of a line, skip it
377                items.push(StreamYieldItem::Text(
378                    chunk[cursor..search_idx + 1].to_string(),
379                ));
380                cursor = search_idx + 1;
381                continue;
382            }
383
384            let block_search_start = search_idx + search_pattern.len();
385            let block_search_start_content =
386                block_search_start + consume_line_ending(&chunk[block_search_start..]);
387
388            let (sep_line_start, sep_line_end) =
389                match find_marker_with_indent(chunk, sep_pattern, block_search_start, indent_slice)
390                {
391                    Some(pair) => pair,
392                    None => {
393                        let backtrack_pos = line_start.max(cursor);
394                        if backtrack_pos > cursor {
395                            items.push(StreamYieldItem::Text(
396                                chunk[cursor..backtrack_pos].to_string(),
397                            ));
398                        }
399                        return (items, backtrack_pos);
400                    }
401                };
402
403            let block_replace_start_content =
404                sep_line_end + consume_line_ending(&chunk[sep_line_end..]);
405
406            let (replace_line_start, _replace_line_end) =
407                match find_marker_with_indent(chunk, replace_pattern, sep_line_end, indent_slice) {
408                    Some(pair) => pair,
409                    None => {
410                        let backtrack_pos = line_start.max(cursor);
411                        if backtrack_pos > cursor {
412                            items.push(StreamYieldItem::Text(
413                                chunk[cursor..backtrack_pos].to_string(),
414                            ));
415                        }
416                        return (items, backtrack_pos);
417                    }
418                };
419
420            if search_idx > cursor {
421                items.push(StreamYieldItem::Text(chunk[cursor..search_idx].to_string()));
422            }
423
424            let final_end = replace_line_start + indent_slice.len() + replace_pattern.len();
425
426            let mut search_content = &chunk[block_search_start_content..sep_line_start];
427            if search_content.ends_with('\r') {
428                search_content = &search_content[..search_content.len() - 1];
429            }
430
431            let mut replace_content = &chunk[block_replace_start_content..replace_line_start];
432            if replace_content.ends_with('\r') {
433                replace_content = &replace_content[..replace_content.len() - 1];
434            }
435
436            items.push(StreamYieldItem::Patch(crate::models::AIPatch {
437                llm_file_path: llm_path.to_string(),
438                search_content: search_content.to_string(),
439                replace_content: replace_content.to_string(),
440                indent: indent_slice.to_string(),
441                raw_block: chunk[search_idx..final_end].to_string(),
442            }));
443
444            cursor = final_end;
445        }
446
447        if cursor < chunk.len() {
448            let tail = &chunk[cursor..];
449            if !self.is_incomplete(tail) {
450                items.push(StreamYieldItem::Text(tail.to_string()));
451                cursor = chunk.len();
452            }
453        }
454
455        (items, cursor)
456    }
457
458    pub fn handle_patch(
459        &mut self,
460        patch: &crate::models::AIPatch,
461        _root: &Path,
462    ) -> (Option<StreamYieldItem>, Vec<String>) {
463        let mut warnings = Vec::new();
464
465        let resolution = self.resolve_path(&patch.llm_file_path, _root, &patch.search_content);
466
467        if let Some(w) = resolution.0 {
468            warnings.push(w.clone());
469        }
470
471        if let Some((path, fallback)) = resolution.1 {
472            if let Some(fb) = fallback {
473                self.overlay
474                    .entry(path.clone())
475                    .or_insert_with(|| fb.clone());
476                self.discovered_baseline.entry(path.clone()).or_insert(fb);
477            }
478
479            let original = self
480                .overlay
481                .get(&path)
482                .map(|s| s.as_str())
483                .or_else(|| self.baseline.get(&path).map(|s| s.as_str()))
484                .unwrap_or("");
485
486            let mut applied = None;
487
488            // Attempt 1: Exact match
489            if let Some(res) =
490                create_patched_content(original, &patch.search_content, &patch.replace_content)
491            {
492                applied = Some(res);
493            }
494            // Attempt 2: Strip \r if patch has it (LLM output \r\n) but file might have \n
495            else if patch.search_content.contains('\r') {
496                let search_normalized = patch.search_content.replace('\r', "");
497                if let Some(res) =
498                    create_patched_content(original, &search_normalized, &patch.replace_content)
499                {
500                    applied = Some(res);
501                }
502            }
503
504            if let Some(new_content) = applied {
505                let diff = generate_diff(&path, Some(original), Some(&new_content));
506                self.overlay.insert(path.clone(), new_content.clone());
507                (
508                    Some(StreamYieldItem::DiffBlock(
509                        crate::models::ProcessedDiffBlock {
510                            llm_file_path: patch.llm_file_path.clone(),
511                            unified_diff: diff,
512                        },
513                    )),
514                    warnings,
515                )
516            } else {
517                warnings.push(format!(
518                    "The SEARCH block from the AI could not be found in '{}'. Patch skipped.",
519                    path
520                ));
521
522                (
523                    Some(StreamYieldItem::Unparsed(crate::models::UnparsedBlock {
524                        text: patch.raw_block.clone(),
525                    })),
526                    warnings,
527                )
528            }
529        } else {
530            warnings.push(format!(
531                "File '{}' from the AI does not match any file in context. Patch skipped.",
532                patch.llm_file_path
533            ));
534
535            (
536                Some(StreamYieldItem::Unparsed(crate::models::UnparsedBlock {
537                    text: patch.raw_block.clone(),
538                })),
539                warnings,
540            )
541        }
542    }
543
544    pub fn finish(&mut self, last_chunk: &str) -> (String, Vec<StreamYieldItem>, Vec<String>) {
545        // Process any final tokens received.
546        self.feed(last_chunk);
547
548        // Force flush if we are stuck waiting for a newline at EOF for a complete block
549        if self.is_incomplete(&self.buffer)
550            && self.buffer.contains("<<<<<<< SEARCH")
551            && self.buffer.contains(">>>>>>> REPLACE")
552        {
553            self.buffer.push('\n');
554        }
555
556        let mut items: Vec<_> = self.by_ref().collect();
557
558        // Anything remaining in the buffer is now considered a trailing segment.
559        if !self.buffer.is_empty() {
560            let looks_like_marker = self.is_incomplete(&self.buffer);
561
562            if looks_like_marker {
563                items.push(StreamYieldItem::Unparsed(UnparsedBlock {
564                    text: self.buffer.clone(),
565                }));
566            } else {
567                items.push(StreamYieldItem::Text(self.buffer.clone()));
568            }
569            self.buffer.clear();
570        }
571
572        let diff = self.build_final_unified_diff();
573
574        let warnings = self.collect_warnings(&items);
575
576        (diff, items, warnings)
577    }
578
579    pub fn collect_warnings(&self, items: &[StreamYieldItem]) -> Vec<String> {
580        items
581            .iter()
582            .filter_map(|i| match i {
583                StreamYieldItem::Warning(w) => Some(w.text.clone()),
584                _ => None,
585            })
586            .collect()
587    }
588
589    /// Processes a list of raw yields, resolving any Patch items into DiffBlocks or Warnings.
590    pub fn process_yields(
591        &mut self,
592        items: Vec<StreamYieldItem>,
593        session_root: &Path,
594    ) -> Vec<StreamYieldItem> {
595        let mut processed = Vec::with_capacity(items.len());
596        for item in items {
597            if let StreamYieldItem::Patch(ref patch) = item {
598                let (resolved, warnings) = self.handle_patch(patch, session_root);
599                for w in warnings {
600                    processed.push(StreamYieldItem::Warning(crate::models::WarningMessage {
601                        text: w,
602                    }));
603                }
604                if let Some(res) = resolved {
605                    processed.push(res);
606                }
607            } else {
608                processed.push(item);
609            }
610        }
611        processed
612    }
613
614    pub fn build_final_unified_diff(&self) -> String {
615        let mut diffs = String::new();
616        let keys: std::collections::BTreeSet<_> = self
617            .discovered_baseline
618            .keys()
619            .chain(self.overlay.keys())
620            .collect();
621
622        for k in keys {
623            let old = self
624                .discovered_baseline
625                .get(k)
626                .map(|s| s.as_str())
627                .or_else(|| self.baseline.get(k).map(|s| s.as_str()));
628            let new = self.overlay.get(k).map(|s| s.as_str());
629
630            if old != new {
631                let d = generate_diff(k, old, new);
632                diffs.push_str(&d);
633            }
634        }
635        diffs
636    }
637
638    fn resolve_path(
639        &self,
640        llm_path: &str,
641        root: &Path,
642        search_block: &str,
643    ) -> (Option<String>, Option<(String, Option<String>)>) {
644        if self.overlay.contains_key(llm_path) || self.baseline.contains_key(llm_path) {
645            return (None, Some((llm_path.to_string(), None)));
646        }
647        if search_block.trim().is_empty() {
648            return (None, Some((llm_path.to_string(), None)));
649        }
650        let abs_path = root.join(llm_path);
651        if abs_path.exists()
652            && let Ok(canon) = abs_path.canonicalize()
653            && let Ok(root_canon) = root.canonicalize()
654            && canon.starts_with(root_canon)
655            && let Ok(content) = std::fs::read_to_string(&abs_path)
656        {
657            let msg = format!(
658                "File '{}' was not in the session context but was found on disk.",
659                llm_path
660            );
661            return (Some(msg), Some((llm_path.to_string(), Some(content))));
662        }
663        (None, None)
664    }
665}
666
667fn consume_line_ending(s: &str) -> usize {
668    if s.starts_with("\r\n") {
669        2
670    } else if s.starts_with('\n') {
671        1
672    } else {
673        0
674    }
675}
676
677fn find_marker_with_indent(
678    chunk: &str,
679    marker: &str,
680    start_pos: usize,
681    expected_indent: &str,
682) -> Option<(usize, usize)> {
683    let mut search_pos = start_pos;
684    while let Some(i) = chunk[search_pos..].find(marker) {
685        let found_idx = search_pos + i;
686        let line_start = chunk[..found_idx]
687            .rfind('\n')
688            .map(|idx| idx + 1)
689            .unwrap_or(0);
690        if chunk[line_start..found_idx] == *expected_indent {
691            let after = &chunk[found_idx + marker.len()..];
692            let line_end = after
693                .find('\n')
694                .map(|idx| found_idx + marker.len() + idx)
695                .unwrap_or(chunk.len());
696            // We ignore \r here to allow CRLF support, checking only for \n as line terminator
697            if chunk[found_idx + marker.len()..line_end]
698                .chars()
699                .all(|c| c.is_whitespace() && c != '\n')
700            {
701                return Some((line_start, line_end));
702            }
703        }
704        search_pos = found_idx + marker.len();
705    }
706    None
707}