Skip to main content

semantic/merge_driver/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Function-level three-way merge driver.
3//!
4//! Decomposes a parseable source file into AST-defined items, merges each item
5//! independently, and falls back to `heddle-merge::text_hunk_merge` on items
6//! that can't be resolved structurally — and on the entire file when the
7//! parser declines.
8//!
9//! See `docs/design/semantic-merge-function-level.md` for the contract.
10
11use std::path::Path;
12
13use merge::{ConflictMarkers, MergeOutcome, text_hunk_merge_with_markers};
14
15use crate::parser::{Language, ParsedFile};
16
17mod items;
18mod language_rules;
19mod reconstruct;
20
21#[cfg(test)]
22mod tests;
23
24use items::segment_file;
25use reconstruct::reconstruct_merged_file;
26
27/// Three-way merge of `base`, `ours`, `theirs` using AST-defined item boundaries
28/// when the parser accepts all three sides, falling back to
29/// [`text_hunk_merge_with_markers`] otherwise.
30///
31/// The `path` is used for language detection only; it does NOT need to exist
32/// on disk.
33pub fn semantic_three_way_merge(
34    base: &[u8],
35    ours: &[u8],
36    theirs: &[u8],
37    path: &Path,
38    markers: ConflictMarkers<'_>,
39) -> MergeOutcome {
40    if base == ours && base == theirs {
41        return MergeOutcome::Clean(base.to_vec());
42    }
43    if base == ours {
44        return MergeOutcome::Clean(theirs.to_vec());
45    }
46    if base == theirs {
47        return MergeOutcome::Clean(ours.to_vec());
48    }
49    if ours == theirs {
50        return MergeOutcome::Clean(ours.to_vec());
51    }
52
53    let language = Language::from_path(path);
54    if matches!(language, Language::Unknown) {
55        return text_hunk_merge_with_markers(base, ours, theirs, markers);
56    }
57
58    let (Ok(base_text), Ok(ours_text), Ok(theirs_text)) = (
59        std::str::from_utf8(base),
60        std::str::from_utf8(ours),
61        std::str::from_utf8(theirs),
62    ) else {
63        return text_hunk_merge_with_markers(base, ours, theirs, markers);
64    };
65
66    let (Some(base_parsed), Some(ours_parsed), Some(theirs_parsed)) = (
67        ParsedFile::parse(base_text, language),
68        ParsedFile::parse(ours_text, language),
69        ParsedFile::parse(theirs_text, language),
70    ) else {
71        return text_hunk_merge_with_markers(base, ours, theirs, markers);
72    };
73
74    let mut base_segments = segment_file(&base_parsed);
75    let mut ours_segments = segment_file(&ours_parsed);
76    let mut theirs_segments = segment_file(&theirs_parsed);
77
78    // Rekey `use` items so declarations whose expanded leaf sets intersect
79    // on ANY path collide for cross-side matching (heddle#468; Codex r2 on
80    // PR #477). Must run before the empty-base add/add guard below and
81    // before reconstruction, both of which key off `Item`/`ItemKey`.
82    items::canonicalize_use_keys(&mut base_segments, &mut ours_segments, &mut theirs_segments);
83
84    // When a side has zero parseable items but the others do, the
85    // per-item alignment has nothing to anchor on for that side and
86    // its contiguous content can't be cleanly split across the other
87    // sides' per-item segments — the surrounding preamble/postamble
88    // merges either drop the side's edits (Codex r2 P1 #3) or
89    // double-emit its bridging content. text_hunk_merge handles the
90    // full-file alignment without those artifacts, so route this
91    // shape through it.
92    //
93    // EXCEPTION: empty base with both sides adding items that share
94    // keys (add/add). text_hunk_merge concatenates both insertions
95    // at the same anchor and silently produces duplicate definitions;
96    // `resolve_item`'s add/add arm is the only path that surfaces this
97    // as a conflict. Drop through to the reconstruct path in that case
98    // so the conflict is reported (Codex r3 P1 #1).
99    let counts = [
100        base_segments.items.len(),
101        ours_segments.items.len(),
102        theirs_segments.items.len(),
103    ];
104    if counts.contains(&0) && counts.iter().any(|&c| c > 0) {
105        let addadd_in_empty_base = base_segments.items.is_empty() && {
106            let ours_keys: std::collections::BTreeSet<_> =
107                ours_segments.items.iter().map(|i| &i.key).collect();
108            theirs_segments
109                .items
110                .iter()
111                .any(|i| ours_keys.contains(&i.key))
112        };
113        if !addadd_in_empty_base {
114            return text_hunk_merge_with_markers(base, ours, theirs, markers);
115        }
116    }
117
118    let outcome = reconstruct_merged_file(
119        base_text,
120        ours_text,
121        theirs_text,
122        &base_segments,
123        &ours_segments,
124        &theirs_segments,
125        markers,
126    );
127
128    // Input-grounded safety net (heddle#490 P3 floor). The tree model makes
129    // silent structural collapse impossible *by construction*, but a cheap
130    // conservation check against the INPUTS — not the merge's own resolved
131    // metadata — is kept as defense-in-depth: if a CLEAN merge ever fails to
132    // re-parse or invents an item/nesting no input had, route to the textual
133    // path instead of emitting the corruption.
134    //
135    // The floor also guards CONFLICT outputs (heddle#490 r6). The clean-only
136    // check could not see a malformed body that ships *alongside* a real
137    // conflict: a divergent container header plus an empty-base both-sides-add
138    // emitted a duplicated opening `{` (so `{`/`}` no longer balance) while the
139    // outcome was `Conflicts`, and a conflict skipped the clean floor — the
140    // malformed markers shipped silently. A conflict the user resolves must
141    // still be well-formed: resolving the markers to EITHER side must yield a
142    // file that re-parses. If a resolution is unparseable (an unbalanced /
143    // duplicated delimiter), route to the textual fallback, whose markers are
144    // well-formed by construction. Part 1 (structural body delimiter) makes
145    // this hold by construction; this guard keeps a future weave regression
146    // from re-shipping the class silently.
147    match &outcome {
148        MergeOutcome::Clean(output) => {
149            if !conserves_inputs(output, language, &base_parsed, &ours_parsed, &theirs_parsed) {
150                return text_hunk_merge_with_markers(base, ours, theirs, markers);
151            }
152        }
153        MergeOutcome::Conflicts {
154            merged_bytes_with_markers,
155            ..
156        } => {
157            if !conflict_well_formed(merged_bytes_with_markers, language) {
158                return text_hunk_merge_with_markers(base, ours, theirs, markers);
159            }
160        }
161        MergeOutcome::Binary | MergeOutcome::DeleteVsModify => {}
162    }
163    outcome
164}
165
166/// Whether a conflict output is structurally well-formed: resolving its markers
167/// to EITHER side independently yields a file that re-parses.
168///
169/// Reuses the clean floor's re-parse signal ([`ParsedFile::parse`] returns
170/// `None` on a tree with errors) so both floors close the SAME class with the
171/// same mechanism. The duplicate-delimiter corruption (heddle#490 r6) leaves a
172/// resolved side with an unbalanced `{`/`}`, which fails to parse and is caught
173/// here. If the markers themselves are malformed (unbalanced
174/// `<<<<<<< / ======= / >>>>>>>` nesting) the resolver returns `None`, which is
175/// likewise treated as not-well-formed.
176fn conflict_well_formed(output: &[u8], language: Language) -> bool {
177    let Ok(text) = std::str::from_utf8(output) else {
178        return false;
179    };
180    let Some((ours, theirs)) = resolve_conflict_sides(text) else {
181        return false;
182    };
183    ParsedFile::parse(ours.as_str(), language).is_some()
184        && ParsedFile::parse(theirs.as_str(), language).is_some()
185}
186
187/// Resolve conflict-marked `text` into its two independent sides: the
188/// take-ours resolution (drop the `theirs` hunks + markers) and the take-theirs
189/// resolution (drop the `ours` hunks + markers). Mirrors the marker shape
190/// emitted by [`merge::markers`] / `reconstruct::emit_addadd_conflict`:
191/// `<<<<<<< <label>` … `=======` … `>>>>>>> <label>`.
192///
193/// Returns `None` when the markers are malformed — a `=======` outside an open
194/// `<<<<<<<`, a `>>>>>>>` outside an open `=======`, a nested `<<<<<<<`, or an
195/// unterminated block at end of input — so a structurally broken conflict is
196/// itself surfaced as not-well-formed.
197fn resolve_conflict_sides(text: &str) -> Option<(String, String)> {
198    enum State {
199        Normal,
200        Ours,
201        Theirs,
202    }
203    let mut ours = String::new();
204    let mut theirs = String::new();
205    let mut state = State::Normal;
206    for line in text.split_inclusive('\n') {
207        let marker = conflict_marker(line);
208        if matches!(marker, Some(ConflictMarker::Start)) {
209            match state {
210                State::Normal => state = State::Ours,
211                _ => return None,
212            }
213        } else if matches!(marker, Some(ConflictMarker::Separator)) {
214            match state {
215                State::Ours => state = State::Theirs,
216                _ => return None,
217            }
218        } else if matches!(marker, Some(ConflictMarker::End)) {
219            match state {
220                State::Theirs => state = State::Normal,
221                _ => return None,
222            }
223        } else {
224            match state {
225                State::Normal => {
226                    ours.push_str(line);
227                    theirs.push_str(line);
228                }
229                State::Ours => ours.push_str(line),
230                State::Theirs => theirs.push_str(line),
231            }
232        }
233    }
234    match state {
235        State::Normal => Some((ours, theirs)),
236        _ => None,
237    }
238}
239
240#[derive(Clone, Copy, Debug, Eq, PartialEq)]
241enum ConflictMarker {
242    Start,
243    Separator,
244    End,
245}
246
247fn conflict_marker(line: &str) -> Option<ConflictMarker> {
248    let body = line.strip_suffix('\n').unwrap_or(line);
249    let body = body.strip_suffix('\r').unwrap_or(body).trim_start();
250
251    if marker_body_matches(body, "<<<<<<<") {
252        Some(ConflictMarker::Start)
253    } else if marker_body_matches(body, "=======") {
254        Some(ConflictMarker::Separator)
255    } else if marker_body_matches(body, ">>>>>>>") {
256        Some(ConflictMarker::End)
257    } else {
258        None
259    }
260}
261
262fn marker_body_matches(body: &str, marker: &str) -> bool {
263    let Some(rest) = body.strip_prefix(marker) else {
264        return false;
265    };
266    rest.is_empty() || rest.starts_with(' ')
267}
268
269/// Whether a clean `output` conserves the structure of its inputs. Re-parses
270/// the output and checks, against the three inputs (re-segmented raw so `use`
271/// keys compare on the same footing as the output's):
272///
273/// 1. **Re-parse** — a clean merge that yields an unparseable file is a
274///    corruption (catches a collapse's unbalanced delimiters).
275/// 2. **Item-identity subset** — every `(scope, kind, name)` in the output
276///    must appear in some input; the merge may not invent an item or move one
277///    into a scope no contributing side gave it (catches mis-nesting / a child
278///    escaping its container).
279///
280/// Both checks are deletion-safe (a subset relation, not equality), so a
281/// legitimate clean merge with deletions passes; and edit-safe (they key on
282/// item identity, not line text), so a within-line edit that recombines bytes
283/// passes. The line-duplication class the harness pins (Bug 1's doubled
284/// postamble) is excluded by construction in the tree model and covered by the
285/// ported conformance tests, so it needs no production line-budget check.
286fn conserves_inputs(
287    output: &[u8],
288    language: Language,
289    base_parsed: &ParsedFile,
290    ours_parsed: &ParsedFile,
291    theirs_parsed: &ParsedFile,
292) -> bool {
293    use std::collections::BTreeSet;
294
295    let Ok(out_text) = std::str::from_utf8(output) else {
296        return false;
297    };
298    let Some(out_parsed) = ParsedFile::parse(out_text, language) else {
299        return false;
300    };
301
302    type Identity = (Vec<String>, items::ItemKind, String);
303    let collect = |seg: &items::FileSegments, set: &mut BTreeSet<Identity>| {
304        items::visit_items(&seg.items, &mut |i| {
305            set.insert((i.key.scope.clone(), i.key.kind, i.key.name.clone()));
306        });
307    };
308
309    let mut allowed: BTreeSet<Identity> = BTreeSet::new();
310    for parsed in [base_parsed, ours_parsed, theirs_parsed] {
311        collect(&segment_file(parsed), &mut allowed);
312    }
313    let mut got: BTreeSet<Identity> = BTreeSet::new();
314    collect(&segment_file(&out_parsed), &mut got);
315    got.is_subset(&allowed)
316}
317
318/// Strategy a merge call should use for content reconciliation.
319#[derive(Clone, Copy, Debug, PartialEq, Eq)]
320pub enum MergeStrategy {
321    /// Always use `heddle-merge::text_hunk_merge` on the whole file.
322    HunkOnly,
323    /// Try AST-defined item decomposition first; fall through to
324    /// `text_hunk_merge` for unparseable / unknown-language files.
325    Semantic,
326}
327
328/// Single entry point that dispatches on [`MergeStrategy`]. Provided so call
329/// sites that already thread a strategy enum don't have to branch themselves.
330pub fn three_way_merge(
331    base: &[u8],
332    ours: &[u8],
333    theirs: &[u8],
334    path: &Path,
335    markers: ConflictMarkers<'_>,
336    strategy: MergeStrategy,
337) -> MergeOutcome {
338    match strategy {
339        MergeStrategy::HunkOnly => text_hunk_merge_with_markers(base, ours, theirs, markers),
340        MergeStrategy::Semantic => semantic_three_way_merge(base, ours, theirs, path, markers),
341    }
342}
343
344pub use merge::{ConflictMarkers as MergeConflictMarkers, MergeOutcome as MergeDriverOutcome};