semantic/merge_driver/mod.rs
1// SPDX-License-Identifier: Apache-2.0
2//! Function-level three-way merge driver.
3//!
4//! Decomposes a parseable source file into AST-defined items, merges each item
5//! independently, and falls back to `heddle-merge::text_hunk_merge` on items
6//! that can't be resolved structurally — and on the entire file when the
7//! parser declines.
8//!
9//! See `docs/design/semantic-merge-function-level.md` for the contract.
10
11use std::path::Path;
12
13use merge::{ConflictMarkers, MergeOutcome, text_hunk_merge_with_markers};
14
15use crate::parser::{Language, ParsedFile};
16
17mod items;
18mod language_rules;
19mod reconstruct;
20
21#[cfg(test)]
22mod tests;
23
24use items::segment_file;
25use reconstruct::reconstruct_merged_file;
26
27/// Three-way merge of `base`, `ours`, `theirs` using AST-defined item boundaries
28/// when the parser accepts all three sides, falling back to
29/// [`text_hunk_merge_with_markers`] otherwise.
30///
31/// The `path` is used for language detection only; it does NOT need to exist
32/// on disk.
33pub fn semantic_three_way_merge(
34 base: &[u8],
35 ours: &[u8],
36 theirs: &[u8],
37 path: &Path,
38 markers: ConflictMarkers<'_>,
39) -> MergeOutcome {
40 if base == ours && base == theirs {
41 return MergeOutcome::Clean(base.to_vec());
42 }
43 if base == ours {
44 return MergeOutcome::Clean(theirs.to_vec());
45 }
46 if base == theirs {
47 return MergeOutcome::Clean(ours.to_vec());
48 }
49 if ours == theirs {
50 return MergeOutcome::Clean(ours.to_vec());
51 }
52
53 let language = Language::from_path(path);
54 if matches!(language, Language::Unknown) {
55 return text_hunk_merge_with_markers(base, ours, theirs, markers);
56 }
57
58 let (Ok(base_text), Ok(ours_text), Ok(theirs_text)) = (
59 std::str::from_utf8(base),
60 std::str::from_utf8(ours),
61 std::str::from_utf8(theirs),
62 ) else {
63 return text_hunk_merge_with_markers(base, ours, theirs, markers);
64 };
65
66 let (Some(base_parsed), Some(ours_parsed), Some(theirs_parsed)) = (
67 ParsedFile::parse(base_text, language),
68 ParsedFile::parse(ours_text, language),
69 ParsedFile::parse(theirs_text, language),
70 ) else {
71 return text_hunk_merge_with_markers(base, ours, theirs, markers);
72 };
73
74 let mut base_segments = segment_file(&base_parsed);
75 let mut ours_segments = segment_file(&ours_parsed);
76 let mut theirs_segments = segment_file(&theirs_parsed);
77
78 // Rekey `use` items so declarations whose expanded leaf sets intersect
79 // on ANY path collide for cross-side matching (heddle#468; Codex r2 on
80 // PR #477). Must run before the empty-base add/add guard below and
81 // before reconstruction, both of which key off `Item`/`ItemKey`.
82 items::canonicalize_use_keys(&mut base_segments, &mut ours_segments, &mut theirs_segments);
83
84 // When a side has zero parseable items but the others do, the
85 // per-item alignment has nothing to anchor on for that side and
86 // its contiguous content can't be cleanly split across the other
87 // sides' per-item segments — the surrounding preamble/postamble
88 // merges either drop the side's edits (Codex r2 P1 #3) or
89 // double-emit its bridging content. text_hunk_merge handles the
90 // full-file alignment without those artifacts, so route this
91 // shape through it.
92 //
93 // EXCEPTION: empty base with both sides adding items that share
94 // keys (add/add). text_hunk_merge concatenates both insertions
95 // at the same anchor and silently produces duplicate definitions;
96 // `resolve_item`'s add/add arm is the only path that surfaces this
97 // as a conflict. Drop through to the reconstruct path in that case
98 // so the conflict is reported (Codex r3 P1 #1).
99 let counts = [
100 base_segments.items.len(),
101 ours_segments.items.len(),
102 theirs_segments.items.len(),
103 ];
104 if counts.contains(&0) && counts.iter().any(|&c| c > 0) {
105 let addadd_in_empty_base = base_segments.items.is_empty() && {
106 let ours_keys: std::collections::BTreeSet<_> =
107 ours_segments.items.iter().map(|i| &i.key).collect();
108 theirs_segments
109 .items
110 .iter()
111 .any(|i| ours_keys.contains(&i.key))
112 };
113 if !addadd_in_empty_base {
114 return text_hunk_merge_with_markers(base, ours, theirs, markers);
115 }
116 }
117
118 let outcome = reconstruct_merged_file(
119 base_text,
120 ours_text,
121 theirs_text,
122 &base_segments,
123 &ours_segments,
124 &theirs_segments,
125 markers,
126 );
127
128 // Input-grounded safety net (heddle#490 P3 floor). The tree model makes
129 // silent structural collapse impossible *by construction*, but a cheap
130 // conservation check against the INPUTS — not the merge's own resolved
131 // metadata — is kept as defense-in-depth: if a CLEAN merge ever fails to
132 // re-parse or invents an item/nesting no input had, route to the textual
133 // path instead of emitting the corruption.
134 //
135 // The floor also guards CONFLICT outputs (heddle#490 r6). The clean-only
136 // check could not see a malformed body that ships *alongside* a real
137 // conflict: a divergent container header plus an empty-base both-sides-add
138 // emitted a duplicated opening `{` (so `{`/`}` no longer balance) while the
139 // outcome was `Conflicts`, and a conflict skipped the clean floor — the
140 // malformed markers shipped silently. A conflict the user resolves must
141 // still be well-formed: resolving the markers to EITHER side must yield a
142 // file that re-parses. If a resolution is unparseable (an unbalanced /
143 // duplicated delimiter), route to the textual fallback, whose markers are
144 // well-formed by construction. Part 1 (structural body delimiter) makes
145 // this hold by construction; this guard keeps a future weave regression
146 // from re-shipping the class silently.
147 match &outcome {
148 MergeOutcome::Clean(output) => {
149 if !conserves_inputs(output, language, &base_parsed, &ours_parsed, &theirs_parsed) {
150 return text_hunk_merge_with_markers(base, ours, theirs, markers);
151 }
152 }
153 MergeOutcome::Conflicts {
154 merged_bytes_with_markers,
155 ..
156 } => {
157 if !conflict_well_formed(merged_bytes_with_markers, language) {
158 return text_hunk_merge_with_markers(base, ours, theirs, markers);
159 }
160 }
161 MergeOutcome::Binary | MergeOutcome::DeleteVsModify => {}
162 }
163 outcome
164}
165
166/// Whether a conflict output is structurally well-formed: resolving its markers
167/// to EITHER side independently yields a file that re-parses.
168///
169/// Reuses the clean floor's re-parse signal ([`ParsedFile::parse`] returns
170/// `None` on a tree with errors) so both floors close the SAME class with the
171/// same mechanism. The duplicate-delimiter corruption (heddle#490 r6) leaves a
172/// resolved side with an unbalanced `{`/`}`, which fails to parse and is caught
173/// here. If the markers themselves are malformed (unbalanced
174/// `<<<<<<< / ======= / >>>>>>>` nesting) the resolver returns `None`, which is
175/// likewise treated as not-well-formed.
176fn conflict_well_formed(output: &[u8], language: Language) -> bool {
177 let Ok(text) = std::str::from_utf8(output) else {
178 return false;
179 };
180 let Some((ours, theirs)) = resolve_conflict_sides(text) else {
181 return false;
182 };
183 ParsedFile::parse(ours.as_str(), language).is_some()
184 && ParsedFile::parse(theirs.as_str(), language).is_some()
185}
186
187/// Resolve conflict-marked `text` into its two independent sides: the
188/// take-ours resolution (drop the `theirs` hunks + markers) and the take-theirs
189/// resolution (drop the `ours` hunks + markers). Mirrors the marker shape
190/// emitted by [`merge::markers`] / `reconstruct::emit_addadd_conflict`:
191/// `<<<<<<< <label>` … `=======` … `>>>>>>> <label>`.
192///
193/// Returns `None` when the markers are malformed — a `=======` outside an open
194/// `<<<<<<<`, a `>>>>>>>` outside an open `=======`, a nested `<<<<<<<`, or an
195/// unterminated block at end of input — so a structurally broken conflict is
196/// itself surfaced as not-well-formed.
197fn resolve_conflict_sides(text: &str) -> Option<(String, String)> {
198 enum State {
199 Normal,
200 Ours,
201 Theirs,
202 }
203 let mut ours = String::new();
204 let mut theirs = String::new();
205 let mut state = State::Normal;
206 for line in text.split_inclusive('\n') {
207 let marker = conflict_marker(line);
208 if matches!(marker, Some(ConflictMarker::Start)) {
209 match state {
210 State::Normal => state = State::Ours,
211 _ => return None,
212 }
213 } else if matches!(marker, Some(ConflictMarker::Separator)) {
214 match state {
215 State::Ours => state = State::Theirs,
216 _ => return None,
217 }
218 } else if matches!(marker, Some(ConflictMarker::End)) {
219 match state {
220 State::Theirs => state = State::Normal,
221 _ => return None,
222 }
223 } else {
224 match state {
225 State::Normal => {
226 ours.push_str(line);
227 theirs.push_str(line);
228 }
229 State::Ours => ours.push_str(line),
230 State::Theirs => theirs.push_str(line),
231 }
232 }
233 }
234 match state {
235 State::Normal => Some((ours, theirs)),
236 _ => None,
237 }
238}
239
240#[derive(Clone, Copy, Debug, Eq, PartialEq)]
241enum ConflictMarker {
242 Start,
243 Separator,
244 End,
245}
246
247fn conflict_marker(line: &str) -> Option<ConflictMarker> {
248 let body = line.strip_suffix('\n').unwrap_or(line);
249 let body = body.strip_suffix('\r').unwrap_or(body).trim_start();
250
251 if marker_body_matches(body, "<<<<<<<") {
252 Some(ConflictMarker::Start)
253 } else if marker_body_matches(body, "=======") {
254 Some(ConflictMarker::Separator)
255 } else if marker_body_matches(body, ">>>>>>>") {
256 Some(ConflictMarker::End)
257 } else {
258 None
259 }
260}
261
262fn marker_body_matches(body: &str, marker: &str) -> bool {
263 let Some(rest) = body.strip_prefix(marker) else {
264 return false;
265 };
266 rest.is_empty() || rest.starts_with(' ')
267}
268
269/// Whether a clean `output` conserves the structure of its inputs. Re-parses
270/// the output and checks, against the three inputs (re-segmented raw so `use`
271/// keys compare on the same footing as the output's):
272///
273/// 1. **Re-parse** — a clean merge that yields an unparseable file is a
274/// corruption (catches a collapse's unbalanced delimiters).
275/// 2. **Item-identity subset** — every `(scope, kind, name)` in the output
276/// must appear in some input; the merge may not invent an item or move one
277/// into a scope no contributing side gave it (catches mis-nesting / a child
278/// escaping its container).
279///
280/// Both checks are deletion-safe (a subset relation, not equality), so a
281/// legitimate clean merge with deletions passes; and edit-safe (they key on
282/// item identity, not line text), so a within-line edit that recombines bytes
283/// passes. The line-duplication class the harness pins (Bug 1's doubled
284/// postamble) is excluded by construction in the tree model and covered by the
285/// ported conformance tests, so it needs no production line-budget check.
286fn conserves_inputs(
287 output: &[u8],
288 language: Language,
289 base_parsed: &ParsedFile,
290 ours_parsed: &ParsedFile,
291 theirs_parsed: &ParsedFile,
292) -> bool {
293 use std::collections::BTreeSet;
294
295 let Ok(out_text) = std::str::from_utf8(output) else {
296 return false;
297 };
298 let Some(out_parsed) = ParsedFile::parse(out_text, language) else {
299 return false;
300 };
301
302 type Identity = (Vec<String>, items::ItemKind, String);
303 let collect = |seg: &items::FileSegments, set: &mut BTreeSet<Identity>| {
304 items::visit_items(&seg.items, &mut |i| {
305 set.insert((i.key.scope.clone(), i.key.kind, i.key.name.clone()));
306 });
307 };
308
309 let mut allowed: BTreeSet<Identity> = BTreeSet::new();
310 for parsed in [base_parsed, ours_parsed, theirs_parsed] {
311 collect(&segment_file(parsed), &mut allowed);
312 }
313 let mut got: BTreeSet<Identity> = BTreeSet::new();
314 collect(&segment_file(&out_parsed), &mut got);
315 got.is_subset(&allowed)
316}
317
318/// Strategy a merge call should use for content reconciliation.
319#[derive(Clone, Copy, Debug, PartialEq, Eq)]
320pub enum MergeStrategy {
321 /// Always use `heddle-merge::text_hunk_merge` on the whole file.
322 HunkOnly,
323 /// Try AST-defined item decomposition first; fall through to
324 /// `text_hunk_merge` for unparseable / unknown-language files.
325 Semantic,
326}
327
328/// Single entry point that dispatches on [`MergeStrategy`]. Provided so call
329/// sites that already thread a strategy enum don't have to branch themselves.
330pub fn three_way_merge(
331 base: &[u8],
332 ours: &[u8],
333 theirs: &[u8],
334 path: &Path,
335 markers: ConflictMarkers<'_>,
336 strategy: MergeStrategy,
337) -> MergeOutcome {
338 match strategy {
339 MergeStrategy::HunkOnly => text_hunk_merge_with_markers(base, ours, theirs, markers),
340 MergeStrategy::Semantic => semantic_three_way_merge(base, ours, theirs, path, markers),
341 }
342}
343
344pub use merge::{ConflictMarkers as MergeConflictMarkers, MergeOutcome as MergeDriverOutcome};