lex_core/lex/includes.rs
1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//! doc-title/doc-annotation conversion + origin stamping + root-escape
19//! check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//! (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//! directory, so relative paths inside an included file resolve from
23//! that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//! resolves a `ReferenceType::File` target from the authoring file's
26//! directory using `Range.origin_path`.
27//! `Document::find_annotation_by_label_in_origin` scopes footnote
28//! lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//! filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//! into `lex convert` and `lex inspect` (default-on, opt-out via
32//! `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47use crate::lex::assembling::AttachAnnotations;
48use crate::lex::ast::elements::container::GeneralContainer;
49use crate::lex::ast::elements::content_item::ContentItem;
50use crate::lex::ast::elements::paragraph::Paragraph;
51use crate::lex::ast::elements::session::Session;
52use crate::lex::ast::range::Range;
53use crate::lex::ast::Document;
54use crate::lex::transforms::Runnable;
55use std::path::{Path, PathBuf};
56use std::sync::Arc;
57
58/// Configuration for the include resolution pass.
59#[derive(Debug, Clone)]
60pub struct ResolveConfig {
61 /// Directory all include paths resolve under. Any include that
62 /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
63 ///
64 /// Must be an **absolute** path. Lexical normalization treats `.`
65 /// and `..` against an empty buffer as no-ops; passing a relative
66 /// or unnormalized root weakens the root-escape prefix check.
67 /// Callers (CLI, LSP) should canonicalize the root before
68 /// constructing `ResolveConfig`.
69 pub root: PathBuf,
70 /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
71 /// Hitting the limit is an error, not a silent truncation.
72 pub max_depth: usize,
73}
74
75impl ResolveConfig {
76 /// Default maximum include depth — enough for any reasonable atomization
77 /// strategy (aggregator → per-chapter → per-section), bounded enough to
78 /// keep the resolver's worst-case work predictable.
79 pub const DEFAULT_MAX_DEPTH: usize = 8;
80
81 /// Construct a config with the given root and default depth.
82 pub fn with_root(root: PathBuf) -> Self {
83 Self {
84 root,
85 max_depth: Self::DEFAULT_MAX_DEPTH,
86 }
87 }
88}
89
90/// A pluggable source-text loader.
91///
92/// Implementations decide where bytes come from (filesystem, in-memory map,
93/// virtual filesystem, content-addressed store, …). lex-core never references
94/// `std::fs` directly through this trait; that keeps the resolver pure and
95/// usable in WASM, sandboxes, and unit tests.
96pub trait Loader {
97 /// Load the source text for `path`. The path is the canonical absolute
98 /// path the resolver decided on after applying the rules in §4 of the
99 /// proposal.
100 fn load(&self, path: &Path) -> Result<String, LoadError>;
101}
102
103/// Errors a [`Loader`] can produce.
104#[derive(Debug, Clone)]
105pub enum LoadError {
106 /// The loader could not find a resource at the given path.
107 NotFound { path: PathBuf },
108 /// Underlying I/O error (or virtual-filesystem equivalent).
109 Io { path: PathBuf, message: String },
110}
111
112impl std::fmt::Display for LoadError {
113 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
114 match self {
115 LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
116 LoadError::Io { path, message } => {
117 write!(f, "io error reading {}: {message}", path.display())
118 }
119 }
120 }
121}
122
123impl std::error::Error for LoadError {}
124
125/// Errors the include resolver can produce.
126#[derive(Debug, Clone)]
127pub enum IncludeError {
128 /// An include chain looped back on itself. `chain` is the resolution
129 /// stack at the moment the duplicate `path` was about to be pushed,
130 /// in source-order (entry first, deepest last). `include_site` is the
131 /// range of the offending `lex.include` annotation in its host file —
132 /// useful for diagnostics that highlight the exact line.
133 Cycle {
134 include_site: Range,
135 path: PathBuf,
136 chain: Vec<PathBuf>,
137 },
138 /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
139 /// shows the resolution stack at the moment of failure, in source
140 /// order. `include_site` is the range of the offending
141 /// `lex.include` annotation in its host file.
142 DepthExceeded {
143 include_site: Range,
144 limit: usize,
145 chain: Vec<PathBuf>,
146 },
147 /// A path resolved outside the configured [`ResolveConfig::root`].
148 RootEscape { path: PathBuf, root: PathBuf },
149 /// The loader could not find or read the included file. `include_site`
150 /// is the range of the offending `lex.include` annotation in its host
151 /// file, so editors can squiggle the line that asked for the missing
152 /// file rather than the document head.
153 NotFound { include_site: Range, path: PathBuf },
154 /// The loader returned text that the parser rejected.
155 ParseFailed { path: PathBuf, message: String },
156 /// The included file's content is not legal in the include site's
157 /// parent container.
158 ///
159 /// Today this only occurs when an included file has top-level Sessions
160 /// and the include site is inside a `GeneralContainer` (Definition,
161 /// ListItem, or another Annotation's body). The `violation` field
162 /// names the offending content kind (e.g. `"Sessions"`) so future
163 /// container/policy combinations can reuse this variant without a
164 /// breaking change.
165 ContainerPolicy {
166 include_site: Range,
167 container: &'static str,
168 file: PathBuf,
169 violation: &'static str,
170 },
171 /// Loader propagated a non-`NotFound` I/O error.
172 LoaderIo { path: PathBuf, message: String },
173 /// `lex.include` annotation was missing the mandatory `src=` parameter.
174 MissingSrc { include_site: Range },
175}
176
177impl std::fmt::Display for IncludeError {
178 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
179 match self {
180 IncludeError::Cycle { path, chain, .. } => {
181 let chain_display: Vec<String> =
182 chain.iter().map(|p| p.display().to_string()).collect();
183 write!(
184 f,
185 "include cycle: {} (chain: {})",
186 path.display(),
187 chain_display.join(" -> ")
188 )
189 }
190 IncludeError::DepthExceeded { limit, chain, .. } => {
191 let chain_display: Vec<String> =
192 chain.iter().map(|p| p.display().to_string()).collect();
193 write!(
194 f,
195 "include depth exceeded limit of {limit} (chain: {})",
196 chain_display.join(" -> ")
197 )
198 }
199 IncludeError::RootEscape { path, root } => write!(
200 f,
201 "include path {} escapes resolution root {}",
202 path.display(),
203 root.display()
204 ),
205 IncludeError::NotFound { path, .. } => {
206 write!(f, "include not found: {}", path.display())
207 }
208 IncludeError::ParseFailed { path, message } => {
209 write!(f, "failed to parse {}: {message}", path.display())
210 }
211 IncludeError::ContainerPolicy {
212 container,
213 file,
214 violation,
215 ..
216 } => write!(
217 f,
218 "included file {} contains {} but include site is inside {} \
219 (which does not allow {})",
220 file.display(),
221 violation,
222 container,
223 violation
224 ),
225 IncludeError::LoaderIo { path, message } => {
226 write!(f, "loader error reading {}: {message}", path.display())
227 }
228 IncludeError::MissingSrc { .. } => {
229 write!(f, "lex.include annotation missing required src= parameter")
230 }
231 }
232 }
233}
234
235impl std::error::Error for IncludeError {}
236
237// No `From<LoadError>` impl: `IncludeError::NotFound` carries the include
238// site (the `lex.include` annotation's range), which a loader doesn't know
239// about. Callers map `LoadError` explicitly at the call site, where the
240// site is available.
241
242/// Which container the include site sits in. Determines the splice-time
243/// policy check (the only one today is "no Sessions in `GeneralContainer`").
244#[derive(Debug, Clone, Copy)]
245enum ContainerKind {
246 /// `Document.root.children` or `Session.children` — accepts everything.
247 Session,
248 /// `Definition.children` — `GeneralContainer`.
249 Definition,
250 /// `Annotation.children` — `GeneralContainer`.
251 AnnotationBody,
252 /// `ListItem.children` — `GeneralContainer`.
253 ListItem,
254}
255
256impl ContainerKind {
257 fn name(self) -> &'static str {
258 match self {
259 ContainerKind::Session => "Session",
260 ContainerKind::Definition => "Definition",
261 ContainerKind::AnnotationBody => "Annotation body",
262 ContainerKind::ListItem => "ListItem",
263 }
264 }
265
266 fn allows_sessions(self) -> bool {
267 matches!(self, ContainerKind::Session)
268 }
269}
270
271/// Resolve `:: lex.include ::` annotations starting from `source`, recursively.
272///
273/// `source_path` identifies the entry-point file. It is used to (a) resolve
274/// relative include paths against the entry file's directory, (b) stamp
275/// `Range.origin_path` on every node so downstream code (file-ref resolution,
276/// diagnostics, LSP goto) can report locations against the authoring file,
277/// and (c) seed the cycle-detection chain so an include cycle that loops
278/// back to the entry is caught. When `None`, relative paths resolve against
279/// `config.root`, origin stamping is skipped on the entry, and the chain
280/// starts empty.
281///
282/// # Pre/post-attachment
283///
284/// Internally this re-parses each source (entry + every loaded file) *without*
285/// annotation attachment so `lex.include` annotations are visible as standalone
286/// children where the splice can replace them in-place. After all splices,
287/// [`AttachAnnotations`] runs once on the merged tree, which lands the include
288/// annotation on the first spliced node by the standard "attach to next
289/// sibling" rule. This matches the textual paste mental model from the proposal.
290///
291/// # Recursion
292///
293/// Each loaded file is fully resolved (its own includes replaced) *before*
294/// being spliced into the host. The recursion uses each file's own directory
295/// as `host_dir`, so a relative path inside an included file resolves from
296/// that file's location — not the entry's. An active-chain stack of
297/// canonicalized paths gates against cycles; the depth counter gates against
298/// pathological nesting (default 8, configurable via [`ResolveConfig::max_depth`]).
299pub fn resolve_from_source(
300 source: &str,
301 source_path: Option<PathBuf>,
302 config: &ResolveConfig,
303 loader: &dyn Loader,
304) -> Result<Document, IncludeError> {
305 let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
306 let host_dir = source_path
307 .as_ref()
308 .and_then(|p| p.parent().map(Path::to_path_buf))
309 .unwrap_or_else(|| config.root.clone());
310
311 let mut doc = parse_no_attach(source).map_err(|message| IncludeError::ParseFailed {
312 path: source_path.clone().unwrap_or_default(),
313 message,
314 })?;
315
316 if let Some(origin) = entry_origin.as_ref() {
317 stamp_doc(&mut doc, origin);
318 }
319
320 // Seed the chain with the lexically-normalized entry path (when known)
321 // so an include that loops back to the entry is detected as a cycle.
322 // Normalization here is essential — `target_path` values produced by
323 // `resolve_path` are also lexically normalized, so an unnormalized
324 // entry would never compare equal to its normalized self.
325 let mut chain: Vec<PathBuf> = source_path
326 .as_ref()
327 .map(|p| vec![lexical_normalize(p)])
328 .unwrap_or_default();
329 let mut state = ResolverState {
330 config,
331 loader,
332 chain: &mut chain,
333 depth: 0,
334 };
335
336 splice_in_session_container(doc.root.children.as_mut_vec(), &host_dir, &mut state)?;
337
338 let doc = AttachAnnotations::new()
339 .run(doc)
340 .map_err(|e| IncludeError::ParseFailed {
341 path: source_path.unwrap_or_default(),
342 message: format!("annotation attachment failed: {e}"),
343 })?;
344
345 Ok(doc)
346}
347
348// ============================================================================
349// Splicing
350// ============================================================================
351
352/// Per-resolution state threaded through the recursive walker. Keeps the
353/// signatures of the splice/process functions short and ensures
354/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
355/// each include site.
356struct ResolverState<'a> {
357 config: &'a ResolveConfig,
358 loader: &'a dyn Loader,
359 /// Active resolution stack: lexically-normalized absolute paths
360 /// currently being resolved. Pushed when we begin loading a file and
361 /// popped when its tree is fully resolved. A push that finds the
362 /// path already on the stack is a cycle.
363 ///
364 /// Normalization (not filesystem canonicalization) is what's used
365 /// here: the resolver never touches `std::fs`, so symlink resolution
366 /// is out. Two paths that lexically refer to the same file (after
367 /// `.`/`..` collapse) compare equal; two paths reaching the same
368 /// inode via different routes do not. For real-FS use cases this is
369 /// fine because `FsLoader` will canonicalize on load before the
370 /// chain comparison sees the path.
371 chain: &'a mut Vec<PathBuf>,
372 /// Number of include hops from the entry point. Each recursion into a
373 /// loaded file increments by 1. Hitting `config.max_depth` is an error.
374 depth: usize,
375}
376
377fn splice_in_session_container(
378 children: &mut Vec<ContentItem>,
379 host_dir: &Path,
380 state: &mut ResolverState<'_>,
381) -> Result<(), IncludeError> {
382 // Post-order: recurse into nested containers first, splice this
383 // container's includes second. The recurse step walks the *original*
384 // tree; the splice step inserts already-fully-resolved content
385 // (recursion happens inside `process_includes`), which is therefore
386 // never re-walked.
387 recurse_into_children(children, host_dir, state)?;
388 process_includes(children, host_dir, state, ContainerKind::Session)
389}
390
391fn splice_in_general_container(
392 container: &mut GeneralContainer,
393 host_dir: &Path,
394 state: &mut ResolverState<'_>,
395 kind: ContainerKind,
396) -> Result<(), IncludeError> {
397 recurse_into_children(container.as_mut_vec(), host_dir, state)?;
398 process_includes(container.as_mut_vec(), host_dir, state, kind)
399}
400
401// Allow &mut Vec because `splice` needs Vec-specific operations.
402#[allow(clippy::ptr_arg)]
403fn process_includes(
404 children: &mut Vec<ContentItem>,
405 host_dir: &Path,
406 state: &mut ResolverState<'_>,
407 kind: ContainerKind,
408) -> Result<(), IncludeError> {
409 // Collect indices of standalone include annotations in this container.
410 let include_indices: Vec<usize> = children
411 .iter()
412 .enumerate()
413 .filter_map(|(i, item)| match item {
414 ContentItem::Annotation(a) if a.is_include() => Some(i),
415 _ => None,
416 })
417 .collect();
418
419 // Process in reverse order so earlier indices stay valid.
420 for i in include_indices.into_iter().rev() {
421 let annotation = match &children[i] {
422 ContentItem::Annotation(a) => a.clone(),
423 _ => unreachable!("index came from include filter"),
424 };
425
426 let splice_items = resolve_one_include(&annotation, host_dir, state, kind)?;
427
428 // Replace the include annotation with the splice content.
429 // The annotation itself stays in the children list immediately
430 // before the splice, so the post-resolution AttachAnnotations
431 // pass moves it onto the first spliced node by the standard
432 // "attach to next sibling" rule.
433 let mut replacement = Vec::with_capacity(splice_items.len() + 1);
434 replacement.push(ContentItem::Annotation(annotation));
435 replacement.extend(splice_items);
436 children.splice(i..=i, replacement);
437 }
438
439 Ok(())
440}
441
442/// Resolve a single include annotation: path → load → parse → recurse →
443/// stamp → policy-check → splice list.
444///
445/// The recursion happens *here*: after parsing the loaded file, we walk
446/// its tree with the loaded file's own directory as `host_dir`, with the
447/// loaded file pushed onto `state.chain` and `state.depth` bumped by 1.
448/// When this call returns, the splice list is fully resolved and ready to
449/// be inserted into the host container.
450fn resolve_one_include(
451 annotation: &crate::lex::ast::elements::annotation::Annotation,
452 host_dir: &Path,
453 state: &mut ResolverState<'_>,
454 parent_kind: ContainerKind,
455) -> Result<Vec<ContentItem>, IncludeError> {
456 let src = annotation
457 .include_src()
458 .ok_or_else(|| IncludeError::MissingSrc {
459 include_site: annotation.location.clone(),
460 })?;
461
462 let target_path = resolve_path(&src, host_dir, &state.config.root)?;
463
464 // Cycle check before load — keep loader free of duplicate work.
465 if state.chain.iter().any(|p| p == &target_path) {
466 return Err(IncludeError::Cycle {
467 include_site: annotation.location.clone(),
468 path: target_path,
469 chain: state.chain.clone(),
470 });
471 }
472
473 // Depth check before recursing into the loaded file. A site that sits
474 // exactly at `max_depth` is fine; a site that would push us *past* it
475 // is the failure case.
476 if state.depth >= state.config.max_depth {
477 return Err(IncludeError::DepthExceeded {
478 include_site: annotation.location.clone(),
479 limit: state.config.max_depth,
480 chain: state.chain.clone(),
481 });
482 }
483
484 let target_source = state.loader.load(&target_path).map_err(|e| match e {
485 LoadError::NotFound { path } => IncludeError::NotFound {
486 include_site: annotation.location.clone(),
487 path,
488 },
489 LoadError::Io { path, message } => IncludeError::LoaderIo { path, message },
490 })?;
491
492 let mut included =
493 parse_no_attach(&target_source).map_err(|message| IncludeError::ParseFailed {
494 path: target_path.clone(),
495 message,
496 })?;
497
498 let target_origin = Arc::new(target_path.clone());
499 stamp_doc(&mut included, &target_origin);
500
501 // Recursively resolve includes inside the loaded file. The host_dir
502 // for that walk is the loaded file's own parent; the chain gains
503 // this path and depth bumps by 1 — both are popped/restored on the
504 // way back so siblings see the same state we got.
505 let included_dir = target_path
506 .parent()
507 .map(Path::to_path_buf)
508 .unwrap_or_else(|| state.config.root.clone());
509
510 state.chain.push(target_path.clone());
511 let saved_depth = state.depth;
512 state.depth = saved_depth + 1;
513 let recurse_result =
514 splice_in_session_container(included.root.children.as_mut_vec(), &included_dir, state);
515 state.depth = saved_depth;
516 state.chain.pop();
517 recurse_result?;
518
519 let splice_items = prepare_splice_list(included);
520 validate_against_kind(
521 &splice_items,
522 parent_kind,
523 &annotation.location,
524 &target_path,
525 )?;
526
527 Ok(splice_items)
528}
529
530#[allow(clippy::ptr_arg)]
531fn recurse_into_children(
532 children: &mut Vec<ContentItem>,
533 host_dir: &Path,
534 state: &mut ResolverState<'_>,
535) -> Result<(), IncludeError> {
536 for item in children.iter_mut() {
537 match item {
538 ContentItem::Session(s) => {
539 splice_in_session_container(s.children.as_mut_vec(), host_dir, state)?;
540 }
541 ContentItem::Definition(d) => {
542 splice_in_general_container(
543 &mut d.children,
544 host_dir,
545 state,
546 ContainerKind::Definition,
547 )?;
548 }
549 ContentItem::Annotation(a) if !a.is_include() => {
550 splice_in_general_container(
551 &mut a.children,
552 host_dir,
553 state,
554 ContainerKind::AnnotationBody,
555 )?;
556 }
557 ContentItem::List(l) => {
558 for li in l.items.as_mut_vec().iter_mut() {
559 if let ContentItem::ListItem(item) = li {
560 splice_in_general_container(
561 &mut item.children,
562 host_dir,
563 state,
564 ContainerKind::ListItem,
565 )?;
566 }
567 }
568 }
569 _ => {}
570 }
571 }
572 Ok(())
573}
574
575fn prepare_splice_list(mut included: Document) -> Vec<ContentItem> {
576 let mut items: Vec<ContentItem> = Vec::new();
577
578 // Document title → Paragraph, prepended.
579 // Equivalent to what a textual paste would parse (an unindented line
580 // becomes a paragraph in the host's context). Per the revised
581 // spec §5.2 this is "do nothing" semantics — converting matches what
582 // the parser would do if the included source were inlined and reparsed.
583 if let Some(title) = included.title {
584 let location = title.location.clone();
585 let para = Paragraph::from_line(title.as_str().to_string()).at(location);
586 items.push(ContentItem::Paragraph(para));
587 }
588
589 // Document-level annotations → regular annotations, prepended.
590 for ann in included.annotations {
591 items.push(ContentItem::Annotation(ann));
592 }
593
594 // Body of the included document.
595 items.append(included.root.children.as_mut_vec());
596
597 items
598}
599
600fn validate_against_kind(
601 items: &[ContentItem],
602 kind: ContainerKind,
603 site: &Range,
604 file: &Path,
605) -> Result<(), IncludeError> {
606 if kind.allows_sessions() {
607 return Ok(());
608 }
609 if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
610 return Err(IncludeError::ContainerPolicy {
611 include_site: site.clone(),
612 container: kind.name(),
613 file: file.to_path_buf(),
614 violation: "Sessions",
615 });
616 }
617 Ok(())
618}
619
620// ============================================================================
621// Path resolution
622// ============================================================================
623
624/// Resolve a file-reference target string the same way the include
625/// resolver resolves include paths.
626///
627/// Use this when consuming `ReferenceType::File { target }` (or any other
628/// node-attached path) so that relative paths resolve from the *authoring*
629/// file's directory, not from wherever the merged document happens to be
630/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
631/// containing node (or `None` if the node was never stamped — in that case
632/// the path is treated as if authored at the root).
633///
634/// Behaviour matches the include resolver:
635/// - Root-absolute targets (leading `/`) resolve under `root`.
636/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
637/// when `ref_origin` is `None`).
638/// - The result is lexically normalized and checked against `root` —
639/// paths that escape it return `RootEscape`.
640///
641/// This is a sister to the resolver's internal `resolve_path` and shares
642/// the same lexical-normalization caveat: it does not touch the filesystem.
643pub fn resolve_file_reference(
644 target: &str,
645 ref_origin: Option<&Path>,
646 root: &Path,
647) -> Result<PathBuf, IncludeError> {
648 let host_dir: PathBuf = ref_origin
649 .and_then(|p| p.parent())
650 .map(Path::to_path_buf)
651 .unwrap_or_else(|| root.to_path_buf());
652 resolve_path(target, &host_dir, root)
653}
654
655fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
656 let candidate = if let Some(rel) = src.strip_prefix('/') {
657 // Root-absolute: leading slash means "from the resolution root".
658 root.join(rel)
659 } else {
660 // Relative: from the host file's directory.
661 host_dir.join(src)
662 };
663 let normalized = lexical_normalize(&candidate);
664 let canonical_root = lexical_normalize(root);
665 if !normalized.starts_with(&canonical_root) {
666 return Err(IncludeError::RootEscape {
667 path: normalized,
668 root: canonical_root,
669 });
670 }
671 Ok(normalized)
672}
673
674/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
675///
676/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
677/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
678/// version is sufficient for include-site path resolution because the
679/// resolver only needs a stable identity for cycle detection and a uniform
680/// shape for the root-escape prefix check.
681///
682/// `..` is collapsed only when the *last* component in the buffer is a
683/// real directory name (`Component::Normal`). When the buffer is empty
684/// or its last component is itself `..` (or a root marker), the new `..`
685/// is *preserved* in the buffer.
686///
687/// This is what defeats `../../etc/passwd` from collapsing to
688/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
689/// would happily strip a `..` (since `Path::new("..").parent()` returns
690/// `Some("")`), silently losing the second `..` and producing a path
691/// that falsely starts with the root prefix. Each unmatched `..` in the
692/// preserved form keeps the normalized path outside any sane root, so
693/// the escape check fires correctly.
694fn lexical_normalize(p: &Path) -> PathBuf {
695 let mut out = PathBuf::new();
696 for c in p.components() {
697 match c {
698 std::path::Component::ParentDir => {
699 let can_pop = matches!(
700 out.components().next_back(),
701 Some(std::path::Component::Normal(_))
702 );
703 if can_pop {
704 out.pop();
705 } else {
706 out.push("..");
707 }
708 }
709 std::path::Component::CurDir => {}
710 other => out.push(other.as_os_str()),
711 }
712 }
713 out
714}
715
716// ============================================================================
717// Origin stamping
718// ============================================================================
719//
720// Walk every node in a Document and set `Range.origin_path` on each
721// `.location` field. The walk only stamps the *block-level* `.location`
722// fields here; finer-grained inline ranges land in PR 6 when file-ref
723// resolution starts consulting them.
724
725fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
726 if let Some(title) = doc.title.as_mut() {
727 title.location.origin_path = Some(Arc::clone(origin));
728 }
729 for ann in doc.annotations.iter_mut() {
730 stamp_annotation(ann, origin);
731 }
732 stamp_session(&mut doc.root, origin);
733}
734
735fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
736 s.location.origin_path = Some(Arc::clone(origin));
737 if let Some(loc) = s.title.location.as_mut() {
738 loc.origin_path = Some(Arc::clone(origin));
739 }
740 for ann in s.annotations.iter_mut() {
741 stamp_annotation(ann, origin);
742 }
743 for item in s.children.as_mut_vec().iter_mut() {
744 stamp_item(item, origin);
745 }
746}
747
748fn stamp_annotation(
749 a: &mut crate::lex::ast::elements::annotation::Annotation,
750 origin: &Arc<PathBuf>,
751) {
752 a.location.origin_path = Some(Arc::clone(origin));
753 a.data.location.origin_path = Some(Arc::clone(origin));
754 for item in a.children.as_mut_vec().iter_mut() {
755 stamp_item(item, origin);
756 }
757}
758
759fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
760 match item {
761 ContentItem::Session(s) => stamp_session(s, origin),
762 ContentItem::Annotation(a) => stamp_annotation(a, origin),
763 ContentItem::Paragraph(p) => {
764 p.location.origin_path = Some(Arc::clone(origin));
765 for ann in p.annotations.iter_mut() {
766 stamp_annotation(ann, origin);
767 }
768 for line in p.lines.iter_mut() {
769 stamp_item(line, origin);
770 }
771 }
772 ContentItem::List(l) => {
773 l.location.origin_path = Some(Arc::clone(origin));
774 for li in l.items.as_mut_vec().iter_mut() {
775 stamp_item(li, origin);
776 }
777 }
778 ContentItem::ListItem(li) => {
779 li.location.origin_path = Some(Arc::clone(origin));
780 for ann in li.annotations.iter_mut() {
781 stamp_annotation(ann, origin);
782 }
783 for child in li.children.as_mut_vec().iter_mut() {
784 stamp_item(child, origin);
785 }
786 }
787 ContentItem::Definition(d) => {
788 d.location.origin_path = Some(Arc::clone(origin));
789 for ann in d.annotations.iter_mut() {
790 stamp_annotation(ann, origin);
791 }
792 for child in d.children.as_mut_vec().iter_mut() {
793 stamp_item(child, origin);
794 }
795 }
796 ContentItem::VerbatimBlock(v) => {
797 v.location.origin_path = Some(Arc::clone(origin));
798 }
799 ContentItem::VerbatimLine(vl) => {
800 vl.location.origin_path = Some(Arc::clone(origin));
801 }
802 ContentItem::Table(t) => {
803 t.location.origin_path = Some(Arc::clone(origin));
804 }
805 ContentItem::TextLine(tl) => {
806 tl.location.origin_path = Some(Arc::clone(origin));
807 }
808 ContentItem::BlankLineGroup(b) => {
809 b.location.origin_path = Some(Arc::clone(origin));
810 }
811 }
812}
813
814// ============================================================================
815// Parser glue
816// ============================================================================
817
818/// Parse `source` into a Document but skip the annotation-attachment stage,
819/// so include annotations are findable in container children lists.
820fn parse_no_attach(source: &str) -> Result<Document, String> {
821 crate::lex::testing::parse_without_annotation_attachment(source)
822}
823
824// ============================================================================
825// Filesystem-backed loader
826// ============================================================================
827
828/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
829///
830/// This is the production loader used by the CLI; the LSP wraps it with a
831/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
832/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
833/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
834/// WASM-friendly.
835///
836/// `FsLoader` is stateless; construct one at the start of a resolution and
837/// share it for the duration. Errors map cleanly:
838/// - `std::io::ErrorKind::NotFound` → [`LoadError::NotFound`]
839/// - any other I/O error → [`LoadError::Io`]
840pub struct FsLoader;
841
842impl FsLoader {
843 pub fn new() -> Self {
844 Self
845 }
846}
847
848impl Default for FsLoader {
849 fn default() -> Self {
850 Self::new()
851 }
852}
853
854impl Loader for FsLoader {
855 fn load(&self, path: &Path) -> Result<String, LoadError> {
856 std::fs::read_to_string(path).map_err(|e| match e.kind() {
857 std::io::ErrorKind::NotFound => LoadError::NotFound {
858 path: path.to_path_buf(),
859 },
860 _ => LoadError::Io {
861 path: path.to_path_buf(),
862 message: e.to_string(),
863 },
864 })
865 }
866}
867
868// ============================================================================
869// Test fixtures (test-support feature + cfg(test))
870// ============================================================================
871
872/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
873#[cfg(any(test, feature = "test-support"))]
874pub struct MemoryLoader {
875 files: std::collections::HashMap<PathBuf, String>,
876}
877
878#[cfg(any(test, feature = "test-support"))]
879impl MemoryLoader {
880 /// Create an empty loader. Add files with [`MemoryLoader::insert`].
881 pub fn new() -> Self {
882 Self {
883 files: std::collections::HashMap::new(),
884 }
885 }
886
887 /// Register a file at `path` with the given source text.
888 pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
889 self.files.insert(path.into(), contents.into());
890 self
891 }
892
893 /// Convenience constructor: build a loader from any iterator of
894 /// `(path, contents)` pairs.
895 pub fn from_pairs<I, P, S>(pairs: I) -> Self
896 where
897 I: IntoIterator<Item = (P, S)>,
898 P: Into<PathBuf>,
899 S: Into<String>,
900 {
901 let mut loader = Self::new();
902 for (path, contents) in pairs {
903 loader.insert(path, contents);
904 }
905 loader
906 }
907}
908
909#[cfg(any(test, feature = "test-support"))]
910impl Default for MemoryLoader {
911 fn default() -> Self {
912 Self::new()
913 }
914}
915
916#[cfg(any(test, feature = "test-support"))]
917impl Loader for MemoryLoader {
918 fn load(&self, path: &Path) -> Result<String, LoadError> {
919 self.files
920 .get(path)
921 .cloned()
922 .ok_or_else(|| LoadError::NotFound {
923 path: path.to_path_buf(),
924 })
925 }
926}
927
928// ============================================================================
929// Tests
930// ============================================================================
931
932#[cfg(test)]
933mod tests;