lex_core/lex/includes.rs
1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//! doc-title/doc-annotation conversion + origin stamping + root-escape
19//! check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//! (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//! directory, so relative paths inside an included file resolve from
23//! that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//! resolves a `ReferenceType::File` target from the authoring file's
26//! directory using `Range.origin_path`.
27//! `Document::find_annotation_by_label_in_origin` scopes footnote
28//! lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//! filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//! into `lex convert` and `lex inspect` (default-on, opt-out via
32//! `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47use crate::lex::assembling::AttachAnnotations;
48use crate::lex::ast::elements::container::GeneralContainer;
49use crate::lex::ast::elements::content_item::ContentItem;
50use crate::lex::ast::elements::paragraph::Paragraph;
51use crate::lex::ast::elements::session::Session;
52use crate::lex::ast::range::Range;
53use crate::lex::ast::Document;
54use crate::lex::transforms::Runnable;
55use std::path::{Path, PathBuf};
56use std::sync::Arc;
57
58/// Configuration for the include resolution pass.
59#[derive(Debug, Clone)]
60pub struct ResolveConfig {
61 /// Directory all include paths resolve under. Any include that
62 /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
63 ///
64 /// Must be an **absolute** path. Lexical normalization treats `.`
65 /// and `..` against an empty buffer as no-ops; passing a relative
66 /// or unnormalized root weakens the root-escape prefix check.
67 /// Callers (CLI, LSP) should canonicalize the root before
68 /// constructing `ResolveConfig`.
69 pub root: PathBuf,
70 /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
71 /// Hitting the limit is an error, not a silent truncation.
72 pub max_depth: usize,
73 /// Maximum total number of `lex.include` annotations resolved across
74 /// the whole tree (depth × breadth). Default 1000
75 /// (see [`ResolveConfig::DEFAULT_MAX_TOTAL_INCLUDES`]).
76 ///
77 /// Caps fan-out: `max_depth` alone bounds chain length but not
78 /// breadth. A document with 100 thousand top-level includes at depth
79 /// 1 sits inside `max_depth` but can still OOM the resolver / LSP /
80 /// CI. Hitting this limit is an error, not a silent truncation.
81 pub max_total_includes: usize,
82}
83
84impl ResolveConfig {
85 /// Default maximum include depth — enough for any reasonable atomization
86 /// strategy (aggregator → per-chapter → per-section), bounded enough to
87 /// keep the resolver's worst-case work predictable.
88 pub const DEFAULT_MAX_DEPTH: usize = 8;
89
90 /// Default maximum total include count (DoS bound). Generous enough
91 /// for a book-length document with thousands of small fragments,
92 /// tight enough to contain adversarial fan-out within a few seconds
93 /// of resolver work.
94 pub const DEFAULT_MAX_TOTAL_INCLUDES: usize = 1000;
95
96 /// Construct a config with the given root and default limits.
97 pub fn with_root(root: PathBuf) -> Self {
98 Self {
99 root,
100 max_depth: Self::DEFAULT_MAX_DEPTH,
101 max_total_includes: Self::DEFAULT_MAX_TOTAL_INCLUDES,
102 }
103 }
104}
105
106/// A pluggable source-text loader.
107///
108/// Implementations decide where bytes come from (filesystem, in-memory map,
109/// virtual filesystem, content-addressed store, …). lex-core never references
110/// `std::fs` directly through this trait; that keeps the resolver pure and
111/// usable in WASM, sandboxes, and unit tests.
112pub trait Loader {
113 /// Load the source text for `path` and return both the contents and a
114 /// canonical identity for the loaded resource. The path is what the
115 /// resolver decided on after applying the rules in §4 of the proposal.
116 ///
117 /// `LoadedFile::canonical_path` is the loader's authoritative identity
118 /// for the resource. For [`FsLoader`] this is the filesystem-canonical
119 /// path (symlinks resolved, case-folded if the underlying FS is
120 /// case-insensitive); for [`MemoryLoader`] it's the lookup key (since
121 /// memory loaders have no symlinks). The resolver uses this for cycle
122 /// detection and for stamping `Range.origin_path` on the loaded tree.
123 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError>;
124}
125
126/// Result of a successful [`Loader::load`].
127#[derive(Debug, Clone)]
128pub struct LoadedFile {
129 /// The file's source text.
130 pub source: String,
131 /// The loader's authoritative identity for the resource. See
132 /// [`Loader::load`] for how loaders decide this.
133 pub canonical_path: PathBuf,
134}
135
136/// Errors a [`Loader`] can produce.
137#[derive(Debug, Clone)]
138pub enum LoadError {
139 /// The loader could not find a resource at the given path.
140 NotFound { path: PathBuf },
141 /// The resource exists but resolves outside the loader's allowed
142 /// boundary. The lexical resolver normalizes `..` in the requested
143 /// path, but loaders that touch a real filesystem must do a second
144 /// check post-canonicalization to catch symlinks that escape the
145 /// boundary lexically-correct paths can't reach.
146 OutsideRoot { path: PathBuf, root: PathBuf },
147 /// The resource exists but its size exceeds the loader's configured
148 /// limit. `size` and `limit` are in bytes. The resolver maps this to
149 /// [`IncludeError::FileTooLarge`] with the offending annotation's site.
150 TooLarge {
151 path: PathBuf,
152 size: u64,
153 limit: u64,
154 },
155 /// Underlying I/O error (or virtual-filesystem equivalent).
156 Io { path: PathBuf, message: String },
157}
158
159impl std::fmt::Display for LoadError {
160 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
161 match self {
162 LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
163 LoadError::OutsideRoot { path, root } => write!(
164 f,
165 "include path {} resolves outside loader root {}",
166 path.display(),
167 root.display()
168 ),
169 LoadError::TooLarge { path, size, limit } => write!(
170 f,
171 "include file {} is {size} bytes, exceeds limit of {limit} bytes",
172 path.display()
173 ),
174 LoadError::Io { path, message } => {
175 write!(f, "io error reading {}: {message}", path.display())
176 }
177 }
178 }
179}
180
181impl std::error::Error for LoadError {}
182
183/// Errors the include resolver can produce.
184#[derive(Debug, Clone)]
185pub enum IncludeError {
186 /// An include chain looped back on itself. `chain` is the resolution
187 /// stack at the moment the duplicate `path` was about to be pushed,
188 /// in source-order (entry first, deepest last). `include_site` is the
189 /// range of the offending `lex.include` annotation in its host file —
190 /// useful for diagnostics that highlight the exact line.
191 Cycle {
192 include_site: Range,
193 path: PathBuf,
194 chain: Vec<PathBuf>,
195 },
196 /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
197 /// shows the resolution stack at the moment of failure, in source
198 /// order. `include_site` is the range of the offending
199 /// `lex.include` annotation in its host file.
200 DepthExceeded {
201 include_site: Range,
202 limit: usize,
203 chain: Vec<PathBuf>,
204 },
205 /// The total number of includes resolved across the document
206 /// exceeded [`ResolveConfig::max_total_includes`]. Bounds adversarial
207 /// fan-out (which `max_depth` alone does not). `include_site` is the
208 /// `lex.include` annotation that pushed the count past the limit.
209 TotalIncludesExceeded { include_site: Range, limit: usize },
210 /// The included file's size exceeded the loader's configured limit.
211 /// Surfaced by loaders that read from a real filesystem (FsLoader)
212 /// to bound memory allocation per include. `include_site` is the
213 /// offending annotation; `size` and `limit` are in bytes.
214 FileTooLarge {
215 include_site: Range,
216 path: PathBuf,
217 size: u64,
218 limit: u64,
219 },
220 /// A path resolved outside the configured [`ResolveConfig::root`].
221 RootEscape { path: PathBuf, root: PathBuf },
222 /// The include `src` was a platform-absolute filesystem path
223 /// (e.g. Windows `C:\foo`, `\\server\share`, `\foo`). The spec
224 /// forbids absolute filesystem paths from entering the
225 /// resolution pipeline; the *root-absolute* form (leading `/`
226 /// resolved against the includes root) is the only spec-allowed
227 /// way to write a path that doesn't start from the host's
228 /// directory. On Unix the only thing that's `Path::is_absolute()`
229 /// is a leading `/`, which is consumed by the root-absolute
230 /// branch first; this variant therefore only fires in practice
231 /// for Windows-shaped absolute paths.
232 AbsolutePath { path: PathBuf },
233 /// The loader could not find or read the included file. `include_site`
234 /// is the range of the offending `lex.include` annotation in its host
235 /// file, so editors can squiggle the line that asked for the missing
236 /// file rather than the document head.
237 NotFound { include_site: Range, path: PathBuf },
238 /// The loader returned text that the parser rejected.
239 ParseFailed { path: PathBuf, message: String },
240 /// The included file's content is not legal in the include site's
241 /// parent container.
242 ///
243 /// Today this only occurs when an included file has top-level Sessions
244 /// and the include site is inside a `GeneralContainer` (Definition,
245 /// ListItem, or another Annotation's body). The `violation` field
246 /// names the offending content kind (e.g. `"Sessions"`) so future
247 /// container/policy combinations can reuse this variant without a
248 /// breaking change.
249 ContainerPolicy {
250 include_site: Range,
251 container: &'static str,
252 file: PathBuf,
253 violation: &'static str,
254 },
255 /// Loader propagated a non-`NotFound` I/O error.
256 LoaderIo { path: PathBuf, message: String },
257 /// `lex.include` annotation was missing the mandatory `src=` parameter.
258 MissingSrc { include_site: Range },
259}
260
261impl std::fmt::Display for IncludeError {
262 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
263 match self {
264 IncludeError::Cycle { path, chain, .. } => {
265 let chain_display: Vec<String> =
266 chain.iter().map(|p| p.display().to_string()).collect();
267 write!(
268 f,
269 "include cycle: {} (chain: {})",
270 path.display(),
271 chain_display.join(" -> ")
272 )
273 }
274 IncludeError::DepthExceeded { limit, chain, .. } => {
275 let chain_display: Vec<String> =
276 chain.iter().map(|p| p.display().to_string()).collect();
277 write!(
278 f,
279 "include depth exceeded limit of {limit} (chain: {})",
280 chain_display.join(" -> ")
281 )
282 }
283 IncludeError::TotalIncludesExceeded { limit, .. } => {
284 write!(f, "total include count exceeded limit of {limit}")
285 }
286 IncludeError::FileTooLarge {
287 path, size, limit, ..
288 } => {
289 write!(
290 f,
291 "included file {} is {size} bytes, exceeds limit of {limit} bytes",
292 path.display()
293 )
294 }
295 IncludeError::RootEscape { path, root } => write!(
296 f,
297 "include path {} escapes resolution root {}",
298 path.display(),
299 root.display()
300 ),
301 IncludeError::AbsolutePath { path } => write!(
302 f,
303 "include src {} is a platform-absolute path; \
304 the spec forbids absolute filesystem paths — use a relative path \
305 (chapters/01.lex) or a root-absolute path (/shared/01.lex)",
306 path.display()
307 ),
308 IncludeError::NotFound { path, .. } => {
309 write!(f, "include not found: {}", path.display())
310 }
311 IncludeError::ParseFailed { path, message } => {
312 write!(f, "failed to parse {}: {message}", path.display())
313 }
314 IncludeError::ContainerPolicy {
315 container,
316 file,
317 violation,
318 ..
319 } => write!(
320 f,
321 "included file {} contains {} but include site is inside {} \
322 (which does not allow {})",
323 file.display(),
324 violation,
325 container,
326 violation
327 ),
328 IncludeError::LoaderIo { path, message } => {
329 write!(f, "loader error reading {}: {message}", path.display())
330 }
331 IncludeError::MissingSrc { .. } => {
332 write!(f, "lex.include annotation missing required src= parameter")
333 }
334 }
335 }
336}
337
338impl std::error::Error for IncludeError {}
339
340// No `From<LoadError>` impl: `IncludeError::NotFound` carries the include
341// site (the `lex.include` annotation's range), which a loader doesn't know
342// about. Callers map `LoadError` explicitly at the call site, where the
343// site is available.
344
345/// Which container the include site sits in. Determines the splice-time
346/// policy check (the only one today is "no Sessions in `GeneralContainer`").
347#[derive(Debug, Clone, Copy)]
348enum ContainerKind {
349 /// `Document.root.children` or `Session.children` — accepts everything.
350 Session,
351 /// `Definition.children` — `GeneralContainer`.
352 Definition,
353 /// `Annotation.children` — `GeneralContainer`.
354 AnnotationBody,
355 /// `ListItem.children` — `GeneralContainer`.
356 ListItem,
357}
358
359impl ContainerKind {
360 fn name(self) -> &'static str {
361 match self {
362 ContainerKind::Session => "Session",
363 ContainerKind::Definition => "Definition",
364 ContainerKind::AnnotationBody => "Annotation body",
365 ContainerKind::ListItem => "ListItem",
366 }
367 }
368
369 fn allows_sessions(self) -> bool {
370 matches!(self, ContainerKind::Session)
371 }
372}
373
374/// Resolve `:: lex.include ::` annotations starting from `source`, recursively.
375///
376/// `source_path` identifies the entry-point file. It is used to (a) resolve
377/// relative include paths against the entry file's directory, (b) stamp
378/// `Range.origin_path` on every node so downstream code (file-ref resolution,
379/// diagnostics, LSP goto) can report locations against the authoring file,
380/// and (c) seed the cycle-detection chain so an include cycle that loops
381/// back to the entry is caught. When `None`, relative paths resolve against
382/// `config.root`, origin stamping is skipped on the entry, and the chain
383/// starts empty.
384///
385/// # Pre/post-attachment
386///
387/// Internally this re-parses each source (entry + every loaded file) *without*
388/// annotation attachment so `lex.include` annotations are visible as standalone
389/// children where the splice can replace them in-place. After all splices,
390/// [`AttachAnnotations`] runs once on the merged tree, which lands the include
391/// annotation on the first spliced node by the standard "attach to next
392/// sibling" rule. This matches the textual paste mental model from the proposal.
393///
394/// # Recursion
395///
396/// Each loaded file is fully resolved (its own includes replaced) *before*
397/// being spliced into the host. The recursion uses each file's own directory
398/// as `host_dir`, so a relative path inside an included file resolves from
399/// that file's location — not the entry's. An active-chain stack of
400/// canonicalized paths gates against cycles; the depth counter gates against
401/// pathological nesting (default 8, configurable via [`ResolveConfig::max_depth`]).
402pub fn resolve_from_source(
403 source: &str,
404 source_path: Option<PathBuf>,
405 config: &ResolveConfig,
406 loader: &dyn Loader,
407) -> Result<Document, IncludeError> {
408 let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
409 let host_dir = source_path
410 .as_ref()
411 .and_then(|p| p.parent().map(Path::to_path_buf))
412 .unwrap_or_else(|| config.root.clone());
413
414 let mut doc = parse_no_attach(source).map_err(|message| IncludeError::ParseFailed {
415 path: source_path.clone().unwrap_or_default(),
416 message,
417 })?;
418
419 if let Some(origin) = entry_origin.as_ref() {
420 stamp_doc(&mut doc, origin);
421 }
422
423 // Seed the chain with the lexically-normalized entry path (when known)
424 // so an include that loops back to the entry is detected as a cycle.
425 // Normalization here is essential — `target_path` values produced by
426 // `resolve_path` are also lexically normalized, so an unnormalized
427 // entry would never compare equal to its normalized self.
428 let mut chain: Vec<PathBuf> = source_path
429 .as_ref()
430 .map(|p| vec![lexical_normalize(p)])
431 .unwrap_or_default();
432 let mut state = ResolverState {
433 config,
434 loader,
435 chain: &mut chain,
436 depth: 0,
437 total_resolved: 0,
438 };
439
440 splice_in_session_container(doc.root.children.as_mut_vec(), &host_dir, &mut state)?;
441
442 let doc = AttachAnnotations::new()
443 .run(doc)
444 .map_err(|e| IncludeError::ParseFailed {
445 path: source_path.unwrap_or_default(),
446 message: format!("annotation attachment failed: {e}"),
447 })?;
448
449 Ok(doc)
450}
451
452// ============================================================================
453// Splicing
454// ============================================================================
455
456/// Per-resolution state threaded through the recursive walker. Keeps the
457/// signatures of the splice/process functions short and ensures
458/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
459/// each include site.
460struct ResolverState<'a> {
461 config: &'a ResolveConfig,
462 loader: &'a dyn Loader,
463 /// Active resolution stack: lexically-normalized absolute paths
464 /// currently being resolved. Pushed when we begin loading a file and
465 /// popped when its tree is fully resolved. A push that finds the
466 /// path already on the stack is a cycle.
467 ///
468 /// Normalization (not filesystem canonicalization) is what's used
469 /// here: the resolver never touches `std::fs`, so symlink resolution
470 /// is out. Two paths that lexically refer to the same file (after
471 /// `.`/`..` collapse) compare equal; two paths reaching the same
472 /// inode via different routes do not. For real-FS use cases this is
473 /// fine because `FsLoader` will canonicalize on load before the
474 /// chain comparison sees the path.
475 chain: &'a mut Vec<PathBuf>,
476 /// Number of include hops from the entry point. Each recursion into a
477 /// loaded file increments by 1. Hitting `config.max_depth` is an error.
478 depth: usize,
479 /// Total includes resolved across the entire walk (depth × breadth).
480 /// Incremented on every successful load. Hitting
481 /// `config.max_total_includes` aborts with `TotalIncludesExceeded` —
482 /// caps adversarial fan-out that `max_depth` alone wouldn't catch.
483 total_resolved: usize,
484}
485
486fn splice_in_session_container(
487 children: &mut Vec<ContentItem>,
488 host_dir: &Path,
489 state: &mut ResolverState<'_>,
490) -> Result<(), IncludeError> {
491 // Post-order: recurse into nested containers first, splice this
492 // container's includes second. The recurse step walks the *original*
493 // tree; the splice step inserts already-fully-resolved content
494 // (recursion happens inside `process_includes`), which is therefore
495 // never re-walked.
496 recurse_into_children(children, host_dir, state)?;
497 process_includes(children, host_dir, state, ContainerKind::Session)
498}
499
500fn splice_in_general_container(
501 container: &mut GeneralContainer,
502 host_dir: &Path,
503 state: &mut ResolverState<'_>,
504 kind: ContainerKind,
505) -> Result<(), IncludeError> {
506 recurse_into_children(container.as_mut_vec(), host_dir, state)?;
507 process_includes(container.as_mut_vec(), host_dir, state, kind)
508}
509
510// Allow &mut Vec because `splice` needs Vec-specific operations.
511#[allow(clippy::ptr_arg)]
512fn process_includes(
513 children: &mut Vec<ContentItem>,
514 host_dir: &Path,
515 state: &mut ResolverState<'_>,
516 kind: ContainerKind,
517) -> Result<(), IncludeError> {
518 // Collect indices of standalone include annotations in this container.
519 let include_indices: Vec<usize> = children
520 .iter()
521 .enumerate()
522 .filter_map(|(i, item)| match item {
523 ContentItem::Annotation(a) if a.is_include() => Some(i),
524 _ => None,
525 })
526 .collect();
527
528 // Process in reverse order so earlier indices stay valid.
529 for i in include_indices.into_iter().rev() {
530 let annotation = match &children[i] {
531 ContentItem::Annotation(a) => a.clone(),
532 _ => unreachable!("index came from include filter"),
533 };
534
535 let splice_items = resolve_one_include(&annotation, host_dir, state, kind)?;
536
537 // Replace the include annotation with the splice content.
538 // The annotation itself stays in the children list immediately
539 // before the splice, so the post-resolution AttachAnnotations
540 // pass moves it onto the first spliced node by the standard
541 // "attach to next sibling" rule.
542 let mut replacement = Vec::with_capacity(splice_items.len() + 1);
543 replacement.push(ContentItem::Annotation(annotation));
544 replacement.extend(splice_items);
545 children.splice(i..=i, replacement);
546 }
547
548 Ok(())
549}
550
551/// Resolve a single include annotation: path → load → parse → recurse →
552/// stamp → policy-check → splice list.
553///
554/// The recursion happens *here*: after parsing the loaded file, we walk
555/// its tree with the loaded file's own directory as `host_dir`, with the
556/// loaded file pushed onto `state.chain` and `state.depth` bumped by 1.
557/// When this call returns, the splice list is fully resolved and ready to
558/// be inserted into the host container.
559fn resolve_one_include(
560 annotation: &crate::lex::ast::elements::annotation::Annotation,
561 host_dir: &Path,
562 state: &mut ResolverState<'_>,
563 parent_kind: ContainerKind,
564) -> Result<Vec<ContentItem>, IncludeError> {
565 let src = annotation
566 .include_src()
567 .ok_or_else(|| IncludeError::MissingSrc {
568 include_site: annotation.location.clone(),
569 })?;
570
571 let target_path = resolve_path(&src, host_dir, &state.config.root)?;
572
573 // Depth check before any FS access. A site sitting exactly at
574 // `max_depth` is fine; one that would push us *past* it is the
575 // failure case.
576 if state.depth >= state.config.max_depth {
577 return Err(IncludeError::DepthExceeded {
578 include_site: annotation.location.clone(),
579 limit: state.config.max_depth,
580 chain: state.chain.clone(),
581 });
582 }
583
584 // Total-count check before loading. Caps fan-out — a doc with
585 // 100k top-level includes would blow past max_total_includes long
586 // before max_depth would catch anything.
587 if state.total_resolved >= state.config.max_total_includes {
588 return Err(IncludeError::TotalIncludesExceeded {
589 include_site: annotation.location.clone(),
590 limit: state.config.max_total_includes,
591 });
592 }
593
594 // Load via the injected loader. The loader returns the source plus
595 // a *canonical* identity for the resource — for FsLoader that's
596 // post-`fs::canonicalize` (symlinks resolved, case-folded on
597 // case-insensitive FS); for MemoryLoader it's the lookup key. We
598 // use the canonical path for cycle detection so a symlink loop or
599 // a case-folded re-include is caught here rather than slipping
600 // through to `max_depth`.
601 let LoadedFile {
602 source: target_source,
603 canonical_path,
604 } = state.loader.load(&target_path).map_err(|e| match e {
605 LoadError::NotFound { path } => IncludeError::NotFound {
606 include_site: annotation.location.clone(),
607 path,
608 },
609 LoadError::OutsideRoot { path, root } => IncludeError::RootEscape { path, root },
610 LoadError::TooLarge { path, size, limit } => IncludeError::FileTooLarge {
611 include_site: annotation.location.clone(),
612 path,
613 size,
614 limit,
615 },
616 LoadError::Io { path, message } => IncludeError::LoaderIo { path, message },
617 })?;
618 state.total_resolved += 1;
619
620 // Cycle check uses the canonical path so symlink/case-fold cycles
621 // are caught even though `target_path` (which we used for the load
622 // request) was just lexically resolved.
623 if state.chain.iter().any(|p| p == &canonical_path) {
624 return Err(IncludeError::Cycle {
625 include_site: annotation.location.clone(),
626 path: canonical_path,
627 chain: state.chain.clone(),
628 });
629 }
630
631 let mut included =
632 parse_no_attach(&target_source).map_err(|message| IncludeError::ParseFailed {
633 path: canonical_path.clone(),
634 message,
635 })?;
636
637 let target_origin = Arc::new(canonical_path.clone());
638 stamp_doc(&mut included, &target_origin);
639
640 // Recursively resolve includes inside the loaded file. The host_dir
641 // for that walk is the loaded file's own canonical parent; the
642 // chain gains the canonical path and depth bumps by 1 — both are
643 // popped/restored on the way back so siblings see the same state.
644 let included_dir = canonical_path
645 .parent()
646 .map(Path::to_path_buf)
647 .unwrap_or_else(|| state.config.root.clone());
648
649 state.chain.push(canonical_path.clone());
650 let saved_depth = state.depth;
651 state.depth = saved_depth + 1;
652 let recurse_result =
653 splice_in_session_container(included.root.children.as_mut_vec(), &included_dir, state);
654 state.depth = saved_depth;
655 state.chain.pop();
656 recurse_result?;
657
658 let splice_items = prepare_splice_list(included);
659 validate_against_kind(
660 &splice_items,
661 parent_kind,
662 &annotation.location,
663 &canonical_path,
664 )?;
665
666 Ok(splice_items)
667}
668
669#[allow(clippy::ptr_arg)]
670fn recurse_into_children(
671 children: &mut Vec<ContentItem>,
672 host_dir: &Path,
673 state: &mut ResolverState<'_>,
674) -> Result<(), IncludeError> {
675 for item in children.iter_mut() {
676 match item {
677 ContentItem::Session(s) => {
678 splice_in_session_container(s.children.as_mut_vec(), host_dir, state)?;
679 }
680 ContentItem::Definition(d) => {
681 splice_in_general_container(
682 &mut d.children,
683 host_dir,
684 state,
685 ContainerKind::Definition,
686 )?;
687 }
688 ContentItem::Annotation(a) if !a.is_include() => {
689 splice_in_general_container(
690 &mut a.children,
691 host_dir,
692 state,
693 ContainerKind::AnnotationBody,
694 )?;
695 }
696 ContentItem::List(l) => {
697 for li in l.items.as_mut_vec().iter_mut() {
698 if let ContentItem::ListItem(item) = li {
699 splice_in_general_container(
700 &mut item.children,
701 host_dir,
702 state,
703 ContainerKind::ListItem,
704 )?;
705 }
706 }
707 }
708 _ => {}
709 }
710 }
711 Ok(())
712}
713
714fn prepare_splice_list(mut included: Document) -> Vec<ContentItem> {
715 let mut items: Vec<ContentItem> = Vec::new();
716
717 // Document title → Paragraph, prepended.
718 // Equivalent to what a textual paste would parse (an unindented line
719 // becomes a paragraph in the host's context). Per the revised
720 // spec §5.2 this is "do nothing" semantics — converting matches what
721 // the parser would do if the included source were inlined and reparsed.
722 if let Some(title) = included.title {
723 let location = title.location.clone();
724 let para = Paragraph::from_line(title.as_str().to_string()).at(location);
725 items.push(ContentItem::Paragraph(para));
726 }
727
728 // Document-level annotations → regular annotations, prepended.
729 for ann in included.annotations {
730 items.push(ContentItem::Annotation(ann));
731 }
732
733 // Body of the included document.
734 items.append(included.root.children.as_mut_vec());
735
736 items
737}
738
739fn validate_against_kind(
740 items: &[ContentItem],
741 kind: ContainerKind,
742 site: &Range,
743 file: &Path,
744) -> Result<(), IncludeError> {
745 if kind.allows_sessions() {
746 return Ok(());
747 }
748 if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
749 return Err(IncludeError::ContainerPolicy {
750 include_site: site.clone(),
751 container: kind.name(),
752 file: file.to_path_buf(),
753 violation: "Sessions",
754 });
755 }
756 Ok(())
757}
758
759// ============================================================================
760// Path resolution
761// ============================================================================
762
763/// Resolve a file-reference target string the same way the include
764/// resolver resolves include paths.
765///
766/// Use this when consuming `ReferenceType::File { target }` (or any other
767/// node-attached path) so that relative paths resolve from the *authoring*
768/// file's directory, not from wherever the merged document happens to be
769/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
770/// containing node (or `None` if the node was never stamped — in that case
771/// the path is treated as if authored at the root).
772///
773/// Behaviour matches the include resolver:
774/// - Root-absolute targets (leading `/`) resolve under `root`.
775/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
776/// when `ref_origin` is `None`).
777/// - The result is lexically normalized and checked against `root` —
778/// paths that escape it return `RootEscape`.
779///
780/// This is a sister to the resolver's internal `resolve_path` and shares
781/// the same lexical-normalization caveat: it does not touch the filesystem.
782pub fn resolve_file_reference(
783 target: &str,
784 ref_origin: Option<&Path>,
785 root: &Path,
786) -> Result<PathBuf, IncludeError> {
787 let host_dir: PathBuf = ref_origin
788 .and_then(|p| p.parent())
789 .map(Path::to_path_buf)
790 .unwrap_or_else(|| root.to_path_buf());
791 resolve_path(target, &host_dir, root)
792}
793
794fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
795 let candidate = if let Some(rel) = src.strip_prefix('/') {
796 // Root-absolute (Lex spec convention): leading `/` means "from
797 // the resolution root", not "filesystem root".
798 root.join(rel)
799 } else {
800 // Anything else must be a relative path. Reject inputs the
801 // host platform would treat as absolute (Windows `C:\foo`,
802 // `\\server\share`, `\foo`) up front: the spec forbids
803 // platform-absolute paths from entering the resolution
804 // pipeline. Without this, `host_dir.join(src)` would silently
805 // discard `host_dir` because Rust's `PathBuf::join` replaces
806 // the base when the joined path is absolute. The downstream
807 // root-escape check would still catch the security side, but
808 // we'd surface a misleading "escapes root" error instead of
809 // "absolute paths not allowed", and we'd be relying on
810 // `PathBuf::join`'s override semantics for the security
811 // outcome rather than holding the line at the input boundary.
812 if Path::new(src).is_absolute() {
813 return Err(IncludeError::AbsolutePath {
814 path: PathBuf::from(src),
815 });
816 }
817 host_dir.join(src)
818 };
819 let normalized = lexical_normalize(&candidate);
820 let canonical_root = lexical_normalize(root);
821 if !normalized.starts_with(&canonical_root) {
822 return Err(IncludeError::RootEscape {
823 path: normalized,
824 root: canonical_root,
825 });
826 }
827 Ok(normalized)
828}
829
830/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
831///
832/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
833/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
834/// version is sufficient for include-site path resolution because the
835/// resolver only needs a stable identity for cycle detection and a uniform
836/// shape for the root-escape prefix check.
837///
838/// `..` is collapsed only when the *last* component in the buffer is a
839/// real directory name (`Component::Normal`). When the buffer is empty
840/// or its last component is itself `..` (or a root marker), the new `..`
841/// is *preserved* in the buffer.
842///
843/// This is what defeats `../../etc/passwd` from collapsing to
844/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
845/// would happily strip a `..` (since `Path::new("..").parent()` returns
846/// `Some("")`), silently losing the second `..` and producing a path
847/// that falsely starts with the root prefix. Each unmatched `..` in the
848/// preserved form keeps the normalized path outside any sane root, so
849/// the escape check fires correctly.
850fn lexical_normalize(p: &Path) -> PathBuf {
851 let mut out = PathBuf::new();
852 for c in p.components() {
853 match c {
854 std::path::Component::ParentDir => {
855 let can_pop = matches!(
856 out.components().next_back(),
857 Some(std::path::Component::Normal(_))
858 );
859 if can_pop {
860 out.pop();
861 } else {
862 out.push("..");
863 }
864 }
865 std::path::Component::CurDir => {}
866 other => out.push(other.as_os_str()),
867 }
868 }
869 out
870}
871
872// ============================================================================
873// Origin stamping
874// ============================================================================
875//
876// Walk every node in a Document and set `Range.origin_path` on each
877// `.location` field. The walk only stamps the *block-level* `.location`
878// fields here; finer-grained inline ranges land in PR 6 when file-ref
879// resolution starts consulting them.
880
881fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
882 if let Some(title) = doc.title.as_mut() {
883 title.location.origin_path = Some(Arc::clone(origin));
884 }
885 for ann in doc.annotations.iter_mut() {
886 stamp_annotation(ann, origin);
887 }
888 stamp_session(&mut doc.root, origin);
889}
890
891fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
892 s.location.origin_path = Some(Arc::clone(origin));
893 if let Some(loc) = s.title.location.as_mut() {
894 loc.origin_path = Some(Arc::clone(origin));
895 }
896 for ann in s.annotations.iter_mut() {
897 stamp_annotation(ann, origin);
898 }
899 for item in s.children.as_mut_vec().iter_mut() {
900 stamp_item(item, origin);
901 }
902}
903
904fn stamp_annotation(
905 a: &mut crate::lex::ast::elements::annotation::Annotation,
906 origin: &Arc<PathBuf>,
907) {
908 a.location.origin_path = Some(Arc::clone(origin));
909 a.data.location.origin_path = Some(Arc::clone(origin));
910 for item in a.children.as_mut_vec().iter_mut() {
911 stamp_item(item, origin);
912 }
913}
914
915fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
916 match item {
917 ContentItem::Session(s) => stamp_session(s, origin),
918 ContentItem::Annotation(a) => stamp_annotation(a, origin),
919 ContentItem::Paragraph(p) => {
920 p.location.origin_path = Some(Arc::clone(origin));
921 for ann in p.annotations.iter_mut() {
922 stamp_annotation(ann, origin);
923 }
924 for line in p.lines.iter_mut() {
925 stamp_item(line, origin);
926 }
927 }
928 ContentItem::List(l) => {
929 l.location.origin_path = Some(Arc::clone(origin));
930 for li in l.items.as_mut_vec().iter_mut() {
931 stamp_item(li, origin);
932 }
933 }
934 ContentItem::ListItem(li) => {
935 li.location.origin_path = Some(Arc::clone(origin));
936 for ann in li.annotations.iter_mut() {
937 stamp_annotation(ann, origin);
938 }
939 for child in li.children.as_mut_vec().iter_mut() {
940 stamp_item(child, origin);
941 }
942 }
943 ContentItem::Definition(d) => {
944 d.location.origin_path = Some(Arc::clone(origin));
945 for ann in d.annotations.iter_mut() {
946 stamp_annotation(ann, origin);
947 }
948 for child in d.children.as_mut_vec().iter_mut() {
949 stamp_item(child, origin);
950 }
951 }
952 ContentItem::VerbatimBlock(v) => {
953 v.location.origin_path = Some(Arc::clone(origin));
954 }
955 ContentItem::VerbatimLine(vl) => {
956 vl.location.origin_path = Some(Arc::clone(origin));
957 }
958 ContentItem::Table(t) => {
959 t.location.origin_path = Some(Arc::clone(origin));
960 }
961 ContentItem::TextLine(tl) => {
962 tl.location.origin_path = Some(Arc::clone(origin));
963 }
964 ContentItem::BlankLineGroup(b) => {
965 b.location.origin_path = Some(Arc::clone(origin));
966 }
967 }
968}
969
970// ============================================================================
971// Parser glue
972// ============================================================================
973
974/// Parse `source` into a Document but skip the annotation-attachment stage,
975/// so include annotations are findable in container children lists.
976fn parse_no_attach(source: &str) -> Result<Document, String> {
977 crate::lex::testing::parse_without_annotation_attachment(source)
978}
979
980// ============================================================================
981// Filesystem-backed loader
982// ============================================================================
983
984/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
985///
986/// This is the production loader used by the CLI; the LSP wraps it with a
987/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
988/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
989/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
990/// WASM-friendly.
991///
992/// `FsLoader` is constructed with the resolution root and rechecks every
993/// load against it post-`fs::canonicalize`, so a symlink pointing outside
994/// the root is rejected even though the lexical-only check in
995/// [`resolve_path`] cannot see it. Also rejects non-regular files (devices,
996/// FIFOs, directories) before reading, so the loader can't be tricked into
997/// blocking on `/dev/zero` or allocating against an open device.
998///
999/// Errors map:
1000/// - canonicalization fails (file missing, permission denied at a parent,
1001/// broken symlink, …) → [`LoadError::NotFound`]
1002/// - canonical path doesn't sit under canonical root → [`LoadError::OutsideRoot`]
1003/// - target is not a regular file → [`LoadError::Io`] with a clear message
1004/// - any other I/O error during read → [`LoadError::Io`]
1005pub struct FsLoader {
1006 /// Filesystem-canonical resolution root. Constructed once at
1007 /// `FsLoader::new`; if canonicalization fails (e.g., the configured
1008 /// root doesn't exist on disk), we fall back to the input verbatim
1009 /// and the bounds check will simply never pass — visible to the user
1010 /// as a `LoadError::OutsideRoot` instead of silently disabling the
1011 /// security check.
1012 canonical_root: PathBuf,
1013 /// Per-file size cap (bytes). Loads of larger files surface as
1014 /// `LoadError::TooLarge` before any bytes are read into memory.
1015 /// Default [`FsLoader::DEFAULT_MAX_FILE_SIZE`].
1016 max_file_size: u64,
1017}
1018
1019impl FsLoader {
1020 /// Default per-file size cap: 10 MiB. Generous for realistic Lex
1021 /// source documents (text only) and tight enough to bound memory
1022 /// allocation per include against an adversarial 1 GB file.
1023 pub const DEFAULT_MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
1024
1025 /// Construct a loader rooted at `root` with default size limits.
1026 /// The loader stores `root`'s fs-canonical form (with symlinks
1027 /// resolved); subsequent loads validate that the requested path's
1028 /// canonical form lives under it.
1029 pub fn new(root: PathBuf) -> Self {
1030 let canonical_root = std::fs::canonicalize(&root).unwrap_or(root);
1031 Self {
1032 canonical_root,
1033 max_file_size: Self::DEFAULT_MAX_FILE_SIZE,
1034 }
1035 }
1036
1037 /// Override the default per-file size cap (bytes). Use to widen the
1038 /// limit for projects with genuinely large source files, or tighten
1039 /// it for stricter sandboxes (e.g., LSPs serving untrusted content).
1040 pub fn with_max_file_size(mut self, max_file_size: u64) -> Self {
1041 self.max_file_size = max_file_size;
1042 self
1043 }
1044}
1045
1046impl Loader for FsLoader {
1047 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1048 // 1. Canonicalize. Resolves symlinks and `..` segments against the
1049 // real filesystem. NotFound / broken-symlink / permission errors
1050 // all surface here.
1051 let canonical_path = std::fs::canonicalize(path).map_err(|e| match e.kind() {
1052 std::io::ErrorKind::NotFound => LoadError::NotFound {
1053 path: path.to_path_buf(),
1054 },
1055 _ => LoadError::Io {
1056 path: path.to_path_buf(),
1057 message: e.to_string(),
1058 },
1059 })?;
1060
1061 // 2. Bounds check against the *canonical* root. This is the
1062 // actual security gate against symlink traversal — the lexical
1063 // check in resolve_path can't see through symlinks.
1064 if !canonical_path.starts_with(&self.canonical_root) {
1065 return Err(LoadError::OutsideRoot {
1066 path: canonical_path,
1067 root: self.canonical_root.clone(),
1068 });
1069 }
1070
1071 // 3. Reject non-regular files. Without this, an attacker (with
1072 // write access to the repo) could symlink an include target to
1073 // `/dev/zero` or a FIFO and block / OOM the reader. The
1074 // is_file() metadata call is a cheap sanity check.
1075 let meta = std::fs::metadata(&canonical_path).map_err(|e| LoadError::Io {
1076 path: canonical_path.clone(),
1077 message: e.to_string(),
1078 })?;
1079 if !meta.is_file() {
1080 return Err(LoadError::Io {
1081 path: canonical_path,
1082 message: "include target is not a regular file".to_string(),
1083 });
1084 }
1085
1086 // 4. Size cap. Bounds memory allocation per include against an
1087 // adversarial 1 GB file before any bytes hit the heap.
1088 let size = meta.len();
1089 if size > self.max_file_size {
1090 return Err(LoadError::TooLarge {
1091 path: canonical_path,
1092 size,
1093 limit: self.max_file_size,
1094 });
1095 }
1096
1097 // 5. Read. By this point we know the path is a regular file under
1098 // the canonical root and within the size cap; anything that
1099 // fails here is a real I/O error worth surfacing.
1100 let source = std::fs::read_to_string(&canonical_path).map_err(|e| LoadError::Io {
1101 path: canonical_path.clone(),
1102 message: e.to_string(),
1103 })?;
1104
1105 Ok(LoadedFile {
1106 source,
1107 canonical_path,
1108 })
1109 }
1110}
1111
1112// ============================================================================
1113// Test fixtures (test-support feature + cfg(test))
1114// ============================================================================
1115
1116/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
1117#[cfg(any(test, feature = "test-support"))]
1118pub struct MemoryLoader {
1119 files: std::collections::HashMap<PathBuf, String>,
1120}
1121
1122#[cfg(any(test, feature = "test-support"))]
1123impl MemoryLoader {
1124 /// Create an empty loader. Add files with [`MemoryLoader::insert`].
1125 pub fn new() -> Self {
1126 Self {
1127 files: std::collections::HashMap::new(),
1128 }
1129 }
1130
1131 /// Register a file at `path` with the given source text.
1132 pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
1133 self.files.insert(path.into(), contents.into());
1134 self
1135 }
1136
1137 /// Convenience constructor: build a loader from any iterator of
1138 /// `(path, contents)` pairs.
1139 pub fn from_pairs<I, P, S>(pairs: I) -> Self
1140 where
1141 I: IntoIterator<Item = (P, S)>,
1142 P: Into<PathBuf>,
1143 S: Into<String>,
1144 {
1145 let mut loader = Self::new();
1146 for (path, contents) in pairs {
1147 loader.insert(path, contents);
1148 }
1149 loader
1150 }
1151}
1152
1153#[cfg(any(test, feature = "test-support"))]
1154impl Default for MemoryLoader {
1155 fn default() -> Self {
1156 Self::new()
1157 }
1158}
1159
1160#[cfg(any(test, feature = "test-support"))]
1161impl Loader for MemoryLoader {
1162 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1163 // Memory loaders have no symlinks; the lookup key *is* the
1164 // canonical identity. Cycle detection in the resolver compares
1165 // `LoadedFile::canonical_path` values; for tests this matches the
1166 // lexically-normalized paths the resolver already produces.
1167 let source = self
1168 .files
1169 .get(path)
1170 .cloned()
1171 .ok_or_else(|| LoadError::NotFound {
1172 path: path.to_path_buf(),
1173 })?;
1174 Ok(LoadedFile {
1175 source,
1176 canonical_path: path.to_path_buf(),
1177 })
1178 }
1179}
1180
1181// ============================================================================
1182// Tests
1183// ============================================================================
1184
1185#[cfg(test)]
1186mod tests;