lex_core/lex/includes.rs
1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//! doc-title/doc-annotation conversion + origin stamping + root-escape
19//! check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//! (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//! directory, so relative paths inside an included file resolve from
23//! that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//! resolves a `ReferenceType::File` target from the authoring file's
26//! directory using `Range.origin_path`.
27//! `Document::find_annotation_by_label_in_origin` scopes footnote
28//! lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//! filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//! into `lex convert` and `lex inspect` (default-on, opt-out via
32//! `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47// `IncludeError` carries diagnostic context (paths, source ranges,
48// handler messages) on every variant; the `result_large_err` lint
49// would have us box the whole error or split it into a thinner shape
50// just to satisfy the size heuristic. The enum is already part of
51// the public API and the error path is rare; suppress the lint for
52// this module rather than churn the public surface.
53#![allow(clippy::result_large_err)]
54
55use crate::lex::assembling::AttachAnnotations;
56use crate::lex::ast::elements::container::GeneralContainer;
57use crate::lex::ast::elements::content_item::ContentItem;
58use crate::lex::ast::elements::session::Session;
59use crate::lex::ast::range::Range;
60use crate::lex::ast::Document;
61use crate::lex::transforms::Runnable;
62use lex_extension::handler::HandlerError;
63use lex_extension_host::registry::Registry;
64use std::path::{Path, PathBuf};
65use std::sync::Arc;
66
67/// Configuration for the include resolution pass.
68#[derive(Debug, Clone)]
69pub struct ResolveConfig {
70 /// Directory all include paths resolve under. Any include that
71 /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
72 ///
73 /// Must be an **absolute** path. Lexical normalization treats `.`
74 /// and `..` against an empty buffer as no-ops; passing a relative
75 /// or unnormalized root weakens the root-escape prefix check.
76 /// Callers (CLI, LSP) should canonicalize the root before
77 /// constructing `ResolveConfig`.
78 pub root: PathBuf,
79 /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
80 /// Hitting the limit is an error, not a silent truncation.
81 pub max_depth: usize,
82 /// Maximum total number of `lex.include` annotations resolved across
83 /// the whole tree (depth × breadth). Default 1000
84 /// (see [`ResolveConfig::DEFAULT_MAX_TOTAL_INCLUDES`]).
85 ///
86 /// Caps fan-out: `max_depth` alone bounds chain length but not
87 /// breadth. A document with 100 thousand top-level includes at depth
88 /// 1 sits inside `max_depth` but can still OOM the resolver / LSP /
89 /// CI. Hitting this limit is an error, not a silent truncation.
90 pub max_total_includes: usize,
91}
92
93impl ResolveConfig {
94 /// Default maximum include depth — enough for any reasonable atomization
95 /// strategy (aggregator → per-chapter → per-section), bounded enough to
96 /// keep the resolver's worst-case work predictable.
97 pub const DEFAULT_MAX_DEPTH: usize = 8;
98
99 /// Default maximum total include count (DoS bound). Generous enough
100 /// for a book-length document with thousands of small fragments,
101 /// tight enough to contain adversarial fan-out within a few seconds
102 /// of resolver work.
103 pub const DEFAULT_MAX_TOTAL_INCLUDES: usize = 1000;
104
105 /// Construct a config with the given root and default limits.
106 pub fn with_root(root: PathBuf) -> Self {
107 Self {
108 root,
109 max_depth: Self::DEFAULT_MAX_DEPTH,
110 max_total_includes: Self::DEFAULT_MAX_TOTAL_INCLUDES,
111 }
112 }
113}
114
115/// A pluggable source-text loader.
116///
117/// Implementations decide where bytes come from (filesystem, in-memory map,
118/// virtual filesystem, content-addressed store, …). lex-core never references
119/// `std::fs` directly through this trait; that keeps the resolver pure and
120/// usable in WASM, sandboxes, and unit tests.
121pub trait Loader {
122 /// Load the source text for `path` and return both the contents and a
123 /// canonical identity for the loaded resource. The path is what the
124 /// resolver decided on after applying the rules in §4 of the proposal.
125 ///
126 /// `LoadedFile::canonical_path` is the loader's authoritative identity
127 /// for the resource. For [`FsLoader`] this is the filesystem-canonical
128 /// path (symlinks resolved, case-folded if the underlying FS is
129 /// case-insensitive); for [`MemoryLoader`] it's the lookup key (since
130 /// memory loaders have no symlinks). The resolver uses this for cycle
131 /// detection and for stamping `Range.origin_path` on the loaded tree.
132 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError>;
133}
134
135/// Result of a successful [`Loader::load`].
136#[derive(Debug, Clone)]
137pub struct LoadedFile {
138 /// The file's source text.
139 pub source: String,
140 /// The loader's authoritative identity for the resource. See
141 /// [`Loader::load`] for how loaders decide this.
142 pub canonical_path: PathBuf,
143}
144
145/// Errors a [`Loader`] can produce.
146#[derive(Debug, Clone)]
147pub enum LoadError {
148 /// The loader could not find a resource at the given path.
149 NotFound { path: PathBuf },
150 /// The resource exists but resolves outside the loader's allowed
151 /// boundary. The lexical resolver normalizes `..` in the requested
152 /// path, but loaders that touch a real filesystem must do a second
153 /// check post-canonicalization to catch symlinks that escape the
154 /// boundary lexically-correct paths can't reach.
155 OutsideRoot { path: PathBuf, root: PathBuf },
156 /// The resource exists but its size exceeds the loader's configured
157 /// limit. `size` and `limit` are in bytes. The resolver maps this to
158 /// [`IncludeError::FileTooLarge`] with the offending annotation's site.
159 TooLarge {
160 path: PathBuf,
161 size: u64,
162 limit: u64,
163 },
164 /// Underlying I/O error (or virtual-filesystem equivalent).
165 Io { path: PathBuf, message: String },
166}
167
168impl std::fmt::Display for LoadError {
169 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
170 match self {
171 LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
172 LoadError::OutsideRoot { path, root } => write!(
173 f,
174 "include path {} resolves outside loader root {}",
175 path.display(),
176 root.display()
177 ),
178 LoadError::TooLarge { path, size, limit } => write!(
179 f,
180 "include file {} is {size} bytes, exceeds limit of {limit} bytes",
181 path.display()
182 ),
183 LoadError::Io { path, message } => {
184 write!(f, "io error reading {}: {message}", path.display())
185 }
186 }
187 }
188}
189
190impl std::error::Error for LoadError {}
191
192/// Errors the include resolver can produce.
193#[derive(Debug, Clone)]
194pub enum IncludeError {
195 /// An include chain looped back on itself. `chain` is the resolution
196 /// stack at the moment the duplicate `path` was about to be pushed,
197 /// in source-order (entry first, deepest last). `include_site` is the
198 /// range of the offending `lex.include` annotation in its host file —
199 /// useful for diagnostics that highlight the exact line.
200 Cycle {
201 include_site: Range,
202 path: PathBuf,
203 chain: Vec<PathBuf>,
204 },
205 /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
206 /// shows the resolution stack at the moment of failure, in source
207 /// order. `include_site` is the range of the offending
208 /// `lex.include` annotation in its host file.
209 DepthExceeded {
210 include_site: Range,
211 limit: usize,
212 chain: Vec<PathBuf>,
213 },
214 /// The total number of includes resolved across the document
215 /// exceeded [`ResolveConfig::max_total_includes`]. Bounds adversarial
216 /// fan-out (which `max_depth` alone does not). `include_site` is the
217 /// `lex.include` annotation that pushed the count past the limit.
218 TotalIncludesExceeded { include_site: Range, limit: usize },
219 /// The included file's size exceeded the loader's configured limit.
220 /// Surfaced by loaders that read from a real filesystem (FsLoader)
221 /// to bound memory allocation per include. `include_site` is the
222 /// offending annotation; `size` and `limit` are in bytes.
223 FileTooLarge {
224 include_site: Range,
225 path: PathBuf,
226 size: u64,
227 limit: u64,
228 },
229 /// A path resolved outside the configured [`ResolveConfig::root`].
230 RootEscape { path: PathBuf, root: PathBuf },
231 /// The include `src` was a platform-absolute filesystem path
232 /// (e.g. Windows `C:\foo`, `\\server\share`, `\foo`). The spec
233 /// forbids absolute filesystem paths from entering the
234 /// resolution pipeline; the *root-absolute* form (leading `/`
235 /// resolved against the includes root) is the only spec-allowed
236 /// way to write a path that doesn't start from the host's
237 /// directory. On Unix the only thing that's `Path::is_absolute()`
238 /// is a leading `/`, which is consumed by the root-absolute
239 /// branch first; this variant therefore only fires in practice
240 /// for Windows-shaped absolute paths.
241 AbsolutePath { path: PathBuf },
242 /// The loader could not find or read the included file. `include_site`
243 /// is the range of the offending `lex.include` annotation in its host
244 /// file, so editors can squiggle the line that asked for the missing
245 /// file rather than the document head.
246 NotFound { include_site: Range, path: PathBuf },
247 /// The loader returned text that the parser rejected.
248 ParseFailed { path: PathBuf, message: String },
249 /// The included file's content is not legal in the include site's
250 /// parent container.
251 ///
252 /// Today this only occurs when an included file has top-level Sessions
253 /// and the include site is inside a `GeneralContainer` (Definition,
254 /// ListItem, or another Annotation's body). The `violation` field
255 /// names the offending content kind (e.g. `"Sessions"`) so future
256 /// container/policy combinations can reuse this variant without a
257 /// breaking change.
258 ContainerPolicy {
259 include_site: Range,
260 container: &'static str,
261 file: PathBuf,
262 violation: &'static str,
263 },
264 /// Loader propagated a non-`NotFound` I/O error.
265 LoaderIo { path: PathBuf, message: String },
266 /// `lex.include` annotation was missing the mandatory `src=` parameter.
267 MissingSrc { include_site: Range },
268 /// A registered handler returned an error the pass could not map
269 /// onto a more specific variant — typically a third-party
270 /// namespace's resolve hook surfacing an internal failure, or an
271 /// unrecognised handler-defined code from `lex.*` built-ins. The
272 /// `code` is the string identifier the registry attaches to the
273 /// diagnostic (`"handler.internal"`, `"handler.custom"`, …).
274 HandlerFailed {
275 include_site: Range,
276 label: String,
277 code: String,
278 message: String,
279 },
280}
281
282impl std::fmt::Display for IncludeError {
283 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
284 match self {
285 IncludeError::Cycle { path, chain, .. } => {
286 let chain_display: Vec<String> =
287 chain.iter().map(|p| p.display().to_string()).collect();
288 write!(
289 f,
290 "include cycle: {} (chain: {})",
291 path.display(),
292 chain_display.join(" -> ")
293 )
294 }
295 IncludeError::DepthExceeded { limit, chain, .. } => {
296 let chain_display: Vec<String> =
297 chain.iter().map(|p| p.display().to_string()).collect();
298 write!(
299 f,
300 "include depth exceeded limit of {limit} (chain: {})",
301 chain_display.join(" -> ")
302 )
303 }
304 IncludeError::TotalIncludesExceeded { limit, .. } => {
305 write!(f, "total include count exceeded limit of {limit}")
306 }
307 IncludeError::FileTooLarge {
308 path, size, limit, ..
309 } => {
310 write!(
311 f,
312 "included file {} is {size} bytes, exceeds limit of {limit} bytes",
313 path.display()
314 )
315 }
316 IncludeError::RootEscape { path, root } => write!(
317 f,
318 "include path {} escapes resolution root {}",
319 path.display(),
320 root.display()
321 ),
322 IncludeError::AbsolutePath { path } => write!(
323 f,
324 "include src {} is a platform-absolute path; \
325 the spec forbids absolute filesystem paths — use a relative path \
326 (chapters/01.lex) or a root-absolute path (/shared/01.lex)",
327 path.display()
328 ),
329 IncludeError::NotFound { path, .. } => {
330 write!(f, "include not found: {}", path.display())
331 }
332 IncludeError::ParseFailed { path, message } => {
333 write!(f, "failed to parse {}: {message}", path.display())
334 }
335 IncludeError::ContainerPolicy {
336 container,
337 file,
338 violation,
339 ..
340 } => write!(
341 f,
342 "included file {} contains {} but include site is inside {} \
343 (which does not allow {})",
344 file.display(),
345 violation,
346 container,
347 violation
348 ),
349 IncludeError::LoaderIo { path, message } => {
350 write!(f, "loader error reading {}: {message}", path.display())
351 }
352 IncludeError::MissingSrc { .. } => {
353 write!(f, "lex.include annotation missing required src= parameter")
354 }
355 IncludeError::HandlerFailed {
356 label,
357 code,
358 message,
359 ..
360 } => write!(f, "extension handler `{label}` failed ({code}): {message}"),
361 }
362 }
363}
364
365impl std::error::Error for IncludeError {}
366
367// No `From<LoadError>` impl: `IncludeError::NotFound` carries the include
368// site (the `lex.include` annotation's range), which a loader doesn't know
369// about. Callers map `LoadError` explicitly at the call site, where the
370// site is available.
371
372/// Which container the include site sits in. Determines the splice-time
373/// policy check (the only one today is "no Sessions in `GeneralContainer`").
374#[derive(Debug, Clone, Copy)]
375enum ContainerKind {
376 /// `Document.root.children` or `Session.children` — accepts everything.
377 Session,
378 /// `Definition.children` — `GeneralContainer`.
379 Definition,
380 /// `Annotation.children` — `GeneralContainer`.
381 AnnotationBody,
382 /// `ListItem.children` — `GeneralContainer`.
383 ListItem,
384}
385
386impl ContainerKind {
387 fn name(self) -> &'static str {
388 match self {
389 ContainerKind::Session => "Session",
390 ContainerKind::Definition => "Definition",
391 ContainerKind::AnnotationBody => "Annotation body",
392 ContainerKind::ListItem => "ListItem",
393 }
394 }
395
396 fn allows_sessions(self) -> bool {
397 matches!(self, ContainerKind::Session)
398 }
399}
400
401/// Hard cap on resolution depth, applied even when the
402/// configurable [`ResolveConfig::max_depth`] is set higher. Bounds
403/// adversarial varying-position recursion (a handler that returns
404/// content with a different invocation site each iteration so the
405/// cycle key never matches) so the resolver always terminates.
406pub const KERNEL_DEPTH_BACKSTOP: usize = 32;
407
408/// Resolve every `hooks.resolve = true` labelled annotation starting
409/// from `source`, dispatching through `registry`, and recursively
410/// processing the spliced content.
411///
412/// `source_path` identifies the entry-point file. It is used to
413/// (a) stamp `Range.origin_path` on every node so downstream code
414/// (file-ref resolution, diagnostics, LSP goto) can report locations
415/// against the authoring file, and (b) provide the host directory
416/// the built-in `lex.include` handler resolves relative `src=` paths
417/// against (via `LabelCtx.node.origin`). When `None`, origin stamping
418/// is skipped on the entry and the handler resolves relative paths
419/// against `config.root`.
420///
421/// # Generic dispatch
422///
423/// Every label whose schema declares `hooks.resolve = true` flows
424/// through the same path: build a [`LabelCtx`] from the annotation,
425/// call [`Registry::dispatch_resolve_raw`], decode the returned
426/// [`WireNode`] back into typed [`ContentItem`]s via
427/// [`crate::lex::wire::from_wire_node`], and splice in place. The
428/// built-in `lex.include` handler is registered the same way as any
429/// third-party namespace.
430///
431/// # Pre/post-attachment
432///
433/// Internally this re-parses the entry source *without* annotation
434/// attachment so labelled annotations stay visible as standalone
435/// children. The handler does its own `parse_no_attach` for loaded
436/// content. After all splices, [`AttachAnnotations`] runs once on
437/// the merged tree.
438///
439/// # Recursion + cycle detection
440///
441/// Cycle detection keys on `(label, origin_path, start_position)` of
442/// the invocation site. A handler that returns content containing
443/// another invocation at the same source position is caught
444/// immediately. A handler that varies the invocation position each
445/// iteration terminates at `min(config.max_depth, KERNEL_DEPTH_BACKSTOP)`
446/// with `IncludeError::DepthExceeded`. The total-includes counter
447/// caps adversarial fan-out independent of depth.
448pub fn resolve_from_source(
449 source: &str,
450 source_path: Option<PathBuf>,
451 config: &ResolveConfig,
452 registry: &Registry,
453) -> Result<Document, IncludeError> {
454 let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
455
456 let mut doc = parse_no_attach(source).map_err(|message| IncludeError::ParseFailed {
457 path: source_path.clone().unwrap_or_default(),
458 message,
459 })?;
460
461 if let Some(origin) = entry_origin.as_ref() {
462 stamp_doc(&mut doc, origin);
463 }
464
465 let mut chain: Vec<ResolveKey> = Vec::new();
466 let mut state = ResolverState {
467 config,
468 registry,
469 chain: &mut chain,
470 depth: 0,
471 total_resolved: 0,
472 };
473
474 splice_in_session_container(doc.root.children.as_mut_vec(), &mut state)?;
475
476 let doc = AttachAnnotations::new()
477 .run(doc)
478 .map_err(|e| IncludeError::ParseFailed {
479 path: source_path.unwrap_or_default(),
480 message: format!("annotation attachment failed: {e}"),
481 })?;
482
483 Ok(doc)
484}
485
486// ============================================================================
487// Splicing
488// ============================================================================
489
490/// One frame on the resolve-pass cycle stack. Two invocations at the
491/// same `(label, origin, start)` position are a cycle, regardless of
492/// what parameters either invocation uses — a handler that varies
493/// params per call (random IDs, timestamps) cannot defeat the
494/// detector by changing param values.
495#[derive(Debug, Clone, PartialEq)]
496struct ResolveKey {
497 label: String,
498 /// `Range.origin_path` of the annotation — the file the
499 /// invocation was authored in. `None` when stamping was skipped
500 /// (e.g., entry source loaded from a string with no path).
501 origin: Option<PathBuf>,
502 start: crate::lex::ast::range::Position,
503}
504
505impl ResolveKey {
506 fn from_annotation(a: &crate::lex::ast::elements::annotation::Annotation) -> Self {
507 Self {
508 label: a.data.label.value.clone(),
509 origin: a.location.origin_path.as_ref().map(|p| (**p).clone()),
510 start: a.location.start,
511 }
512 }
513}
514
515/// Per-resolution state threaded through the recursive walker. Keeps the
516/// signatures of the splice/process functions short and ensures
517/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
518/// each invocation.
519struct ResolverState<'a> {
520 config: &'a ResolveConfig,
521 registry: &'a Registry,
522 /// Active resolution stack of `(label, origin, position)` keys.
523 /// Pushed when we begin dispatching for an invocation and popped
524 /// when its splice subtree is fully resolved. A push that finds
525 /// the same key already on the stack is a cycle.
526 chain: &'a mut Vec<ResolveKey>,
527 /// Number of dispatch hops from the entry point. Each recursion
528 /// increments by 1. Hitting `config.max_depth` or the
529 /// [`KERNEL_DEPTH_BACKSTOP`] (whichever is lower) is an error.
530 depth: usize,
531 /// Total invocations resolved across the entire walk
532 /// (depth × breadth). Incremented on every successful dispatch.
533 /// Hitting `config.max_total_includes` aborts with
534 /// `TotalIncludesExceeded`.
535 total_resolved: usize,
536}
537
538fn splice_in_session_container(
539 children: &mut Vec<ContentItem>,
540 state: &mut ResolverState<'_>,
541) -> Result<(), IncludeError> {
542 // Post-order: recurse into nested containers first, splice this
543 // container's invocations second. Recursion happens inside
544 // `process_resolves` for any spliced subtree, so that subtree
545 // is never re-walked at the parent level.
546 recurse_into_children(children, state)?;
547 process_resolves(children, state, ContainerKind::Session)
548}
549
550fn splice_in_general_container(
551 container: &mut GeneralContainer,
552 state: &mut ResolverState<'_>,
553 kind: ContainerKind,
554) -> Result<(), IncludeError> {
555 recurse_into_children(container.as_mut_vec(), state)?;
556 process_resolves(container.as_mut_vec(), state, kind)
557}
558
559/// Walk the children of a container, dispatch every annotation whose
560/// schema declares `hooks.resolve = true` through the registry, and
561/// splice the returned content in place of the annotation. Recurses
562/// into the spliced content so nested invocations resolve too.
563// Allow &mut Vec because `splice` needs Vec-specific operations.
564#[allow(clippy::ptr_arg)]
565fn process_resolves(
566 children: &mut Vec<ContentItem>,
567 state: &mut ResolverState<'_>,
568 kind: ContainerKind,
569) -> Result<(), IncludeError> {
570 // Collect indices of annotations whose schema has hooks.resolve.
571 let resolve_indices: Vec<usize> = children
572 .iter()
573 .enumerate()
574 .filter_map(|(i, item)| match item {
575 ContentItem::Annotation(a) => {
576 let label = &a.data.label.value;
577 if state
578 .registry
579 .schema_for(label)
580 .map(|s| s.hooks.resolve)
581 .unwrap_or(false)
582 {
583 Some(i)
584 } else {
585 None
586 }
587 }
588 _ => None,
589 })
590 .collect();
591
592 for i in resolve_indices.into_iter().rev() {
593 let annotation = match &children[i] {
594 ContentItem::Annotation(a) => a.clone(),
595 _ => unreachable!("index came from resolve filter"),
596 };
597
598 match resolve_one_invocation(&annotation, state, kind)? {
599 ResolveOutcome::Spliced(splice_items) => {
600 // Replace the annotation with `[annotation, ...splice_items]`.
601 // The annotation itself stays in the children list immediately
602 // before the splice, so the post-resolution AttachAnnotations
603 // pass moves it onto the first spliced node by the standard
604 // "attach to next sibling" rule.
605 let mut replacement = Vec::with_capacity(splice_items.len() + 1);
606 replacement.push(ContentItem::Annotation(annotation));
607 replacement.extend(splice_items);
608 children.splice(i..=i, replacement);
609 }
610 ResolveOutcome::Unexpanded => {
611 // Handler opted out of expanding this invocation. The
612 // annotation stays in place, but its body wasn't
613 // walked by `recurse_into_children` (that walker
614 // skips resolve-hooked annotations to avoid double-
615 // resolution). Walk the body now so any nested
616 // invocations inside the unexpanded annotation get
617 // resolved on the way back up.
618 let mut owned = annotation;
619 splice_in_general_container(
620 &mut owned.children,
621 state,
622 ContainerKind::AnnotationBody,
623 )?;
624 children[i] = ContentItem::Annotation(owned);
625 }
626 }
627 }
628
629 Ok(())
630}
631
632/// Outcome of dispatching a single resolve-hooked annotation. The
633/// pass needs to distinguish between "handler returned content,
634/// splice it in" and "handler opted out, leave the annotation
635/// alone": the second case still requires walking the annotation's
636/// body for nested invocations because `recurse_into_children`
637/// otherwise skips resolve-hooked annotations to prevent double-
638/// resolution.
639enum ResolveOutcome {
640 Spliced(Vec<ContentItem>),
641 Unexpanded,
642}
643
644/// Dispatch a single resolve-hooked annotation through the registry,
645/// decode the returned `WireNode` back into typed children, then
646/// recursively walk the splice items so nested invocations resolve
647/// before the splice is placed into the parent container.
648///
649/// Returns [`ResolveOutcome::Unexpanded`] when the handler returned
650/// `Ok(None)` (third-party handlers can opt out of expanding a
651/// particular invocation). The caller is then responsible for
652/// walking the annotation's body for nested invocations — the
653/// resolve walker normally skips resolve-hooked annotations'
654/// bodies.
655fn resolve_one_invocation(
656 annotation: &crate::lex::ast::elements::annotation::Annotation,
657 state: &mut ResolverState<'_>,
658 parent_kind: ContainerKind,
659) -> Result<ResolveOutcome, IncludeError> {
660 let label = &annotation.data.label.value;
661 let key = ResolveKey::from_annotation(annotation);
662
663 // Cycle check on (label, origin, start) of the invocation site.
664 if state.chain.contains(&key) {
665 return Err(IncludeError::Cycle {
666 include_site: annotation.location.clone(),
667 path: key.origin.clone().unwrap_or_default(),
668 chain: state
669 .chain
670 .iter()
671 .map(|k| k.origin.clone().unwrap_or_default())
672 .collect(),
673 });
674 }
675
676 // Depth check. The effective limit is the lower of the
677 // user-facing `config.max_depth` (default 8) and the hard
678 // [`KERNEL_DEPTH_BACKSTOP`] (32, fixed). The kernel backstop
679 // exists for adversarial varying-position recursion that the
680 // cycle key can't catch — even if a user bumps `max_depth`
681 // higher than 32 for legitimate deep atomization, the backstop
682 // still terminates. The error reports `effective_depth_limit`
683 // (the actual cap that fired) rather than `config.max_depth`,
684 // so when the backstop is the binding limit the user sees `32`
685 // and not the (higher) config value.
686 let effective_depth_limit = state.config.max_depth.min(KERNEL_DEPTH_BACKSTOP);
687 if state.depth >= effective_depth_limit {
688 return Err(IncludeError::DepthExceeded {
689 include_site: annotation.location.clone(),
690 limit: effective_depth_limit,
691 chain: state
692 .chain
693 .iter()
694 .map(|k| k.origin.clone().unwrap_or_default())
695 .collect(),
696 });
697 }
698
699 // Total-count check before dispatch.
700 if state.total_resolved >= state.config.max_total_includes {
701 return Err(IncludeError::TotalIncludesExceeded {
702 include_site: annotation.location.clone(),
703 limit: state.config.max_total_includes,
704 });
705 }
706
707 let ctx = build_label_ctx(annotation);
708
709 let wire_node = match state.registry.dispatch_resolve_raw(&ctx) {
710 Ok(Some(node)) => node,
711 Ok(None) => {
712 // Handler returned "nothing to splice" — leave the
713 // annotation in place. The caller still needs to walk
714 // its body for nested invocations (built-in lex.include
715 // never returns None; this path is reachable only via
716 // third-party handlers that opt out per-invocation).
717 return Ok(ResolveOutcome::Unexpanded);
718 }
719 Err(handler_err) => {
720 return Err(handler_error_to_include_error(
721 &handler_err,
722 label,
723 &annotation.location,
724 ));
725 }
726 };
727
728 state.total_resolved += 1;
729
730 // Decode the wire payload into typed lex-core ContentItems.
731 let mut splice_items = decode_wire_to_items(&wire_node, label, &annotation.location)?;
732
733 // Recurse into the spliced subtree FIRST so nested resolve-hooked
734 // annotations are processed before the splice lands. Validation
735 // must wait until *after* this step: a nested invocation can
736 // splice in content (e.g. a top-level `Session` from a chained
737 // `lex.include`) that wasn't in the handler's original output,
738 // and the final shape is what has to satisfy the parent
739 // container's policy.
740 //
741 // The `IncludeError::ContainerPolicy.file` field describes the
742 // *spliced content's* source file (the file containing the
743 // disallowed shape), not the invocation site. Take it from the
744 // handler-returned wire payload's origin when present, falling
745 // back to the first decoded item's origin path if the wire
746 // payload didn't stamp a `Document` origin.
747 let included_path = wire_node_origin_pathbuf(&wire_node)
748 .or_else(|| splice_items_first_origin(&splice_items))
749 .unwrap_or_default();
750 state.chain.push(key);
751 let saved_depth = state.depth;
752 state.depth = saved_depth + 1;
753 let recurse_result = splice_in_session_container(&mut splice_items, state);
754 state.depth = saved_depth;
755 state.chain.pop();
756 recurse_result?;
757
758 // Container-policy validation: enforce no-Sessions inside
759 // `GeneralContainer` (Definition / Annotation body / ListItem).
760 // Runs against the post-recursion splice list so nested
761 // expansions can't smuggle disallowed shapes past the check.
762 validate_against_kind(
763 &splice_items,
764 parent_kind,
765 &annotation.location,
766 &included_path,
767 )?;
768
769 Ok(ResolveOutcome::Spliced(splice_items))
770}
771
772/// Build a [`LabelCtx`] from a lex-core [`Annotation`]. The body is
773/// derived from the annotation's children (parsed-Lex form), the
774/// params from `Annotation::data::parameters`, and the host node info
775/// from `Annotation::location`.
776fn build_label_ctx(
777 a: &crate::lex::ast::elements::annotation::Annotation,
778) -> lex_extension::wire::LabelCtx {
779 use crate::lex::wire::to_wire_node;
780 use lex_extension::wire::{AnnotationBody, LabelCtx, NodeRef};
781
782 let label = a.data.label.value.clone();
783 let params = {
784 // Pass *semantic* parameter values to handlers (quotes
785 // stripped, escape sequences resolved). Handlers consume
786 // params as JSON values, where there is no "quoted string"
787 // vs "unquoted token" distinction; only the decoded value
788 // is meaningful. The codec's `parameters_to_json` (used by
789 // `annotation_to_wire` for round-tripping annotation
790 // *content*) keeps the raw form to preserve source — the
791 // two paths intentionally differ.
792 let mut obj = serde_json::Map::with_capacity(a.data.parameters.len());
793 for p in &a.data.parameters {
794 obj.insert(p.key.clone(), serde_json::Value::String(p.unquoted_value()));
795 }
796 serde_json::Value::Object(obj)
797 };
798 let body = if a.children.is_empty() {
799 AnnotationBody::None
800 } else {
801 let wire_children: Vec<lex_extension::wire::WireNode> =
802 a.children.iter().map(to_wire_node).collect();
803 AnnotationBody::Lex {
804 children: wire_children,
805 }
806 };
807 let range = lex_extension::wire::Range::new(
808 lex_extension::wire::Position::new(
809 u32::try_from(a.location.start.line).unwrap_or(u32::MAX),
810 u32::try_from(a.location.start.column).unwrap_or(u32::MAX),
811 ),
812 lex_extension::wire::Position::new(
813 u32::try_from(a.location.end.line).unwrap_or(u32::MAX),
814 u32::try_from(a.location.end.column).unwrap_or(u32::MAX),
815 ),
816 );
817 let origin = a
818 .location
819 .origin_path
820 .as_ref()
821 .map(|p| p.to_string_lossy().into_owned());
822 LabelCtx {
823 label,
824 params,
825 body,
826 node: NodeRef {
827 kind: "annotation".into(),
828 range,
829 origin,
830 },
831 }
832}
833
834/// Convert a handler-returned [`WireNode`] back into a list of
835/// [`ContentItem`]s ready for splicing. `WireNode::Document` is
836/// unwrapped (its children become the splice list); any other root
837/// shape is wrapped as a single-item list.
838///
839/// `invocation_label` is the label whose handler produced `wire` —
840/// threaded through so wire-decode failures are attributed to the
841/// real namespace rather than a hardcoded `lex.include`. A
842/// third-party `acme.expand` handler that returns malformed wire
843/// will surface as `IncludeError::HandlerFailed { label:
844/// "acme.expand", .. }`.
845/// Lift a [`WireNode`]'s top-level `origin` field into a `PathBuf`
846/// when present. Used by the resolve pass to attribute
847/// container-policy errors to the *spliced content's* source file
848/// rather than the invocation site.
849fn wire_node_origin_pathbuf(node: &lex_extension::wire::WireNode) -> Option<PathBuf> {
850 use lex_extension::wire::WireNode as W;
851 let s = match node {
852 W::Document { origin, .. } => origin.as_deref(),
853 W::Session { origin, .. } => origin.as_deref(),
854 W::Definition { origin, .. } => origin.as_deref(),
855 W::Paragraph { origin, .. } => origin.as_deref(),
856 W::List { origin, .. } => origin.as_deref(),
857 W::Verbatim { origin, .. } => origin.as_deref(),
858 W::Table { origin, .. } => origin.as_deref(),
859 W::Annotation { origin, .. } => origin.as_deref(),
860 W::Blank { origin, .. } => origin.as_deref(),
861 _ => None,
862 };
863 s.map(PathBuf::from)
864}
865
866/// Fallback when `WireNode::Document.origin` is unset: walk the
867/// decoded splice list and return the first item that carries an
868/// origin. The interner from `from_wire_node` ensures every item
869/// shares one Arc per origin string, so iterating is cheap.
870fn splice_items_first_origin(items: &[ContentItem]) -> Option<PathBuf> {
871 for item in items {
872 let r = match item {
873 ContentItem::Paragraph(p) => &p.location,
874 ContentItem::Session(s) => &s.location,
875 ContentItem::Definition(d) => &d.location,
876 ContentItem::List(l) => &l.location,
877 ContentItem::ListItem(li) => &li.location,
878 ContentItem::Annotation(a) => &a.location,
879 ContentItem::VerbatimBlock(v) => &v.location,
880 ContentItem::VerbatimLine(vl) => &vl.location,
881 ContentItem::Table(t) => &t.location,
882 ContentItem::TextLine(tl) => &tl.location,
883 ContentItem::BlankLineGroup(blg) => &blg.location,
884 };
885 if let Some(arc) = r.origin_path.as_ref() {
886 return Some((**arc).clone());
887 }
888 }
889 None
890}
891
892fn decode_wire_to_items(
893 wire: &lex_extension::wire::WireNode,
894 invocation_label: &str,
895 include_site: &Range,
896) -> Result<Vec<ContentItem>, IncludeError> {
897 use crate::lex::wire::from_wire_node;
898
899 from_wire_node(wire).map_err(|e| IncludeError::HandlerFailed {
900 include_site: include_site.clone(),
901 label: invocation_label.to_string(),
902 code: "wire.decode".into(),
903 message: format!("decoding handler-returned wire payload failed: {e}"),
904 })
905}
906
907/// Map a [`HandlerError`] returned by the registry into the most
908/// specific [`IncludeError`] variant available. Codes in the
909/// `-32001..=-32005` range emitted by [`crate::lex::builtins::LexIncludeHandler`]
910/// translate back to their corresponding pre-extension-system
911/// variants so existing CLI/LSP error rendering and the integration
912/// test suite keep working unchanged. Unknown codes (third-party
913/// namespaces, future built-ins) surface as `HandlerFailed`.
914fn handler_error_to_include_error(
915 err: &HandlerError,
916 label: &str,
917 include_site: &Range,
918) -> IncludeError {
919 use crate::lex::builtins::include::{
920 CODE_ABSOLUTE_PATH, CODE_IO, CODE_MISSING_SRC, CODE_NOT_FOUND, CODE_OUTSIDE_ROOT,
921 CODE_PARSE_FAILED, CODE_TOO_LARGE,
922 };
923
924 match err {
925 HandlerError::Custom {
926 code,
927 message,
928 data,
929 } => match *code {
930 CODE_NOT_FOUND => IncludeError::NotFound {
931 include_site: include_site.clone(),
932 path: data_str(data, "path")
933 .map(PathBuf::from)
934 .unwrap_or_default(),
935 },
936 CODE_OUTSIDE_ROOT => IncludeError::RootEscape {
937 path: data_str(data, "path")
938 .map(PathBuf::from)
939 .unwrap_or_default(),
940 root: data_str(data, "root")
941 .map(PathBuf::from)
942 .unwrap_or_default(),
943 },
944 CODE_TOO_LARGE => IncludeError::FileTooLarge {
945 include_site: include_site.clone(),
946 path: data_str(data, "path")
947 .map(PathBuf::from)
948 .unwrap_or_default(),
949 size: data_u64(data, "size").unwrap_or(0),
950 limit: data_u64(data, "limit").unwrap_or(0),
951 },
952 CODE_ABSOLUTE_PATH => IncludeError::AbsolutePath {
953 path: data_str(data, "path")
954 .map(PathBuf::from)
955 .unwrap_or_default(),
956 },
957 CODE_IO => IncludeError::LoaderIo {
958 path: data_str(data, "path")
959 .map(PathBuf::from)
960 .unwrap_or_default(),
961 message: message.clone(),
962 },
963 CODE_MISSING_SRC => IncludeError::MissingSrc {
964 include_site: include_site.clone(),
965 },
966 CODE_PARSE_FAILED => IncludeError::ParseFailed {
967 path: data_str(data, "path")
968 .map(PathBuf::from)
969 .unwrap_or_default(),
970 message: data_str(data, "message").unwrap_or_else(|| message.clone()),
971 },
972 other => IncludeError::HandlerFailed {
973 include_site: include_site.clone(),
974 label: label.to_string(),
975 code: format!("handler.custom({other})"),
976 message: message.clone(),
977 },
978 },
979 HandlerError::Internal { message } => IncludeError::HandlerFailed {
980 include_site: include_site.clone(),
981 label: label.to_string(),
982 code: "handler.internal".into(),
983 message: message.clone(),
984 },
985 HandlerError::Unsupported { detail } => IncludeError::HandlerFailed {
986 include_site: include_site.clone(),
987 label: label.to_string(),
988 code: "handler.unsupported".into(),
989 message: detail.clone(),
990 },
991 }
992}
993
994fn data_str(data: &Option<serde_json::Value>, key: &str) -> Option<String> {
995 data.as_ref()?.get(key)?.as_str().map(str::to_string)
996}
997
998fn data_u64(data: &Option<serde_json::Value>, key: &str) -> Option<u64> {
999 data.as_ref()?.get(key)?.as_u64()
1000}
1001
1002#[allow(clippy::ptr_arg)]
1003fn recurse_into_children(
1004 children: &mut Vec<ContentItem>,
1005 state: &mut ResolverState<'_>,
1006) -> Result<(), IncludeError> {
1007 for item in children.iter_mut() {
1008 match item {
1009 ContentItem::Session(s) => {
1010 splice_in_session_container(s.children.as_mut_vec(), state)?;
1011 }
1012 ContentItem::Definition(d) => {
1013 splice_in_general_container(&mut d.children, state, ContainerKind::Definition)?;
1014 }
1015 ContentItem::Annotation(a) => {
1016 // Skip the body of annotations whose schema declares
1017 // `hooks.resolve = true` — those are dispatched at the
1018 // parent level by `process_resolves`. Walking their
1019 // bodies *here* would trip the resolve again on the
1020 // same invocation.
1021 //
1022 // The body is still walked when the resolve actually
1023 // runs: `process_resolves` calls
1024 // `resolve_one_invocation`, and the
1025 // [`ResolveOutcome::Spliced`] arm walks the splice
1026 // subtree (which replaces the annotation), while the
1027 // [`ResolveOutcome::Unexpanded`] arm explicitly
1028 // walks the kept annotation's body via
1029 // `splice_in_general_container`. So nested
1030 // resolve-hooked annotations inside an unexpanded
1031 // outer annotation are still reached.
1032 //
1033 // Non-resolve-hooked annotations recurse normally
1034 // here so their nested bodies get processed.
1035 let is_resolve_hooked = state
1036 .registry
1037 .schema_for(&a.data.label.value)
1038 .map(|s| s.hooks.resolve)
1039 .unwrap_or(false);
1040 if !is_resolve_hooked {
1041 splice_in_general_container(
1042 &mut a.children,
1043 state,
1044 ContainerKind::AnnotationBody,
1045 )?;
1046 }
1047 }
1048 ContentItem::List(l) => {
1049 for li in l.items.as_mut_vec().iter_mut() {
1050 if let ContentItem::ListItem(item) = li {
1051 splice_in_general_container(
1052 &mut item.children,
1053 state,
1054 ContainerKind::ListItem,
1055 )?;
1056 }
1057 }
1058 }
1059 _ => {}
1060 }
1061 }
1062 Ok(())
1063}
1064
1065fn validate_against_kind(
1066 items: &[ContentItem],
1067 kind: ContainerKind,
1068 site: &Range,
1069 file: &Path,
1070) -> Result<(), IncludeError> {
1071 if kind.allows_sessions() {
1072 return Ok(());
1073 }
1074 if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
1075 return Err(IncludeError::ContainerPolicy {
1076 include_site: site.clone(),
1077 container: kind.name(),
1078 file: file.to_path_buf(),
1079 violation: "Sessions",
1080 });
1081 }
1082 Ok(())
1083}
1084
1085// ============================================================================
1086// Path resolution
1087// ============================================================================
1088
1089/// Resolve a file-reference target string the same way the include
1090/// resolver resolves include paths.
1091///
1092/// Use this when consuming `ReferenceType::File { target }` (or any other
1093/// node-attached path) so that relative paths resolve from the *authoring*
1094/// file's directory, not from wherever the merged document happens to be
1095/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
1096/// containing node (or `None` if the node was never stamped — in that case
1097/// the path is treated as if authored at the root).
1098///
1099/// Behaviour matches the include resolver:
1100/// - Root-absolute targets (leading `/`) resolve under `root`.
1101/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
1102/// when `ref_origin` is `None`).
1103/// - The result is lexically normalized and checked against `root` —
1104/// paths that escape it return `RootEscape`.
1105///
1106/// This is a sister to the resolver's internal `resolve_path` and shares
1107/// the same lexical-normalization caveat: it does not touch the filesystem.
1108pub fn resolve_file_reference(
1109 target: &str,
1110 ref_origin: Option<&Path>,
1111 root: &Path,
1112) -> Result<PathBuf, IncludeError> {
1113 let host_dir: PathBuf = ref_origin
1114 .and_then(|p| p.parent())
1115 .map(Path::to_path_buf)
1116 .unwrap_or_else(|| root.to_path_buf());
1117 resolve_path(target, &host_dir, root)
1118}
1119
1120fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
1121 let candidate = if let Some(rel) = src.strip_prefix('/') {
1122 // Root-absolute (Lex spec convention): leading `/` means "from
1123 // the resolution root", not "filesystem root".
1124 root.join(rel)
1125 } else {
1126 // Anything else must be a relative path. Reject inputs the
1127 // host platform would treat as absolute (Windows `C:\foo`,
1128 // `\\server\share`, `\foo`) up front: the spec forbids
1129 // platform-absolute paths from entering the resolution
1130 // pipeline. Without this, `host_dir.join(src)` would silently
1131 // discard `host_dir` because Rust's `PathBuf::join` replaces
1132 // the base when the joined path is absolute. The downstream
1133 // root-escape check would still catch the security side, but
1134 // we'd surface a misleading "escapes root" error instead of
1135 // "absolute paths not allowed", and we'd be relying on
1136 // `PathBuf::join`'s override semantics for the security
1137 // outcome rather than holding the line at the input boundary.
1138 if Path::new(src).is_absolute() {
1139 return Err(IncludeError::AbsolutePath {
1140 path: PathBuf::from(src),
1141 });
1142 }
1143 host_dir.join(src)
1144 };
1145 let normalized = lexical_normalize(&candidate);
1146 let canonical_root = lexical_normalize(root);
1147 if !normalized.starts_with(&canonical_root) {
1148 return Err(IncludeError::RootEscape {
1149 path: normalized,
1150 root: canonical_root,
1151 });
1152 }
1153 Ok(normalized)
1154}
1155
1156/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
1157///
1158/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
1159/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
1160/// version is sufficient for include-site path resolution because the
1161/// resolver only needs a stable identity for cycle detection and a uniform
1162/// shape for the root-escape prefix check.
1163///
1164/// `..` is collapsed only when the *last* component in the buffer is a
1165/// real directory name (`Component::Normal`). When the buffer is empty
1166/// or its last component is itself `..` (or a root marker), the new `..`
1167/// is *preserved* in the buffer.
1168///
1169/// This is what defeats `../../etc/passwd` from collapsing to
1170/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
1171/// would happily strip a `..` (since `Path::new("..").parent()` returns
1172/// `Some("")`), silently losing the second `..` and producing a path
1173/// that falsely starts with the root prefix. Each unmatched `..` in the
1174/// preserved form keeps the normalized path outside any sane root, so
1175/// the escape check fires correctly.
1176fn lexical_normalize(p: &Path) -> PathBuf {
1177 let mut out = PathBuf::new();
1178 for c in p.components() {
1179 match c {
1180 std::path::Component::ParentDir => {
1181 let can_pop = matches!(
1182 out.components().next_back(),
1183 Some(std::path::Component::Normal(_))
1184 );
1185 if can_pop {
1186 out.pop();
1187 } else {
1188 out.push("..");
1189 }
1190 }
1191 std::path::Component::CurDir => {}
1192 other => out.push(other.as_os_str()),
1193 }
1194 }
1195 out
1196}
1197
1198// ============================================================================
1199// Origin stamping
1200// ============================================================================
1201//
1202// Walk every node in a Document and set `Range.origin_path` on each
1203// `.location` field. The walk only stamps the *block-level* `.location`
1204// fields here; finer-grained inline ranges land in PR 6 when file-ref
1205// resolution starts consulting them.
1206
1207pub(crate) fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
1208 if let Some(title) = doc.title.as_mut() {
1209 title.location.origin_path = Some(Arc::clone(origin));
1210 }
1211 for ann in doc.annotations.iter_mut() {
1212 stamp_annotation(ann, origin);
1213 }
1214 stamp_session(&mut doc.root, origin);
1215}
1216
1217fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
1218 s.location.origin_path = Some(Arc::clone(origin));
1219 if let Some(loc) = s.title.location.as_mut() {
1220 loc.origin_path = Some(Arc::clone(origin));
1221 }
1222 for ann in s.annotations.iter_mut() {
1223 stamp_annotation(ann, origin);
1224 }
1225 for item in s.children.as_mut_vec().iter_mut() {
1226 stamp_item(item, origin);
1227 }
1228}
1229
1230fn stamp_annotation(
1231 a: &mut crate::lex::ast::elements::annotation::Annotation,
1232 origin: &Arc<PathBuf>,
1233) {
1234 a.location.origin_path = Some(Arc::clone(origin));
1235 a.data.location.origin_path = Some(Arc::clone(origin));
1236 for item in a.children.as_mut_vec().iter_mut() {
1237 stamp_item(item, origin);
1238 }
1239}
1240
1241fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
1242 match item {
1243 ContentItem::Session(s) => stamp_session(s, origin),
1244 ContentItem::Annotation(a) => stamp_annotation(a, origin),
1245 ContentItem::Paragraph(p) => {
1246 p.location.origin_path = Some(Arc::clone(origin));
1247 for ann in p.annotations.iter_mut() {
1248 stamp_annotation(ann, origin);
1249 }
1250 for line in p.lines.iter_mut() {
1251 stamp_item(line, origin);
1252 }
1253 }
1254 ContentItem::List(l) => {
1255 l.location.origin_path = Some(Arc::clone(origin));
1256 for li in l.items.as_mut_vec().iter_mut() {
1257 stamp_item(li, origin);
1258 }
1259 }
1260 ContentItem::ListItem(li) => {
1261 li.location.origin_path = Some(Arc::clone(origin));
1262 for ann in li.annotations.iter_mut() {
1263 stamp_annotation(ann, origin);
1264 }
1265 for child in li.children.as_mut_vec().iter_mut() {
1266 stamp_item(child, origin);
1267 }
1268 }
1269 ContentItem::Definition(d) => {
1270 d.location.origin_path = Some(Arc::clone(origin));
1271 for ann in d.annotations.iter_mut() {
1272 stamp_annotation(ann, origin);
1273 }
1274 for child in d.children.as_mut_vec().iter_mut() {
1275 stamp_item(child, origin);
1276 }
1277 }
1278 ContentItem::VerbatimBlock(v) => {
1279 v.location.origin_path = Some(Arc::clone(origin));
1280 }
1281 ContentItem::VerbatimLine(vl) => {
1282 vl.location.origin_path = Some(Arc::clone(origin));
1283 }
1284 ContentItem::Table(t) => {
1285 t.location.origin_path = Some(Arc::clone(origin));
1286 }
1287 ContentItem::TextLine(tl) => {
1288 tl.location.origin_path = Some(Arc::clone(origin));
1289 }
1290 ContentItem::BlankLineGroup(b) => {
1291 b.location.origin_path = Some(Arc::clone(origin));
1292 }
1293 }
1294}
1295
1296// ============================================================================
1297// Parser glue
1298// ============================================================================
1299
1300/// Parse `source` into a Document but skip the annotation-attachment stage,
1301/// so include annotations are findable in container children lists.
1302pub(crate) fn parse_no_attach(source: &str) -> Result<Document, String> {
1303 crate::lex::testing::parse_without_annotation_attachment(source)
1304}
1305
1306// ============================================================================
1307// Filesystem-backed loader
1308// ============================================================================
1309
1310/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
1311///
1312/// This is the production loader used by the CLI; the LSP wraps it with a
1313/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
1314/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
1315/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
1316/// WASM-friendly.
1317///
1318/// `FsLoader` is constructed with the resolution root and rechecks every
1319/// load against it post-`fs::canonicalize`, so a symlink pointing outside
1320/// the root is rejected even though the lexical-only check in
1321/// [`resolve_path`] cannot see it. Also rejects non-regular files (devices,
1322/// FIFOs, directories) before reading, so the loader can't be tricked into
1323/// blocking on `/dev/zero` or allocating against an open device.
1324///
1325/// Errors map:
1326/// - canonicalization fails (file missing, permission denied at a parent,
1327/// broken symlink, …) → [`LoadError::NotFound`]
1328/// - canonical path doesn't sit under canonical root → [`LoadError::OutsideRoot`]
1329/// - target is not a regular file → [`LoadError::Io`] with a clear message
1330/// - any other I/O error during read → [`LoadError::Io`]
1331pub struct FsLoader {
1332 /// Filesystem-canonical resolution root. Constructed once at
1333 /// `FsLoader::new`; if canonicalization fails (e.g., the configured
1334 /// root doesn't exist on disk), we fall back to the input verbatim
1335 /// and the bounds check will simply never pass — visible to the user
1336 /// as a `LoadError::OutsideRoot` instead of silently disabling the
1337 /// security check.
1338 canonical_root: PathBuf,
1339 /// Per-file size cap (bytes). Loads of larger files surface as
1340 /// `LoadError::TooLarge` before any bytes are read into memory.
1341 /// Default [`FsLoader::DEFAULT_MAX_FILE_SIZE`].
1342 max_file_size: u64,
1343}
1344
1345impl FsLoader {
1346 /// Default per-file size cap: 10 MiB. Generous for realistic Lex
1347 /// source documents (text only) and tight enough to bound memory
1348 /// allocation per include against an adversarial 1 GB file.
1349 pub const DEFAULT_MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
1350
1351 /// Construct a loader rooted at `root` with default size limits.
1352 /// The loader stores `root`'s fs-canonical form (with symlinks
1353 /// resolved); subsequent loads validate that the requested path's
1354 /// canonical form lives under it.
1355 pub fn new(root: PathBuf) -> Self {
1356 let canonical_root = std::fs::canonicalize(&root).unwrap_or(root);
1357 Self {
1358 canonical_root,
1359 max_file_size: Self::DEFAULT_MAX_FILE_SIZE,
1360 }
1361 }
1362
1363 /// Override the default per-file size cap (bytes). Use to widen the
1364 /// limit for projects with genuinely large source files, or tighten
1365 /// it for stricter sandboxes (e.g., LSPs serving untrusted content).
1366 pub fn with_max_file_size(mut self, max_file_size: u64) -> Self {
1367 self.max_file_size = max_file_size;
1368 self
1369 }
1370}
1371
1372impl Loader for FsLoader {
1373 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1374 // 1. Canonicalize. Resolves symlinks and `..` segments against the
1375 // real filesystem. NotFound / broken-symlink / permission errors
1376 // all surface here.
1377 let canonical_path = std::fs::canonicalize(path).map_err(|e| match e.kind() {
1378 std::io::ErrorKind::NotFound => LoadError::NotFound {
1379 path: path.to_path_buf(),
1380 },
1381 _ => LoadError::Io {
1382 path: path.to_path_buf(),
1383 message: e.to_string(),
1384 },
1385 })?;
1386
1387 // 2. Bounds check against the *canonical* root. This is the
1388 // actual security gate against symlink traversal — the lexical
1389 // check in resolve_path can't see through symlinks.
1390 if !canonical_path.starts_with(&self.canonical_root) {
1391 return Err(LoadError::OutsideRoot {
1392 path: canonical_path,
1393 root: self.canonical_root.clone(),
1394 });
1395 }
1396
1397 // 3. Reject non-regular files. Without this, an attacker (with
1398 // write access to the repo) could symlink an include target to
1399 // `/dev/zero` or a FIFO and block / OOM the reader. The
1400 // is_file() metadata call is a cheap sanity check.
1401 let meta = std::fs::metadata(&canonical_path).map_err(|e| LoadError::Io {
1402 path: canonical_path.clone(),
1403 message: e.to_string(),
1404 })?;
1405 if !meta.is_file() {
1406 return Err(LoadError::Io {
1407 path: canonical_path,
1408 message: "include target is not a regular file".to_string(),
1409 });
1410 }
1411
1412 // 4. Size cap. Bounds memory allocation per include against an
1413 // adversarial 1 GB file before any bytes hit the heap.
1414 let size = meta.len();
1415 if size > self.max_file_size {
1416 return Err(LoadError::TooLarge {
1417 path: canonical_path,
1418 size,
1419 limit: self.max_file_size,
1420 });
1421 }
1422
1423 // 5. Read. By this point we know the path is a regular file under
1424 // the canonical root and within the size cap; anything that
1425 // fails here is a real I/O error worth surfacing.
1426 let source = std::fs::read_to_string(&canonical_path).map_err(|e| LoadError::Io {
1427 path: canonical_path.clone(),
1428 message: e.to_string(),
1429 })?;
1430
1431 Ok(LoadedFile {
1432 source,
1433 canonical_path,
1434 })
1435 }
1436}
1437
1438// ============================================================================
1439// Test fixtures (test-support feature + cfg(test))
1440// ============================================================================
1441
1442/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
1443#[cfg(any(test, feature = "test-support"))]
1444pub struct MemoryLoader {
1445 files: std::collections::HashMap<PathBuf, String>,
1446}
1447
1448#[cfg(any(test, feature = "test-support"))]
1449impl MemoryLoader {
1450 /// Create an empty loader. Add files with [`MemoryLoader::insert`].
1451 pub fn new() -> Self {
1452 Self {
1453 files: std::collections::HashMap::new(),
1454 }
1455 }
1456
1457 /// Register a file at `path` with the given source text.
1458 pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
1459 self.files.insert(path.into(), contents.into());
1460 self
1461 }
1462
1463 /// Convenience constructor: build a loader from any iterator of
1464 /// `(path, contents)` pairs.
1465 pub fn from_pairs<I, P, S>(pairs: I) -> Self
1466 where
1467 I: IntoIterator<Item = (P, S)>,
1468 P: Into<PathBuf>,
1469 S: Into<String>,
1470 {
1471 let mut loader = Self::new();
1472 for (path, contents) in pairs {
1473 loader.insert(path, contents);
1474 }
1475 loader
1476 }
1477}
1478
1479#[cfg(any(test, feature = "test-support"))]
1480impl Default for MemoryLoader {
1481 fn default() -> Self {
1482 Self::new()
1483 }
1484}
1485
1486#[cfg(any(test, feature = "test-support"))]
1487impl Loader for MemoryLoader {
1488 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1489 // Memory loaders have no symlinks; the lookup key *is* the
1490 // canonical identity. Cycle detection in the resolver compares
1491 // `LoadedFile::canonical_path` values; for tests this matches the
1492 // lexically-normalized paths the resolver already produces.
1493 let source = self
1494 .files
1495 .get(path)
1496 .cloned()
1497 .ok_or_else(|| LoadError::NotFound {
1498 path: path.to_path_buf(),
1499 })?;
1500 Ok(LoadedFile {
1501 source,
1502 canonical_path: path.to_path_buf(),
1503 })
1504 }
1505}
1506
1507// ============================================================================
1508// Tests
1509// ============================================================================
1510
1511#[cfg(test)]
1512mod tests;