lex_core/lex/includes.rs
1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//! doc-title/doc-annotation conversion + origin stamping + root-escape
19//! check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//! (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//! directory, so relative paths inside an included file resolve from
23//! that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//! resolves a `ReferenceType::File` target from the authoring file's
26//! directory using `Range.origin_path`.
27//! `Document::find_annotation_by_label_in_origin` scopes footnote
28//! lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//! filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//! into `lex convert` and `lex inspect` (default-on, opt-out via
32//! `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47// `IncludeError` carries diagnostic context (paths, source ranges,
48// handler messages) on every variant; the `result_large_err` lint
49// would have us box the whole error or split it into a thinner shape
50// just to satisfy the size heuristic. The enum is already part of
51// the public API and the error path is rare; suppress the lint for
52// this module rather than churn the public surface.
53#![allow(clippy::result_large_err)]
54
55use crate::lex::assembling::stages::{ApplyTableConfig, NormalizeLabels};
56use crate::lex::assembling::AttachAnnotations;
57use crate::lex::ast::elements::container::GeneralContainer;
58use crate::lex::ast::elements::content_item::ContentItem;
59use crate::lex::ast::elements::session::Session;
60use crate::lex::ast::range::Range;
61use crate::lex::ast::Document;
62use crate::lex::transforms::Runnable;
63use lex_extension::handler::HandlerError;
64use lex_extension_host::registry::Registry;
65use std::path::{Path, PathBuf};
66use std::sync::Arc;
67
68/// Configuration for the include resolution pass.
69#[derive(Debug, Clone)]
70pub struct ResolveConfig {
71 /// Directory all include paths resolve under. Any include that
72 /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
73 ///
74 /// Must be an **absolute** path. Lexical normalization treats `.`
75 /// and `..` against an empty buffer as no-ops; passing a relative
76 /// or unnormalized root weakens the root-escape prefix check.
77 /// Callers (CLI, LSP) should canonicalize the root before
78 /// constructing `ResolveConfig`.
79 pub root: PathBuf,
80 /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
81 /// Hitting the limit is an error, not a silent truncation.
82 pub max_depth: usize,
83 /// Maximum total number of `lex.include` annotations resolved across
84 /// the whole tree (depth × breadth). Default 1000
85 /// (see [`ResolveConfig::DEFAULT_MAX_TOTAL_INCLUDES`]).
86 ///
87 /// Caps fan-out: `max_depth` alone bounds chain length but not
88 /// breadth. A document with 100 thousand top-level includes at depth
89 /// 1 sits inside `max_depth` but can still OOM the resolver / LSP /
90 /// CI. Hitting this limit is an error, not a silent truncation.
91 pub max_total_includes: usize,
92}
93
94impl ResolveConfig {
95 /// Default maximum include depth — enough for any reasonable atomization
96 /// strategy (aggregator → per-chapter → per-section), bounded enough to
97 /// keep the resolver's worst-case work predictable.
98 pub const DEFAULT_MAX_DEPTH: usize = 8;
99
100 /// Default maximum total include count (DoS bound). Generous enough
101 /// for a book-length document with thousands of small fragments,
102 /// tight enough to contain adversarial fan-out within a few seconds
103 /// of resolver work.
104 pub const DEFAULT_MAX_TOTAL_INCLUDES: usize = 1000;
105
106 /// Construct a config with the given root and default limits.
107 pub fn with_root(root: PathBuf) -> Self {
108 Self {
109 root,
110 max_depth: Self::DEFAULT_MAX_DEPTH,
111 max_total_includes: Self::DEFAULT_MAX_TOTAL_INCLUDES,
112 }
113 }
114}
115
116/// A pluggable source-text loader.
117///
118/// Implementations decide where bytes come from (filesystem, in-memory map,
119/// virtual filesystem, content-addressed store, …). lex-core never references
120/// `std::fs` directly through this trait; that keeps the resolver pure and
121/// usable in WASM, sandboxes, and unit tests.
122pub trait Loader {
123 /// Load the source text for `path` and return both the contents and a
124 /// canonical identity for the loaded resource. The path is what the
125 /// resolver decided on after applying the rules in §4 of the proposal.
126 ///
127 /// `LoadedFile::canonical_path` is the loader's authoritative identity
128 /// for the resource. For [`FsLoader`] this is the filesystem-canonical
129 /// path (symlinks resolved, case-folded if the underlying FS is
130 /// case-insensitive); for [`MemoryLoader`] it's the lookup key (since
131 /// memory loaders have no symlinks). The resolver uses this for cycle
132 /// detection and for stamping `Range.origin_path` on the loaded tree.
133 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError>;
134}
135
136/// Result of a successful [`Loader::load`].
137#[derive(Debug, Clone)]
138pub struct LoadedFile {
139 /// The file's source text.
140 pub source: String,
141 /// The loader's authoritative identity for the resource. See
142 /// [`Loader::load`] for how loaders decide this.
143 pub canonical_path: PathBuf,
144}
145
146/// Errors a [`Loader`] can produce.
147#[derive(Debug, Clone)]
148pub enum LoadError {
149 /// The loader could not find a resource at the given path.
150 NotFound { path: PathBuf },
151 /// The resource exists but resolves outside the loader's allowed
152 /// boundary. The lexical resolver normalizes `..` in the requested
153 /// path, but loaders that touch a real filesystem must do a second
154 /// check post-canonicalization to catch symlinks that escape the
155 /// boundary lexically-correct paths can't reach.
156 OutsideRoot { path: PathBuf, root: PathBuf },
157 /// The resource exists but its size exceeds the loader's configured
158 /// limit. `size` and `limit` are in bytes. The resolver maps this to
159 /// [`IncludeError::FileTooLarge`] with the offending annotation's site.
160 TooLarge {
161 path: PathBuf,
162 size: u64,
163 limit: u64,
164 },
165 /// Underlying I/O error (or virtual-filesystem equivalent).
166 Io { path: PathBuf, message: String },
167}
168
169impl std::fmt::Display for LoadError {
170 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171 match self {
172 LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
173 LoadError::OutsideRoot { path, root } => write!(
174 f,
175 "include path {} resolves outside loader root {}",
176 path.display(),
177 root.display()
178 ),
179 LoadError::TooLarge { path, size, limit } => write!(
180 f,
181 "include file {} is {size} bytes, exceeds limit of {limit} bytes",
182 path.display()
183 ),
184 LoadError::Io { path, message } => {
185 write!(f, "io error reading {}: {message}", path.display())
186 }
187 }
188 }
189}
190
191impl std::error::Error for LoadError {}
192
193/// Errors the include resolver can produce.
194#[derive(Debug, Clone)]
195pub enum IncludeError {
196 /// An include chain looped back on itself. `chain` is the resolution
197 /// stack at the moment the duplicate `path` was about to be pushed,
198 /// in source-order (entry first, deepest last). `include_site` is the
199 /// range of the offending `lex.include` annotation in its host file —
200 /// useful for diagnostics that highlight the exact line.
201 Cycle {
202 include_site: Range,
203 path: PathBuf,
204 chain: Vec<PathBuf>,
205 },
206 /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
207 /// shows the resolution stack at the moment of failure, in source
208 /// order. `include_site` is the range of the offending
209 /// `lex.include` annotation in its host file.
210 DepthExceeded {
211 include_site: Range,
212 limit: usize,
213 chain: Vec<PathBuf>,
214 },
215 /// The total number of includes resolved across the document
216 /// exceeded [`ResolveConfig::max_total_includes`]. Bounds adversarial
217 /// fan-out (which `max_depth` alone does not). `include_site` is the
218 /// `lex.include` annotation that pushed the count past the limit.
219 TotalIncludesExceeded { include_site: Range, limit: usize },
220 /// The included file's size exceeded the loader's configured limit.
221 /// Surfaced by loaders that read from a real filesystem (FsLoader)
222 /// to bound memory allocation per include. `include_site` is the
223 /// offending annotation; `size` and `limit` are in bytes.
224 FileTooLarge {
225 include_site: Range,
226 path: PathBuf,
227 size: u64,
228 limit: u64,
229 },
230 /// A path resolved outside the configured [`ResolveConfig::root`].
231 RootEscape { path: PathBuf, root: PathBuf },
232 /// The include `src` was a platform-absolute filesystem path
233 /// (e.g. Windows `C:\foo`, `\\server\share`, `\foo`). The spec
234 /// forbids absolute filesystem paths from entering the
235 /// resolution pipeline; the *root-absolute* form (leading `/`
236 /// resolved against the includes root) is the only spec-allowed
237 /// way to write a path that doesn't start from the host's
238 /// directory. On Unix the only thing that's `Path::is_absolute()`
239 /// is a leading `/`, which is consumed by the root-absolute
240 /// branch first; this variant therefore only fires in practice
241 /// for Windows-shaped absolute paths.
242 AbsolutePath { path: PathBuf },
243 /// The loader could not find or read the included file. `include_site`
244 /// is the range of the offending `lex.include` annotation in its host
245 /// file, so editors can squiggle the line that asked for the missing
246 /// file rather than the document head.
247 NotFound { include_site: Range, path: PathBuf },
248 /// The loader returned text that the parser rejected.
249 ParseFailed { path: PathBuf, message: String },
250 /// The included file's content is not legal in the include site's
251 /// parent container.
252 ///
253 /// Today this only occurs when an included file has top-level Sessions
254 /// and the include site is inside a `GeneralContainer` (Definition,
255 /// ListItem, or another Annotation's body). The `violation` field
256 /// names the offending content kind (e.g. `"Sessions"`) so future
257 /// container/policy combinations can reuse this variant without a
258 /// breaking change.
259 ContainerPolicy {
260 include_site: Range,
261 container: &'static str,
262 file: PathBuf,
263 violation: &'static str,
264 },
265 /// Loader propagated a non-`NotFound` I/O error.
266 LoaderIo { path: PathBuf, message: String },
267 /// `lex.include` annotation was missing the mandatory `src=` parameter.
268 MissingSrc { include_site: Range },
269 /// A registered handler returned an error the pass could not map
270 /// onto a more specific variant — typically a third-party
271 /// namespace's resolve hook surfacing an internal failure, or an
272 /// unrecognised handler-defined code from `lex.*` built-ins. The
273 /// `code` is the string identifier the registry attaches to the
274 /// diagnostic (`"handler.internal"`, `"handler.custom"`, …).
275 HandlerFailed {
276 include_site: Range,
277 label: String,
278 code: String,
279 message: String,
280 },
281}
282
283impl std::fmt::Display for IncludeError {
284 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285 match self {
286 IncludeError::Cycle { path, chain, .. } => {
287 let chain_display: Vec<String> =
288 chain.iter().map(|p| p.display().to_string()).collect();
289 write!(
290 f,
291 "include cycle: {} (chain: {})",
292 path.display(),
293 chain_display.join(" -> ")
294 )
295 }
296 IncludeError::DepthExceeded { limit, chain, .. } => {
297 let chain_display: Vec<String> =
298 chain.iter().map(|p| p.display().to_string()).collect();
299 write!(
300 f,
301 "include depth exceeded limit of {limit} (chain: {})",
302 chain_display.join(" -> ")
303 )
304 }
305 IncludeError::TotalIncludesExceeded { limit, .. } => {
306 write!(f, "total include count exceeded limit of {limit}")
307 }
308 IncludeError::FileTooLarge {
309 path, size, limit, ..
310 } => {
311 write!(
312 f,
313 "included file {} is {size} bytes, exceeds limit of {limit} bytes",
314 path.display()
315 )
316 }
317 IncludeError::RootEscape { path, root } => write!(
318 f,
319 "include path {} escapes resolution root {}",
320 path.display(),
321 root.display()
322 ),
323 IncludeError::AbsolutePath { path } => write!(
324 f,
325 "include src {} is a platform-absolute path; \
326 the spec forbids absolute filesystem paths — use a relative path \
327 (chapters/01.lex) or a root-absolute path (/shared/01.lex)",
328 path.display()
329 ),
330 IncludeError::NotFound { path, .. } => {
331 write!(f, "include not found: {}", path.display())
332 }
333 IncludeError::ParseFailed { path, message } => {
334 write!(f, "failed to parse {}: {message}", path.display())
335 }
336 IncludeError::ContainerPolicy {
337 container,
338 file,
339 violation,
340 ..
341 } => write!(
342 f,
343 "included file {} contains {} but include site is inside {} \
344 (which does not allow {})",
345 file.display(),
346 violation,
347 container,
348 violation
349 ),
350 IncludeError::LoaderIo { path, message } => {
351 write!(f, "loader error reading {}: {message}", path.display())
352 }
353 IncludeError::MissingSrc { .. } => {
354 write!(f, "lex.include annotation missing required src= parameter")
355 }
356 IncludeError::HandlerFailed {
357 label,
358 code,
359 message,
360 ..
361 } => write!(f, "extension handler `{label}` failed ({code}): {message}"),
362 }
363 }
364}
365
366impl std::error::Error for IncludeError {}
367
368// No `From<LoadError>` impl: `IncludeError::NotFound` carries the include
369// site (the `lex.include` annotation's range), which a loader doesn't know
370// about. Callers map `LoadError` explicitly at the call site, where the
371// site is available.
372
373/// Which container the include site sits in. Determines the splice-time
374/// policy check (the only one today is "no Sessions in `GeneralContainer`").
375#[derive(Debug, Clone, Copy)]
376enum ContainerKind {
377 /// `Document.root.children` or `Session.children` — accepts everything.
378 Session,
379 /// `Definition.children` — `GeneralContainer`.
380 Definition,
381 /// `Annotation.children` — `GeneralContainer`.
382 AnnotationBody,
383 /// `ListItem.children` — `GeneralContainer`.
384 ListItem,
385}
386
387impl ContainerKind {
388 fn name(self) -> &'static str {
389 match self {
390 ContainerKind::Session => "Session",
391 ContainerKind::Definition => "Definition",
392 ContainerKind::AnnotationBody => "Annotation body",
393 ContainerKind::ListItem => "ListItem",
394 }
395 }
396
397 fn allows_sessions(self) -> bool {
398 matches!(self, ContainerKind::Session)
399 }
400}
401
402/// Hard cap on resolution depth, applied even when the
403/// configurable [`ResolveConfig::max_depth`] is set higher. Bounds
404/// adversarial varying-position recursion (a handler that returns
405/// content with a different invocation site each iteration so the
406/// cycle key never matches) so the resolver always terminates.
407pub const KERNEL_DEPTH_BACKSTOP: usize = 32;
408
409/// Resolve every `hooks.resolve = true` labelled annotation starting
410/// from `source`, dispatching through `registry`, and recursively
411/// processing the spliced content.
412///
413/// `source_path` identifies the entry-point file. It is used to
414/// (a) stamp `Range.origin_path` on every node so downstream code
415/// (file-ref resolution, diagnostics, LSP goto) can report locations
416/// against the authoring file, and (b) provide the host directory
417/// the built-in `lex.include` handler resolves relative `src=` paths
418/// against (via `LabelCtx.node.origin`). When `None`, origin stamping
419/// is skipped on the entry and the handler resolves relative paths
420/// against `config.root`.
421///
422/// # Generic dispatch
423///
424/// Every label whose schema declares `hooks.resolve = true` flows
425/// through the same path: build a [`LabelCtx`] from the annotation,
426/// call [`Registry::dispatch_resolve_raw`], decode the returned
427/// [`WireNode`] back into typed [`ContentItem`]s via
428/// [`crate::lex::wire::from_wire_node`], and splice in place. The
429/// built-in `lex.include` handler is registered the same way as any
430/// third-party namespace.
431///
432/// # Pre/post-attachment
433///
434/// Internally this re-parses the entry source *without* annotation
435/// attachment so labelled annotations stay visible as standalone
436/// children. The handler does its own `parse_no_attach` for loaded
437/// content. After all splices, [`AttachAnnotations`] runs once on
438/// the merged tree.
439///
440/// # Recursion + cycle detection
441///
442/// Cycle detection keys on `(label, origin_path, start_position)` of
443/// the invocation site. A handler that returns content containing
444/// another invocation at the same source position is caught
445/// immediately. A handler that varies the invocation position each
446/// iteration terminates at `min(config.max_depth, KERNEL_DEPTH_BACKSTOP)`
447/// with `IncludeError::DepthExceeded`. The total-includes counter
448/// caps adversarial fan-out independent of depth.
449pub fn resolve_from_source(
450 source: &str,
451 source_path: Option<PathBuf>,
452 config: &ResolveConfig,
453 registry: &Registry,
454) -> Result<Document, IncludeError> {
455 let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
456
457 let mut doc = parse_no_attach(source).map_err(|message| IncludeError::ParseFailed {
458 path: source_path.clone().unwrap_or_default(),
459 message,
460 })?;
461
462 if let Some(origin) = entry_origin.as_ref() {
463 stamp_doc(&mut doc, origin);
464 }
465
466 // Normalise labels in the entry source BEFORE the resolve walk so
467 // shortcut spellings (`:: include ::`, `:: image ::`, …) are
468 // rewritten to their canonical form. The resolve dispatcher keys
469 // on `registry.schema_for(label)` with the canonical spelling, so
470 // without this an `:: include src=... ::` annotation would be
471 // skipped because no schema is registered under the bare alias.
472 //
473 // Permissive mode: unknown labels are left as-is rather than
474 // erroring. The standard parse pipeline enforces strict-mode
475 // namespace policy (`STRING_TO_AST`); the resolve entry point is
476 // a downstream stage that just needs the shortcut table applied
477 // so dispatch finds the right handler.
478 let mut doc =
479 NormalizeLabels::permissive()
480 .run(doc)
481 .map_err(|e| IncludeError::ParseFailed {
482 path: source_path.clone().unwrap_or_default(),
483 message: format!("label normalisation failed: {e}"),
484 })?;
485
486 let mut chain: Vec<ResolveKey> = Vec::new();
487 let mut state = ResolverState {
488 config,
489 registry,
490 chain: &mut chain,
491 depth: 0,
492 total_resolved: 0,
493 };
494
495 splice_in_session_container(doc.root.children.as_mut_vec(), &mut state)?;
496
497 let doc = AttachAnnotations::new()
498 .run(doc)
499 .map_err(|e| IncludeError::ParseFailed {
500 path: source_path.clone().unwrap_or_default(),
501 message: format!("annotation attachment failed: {e}"),
502 })?;
503
504 // Re-normalise after splicing. Each included file is parsed via
505 // `parse_no_attach` (no normalisation), so shortcut labels in the
506 // spliced content — e.g. `:: image src=... ::` inside an included
507 // chapter — need rewriting before downstream IR/format passes can
508 // dispatch them.
509 let doc = NormalizeLabels::permissive()
510 .run(doc)
511 .map_err(|e| IncludeError::ParseFailed {
512 path: source_path.clone().unwrap_or_default(),
513 message: format!("label normalisation failed: {e}"),
514 })?;
515
516 // Apply table configuration so `:: table header=N align=... ::`
517 // annotations attached to tables (here or in spliced content) take
518 // effect — matches the order the standard pipeline runs them.
519 let doc = ApplyTableConfig::new()
520 .run(doc)
521 .map_err(|e| IncludeError::ParseFailed {
522 path: source_path.unwrap_or_default(),
523 message: format!("table config application failed: {e}"),
524 })?;
525
526 Ok(doc)
527}
528
529// ============================================================================
530// Splicing
531// ============================================================================
532
533/// One frame on the resolve-pass cycle stack. Two invocations at the
534/// same `(label, origin, start)` position are a cycle, regardless of
535/// what parameters either invocation uses — a handler that varies
536/// params per call (random IDs, timestamps) cannot defeat the
537/// detector by changing param values.
538#[derive(Debug, Clone, PartialEq)]
539struct ResolveKey {
540 label: String,
541 /// `Range.origin_path` of the annotation — the file the
542 /// invocation was authored in. `None` when stamping was skipped
543 /// (e.g., entry source loaded from a string with no path).
544 origin: Option<PathBuf>,
545 start: crate::lex::ast::range::Position,
546}
547
548impl ResolveKey {
549 fn from_annotation(a: &crate::lex::ast::elements::annotation::Annotation) -> Self {
550 Self {
551 label: a.data.label.value.clone(),
552 origin: a.location.origin_path.as_ref().map(|p| (**p).clone()),
553 start: a.location.start,
554 }
555 }
556}
557
558/// Per-resolution state threaded through the recursive walker. Keeps the
559/// signatures of the splice/process functions short and ensures
560/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
561/// each invocation.
562struct ResolverState<'a> {
563 config: &'a ResolveConfig,
564 registry: &'a Registry,
565 /// Active resolution stack of `(label, origin, position)` keys.
566 /// Pushed when we begin dispatching for an invocation and popped
567 /// when its splice subtree is fully resolved. A push that finds
568 /// the same key already on the stack is a cycle.
569 chain: &'a mut Vec<ResolveKey>,
570 /// Number of dispatch hops from the entry point. Each recursion
571 /// increments by 1. Hitting `config.max_depth` or the
572 /// [`KERNEL_DEPTH_BACKSTOP`] (whichever is lower) is an error.
573 depth: usize,
574 /// Total invocations resolved across the entire walk
575 /// (depth × breadth). Incremented on every successful dispatch.
576 /// Hitting `config.max_total_includes` aborts with
577 /// `TotalIncludesExceeded`.
578 total_resolved: usize,
579}
580
581fn splice_in_session_container(
582 children: &mut Vec<ContentItem>,
583 state: &mut ResolverState<'_>,
584) -> Result<(), IncludeError> {
585 // Post-order: recurse into nested containers first, splice this
586 // container's invocations second. Recursion happens inside
587 // `process_resolves` for any spliced subtree, so that subtree
588 // is never re-walked at the parent level.
589 recurse_into_children(children, state)?;
590 process_resolves(children, state, ContainerKind::Session)
591}
592
593fn splice_in_general_container(
594 container: &mut GeneralContainer,
595 state: &mut ResolverState<'_>,
596 kind: ContainerKind,
597) -> Result<(), IncludeError> {
598 recurse_into_children(container.as_mut_vec(), state)?;
599 process_resolves(container.as_mut_vec(), state, kind)
600}
601
602/// Walk the children of a container, dispatch every annotation whose
603/// schema declares `hooks.resolve = true` through the registry, and
604/// splice the returned content in place of the annotation. Recurses
605/// into the spliced content so nested invocations resolve too.
606// Allow &mut Vec because `splice` needs Vec-specific operations.
607#[allow(clippy::ptr_arg)]
608fn process_resolves(
609 children: &mut Vec<ContentItem>,
610 state: &mut ResolverState<'_>,
611 kind: ContainerKind,
612) -> Result<(), IncludeError> {
613 // Collect indices of annotations whose schema has hooks.resolve.
614 let resolve_indices: Vec<usize> = children
615 .iter()
616 .enumerate()
617 .filter_map(|(i, item)| match item {
618 ContentItem::Annotation(a) => {
619 let label = &a.data.label.value;
620 if state
621 .registry
622 .schema_for(label)
623 .map(|s| s.hooks.resolve)
624 .unwrap_or(false)
625 {
626 Some(i)
627 } else {
628 None
629 }
630 }
631 _ => None,
632 })
633 .collect();
634
635 for i in resolve_indices.into_iter().rev() {
636 let annotation = match &children[i] {
637 ContentItem::Annotation(a) => a.clone(),
638 _ => unreachable!("index came from resolve filter"),
639 };
640
641 match resolve_one_invocation(&annotation, state, kind)? {
642 ResolveOutcome::Spliced(splice_items) => {
643 // Expansion replaces the directive with the included content. The
644 // `lex.include` annotation is consumed — drop it. (It used to be
645 // kept in the stream as provenance, relying on the serializer
646 // dropping attached annotations; now that the serializer emits
647 // them (lex#682), keeping it would leak `:: lex.include ::` into
648 // expanded output. Origin provenance is tracked on
649 // `Range.origin_path`, not this node.)
650 children.splice(i..=i, splice_items);
651 }
652 ResolveOutcome::Unexpanded => {
653 // Handler opted out of expanding this invocation. The
654 // annotation stays in place, but its body wasn't
655 // walked by `recurse_into_children` (that walker
656 // skips resolve-hooked annotations to avoid double-
657 // resolution). Walk the body now so any nested
658 // invocations inside the unexpanded annotation get
659 // resolved on the way back up.
660 let mut owned = annotation;
661 splice_in_general_container(
662 &mut owned.children,
663 state,
664 ContainerKind::AnnotationBody,
665 )?;
666 children[i] = ContentItem::Annotation(owned);
667 }
668 }
669 }
670
671 Ok(())
672}
673
674/// Outcome of dispatching a single resolve-hooked annotation. The
675/// pass needs to distinguish between "handler returned content,
676/// splice it in" and "handler opted out, leave the annotation
677/// alone": the second case still requires walking the annotation's
678/// body for nested invocations because `recurse_into_children`
679/// otherwise skips resolve-hooked annotations to prevent double-
680/// resolution.
681enum ResolveOutcome {
682 Spliced(Vec<ContentItem>),
683 Unexpanded,
684}
685
686/// Dispatch a single resolve-hooked annotation through the registry,
687/// decode the returned `WireNode` back into typed children, then
688/// recursively walk the splice items so nested invocations resolve
689/// before the splice is placed into the parent container.
690///
691/// Returns [`ResolveOutcome::Unexpanded`] when the handler returned
692/// `Ok(None)` (third-party handlers can opt out of expanding a
693/// particular invocation). The caller is then responsible for
694/// walking the annotation's body for nested invocations — the
695/// resolve walker normally skips resolve-hooked annotations'
696/// bodies.
697fn resolve_one_invocation(
698 annotation: &crate::lex::ast::elements::annotation::Annotation,
699 state: &mut ResolverState<'_>,
700 parent_kind: ContainerKind,
701) -> Result<ResolveOutcome, IncludeError> {
702 let label = &annotation.data.label.value;
703 let key = ResolveKey::from_annotation(annotation);
704
705 // Cycle check on (label, origin, start) of the invocation site.
706 if state.chain.contains(&key) {
707 return Err(IncludeError::Cycle {
708 include_site: annotation.location.clone(),
709 path: key.origin.clone().unwrap_or_default(),
710 chain: state
711 .chain
712 .iter()
713 .map(|k| k.origin.clone().unwrap_or_default())
714 .collect(),
715 });
716 }
717
718 // Depth check. The effective limit is the lower of the
719 // user-facing `config.max_depth` (default 8) and the hard
720 // [`KERNEL_DEPTH_BACKSTOP`] (32, fixed). The kernel backstop
721 // exists for adversarial varying-position recursion that the
722 // cycle key can't catch — even if a user bumps `max_depth`
723 // higher than 32 for legitimate deep atomization, the backstop
724 // still terminates. The error reports `effective_depth_limit`
725 // (the actual cap that fired) rather than `config.max_depth`,
726 // so when the backstop is the binding limit the user sees `32`
727 // and not the (higher) config value.
728 let effective_depth_limit = state.config.max_depth.min(KERNEL_DEPTH_BACKSTOP);
729 if state.depth >= effective_depth_limit {
730 return Err(IncludeError::DepthExceeded {
731 include_site: annotation.location.clone(),
732 limit: effective_depth_limit,
733 chain: state
734 .chain
735 .iter()
736 .map(|k| k.origin.clone().unwrap_or_default())
737 .collect(),
738 });
739 }
740
741 // Total-count check before dispatch.
742 if state.total_resolved >= state.config.max_total_includes {
743 return Err(IncludeError::TotalIncludesExceeded {
744 include_site: annotation.location.clone(),
745 limit: state.config.max_total_includes,
746 });
747 }
748
749 let ctx = build_label_ctx(annotation);
750
751 let wire_node = match state.registry.dispatch_resolve_raw(&ctx) {
752 Ok(Some(node)) => node,
753 Ok(None) => {
754 // Handler returned "nothing to splice" — leave the
755 // annotation in place. The caller still needs to walk
756 // its body for nested invocations (built-in lex.include
757 // never returns None; this path is reachable only via
758 // third-party handlers that opt out per-invocation).
759 return Ok(ResolveOutcome::Unexpanded);
760 }
761 Err(handler_err) => {
762 return Err(handler_error_to_include_error(
763 &handler_err,
764 label,
765 &annotation.location,
766 ));
767 }
768 };
769
770 state.total_resolved += 1;
771
772 // Decode the wire payload into typed lex-core ContentItems.
773 let mut splice_items = decode_wire_to_items(&wire_node, label, &annotation.location)?;
774
775 // Recurse into the spliced subtree FIRST so nested resolve-hooked
776 // annotations are processed before the splice lands. Validation
777 // must wait until *after* this step: a nested invocation can
778 // splice in content (e.g. a top-level `Session` from a chained
779 // `lex.include`) that wasn't in the handler's original output,
780 // and the final shape is what has to satisfy the parent
781 // container's policy.
782 //
783 // The `IncludeError::ContainerPolicy.file` field describes the
784 // *spliced content's* source file (the file containing the
785 // disallowed shape), not the invocation site. Take it from the
786 // handler-returned wire payload's origin when present, falling
787 // back to the first decoded item's origin path if the wire
788 // payload didn't stamp a `Document` origin.
789 let included_path = wire_node_origin_pathbuf(&wire_node)
790 .or_else(|| splice_items_first_origin(&splice_items))
791 .unwrap_or_default();
792 state.chain.push(key);
793 let saved_depth = state.depth;
794 state.depth = saved_depth + 1;
795 let recurse_result = splice_in_session_container(&mut splice_items, state);
796 state.depth = saved_depth;
797 state.chain.pop();
798 recurse_result?;
799
800 // Container-policy validation: enforce no-Sessions inside
801 // `GeneralContainer` (Definition / Annotation body / ListItem).
802 // Runs against the post-recursion splice list so nested
803 // expansions can't smuggle disallowed shapes past the check.
804 validate_against_kind(
805 &splice_items,
806 parent_kind,
807 &annotation.location,
808 &included_path,
809 )?;
810
811 Ok(ResolveOutcome::Spliced(splice_items))
812}
813
814/// Build a [`LabelCtx`] from a lex-core [`Annotation`]. The body is
815/// derived from the annotation's children (parsed-Lex form), the
816/// params from `Annotation::data::parameters`, and the host node info
817/// from `Annotation::location`.
818fn build_label_ctx(
819 a: &crate::lex::ast::elements::annotation::Annotation,
820) -> lex_extension::wire::LabelCtx {
821 use crate::lex::wire::to_wire_node;
822 use lex_extension::wire::{AnnotationBody, LabelCtx, NodeRef};
823
824 let label = a.data.label.value.clone();
825 let params = {
826 // Pass *semantic* parameter values to handlers (quotes
827 // stripped, escape sequences resolved). Handlers consume
828 // params as JSON values, where there is no "quoted string"
829 // vs "unquoted token" distinction; only the decoded value
830 // is meaningful. The codec's `parameters_to_json` (used by
831 // `annotation_to_wire` for round-tripping annotation
832 // *content*) keeps the raw form to preserve source — the
833 // two paths intentionally differ.
834 let mut obj = serde_json::Map::with_capacity(a.data.parameters.len());
835 for p in &a.data.parameters {
836 obj.insert(p.key.clone(), serde_json::Value::String(p.unquoted_value()));
837 }
838 serde_json::Value::Object(obj)
839 };
840 let body = if a.children.is_empty() {
841 AnnotationBody::None
842 } else {
843 let wire_children: Vec<lex_extension::wire::WireNode> =
844 a.children.iter().map(to_wire_node).collect();
845 AnnotationBody::Lex {
846 children: wire_children,
847 }
848 };
849 let range = lex_extension::wire::Range::new(
850 lex_extension::wire::Position::new(
851 u32::try_from(a.location.start.line).unwrap_or(u32::MAX),
852 u32::try_from(a.location.start.column).unwrap_or(u32::MAX),
853 ),
854 lex_extension::wire::Position::new(
855 u32::try_from(a.location.end.line).unwrap_or(u32::MAX),
856 u32::try_from(a.location.end.column).unwrap_or(u32::MAX),
857 ),
858 );
859 let origin = a
860 .location
861 .origin_path
862 .as_ref()
863 .map(|p| p.to_string_lossy().into_owned());
864 LabelCtx {
865 label,
866 params,
867 body,
868 node: NodeRef {
869 kind: "annotation".into(),
870 range,
871 origin,
872 },
873 }
874}
875
876/// Convert a handler-returned [`WireNode`] back into a list of
877/// [`ContentItem`]s ready for splicing. `WireNode::Document` is
878/// unwrapped (its children become the splice list); any other root
879/// shape is wrapped as a single-item list.
880///
881/// `invocation_label` is the label whose handler produced `wire` —
882/// threaded through so wire-decode failures are attributed to the
883/// real namespace rather than a hardcoded `lex.include`. A
884/// third-party `acme.expand` handler that returns malformed wire
885/// will surface as `IncludeError::HandlerFailed { label:
886/// "acme.expand", .. }`.
887/// Lift a [`WireNode`]'s top-level `origin` field into a `PathBuf`
888/// when present. Used by the resolve pass to attribute
889/// container-policy errors to the *spliced content's* source file
890/// rather than the invocation site.
891fn wire_node_origin_pathbuf(node: &lex_extension::wire::WireNode) -> Option<PathBuf> {
892 use lex_extension::wire::WireNode as W;
893 let s = match node {
894 W::Document { origin, .. } => origin.as_deref(),
895 W::Session { origin, .. } => origin.as_deref(),
896 W::Definition { origin, .. } => origin.as_deref(),
897 W::Paragraph { origin, .. } => origin.as_deref(),
898 W::List { origin, .. } => origin.as_deref(),
899 W::Verbatim { origin, .. } => origin.as_deref(),
900 W::Table { origin, .. } => origin.as_deref(),
901 W::Annotation { origin, .. } => origin.as_deref(),
902 W::Blank { origin, .. } => origin.as_deref(),
903 _ => None,
904 };
905 s.map(PathBuf::from)
906}
907
908/// Fallback when `WireNode::Document.origin` is unset: walk the
909/// decoded splice list and return the first item that carries an
910/// origin. The interner from `from_wire_node` ensures every item
911/// shares one Arc per origin string, so iterating is cheap.
912fn splice_items_first_origin(items: &[ContentItem]) -> Option<PathBuf> {
913 for item in items {
914 let r = match item {
915 ContentItem::Paragraph(p) => &p.location,
916 ContentItem::Session(s) => &s.location,
917 ContentItem::Definition(d) => &d.location,
918 ContentItem::List(l) => &l.location,
919 ContentItem::ListItem(li) => &li.location,
920 ContentItem::Annotation(a) => &a.location,
921 ContentItem::VerbatimBlock(v) => &v.location,
922 ContentItem::VerbatimLine(vl) => &vl.location,
923 ContentItem::Table(t) => &t.location,
924 ContentItem::TextLine(tl) => &tl.location,
925 ContentItem::BlankLineGroup(blg) => &blg.location,
926 };
927 if let Some(arc) = r.origin_path.as_ref() {
928 return Some((**arc).clone());
929 }
930 }
931 None
932}
933
934fn decode_wire_to_items(
935 wire: &lex_extension::wire::WireNode,
936 invocation_label: &str,
937 include_site: &Range,
938) -> Result<Vec<ContentItem>, IncludeError> {
939 use crate::lex::wire::from_wire_node;
940
941 from_wire_node(wire).map_err(|e| IncludeError::HandlerFailed {
942 include_site: include_site.clone(),
943 label: invocation_label.to_string(),
944 code: "wire.decode".into(),
945 message: format!("decoding handler-returned wire payload failed: {e}"),
946 })
947}
948
949/// Map a [`HandlerError`] returned by the registry into the most
950/// specific [`IncludeError`] variant available. Codes in the
951/// `-32001..=-32005` range emitted by [`crate::lex::builtins::LexIncludeHandler`]
952/// translate back to their corresponding pre-extension-system
953/// variants so existing CLI/LSP error rendering and the integration
954/// test suite keep working unchanged. Unknown codes (third-party
955/// namespaces, future built-ins) surface as `HandlerFailed`.
956fn handler_error_to_include_error(
957 err: &HandlerError,
958 label: &str,
959 include_site: &Range,
960) -> IncludeError {
961 use crate::lex::builtins::include::{
962 CODE_ABSOLUTE_PATH, CODE_IO, CODE_MISSING_SRC, CODE_NOT_FOUND, CODE_OUTSIDE_ROOT,
963 CODE_PARSE_FAILED, CODE_TOO_LARGE,
964 };
965
966 match err {
967 HandlerError::Custom {
968 code,
969 message,
970 data,
971 } => match *code {
972 CODE_NOT_FOUND => IncludeError::NotFound {
973 include_site: include_site.clone(),
974 path: data_str(data, "path")
975 .map(PathBuf::from)
976 .unwrap_or_default(),
977 },
978 CODE_OUTSIDE_ROOT => IncludeError::RootEscape {
979 path: data_str(data, "path")
980 .map(PathBuf::from)
981 .unwrap_or_default(),
982 root: data_str(data, "root")
983 .map(PathBuf::from)
984 .unwrap_or_default(),
985 },
986 CODE_TOO_LARGE => IncludeError::FileTooLarge {
987 include_site: include_site.clone(),
988 path: data_str(data, "path")
989 .map(PathBuf::from)
990 .unwrap_or_default(),
991 size: data_u64(data, "size").unwrap_or(0),
992 limit: data_u64(data, "limit").unwrap_or(0),
993 },
994 CODE_ABSOLUTE_PATH => IncludeError::AbsolutePath {
995 path: data_str(data, "path")
996 .map(PathBuf::from)
997 .unwrap_or_default(),
998 },
999 CODE_IO => IncludeError::LoaderIo {
1000 path: data_str(data, "path")
1001 .map(PathBuf::from)
1002 .unwrap_or_default(),
1003 message: message.clone(),
1004 },
1005 CODE_MISSING_SRC => IncludeError::MissingSrc {
1006 include_site: include_site.clone(),
1007 },
1008 CODE_PARSE_FAILED => IncludeError::ParseFailed {
1009 path: data_str(data, "path")
1010 .map(PathBuf::from)
1011 .unwrap_or_default(),
1012 message: data_str(data, "message").unwrap_or_else(|| message.clone()),
1013 },
1014 other => IncludeError::HandlerFailed {
1015 include_site: include_site.clone(),
1016 label: label.to_string(),
1017 code: format!("handler.custom({other})"),
1018 message: message.clone(),
1019 },
1020 },
1021 HandlerError::Internal { message } => IncludeError::HandlerFailed {
1022 include_site: include_site.clone(),
1023 label: label.to_string(),
1024 code: "handler.internal".into(),
1025 message: message.clone(),
1026 },
1027 HandlerError::Unsupported { detail } => IncludeError::HandlerFailed {
1028 include_site: include_site.clone(),
1029 label: label.to_string(),
1030 code: "handler.unsupported".into(),
1031 message: detail.clone(),
1032 },
1033 }
1034}
1035
1036fn data_str(data: &Option<serde_json::Value>, key: &str) -> Option<String> {
1037 data.as_ref()?.get(key)?.as_str().map(str::to_string)
1038}
1039
1040fn data_u64(data: &Option<serde_json::Value>, key: &str) -> Option<u64> {
1041 data.as_ref()?.get(key)?.as_u64()
1042}
1043
1044#[allow(clippy::ptr_arg)]
1045fn recurse_into_children(
1046 children: &mut Vec<ContentItem>,
1047 state: &mut ResolverState<'_>,
1048) -> Result<(), IncludeError> {
1049 for item in children.iter_mut() {
1050 match item {
1051 ContentItem::Session(s) => {
1052 splice_in_session_container(s.children.as_mut_vec(), state)?;
1053 }
1054 ContentItem::Definition(d) => {
1055 splice_in_general_container(&mut d.children, state, ContainerKind::Definition)?;
1056 }
1057 ContentItem::Annotation(a) => {
1058 // Skip the body of annotations whose schema declares
1059 // `hooks.resolve = true` — those are dispatched at the
1060 // parent level by `process_resolves`. Walking their
1061 // bodies *here* would trip the resolve again on the
1062 // same invocation.
1063 //
1064 // The body is still walked when the resolve actually
1065 // runs: `process_resolves` calls
1066 // `resolve_one_invocation`, and the
1067 // [`ResolveOutcome::Spliced`] arm walks the splice
1068 // subtree (which replaces the annotation), while the
1069 // [`ResolveOutcome::Unexpanded`] arm explicitly
1070 // walks the kept annotation's body via
1071 // `splice_in_general_container`. So nested
1072 // resolve-hooked annotations inside an unexpanded
1073 // outer annotation are still reached.
1074 //
1075 // Non-resolve-hooked annotations recurse normally
1076 // here so their nested bodies get processed.
1077 let is_resolve_hooked = state
1078 .registry
1079 .schema_for(&a.data.label.value)
1080 .map(|s| s.hooks.resolve)
1081 .unwrap_or(false);
1082 if !is_resolve_hooked {
1083 splice_in_general_container(
1084 &mut a.children,
1085 state,
1086 ContainerKind::AnnotationBody,
1087 )?;
1088 }
1089 }
1090 ContentItem::List(l) => {
1091 for li in l.items.as_mut_vec().iter_mut() {
1092 if let ContentItem::ListItem(item) = li {
1093 splice_in_general_container(
1094 &mut item.children,
1095 state,
1096 ContainerKind::ListItem,
1097 )?;
1098 }
1099 }
1100 }
1101 _ => {}
1102 }
1103 }
1104 Ok(())
1105}
1106
1107fn validate_against_kind(
1108 items: &[ContentItem],
1109 kind: ContainerKind,
1110 site: &Range,
1111 file: &Path,
1112) -> Result<(), IncludeError> {
1113 if kind.allows_sessions() {
1114 return Ok(());
1115 }
1116 if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
1117 return Err(IncludeError::ContainerPolicy {
1118 include_site: site.clone(),
1119 container: kind.name(),
1120 file: file.to_path_buf(),
1121 violation: "Sessions",
1122 });
1123 }
1124 Ok(())
1125}
1126
1127// ============================================================================
1128// Path resolution
1129// ============================================================================
1130
1131/// Resolve a file-reference target string the same way the include
1132/// resolver resolves include paths.
1133///
1134/// Use this when consuming `ReferenceType::File { target }` (or any other
1135/// node-attached path) so that relative paths resolve from the *authoring*
1136/// file's directory, not from wherever the merged document happens to be
1137/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
1138/// containing node (or `None` if the node was never stamped — in that case
1139/// the path is treated as if authored at the root).
1140///
1141/// Behaviour matches the include resolver:
1142/// - Root-absolute targets (leading `/`) resolve under `root`.
1143/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
1144/// when `ref_origin` is `None`).
1145/// - The result is lexically normalized and checked against `root` —
1146/// paths that escape it return `RootEscape`.
1147///
1148/// This is a sister to the resolver's internal `resolve_path` and shares
1149/// the same lexical-normalization caveat: it does not touch the filesystem.
1150pub fn resolve_file_reference(
1151 target: &str,
1152 ref_origin: Option<&Path>,
1153 root: &Path,
1154) -> Result<PathBuf, IncludeError> {
1155 let host_dir: PathBuf = ref_origin
1156 .and_then(|p| p.parent())
1157 .map(Path::to_path_buf)
1158 .unwrap_or_else(|| root.to_path_buf());
1159 resolve_path(target, &host_dir, root)
1160}
1161
1162fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
1163 let candidate = if let Some(rel) = src.strip_prefix('/') {
1164 // Root-absolute (Lex spec convention): leading `/` means "from
1165 // the resolution root", not "filesystem root".
1166 root.join(rel)
1167 } else {
1168 // Anything else must be a relative path. Reject inputs the
1169 // host platform would treat as absolute (Windows `C:\foo`,
1170 // `\\server\share`, `\foo`) up front: the spec forbids
1171 // platform-absolute paths from entering the resolution
1172 // pipeline. Without this, `host_dir.join(src)` would silently
1173 // discard `host_dir` because Rust's `PathBuf::join` replaces
1174 // the base when the joined path is absolute. The downstream
1175 // root-escape check would still catch the security side, but
1176 // we'd surface a misleading "escapes root" error instead of
1177 // "absolute paths not allowed", and we'd be relying on
1178 // `PathBuf::join`'s override semantics for the security
1179 // outcome rather than holding the line at the input boundary.
1180 if Path::new(src).is_absolute() {
1181 return Err(IncludeError::AbsolutePath {
1182 path: PathBuf::from(src),
1183 });
1184 }
1185 host_dir.join(src)
1186 };
1187 let normalized = lexical_normalize(&candidate);
1188 let canonical_root = lexical_normalize(root);
1189 if !normalized.starts_with(&canonical_root) {
1190 return Err(IncludeError::RootEscape {
1191 path: normalized,
1192 root: canonical_root,
1193 });
1194 }
1195 Ok(normalized)
1196}
1197
1198/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
1199///
1200/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
1201/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
1202/// version is sufficient for include-site path resolution because the
1203/// resolver only needs a stable identity for cycle detection and a uniform
1204/// shape for the root-escape prefix check.
1205///
1206/// `..` is collapsed only when the *last* component in the buffer is a
1207/// real directory name (`Component::Normal`). When the buffer is empty
1208/// or its last component is itself `..` (or a root marker), the new `..`
1209/// is *preserved* in the buffer.
1210///
1211/// This is what defeats `../../etc/passwd` from collapsing to
1212/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
1213/// would happily strip a `..` (since `Path::new("..").parent()` returns
1214/// `Some("")`), silently losing the second `..` and producing a path
1215/// that falsely starts with the root prefix. Each unmatched `..` in the
1216/// preserved form keeps the normalized path outside any sane root, so
1217/// the escape check fires correctly.
1218fn lexical_normalize(p: &Path) -> PathBuf {
1219 let mut out = PathBuf::new();
1220 for c in p.components() {
1221 match c {
1222 std::path::Component::ParentDir => {
1223 let can_pop = matches!(
1224 out.components().next_back(),
1225 Some(std::path::Component::Normal(_))
1226 );
1227 if can_pop {
1228 out.pop();
1229 } else {
1230 out.push("..");
1231 }
1232 }
1233 std::path::Component::CurDir => {}
1234 other => out.push(other.as_os_str()),
1235 }
1236 }
1237 out
1238}
1239
1240// ============================================================================
1241// Origin stamping
1242// ============================================================================
1243//
1244// Walk every node in a Document and set `Range.origin_path` on each
1245// `.location` field. The walk only stamps the *block-level* `.location`
1246// fields here; finer-grained inline ranges land in PR 6 when file-ref
1247// resolution starts consulting them.
1248
1249pub(crate) fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
1250 if let Some(title) = doc.title.as_mut() {
1251 title.location.origin_path = Some(Arc::clone(origin));
1252 }
1253 for ann in doc.annotations.iter_mut() {
1254 stamp_annotation(ann, origin);
1255 }
1256 stamp_session(&mut doc.root, origin);
1257}
1258
1259fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
1260 s.location.origin_path = Some(Arc::clone(origin));
1261 if let Some(loc) = s.title.location.as_mut() {
1262 loc.origin_path = Some(Arc::clone(origin));
1263 }
1264 for ann in s.annotations.iter_mut() {
1265 stamp_annotation(ann, origin);
1266 }
1267 for item in s.children.as_mut_vec().iter_mut() {
1268 stamp_item(item, origin);
1269 }
1270}
1271
1272fn stamp_annotation(
1273 a: &mut crate::lex::ast::elements::annotation::Annotation,
1274 origin: &Arc<PathBuf>,
1275) {
1276 a.location.origin_path = Some(Arc::clone(origin));
1277 a.data.location.origin_path = Some(Arc::clone(origin));
1278 for item in a.children.as_mut_vec().iter_mut() {
1279 stamp_item(item, origin);
1280 }
1281}
1282
1283fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
1284 match item {
1285 ContentItem::Session(s) => stamp_session(s, origin),
1286 ContentItem::Annotation(a) => stamp_annotation(a, origin),
1287 ContentItem::Paragraph(p) => {
1288 p.location.origin_path = Some(Arc::clone(origin));
1289 for ann in p.annotations.iter_mut() {
1290 stamp_annotation(ann, origin);
1291 }
1292 for line in p.lines.iter_mut() {
1293 stamp_item(line, origin);
1294 }
1295 }
1296 ContentItem::List(l) => {
1297 l.location.origin_path = Some(Arc::clone(origin));
1298 for li in l.items.as_mut_vec().iter_mut() {
1299 stamp_item(li, origin);
1300 }
1301 }
1302 ContentItem::ListItem(li) => {
1303 li.location.origin_path = Some(Arc::clone(origin));
1304 for ann in li.annotations.iter_mut() {
1305 stamp_annotation(ann, origin);
1306 }
1307 for child in li.children.as_mut_vec().iter_mut() {
1308 stamp_item(child, origin);
1309 }
1310 }
1311 ContentItem::Definition(d) => {
1312 d.location.origin_path = Some(Arc::clone(origin));
1313 for ann in d.annotations.iter_mut() {
1314 stamp_annotation(ann, origin);
1315 }
1316 for child in d.children.as_mut_vec().iter_mut() {
1317 stamp_item(child, origin);
1318 }
1319 }
1320 ContentItem::VerbatimBlock(v) => {
1321 v.location.origin_path = Some(Arc::clone(origin));
1322 }
1323 ContentItem::VerbatimLine(vl) => {
1324 vl.location.origin_path = Some(Arc::clone(origin));
1325 }
1326 ContentItem::Table(t) => {
1327 t.location.origin_path = Some(Arc::clone(origin));
1328 }
1329 ContentItem::TextLine(tl) => {
1330 tl.location.origin_path = Some(Arc::clone(origin));
1331 }
1332 ContentItem::BlankLineGroup(b) => {
1333 b.location.origin_path = Some(Arc::clone(origin));
1334 }
1335 }
1336}
1337
1338// ============================================================================
1339// Parser glue
1340// ============================================================================
1341
1342/// Parse `source` into a Document but skip the annotation-attachment stage,
1343/// so include annotations are findable in container children lists.
1344pub(crate) fn parse_no_attach(source: &str) -> Result<Document, String> {
1345 crate::lex::testing::parse_without_annotation_attachment(source)
1346}
1347
1348// ============================================================================
1349// Filesystem-backed loader
1350// ============================================================================
1351
1352/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
1353///
1354/// This is the production loader used by the CLI; the LSP wraps it with a
1355/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
1356/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
1357/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
1358/// WASM-friendly.
1359///
1360/// `FsLoader` is constructed with the resolution root and rechecks every
1361/// load against it post-`fs::canonicalize`, so a symlink pointing outside
1362/// the root is rejected even though the lexical-only check in
1363/// [`resolve_path`] cannot see it. Also rejects non-regular files (devices,
1364/// FIFOs, directories) before reading, so the loader can't be tricked into
1365/// blocking on `/dev/zero` or allocating against an open device.
1366///
1367/// Errors map:
1368/// - canonicalization fails (file missing, permission denied at a parent,
1369/// broken symlink, …) → [`LoadError::NotFound`]
1370/// - canonical path doesn't sit under canonical root → [`LoadError::OutsideRoot`]
1371/// - target is not a regular file → [`LoadError::Io`] with a clear message
1372/// - any other I/O error during read → [`LoadError::Io`]
1373pub struct FsLoader {
1374 /// Filesystem-canonical resolution root. Constructed once at
1375 /// `FsLoader::new`; if canonicalization fails (e.g., the configured
1376 /// root doesn't exist on disk), we fall back to the input verbatim
1377 /// and the bounds check will simply never pass — visible to the user
1378 /// as a `LoadError::OutsideRoot` instead of silently disabling the
1379 /// security check.
1380 canonical_root: PathBuf,
1381 /// Per-file size cap (bytes). Loads of larger files surface as
1382 /// `LoadError::TooLarge` before any bytes are read into memory.
1383 /// Default [`FsLoader::DEFAULT_MAX_FILE_SIZE`].
1384 max_file_size: u64,
1385}
1386
1387impl FsLoader {
1388 /// Default per-file size cap: 10 MiB. Generous for realistic Lex
1389 /// source documents (text only) and tight enough to bound memory
1390 /// allocation per include against an adversarial 1 GB file.
1391 pub const DEFAULT_MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
1392
1393 /// Construct a loader rooted at `root` with default size limits.
1394 /// The loader stores `root`'s fs-canonical form (with symlinks
1395 /// resolved); subsequent loads validate that the requested path's
1396 /// canonical form lives under it.
1397 pub fn new(root: PathBuf) -> Self {
1398 let canonical_root = std::fs::canonicalize(&root).unwrap_or(root);
1399 Self {
1400 canonical_root,
1401 max_file_size: Self::DEFAULT_MAX_FILE_SIZE,
1402 }
1403 }
1404
1405 /// Override the default per-file size cap (bytes). Use to widen the
1406 /// limit for projects with genuinely large source files, or tighten
1407 /// it for stricter sandboxes (e.g., LSPs serving untrusted content).
1408 pub fn with_max_file_size(mut self, max_file_size: u64) -> Self {
1409 self.max_file_size = max_file_size;
1410 self
1411 }
1412}
1413
1414impl Loader for FsLoader {
1415 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1416 // 1. Canonicalize. Resolves symlinks and `..` segments against the
1417 // real filesystem. NotFound / broken-symlink / permission errors
1418 // all surface here.
1419 let canonical_path = std::fs::canonicalize(path).map_err(|e| match e.kind() {
1420 std::io::ErrorKind::NotFound => LoadError::NotFound {
1421 path: path.to_path_buf(),
1422 },
1423 _ => LoadError::Io {
1424 path: path.to_path_buf(),
1425 message: e.to_string(),
1426 },
1427 })?;
1428
1429 // 2. Bounds check against the *canonical* root. This is the
1430 // actual security gate against symlink traversal — the lexical
1431 // check in resolve_path can't see through symlinks.
1432 if !canonical_path.starts_with(&self.canonical_root) {
1433 return Err(LoadError::OutsideRoot {
1434 path: canonical_path,
1435 root: self.canonical_root.clone(),
1436 });
1437 }
1438
1439 // 3. Reject non-regular files. Without this, an attacker (with
1440 // write access to the repo) could symlink an include target to
1441 // `/dev/zero` or a FIFO and block / OOM the reader. The
1442 // is_file() metadata call is a cheap sanity check.
1443 let meta = std::fs::metadata(&canonical_path).map_err(|e| LoadError::Io {
1444 path: canonical_path.clone(),
1445 message: e.to_string(),
1446 })?;
1447 if !meta.is_file() {
1448 return Err(LoadError::Io {
1449 path: canonical_path,
1450 message: "include target is not a regular file".to_string(),
1451 });
1452 }
1453
1454 // 4. Size cap. Bounds memory allocation per include against an
1455 // adversarial 1 GB file before any bytes hit the heap.
1456 let size = meta.len();
1457 if size > self.max_file_size {
1458 return Err(LoadError::TooLarge {
1459 path: canonical_path,
1460 size,
1461 limit: self.max_file_size,
1462 });
1463 }
1464
1465 // 5. Read. By this point we know the path is a regular file under
1466 // the canonical root and within the size cap; anything that
1467 // fails here is a real I/O error worth surfacing.
1468 let source = std::fs::read_to_string(&canonical_path).map_err(|e| LoadError::Io {
1469 path: canonical_path.clone(),
1470 message: e.to_string(),
1471 })?;
1472
1473 Ok(LoadedFile {
1474 source,
1475 canonical_path,
1476 })
1477 }
1478}
1479
1480// ============================================================================
1481// Test fixtures (test-support feature + cfg(test))
1482// ============================================================================
1483
1484/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
1485#[cfg(any(test, feature = "test-support"))]
1486pub struct MemoryLoader {
1487 files: std::collections::HashMap<PathBuf, String>,
1488}
1489
1490#[cfg(any(test, feature = "test-support"))]
1491impl MemoryLoader {
1492 /// Create an empty loader. Add files with [`MemoryLoader::insert`].
1493 pub fn new() -> Self {
1494 Self {
1495 files: std::collections::HashMap::new(),
1496 }
1497 }
1498
1499 /// Register a file at `path` with the given source text.
1500 pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
1501 self.files.insert(path.into(), contents.into());
1502 self
1503 }
1504
1505 /// Convenience constructor: build a loader from any iterator of
1506 /// `(path, contents)` pairs.
1507 pub fn from_pairs<I, P, S>(pairs: I) -> Self
1508 where
1509 I: IntoIterator<Item = (P, S)>,
1510 P: Into<PathBuf>,
1511 S: Into<String>,
1512 {
1513 let mut loader = Self::new();
1514 for (path, contents) in pairs {
1515 loader.insert(path, contents);
1516 }
1517 loader
1518 }
1519}
1520
1521#[cfg(any(test, feature = "test-support"))]
1522impl Default for MemoryLoader {
1523 fn default() -> Self {
1524 Self::new()
1525 }
1526}
1527
1528#[cfg(any(test, feature = "test-support"))]
1529impl Loader for MemoryLoader {
1530 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1531 // Memory loaders have no symlinks; the lookup key *is* the
1532 // canonical identity. Cycle detection in the resolver compares
1533 // `LoadedFile::canonical_path` values; for tests this matches the
1534 // lexically-normalized paths the resolver already produces.
1535 let source = self
1536 .files
1537 .get(path)
1538 .cloned()
1539 .ok_or_else(|| LoadError::NotFound {
1540 path: path.to_path_buf(),
1541 })?;
1542 Ok(LoadedFile {
1543 source,
1544 canonical_path: path.to_path_buf(),
1545 })
1546 }
1547}
1548
1549// ============================================================================
1550// Tests
1551// ============================================================================
1552
1553#[cfg(test)]
1554mod tests;