lex_core/lex/includes.rs
1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//! doc-title/doc-annotation conversion + origin stamping + root-escape
19//! check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//! (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//! directory, so relative paths inside an included file resolve from
23//! that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//! resolves a `ReferenceType::File` target from the authoring file's
26//! directory using `Range.origin_path`.
27//! `Document::find_annotation_by_label_in_origin` scopes footnote
28//! lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//! filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//! into `lex convert` and `lex inspect` (default-on, opt-out via
32//! `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47// `IncludeError` carries diagnostic context (paths, source ranges,
48// handler messages) on every variant; the `result_large_err` lint
49// would have us box the whole error or split it into a thinner shape
50// just to satisfy the size heuristic. The enum is already part of
51// the public API and the error path is rare; suppress the lint for
52// this module rather than churn the public surface.
53#![allow(clippy::result_large_err)]
54
55use crate::lex::assembling::stages::{ApplyTableConfig, NormalizeLabels};
56use crate::lex::assembling::AttachAnnotations;
57use crate::lex::ast::elements::container::GeneralContainer;
58use crate::lex::ast::elements::content_item::ContentItem;
59use crate::lex::ast::elements::session::Session;
60use crate::lex::ast::range::Range;
61use crate::lex::ast::Document;
62use crate::lex::transforms::Runnable;
63use lex_extension::handler::HandlerError;
64use lex_extension_host::registry::Registry;
65use std::path::{Path, PathBuf};
66use std::sync::Arc;
67
68/// Configuration for the include resolution pass.
69#[derive(Debug, Clone)]
70pub struct ResolveConfig {
71 /// Directory all include paths resolve under. Any include that
72 /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
73 ///
74 /// Must be an **absolute** path. Lexical normalization treats `.`
75 /// and `..` against an empty buffer as no-ops; passing a relative
76 /// or unnormalized root weakens the root-escape prefix check.
77 /// Callers (CLI, LSP) should canonicalize the root before
78 /// constructing `ResolveConfig`.
79 pub root: PathBuf,
80 /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
81 /// Hitting the limit is an error, not a silent truncation.
82 pub max_depth: usize,
83 /// Maximum total number of `lex.include` annotations resolved across
84 /// the whole tree (depth × breadth). Default 1000
85 /// (see [`ResolveConfig::DEFAULT_MAX_TOTAL_INCLUDES`]).
86 ///
87 /// Caps fan-out: `max_depth` alone bounds chain length but not
88 /// breadth. A document with 100 thousand top-level includes at depth
89 /// 1 sits inside `max_depth` but can still OOM the resolver / LSP /
90 /// CI. Hitting this limit is an error, not a silent truncation.
91 pub max_total_includes: usize,
92}
93
94impl ResolveConfig {
95 /// Default maximum include depth — enough for any reasonable atomization
96 /// strategy (aggregator → per-chapter → per-section), bounded enough to
97 /// keep the resolver's worst-case work predictable.
98 pub const DEFAULT_MAX_DEPTH: usize = 8;
99
100 /// Default maximum total include count (DoS bound). Generous enough
101 /// for a book-length document with thousands of small fragments,
102 /// tight enough to contain adversarial fan-out within a few seconds
103 /// of resolver work.
104 pub const DEFAULT_MAX_TOTAL_INCLUDES: usize = 1000;
105
106 /// Construct a config with the given root and default limits.
107 pub fn with_root(root: PathBuf) -> Self {
108 Self {
109 root,
110 max_depth: Self::DEFAULT_MAX_DEPTH,
111 max_total_includes: Self::DEFAULT_MAX_TOTAL_INCLUDES,
112 }
113 }
114}
115
116/// A pluggable source-text loader.
117///
118/// Implementations decide where bytes come from (filesystem, in-memory map,
119/// virtual filesystem, content-addressed store, …). lex-core never references
120/// `std::fs` directly through this trait; that keeps the resolver pure and
121/// usable in WASM, sandboxes, and unit tests.
122pub trait Loader {
123 /// Load the source text for `path` and return both the contents and a
124 /// canonical identity for the loaded resource. The path is what the
125 /// resolver decided on after applying the rules in §4 of the proposal.
126 ///
127 /// `LoadedFile::canonical_path` is the loader's authoritative identity
128 /// for the resource. For [`FsLoader`] this is the filesystem-canonical
129 /// path (symlinks resolved, case-folded if the underlying FS is
130 /// case-insensitive); for [`MemoryLoader`] it's the lookup key (since
131 /// memory loaders have no symlinks). The resolver uses this for cycle
132 /// detection and for stamping `Range.origin_path` on the loaded tree.
133 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError>;
134}
135
136/// Result of a successful [`Loader::load`].
137#[derive(Debug, Clone)]
138pub struct LoadedFile {
139 /// The file's source text.
140 pub source: String,
141 /// The loader's authoritative identity for the resource. See
142 /// [`Loader::load`] for how loaders decide this.
143 pub canonical_path: PathBuf,
144}
145
146/// Errors a [`Loader`] can produce.
147#[derive(Debug, Clone)]
148pub enum LoadError {
149 /// The loader could not find a resource at the given path.
150 NotFound { path: PathBuf },
151 /// The resource exists but resolves outside the loader's allowed
152 /// boundary. The lexical resolver normalizes `..` in the requested
153 /// path, but loaders that touch a real filesystem must do a second
154 /// check post-canonicalization to catch symlinks that escape the
155 /// boundary lexically-correct paths can't reach.
156 OutsideRoot { path: PathBuf, root: PathBuf },
157 /// The resource exists but its size exceeds the loader's configured
158 /// limit. `size` and `limit` are in bytes. The resolver maps this to
159 /// [`IncludeError::FileTooLarge`] with the offending annotation's site.
160 TooLarge {
161 path: PathBuf,
162 size: u64,
163 limit: u64,
164 },
165 /// Underlying I/O error (or virtual-filesystem equivalent).
166 Io { path: PathBuf, message: String },
167}
168
169impl std::fmt::Display for LoadError {
170 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171 match self {
172 LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
173 LoadError::OutsideRoot { path, root } => write!(
174 f,
175 "include path {} resolves outside loader root {}",
176 path.display(),
177 root.display()
178 ),
179 LoadError::TooLarge { path, size, limit } => write!(
180 f,
181 "include file {} is {size} bytes, exceeds limit of {limit} bytes",
182 path.display()
183 ),
184 LoadError::Io { path, message } => {
185 write!(f, "io error reading {}: {message}", path.display())
186 }
187 }
188 }
189}
190
191impl std::error::Error for LoadError {}
192
193/// Errors the include resolver can produce.
194#[derive(Debug, Clone)]
195pub enum IncludeError {
196 /// An include chain looped back on itself. `chain` is the resolution
197 /// stack at the moment the duplicate `path` was about to be pushed,
198 /// in source-order (entry first, deepest last). `include_site` is the
199 /// range of the offending `lex.include` annotation in its host file —
200 /// useful for diagnostics that highlight the exact line.
201 Cycle {
202 include_site: Range,
203 path: PathBuf,
204 chain: Vec<PathBuf>,
205 },
206 /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
207 /// shows the resolution stack at the moment of failure, in source
208 /// order. `include_site` is the range of the offending
209 /// `lex.include` annotation in its host file.
210 DepthExceeded {
211 include_site: Range,
212 limit: usize,
213 chain: Vec<PathBuf>,
214 },
215 /// The total number of includes resolved across the document
216 /// exceeded [`ResolveConfig::max_total_includes`]. Bounds adversarial
217 /// fan-out (which `max_depth` alone does not). `include_site` is the
218 /// `lex.include` annotation that pushed the count past the limit.
219 TotalIncludesExceeded { include_site: Range, limit: usize },
220 /// The included file's size exceeded the loader's configured limit.
221 /// Surfaced by loaders that read from a real filesystem (FsLoader)
222 /// to bound memory allocation per include. `include_site` is the
223 /// offending annotation; `size` and `limit` are in bytes.
224 FileTooLarge {
225 include_site: Range,
226 path: PathBuf,
227 size: u64,
228 limit: u64,
229 },
230 /// A path resolved outside the configured [`ResolveConfig::root`].
231 RootEscape { path: PathBuf, root: PathBuf },
232 /// The include `src` was a platform-absolute filesystem path
233 /// (e.g. Windows `C:\foo`, `\\server\share`, `\foo`). The spec
234 /// forbids absolute filesystem paths from entering the
235 /// resolution pipeline; the *root-absolute* form (leading `/`
236 /// resolved against the includes root) is the only spec-allowed
237 /// way to write a path that doesn't start from the host's
238 /// directory. On Unix the only thing that's `Path::is_absolute()`
239 /// is a leading `/`, which is consumed by the root-absolute
240 /// branch first; this variant therefore only fires in practice
241 /// for Windows-shaped absolute paths.
242 AbsolutePath { path: PathBuf },
243 /// The loader could not find or read the included file. `include_site`
244 /// is the range of the offending `lex.include` annotation in its host
245 /// file, so editors can squiggle the line that asked for the missing
246 /// file rather than the document head.
247 NotFound { include_site: Range, path: PathBuf },
248 /// The loader returned text that the parser rejected.
249 ParseFailed { path: PathBuf, message: String },
250 /// The included file's content is not legal in the include site's
251 /// parent container.
252 ///
253 /// Today this only occurs when an included file has top-level Sessions
254 /// and the include site is inside a `GeneralContainer` (Definition,
255 /// ListItem, or another Annotation's body). The `violation` field
256 /// names the offending content kind (e.g. `"Sessions"`) so future
257 /// container/policy combinations can reuse this variant without a
258 /// breaking change.
259 ContainerPolicy {
260 include_site: Range,
261 container: &'static str,
262 file: PathBuf,
263 violation: &'static str,
264 },
265 /// Loader propagated a non-`NotFound` I/O error.
266 LoaderIo { path: PathBuf, message: String },
267 /// `lex.include` annotation was missing the mandatory `src=` parameter.
268 MissingSrc { include_site: Range },
269 /// A registered handler returned an error the pass could not map
270 /// onto a more specific variant — typically a third-party
271 /// namespace's resolve hook surfacing an internal failure, or an
272 /// unrecognised handler-defined code from `lex.*` built-ins. The
273 /// `code` is the string identifier the registry attaches to the
274 /// diagnostic (`"handler.internal"`, `"handler.custom"`, …).
275 HandlerFailed {
276 include_site: Range,
277 label: String,
278 code: String,
279 message: String,
280 },
281}
282
283impl std::fmt::Display for IncludeError {
284 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285 match self {
286 IncludeError::Cycle { path, chain, .. } => {
287 let chain_display: Vec<String> =
288 chain.iter().map(|p| p.display().to_string()).collect();
289 write!(
290 f,
291 "include cycle: {} (chain: {})",
292 path.display(),
293 chain_display.join(" -> ")
294 )
295 }
296 IncludeError::DepthExceeded { limit, chain, .. } => {
297 let chain_display: Vec<String> =
298 chain.iter().map(|p| p.display().to_string()).collect();
299 write!(
300 f,
301 "include depth exceeded limit of {limit} (chain: {})",
302 chain_display.join(" -> ")
303 )
304 }
305 IncludeError::TotalIncludesExceeded { limit, .. } => {
306 write!(f, "total include count exceeded limit of {limit}")
307 }
308 IncludeError::FileTooLarge {
309 path, size, limit, ..
310 } => {
311 write!(
312 f,
313 "included file {} is {size} bytes, exceeds limit of {limit} bytes",
314 path.display()
315 )
316 }
317 IncludeError::RootEscape { path, root } => write!(
318 f,
319 "include path {} escapes resolution root {}",
320 path.display(),
321 root.display()
322 ),
323 IncludeError::AbsolutePath { path } => write!(
324 f,
325 "include src {} is a platform-absolute path; \
326 the spec forbids absolute filesystem paths — use a relative path \
327 (chapters/01.lex) or a root-absolute path (/shared/01.lex)",
328 path.display()
329 ),
330 IncludeError::NotFound { path, .. } => {
331 write!(f, "include not found: {}", path.display())
332 }
333 IncludeError::ParseFailed { path, message } => {
334 write!(f, "failed to parse {}: {message}", path.display())
335 }
336 IncludeError::ContainerPolicy {
337 container,
338 file,
339 violation,
340 ..
341 } => write!(
342 f,
343 "included file {} contains {} but include site is inside {} \
344 (which does not allow {})",
345 file.display(),
346 violation,
347 container,
348 violation
349 ),
350 IncludeError::LoaderIo { path, message } => {
351 write!(f, "loader error reading {}: {message}", path.display())
352 }
353 IncludeError::MissingSrc { .. } => {
354 write!(f, "lex.include annotation missing required src= parameter")
355 }
356 IncludeError::HandlerFailed {
357 label,
358 code,
359 message,
360 ..
361 } => write!(f, "extension handler `{label}` failed ({code}): {message}"),
362 }
363 }
364}
365
366impl std::error::Error for IncludeError {}
367
368// No `From<LoadError>` impl: `IncludeError::NotFound` carries the include
369// site (the `lex.include` annotation's range), which a loader doesn't know
370// about. Callers map `LoadError` explicitly at the call site, where the
371// site is available.
372
373/// Which container the include site sits in. Determines the splice-time
374/// policy check (the only one today is "no Sessions in `GeneralContainer`").
375#[derive(Debug, Clone, Copy)]
376enum ContainerKind {
377 /// `Document.root.children` or `Session.children` — accepts everything.
378 Session,
379 /// `Definition.children` — `GeneralContainer`.
380 Definition,
381 /// `Annotation.children` — `GeneralContainer`.
382 AnnotationBody,
383 /// `ListItem.children` — `GeneralContainer`.
384 ListItem,
385}
386
387impl ContainerKind {
388 fn name(self) -> &'static str {
389 match self {
390 ContainerKind::Session => "Session",
391 ContainerKind::Definition => "Definition",
392 ContainerKind::AnnotationBody => "Annotation body",
393 ContainerKind::ListItem => "ListItem",
394 }
395 }
396
397 fn allows_sessions(self) -> bool {
398 matches!(self, ContainerKind::Session)
399 }
400}
401
402/// Hard cap on resolution depth, applied even when the
403/// configurable [`ResolveConfig::max_depth`] is set higher. Bounds
404/// adversarial varying-position recursion (a handler that returns
405/// content with a different invocation site each iteration so the
406/// cycle key never matches) so the resolver always terminates.
407pub const KERNEL_DEPTH_BACKSTOP: usize = 32;
408
409/// Resolve every `hooks.resolve = true` labelled annotation starting
410/// from `source`, dispatching through `registry`, and recursively
411/// processing the spliced content.
412///
413/// `source_path` identifies the entry-point file. It is used to
414/// (a) stamp `Range.origin_path` on every node so downstream code
415/// (file-ref resolution, diagnostics, LSP goto) can report locations
416/// against the authoring file, and (b) provide the host directory
417/// the built-in `lex.include` handler resolves relative `src=` paths
418/// against (via `LabelCtx.node.origin`). When `None`, origin stamping
419/// is skipped on the entry and the handler resolves relative paths
420/// against `config.root`.
421///
422/// # Generic dispatch
423///
424/// Every label whose schema declares `hooks.resolve = true` flows
425/// through the same path: build a [`LabelCtx`] from the annotation,
426/// call [`Registry::dispatch_resolve_raw`], decode the returned
427/// [`WireNode`] back into typed [`ContentItem`]s via
428/// [`crate::lex::wire::from_wire_node`], and splice in place. The
429/// built-in `lex.include` handler is registered the same way as any
430/// third-party namespace.
431///
432/// # Pre/post-attachment
433///
434/// Internally this re-parses the entry source *without* annotation
435/// attachment so labelled annotations stay visible as standalone
436/// children. The handler does its own `parse_no_attach` for loaded
437/// content. After all splices, [`AttachAnnotations`] runs once on
438/// the merged tree.
439///
440/// # Recursion + cycle detection
441///
442/// Cycle detection keys on `(label, origin_path, start_position)` of
443/// the invocation site. A handler that returns content containing
444/// another invocation at the same source position is caught
445/// immediately. A handler that varies the invocation position each
446/// iteration terminates at `min(config.max_depth, KERNEL_DEPTH_BACKSTOP)`
447/// with `IncludeError::DepthExceeded`. The total-includes counter
448/// caps adversarial fan-out independent of depth.
449pub fn resolve_from_source(
450 source: &str,
451 source_path: Option<PathBuf>,
452 config: &ResolveConfig,
453 registry: &Registry,
454) -> Result<Document, IncludeError> {
455 let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
456
457 let mut doc = parse_no_attach(source).map_err(|message| IncludeError::ParseFailed {
458 path: source_path.clone().unwrap_or_default(),
459 message,
460 })?;
461
462 if let Some(origin) = entry_origin.as_ref() {
463 stamp_doc(&mut doc, origin);
464 }
465
466 // Normalise labels in the entry source BEFORE the resolve walk so
467 // shortcut spellings (`:: include ::`, `:: image ::`, …) are
468 // rewritten to their canonical form. The resolve dispatcher keys
469 // on `registry.schema_for(label)` with the canonical spelling, so
470 // without this an `:: include src=... ::` annotation would be
471 // skipped because no schema is registered under the bare alias.
472 //
473 // Permissive mode: unknown labels are left as-is rather than
474 // erroring. The standard parse pipeline enforces strict-mode
475 // namespace policy (`STRING_TO_AST`); the resolve entry point is
476 // a downstream stage that just needs the shortcut table applied
477 // so dispatch finds the right handler.
478 let mut doc =
479 NormalizeLabels::permissive()
480 .run(doc)
481 .map_err(|e| IncludeError::ParseFailed {
482 path: source_path.clone().unwrap_or_default(),
483 message: format!("label normalisation failed: {e}"),
484 })?;
485
486 let mut chain: Vec<ResolveKey> = Vec::new();
487 let mut state = ResolverState {
488 config,
489 registry,
490 chain: &mut chain,
491 depth: 0,
492 total_resolved: 0,
493 };
494
495 splice_in_session_container(doc.root.children.as_mut_vec(), &mut state)?;
496
497 let doc = AttachAnnotations::new()
498 .run(doc)
499 .map_err(|e| IncludeError::ParseFailed {
500 path: source_path.clone().unwrap_or_default(),
501 message: format!("annotation attachment failed: {e}"),
502 })?;
503
504 // Re-normalise after splicing. Each included file is parsed via
505 // `parse_no_attach` (no normalisation), so shortcut labels in the
506 // spliced content — e.g. `:: image src=... ::` inside an included
507 // chapter — need rewriting before downstream IR/format passes can
508 // dispatch them.
509 let doc = NormalizeLabels::permissive()
510 .run(doc)
511 .map_err(|e| IncludeError::ParseFailed {
512 path: source_path.clone().unwrap_or_default(),
513 message: format!("label normalisation failed: {e}"),
514 })?;
515
516 // Apply table configuration so `:: table header=N align=... ::`
517 // annotations attached to tables (here or in spliced content) take
518 // effect — matches the order the standard pipeline runs them.
519 let doc = ApplyTableConfig::new()
520 .run(doc)
521 .map_err(|e| IncludeError::ParseFailed {
522 path: source_path.unwrap_or_default(),
523 message: format!("table config application failed: {e}"),
524 })?;
525
526 Ok(doc)
527}
528
529// ============================================================================
530// Splicing
531// ============================================================================
532
533/// One frame on the resolve-pass cycle stack. Two invocations at the
534/// same `(label, origin, start)` position are a cycle, regardless of
535/// what parameters either invocation uses — a handler that varies
536/// params per call (random IDs, timestamps) cannot defeat the
537/// detector by changing param values.
538#[derive(Debug, Clone, PartialEq)]
539struct ResolveKey {
540 label: String,
541 /// `Range.origin_path` of the annotation — the file the
542 /// invocation was authored in. `None` when stamping was skipped
543 /// (e.g., entry source loaded from a string with no path).
544 origin: Option<PathBuf>,
545 start: crate::lex::ast::range::Position,
546}
547
548impl ResolveKey {
549 fn from_annotation(a: &crate::lex::ast::elements::annotation::Annotation) -> Self {
550 Self {
551 label: a.data.label.value.clone(),
552 origin: a.location.origin_path.as_ref().map(|p| (**p).clone()),
553 start: a.location.start,
554 }
555 }
556}
557
558/// Per-resolution state threaded through the recursive walker. Keeps the
559/// signatures of the splice/process functions short and ensures
560/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
561/// each invocation.
562struct ResolverState<'a> {
563 config: &'a ResolveConfig,
564 registry: &'a Registry,
565 /// Active resolution stack of `(label, origin, position)` keys.
566 /// Pushed when we begin dispatching for an invocation and popped
567 /// when its splice subtree is fully resolved. A push that finds
568 /// the same key already on the stack is a cycle.
569 chain: &'a mut Vec<ResolveKey>,
570 /// Number of dispatch hops from the entry point. Each recursion
571 /// increments by 1. Hitting `config.max_depth` or the
572 /// [`KERNEL_DEPTH_BACKSTOP`] (whichever is lower) is an error.
573 depth: usize,
574 /// Total invocations resolved across the entire walk
575 /// (depth × breadth). Incremented on every successful dispatch.
576 /// Hitting `config.max_total_includes` aborts with
577 /// `TotalIncludesExceeded`.
578 total_resolved: usize,
579}
580
581fn splice_in_session_container(
582 children: &mut Vec<ContentItem>,
583 state: &mut ResolverState<'_>,
584) -> Result<(), IncludeError> {
585 // Post-order: recurse into nested containers first, splice this
586 // container's invocations second. Recursion happens inside
587 // `process_resolves` for any spliced subtree, so that subtree
588 // is never re-walked at the parent level.
589 recurse_into_children(children, state)?;
590 process_resolves(children, state, ContainerKind::Session)
591}
592
593fn splice_in_general_container(
594 container: &mut GeneralContainer,
595 state: &mut ResolverState<'_>,
596 kind: ContainerKind,
597) -> Result<(), IncludeError> {
598 recurse_into_children(container.as_mut_vec(), state)?;
599 process_resolves(container.as_mut_vec(), state, kind)
600}
601
602/// Walk the children of a container, dispatch every annotation whose
603/// schema declares `hooks.resolve = true` through the registry, and
604/// splice the returned content in place of the annotation. Recurses
605/// into the spliced content so nested invocations resolve too.
606// Allow &mut Vec because `splice` needs Vec-specific operations.
607#[allow(clippy::ptr_arg)]
608fn process_resolves(
609 children: &mut Vec<ContentItem>,
610 state: &mut ResolverState<'_>,
611 kind: ContainerKind,
612) -> Result<(), IncludeError> {
613 // Collect indices of annotations whose schema has hooks.resolve.
614 let resolve_indices: Vec<usize> = children
615 .iter()
616 .enumerate()
617 .filter_map(|(i, item)| match item {
618 ContentItem::Annotation(a) => {
619 let label = &a.data.label.value;
620 if state
621 .registry
622 .schema_for(label)
623 .map(|s| s.hooks.resolve)
624 .unwrap_or(false)
625 {
626 Some(i)
627 } else {
628 None
629 }
630 }
631 _ => None,
632 })
633 .collect();
634
635 for i in resolve_indices.into_iter().rev() {
636 let annotation = match &children[i] {
637 ContentItem::Annotation(a) => a.clone(),
638 _ => unreachable!("index came from resolve filter"),
639 };
640
641 match resolve_one_invocation(&annotation, state, kind)? {
642 ResolveOutcome::Spliced(splice_items) => {
643 // Replace the annotation with `[annotation, ...splice_items]`.
644 // The annotation itself stays in the children list immediately
645 // before the splice, so the post-resolution AttachAnnotations
646 // pass moves it onto the first spliced node by the standard
647 // "attach to next sibling" rule.
648 let mut replacement = Vec::with_capacity(splice_items.len() + 1);
649 replacement.push(ContentItem::Annotation(annotation));
650 replacement.extend(splice_items);
651 children.splice(i..=i, replacement);
652 }
653 ResolveOutcome::Unexpanded => {
654 // Handler opted out of expanding this invocation. The
655 // annotation stays in place, but its body wasn't
656 // walked by `recurse_into_children` (that walker
657 // skips resolve-hooked annotations to avoid double-
658 // resolution). Walk the body now so any nested
659 // invocations inside the unexpanded annotation get
660 // resolved on the way back up.
661 let mut owned = annotation;
662 splice_in_general_container(
663 &mut owned.children,
664 state,
665 ContainerKind::AnnotationBody,
666 )?;
667 children[i] = ContentItem::Annotation(owned);
668 }
669 }
670 }
671
672 Ok(())
673}
674
675/// Outcome of dispatching a single resolve-hooked annotation. The
676/// pass needs to distinguish between "handler returned content,
677/// splice it in" and "handler opted out, leave the annotation
678/// alone": the second case still requires walking the annotation's
679/// body for nested invocations because `recurse_into_children`
680/// otherwise skips resolve-hooked annotations to prevent double-
681/// resolution.
682enum ResolveOutcome {
683 Spliced(Vec<ContentItem>),
684 Unexpanded,
685}
686
687/// Dispatch a single resolve-hooked annotation through the registry,
688/// decode the returned `WireNode` back into typed children, then
689/// recursively walk the splice items so nested invocations resolve
690/// before the splice is placed into the parent container.
691///
692/// Returns [`ResolveOutcome::Unexpanded`] when the handler returned
693/// `Ok(None)` (third-party handlers can opt out of expanding a
694/// particular invocation). The caller is then responsible for
695/// walking the annotation's body for nested invocations — the
696/// resolve walker normally skips resolve-hooked annotations'
697/// bodies.
698fn resolve_one_invocation(
699 annotation: &crate::lex::ast::elements::annotation::Annotation,
700 state: &mut ResolverState<'_>,
701 parent_kind: ContainerKind,
702) -> Result<ResolveOutcome, IncludeError> {
703 let label = &annotation.data.label.value;
704 let key = ResolveKey::from_annotation(annotation);
705
706 // Cycle check on (label, origin, start) of the invocation site.
707 if state.chain.contains(&key) {
708 return Err(IncludeError::Cycle {
709 include_site: annotation.location.clone(),
710 path: key.origin.clone().unwrap_or_default(),
711 chain: state
712 .chain
713 .iter()
714 .map(|k| k.origin.clone().unwrap_or_default())
715 .collect(),
716 });
717 }
718
719 // Depth check. The effective limit is the lower of the
720 // user-facing `config.max_depth` (default 8) and the hard
721 // [`KERNEL_DEPTH_BACKSTOP`] (32, fixed). The kernel backstop
722 // exists for adversarial varying-position recursion that the
723 // cycle key can't catch — even if a user bumps `max_depth`
724 // higher than 32 for legitimate deep atomization, the backstop
725 // still terminates. The error reports `effective_depth_limit`
726 // (the actual cap that fired) rather than `config.max_depth`,
727 // so when the backstop is the binding limit the user sees `32`
728 // and not the (higher) config value.
729 let effective_depth_limit = state.config.max_depth.min(KERNEL_DEPTH_BACKSTOP);
730 if state.depth >= effective_depth_limit {
731 return Err(IncludeError::DepthExceeded {
732 include_site: annotation.location.clone(),
733 limit: effective_depth_limit,
734 chain: state
735 .chain
736 .iter()
737 .map(|k| k.origin.clone().unwrap_or_default())
738 .collect(),
739 });
740 }
741
742 // Total-count check before dispatch.
743 if state.total_resolved >= state.config.max_total_includes {
744 return Err(IncludeError::TotalIncludesExceeded {
745 include_site: annotation.location.clone(),
746 limit: state.config.max_total_includes,
747 });
748 }
749
750 let ctx = build_label_ctx(annotation);
751
752 let wire_node = match state.registry.dispatch_resolve_raw(&ctx) {
753 Ok(Some(node)) => node,
754 Ok(None) => {
755 // Handler returned "nothing to splice" — leave the
756 // annotation in place. The caller still needs to walk
757 // its body for nested invocations (built-in lex.include
758 // never returns None; this path is reachable only via
759 // third-party handlers that opt out per-invocation).
760 return Ok(ResolveOutcome::Unexpanded);
761 }
762 Err(handler_err) => {
763 return Err(handler_error_to_include_error(
764 &handler_err,
765 label,
766 &annotation.location,
767 ));
768 }
769 };
770
771 state.total_resolved += 1;
772
773 // Decode the wire payload into typed lex-core ContentItems.
774 let mut splice_items = decode_wire_to_items(&wire_node, label, &annotation.location)?;
775
776 // Recurse into the spliced subtree FIRST so nested resolve-hooked
777 // annotations are processed before the splice lands. Validation
778 // must wait until *after* this step: a nested invocation can
779 // splice in content (e.g. a top-level `Session` from a chained
780 // `lex.include`) that wasn't in the handler's original output,
781 // and the final shape is what has to satisfy the parent
782 // container's policy.
783 //
784 // The `IncludeError::ContainerPolicy.file` field describes the
785 // *spliced content's* source file (the file containing the
786 // disallowed shape), not the invocation site. Take it from the
787 // handler-returned wire payload's origin when present, falling
788 // back to the first decoded item's origin path if the wire
789 // payload didn't stamp a `Document` origin.
790 let included_path = wire_node_origin_pathbuf(&wire_node)
791 .or_else(|| splice_items_first_origin(&splice_items))
792 .unwrap_or_default();
793 state.chain.push(key);
794 let saved_depth = state.depth;
795 state.depth = saved_depth + 1;
796 let recurse_result = splice_in_session_container(&mut splice_items, state);
797 state.depth = saved_depth;
798 state.chain.pop();
799 recurse_result?;
800
801 // Container-policy validation: enforce no-Sessions inside
802 // `GeneralContainer` (Definition / Annotation body / ListItem).
803 // Runs against the post-recursion splice list so nested
804 // expansions can't smuggle disallowed shapes past the check.
805 validate_against_kind(
806 &splice_items,
807 parent_kind,
808 &annotation.location,
809 &included_path,
810 )?;
811
812 Ok(ResolveOutcome::Spliced(splice_items))
813}
814
815/// Build a [`LabelCtx`] from a lex-core [`Annotation`]. The body is
816/// derived from the annotation's children (parsed-Lex form), the
817/// params from `Annotation::data::parameters`, and the host node info
818/// from `Annotation::location`.
819fn build_label_ctx(
820 a: &crate::lex::ast::elements::annotation::Annotation,
821) -> lex_extension::wire::LabelCtx {
822 use crate::lex::wire::to_wire_node;
823 use lex_extension::wire::{AnnotationBody, LabelCtx, NodeRef};
824
825 let label = a.data.label.value.clone();
826 let params = {
827 // Pass *semantic* parameter values to handlers (quotes
828 // stripped, escape sequences resolved). Handlers consume
829 // params as JSON values, where there is no "quoted string"
830 // vs "unquoted token" distinction; only the decoded value
831 // is meaningful. The codec's `parameters_to_json` (used by
832 // `annotation_to_wire` for round-tripping annotation
833 // *content*) keeps the raw form to preserve source — the
834 // two paths intentionally differ.
835 let mut obj = serde_json::Map::with_capacity(a.data.parameters.len());
836 for p in &a.data.parameters {
837 obj.insert(p.key.clone(), serde_json::Value::String(p.unquoted_value()));
838 }
839 serde_json::Value::Object(obj)
840 };
841 let body = if a.children.is_empty() {
842 AnnotationBody::None
843 } else {
844 let wire_children: Vec<lex_extension::wire::WireNode> =
845 a.children.iter().map(to_wire_node).collect();
846 AnnotationBody::Lex {
847 children: wire_children,
848 }
849 };
850 let range = lex_extension::wire::Range::new(
851 lex_extension::wire::Position::new(
852 u32::try_from(a.location.start.line).unwrap_or(u32::MAX),
853 u32::try_from(a.location.start.column).unwrap_or(u32::MAX),
854 ),
855 lex_extension::wire::Position::new(
856 u32::try_from(a.location.end.line).unwrap_or(u32::MAX),
857 u32::try_from(a.location.end.column).unwrap_or(u32::MAX),
858 ),
859 );
860 let origin = a
861 .location
862 .origin_path
863 .as_ref()
864 .map(|p| p.to_string_lossy().into_owned());
865 LabelCtx {
866 label,
867 params,
868 body,
869 node: NodeRef {
870 kind: "annotation".into(),
871 range,
872 origin,
873 },
874 }
875}
876
877/// Convert a handler-returned [`WireNode`] back into a list of
878/// [`ContentItem`]s ready for splicing. `WireNode::Document` is
879/// unwrapped (its children become the splice list); any other root
880/// shape is wrapped as a single-item list.
881///
882/// `invocation_label` is the label whose handler produced `wire` —
883/// threaded through so wire-decode failures are attributed to the
884/// real namespace rather than a hardcoded `lex.include`. A
885/// third-party `acme.expand` handler that returns malformed wire
886/// will surface as `IncludeError::HandlerFailed { label:
887/// "acme.expand", .. }`.
888/// Lift a [`WireNode`]'s top-level `origin` field into a `PathBuf`
889/// when present. Used by the resolve pass to attribute
890/// container-policy errors to the *spliced content's* source file
891/// rather than the invocation site.
892fn wire_node_origin_pathbuf(node: &lex_extension::wire::WireNode) -> Option<PathBuf> {
893 use lex_extension::wire::WireNode as W;
894 let s = match node {
895 W::Document { origin, .. } => origin.as_deref(),
896 W::Session { origin, .. } => origin.as_deref(),
897 W::Definition { origin, .. } => origin.as_deref(),
898 W::Paragraph { origin, .. } => origin.as_deref(),
899 W::List { origin, .. } => origin.as_deref(),
900 W::Verbatim { origin, .. } => origin.as_deref(),
901 W::Table { origin, .. } => origin.as_deref(),
902 W::Annotation { origin, .. } => origin.as_deref(),
903 W::Blank { origin, .. } => origin.as_deref(),
904 _ => None,
905 };
906 s.map(PathBuf::from)
907}
908
909/// Fallback when `WireNode::Document.origin` is unset: walk the
910/// decoded splice list and return the first item that carries an
911/// origin. The interner from `from_wire_node` ensures every item
912/// shares one Arc per origin string, so iterating is cheap.
913fn splice_items_first_origin(items: &[ContentItem]) -> Option<PathBuf> {
914 for item in items {
915 let r = match item {
916 ContentItem::Paragraph(p) => &p.location,
917 ContentItem::Session(s) => &s.location,
918 ContentItem::Definition(d) => &d.location,
919 ContentItem::List(l) => &l.location,
920 ContentItem::ListItem(li) => &li.location,
921 ContentItem::Annotation(a) => &a.location,
922 ContentItem::VerbatimBlock(v) => &v.location,
923 ContentItem::VerbatimLine(vl) => &vl.location,
924 ContentItem::Table(t) => &t.location,
925 ContentItem::TextLine(tl) => &tl.location,
926 ContentItem::BlankLineGroup(blg) => &blg.location,
927 };
928 if let Some(arc) = r.origin_path.as_ref() {
929 return Some((**arc).clone());
930 }
931 }
932 None
933}
934
935fn decode_wire_to_items(
936 wire: &lex_extension::wire::WireNode,
937 invocation_label: &str,
938 include_site: &Range,
939) -> Result<Vec<ContentItem>, IncludeError> {
940 use crate::lex::wire::from_wire_node;
941
942 from_wire_node(wire).map_err(|e| IncludeError::HandlerFailed {
943 include_site: include_site.clone(),
944 label: invocation_label.to_string(),
945 code: "wire.decode".into(),
946 message: format!("decoding handler-returned wire payload failed: {e}"),
947 })
948}
949
950/// Map a [`HandlerError`] returned by the registry into the most
951/// specific [`IncludeError`] variant available. Codes in the
952/// `-32001..=-32005` range emitted by [`crate::lex::builtins::LexIncludeHandler`]
953/// translate back to their corresponding pre-extension-system
954/// variants so existing CLI/LSP error rendering and the integration
955/// test suite keep working unchanged. Unknown codes (third-party
956/// namespaces, future built-ins) surface as `HandlerFailed`.
957fn handler_error_to_include_error(
958 err: &HandlerError,
959 label: &str,
960 include_site: &Range,
961) -> IncludeError {
962 use crate::lex::builtins::include::{
963 CODE_ABSOLUTE_PATH, CODE_IO, CODE_MISSING_SRC, CODE_NOT_FOUND, CODE_OUTSIDE_ROOT,
964 CODE_PARSE_FAILED, CODE_TOO_LARGE,
965 };
966
967 match err {
968 HandlerError::Custom {
969 code,
970 message,
971 data,
972 } => match *code {
973 CODE_NOT_FOUND => IncludeError::NotFound {
974 include_site: include_site.clone(),
975 path: data_str(data, "path")
976 .map(PathBuf::from)
977 .unwrap_or_default(),
978 },
979 CODE_OUTSIDE_ROOT => IncludeError::RootEscape {
980 path: data_str(data, "path")
981 .map(PathBuf::from)
982 .unwrap_or_default(),
983 root: data_str(data, "root")
984 .map(PathBuf::from)
985 .unwrap_or_default(),
986 },
987 CODE_TOO_LARGE => IncludeError::FileTooLarge {
988 include_site: include_site.clone(),
989 path: data_str(data, "path")
990 .map(PathBuf::from)
991 .unwrap_or_default(),
992 size: data_u64(data, "size").unwrap_or(0),
993 limit: data_u64(data, "limit").unwrap_or(0),
994 },
995 CODE_ABSOLUTE_PATH => IncludeError::AbsolutePath {
996 path: data_str(data, "path")
997 .map(PathBuf::from)
998 .unwrap_or_default(),
999 },
1000 CODE_IO => IncludeError::LoaderIo {
1001 path: data_str(data, "path")
1002 .map(PathBuf::from)
1003 .unwrap_or_default(),
1004 message: message.clone(),
1005 },
1006 CODE_MISSING_SRC => IncludeError::MissingSrc {
1007 include_site: include_site.clone(),
1008 },
1009 CODE_PARSE_FAILED => IncludeError::ParseFailed {
1010 path: data_str(data, "path")
1011 .map(PathBuf::from)
1012 .unwrap_or_default(),
1013 message: data_str(data, "message").unwrap_or_else(|| message.clone()),
1014 },
1015 other => IncludeError::HandlerFailed {
1016 include_site: include_site.clone(),
1017 label: label.to_string(),
1018 code: format!("handler.custom({other})"),
1019 message: message.clone(),
1020 },
1021 },
1022 HandlerError::Internal { message } => IncludeError::HandlerFailed {
1023 include_site: include_site.clone(),
1024 label: label.to_string(),
1025 code: "handler.internal".into(),
1026 message: message.clone(),
1027 },
1028 HandlerError::Unsupported { detail } => IncludeError::HandlerFailed {
1029 include_site: include_site.clone(),
1030 label: label.to_string(),
1031 code: "handler.unsupported".into(),
1032 message: detail.clone(),
1033 },
1034 }
1035}
1036
1037fn data_str(data: &Option<serde_json::Value>, key: &str) -> Option<String> {
1038 data.as_ref()?.get(key)?.as_str().map(str::to_string)
1039}
1040
1041fn data_u64(data: &Option<serde_json::Value>, key: &str) -> Option<u64> {
1042 data.as_ref()?.get(key)?.as_u64()
1043}
1044
1045#[allow(clippy::ptr_arg)]
1046fn recurse_into_children(
1047 children: &mut Vec<ContentItem>,
1048 state: &mut ResolverState<'_>,
1049) -> Result<(), IncludeError> {
1050 for item in children.iter_mut() {
1051 match item {
1052 ContentItem::Session(s) => {
1053 splice_in_session_container(s.children.as_mut_vec(), state)?;
1054 }
1055 ContentItem::Definition(d) => {
1056 splice_in_general_container(&mut d.children, state, ContainerKind::Definition)?;
1057 }
1058 ContentItem::Annotation(a) => {
1059 // Skip the body of annotations whose schema declares
1060 // `hooks.resolve = true` — those are dispatched at the
1061 // parent level by `process_resolves`. Walking their
1062 // bodies *here* would trip the resolve again on the
1063 // same invocation.
1064 //
1065 // The body is still walked when the resolve actually
1066 // runs: `process_resolves` calls
1067 // `resolve_one_invocation`, and the
1068 // [`ResolveOutcome::Spliced`] arm walks the splice
1069 // subtree (which replaces the annotation), while the
1070 // [`ResolveOutcome::Unexpanded`] arm explicitly
1071 // walks the kept annotation's body via
1072 // `splice_in_general_container`. So nested
1073 // resolve-hooked annotations inside an unexpanded
1074 // outer annotation are still reached.
1075 //
1076 // Non-resolve-hooked annotations recurse normally
1077 // here so their nested bodies get processed.
1078 let is_resolve_hooked = state
1079 .registry
1080 .schema_for(&a.data.label.value)
1081 .map(|s| s.hooks.resolve)
1082 .unwrap_or(false);
1083 if !is_resolve_hooked {
1084 splice_in_general_container(
1085 &mut a.children,
1086 state,
1087 ContainerKind::AnnotationBody,
1088 )?;
1089 }
1090 }
1091 ContentItem::List(l) => {
1092 for li in l.items.as_mut_vec().iter_mut() {
1093 if let ContentItem::ListItem(item) = li {
1094 splice_in_general_container(
1095 &mut item.children,
1096 state,
1097 ContainerKind::ListItem,
1098 )?;
1099 }
1100 }
1101 }
1102 _ => {}
1103 }
1104 }
1105 Ok(())
1106}
1107
1108fn validate_against_kind(
1109 items: &[ContentItem],
1110 kind: ContainerKind,
1111 site: &Range,
1112 file: &Path,
1113) -> Result<(), IncludeError> {
1114 if kind.allows_sessions() {
1115 return Ok(());
1116 }
1117 if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
1118 return Err(IncludeError::ContainerPolicy {
1119 include_site: site.clone(),
1120 container: kind.name(),
1121 file: file.to_path_buf(),
1122 violation: "Sessions",
1123 });
1124 }
1125 Ok(())
1126}
1127
1128// ============================================================================
1129// Path resolution
1130// ============================================================================
1131
1132/// Resolve a file-reference target string the same way the include
1133/// resolver resolves include paths.
1134///
1135/// Use this when consuming `ReferenceType::File { target }` (or any other
1136/// node-attached path) so that relative paths resolve from the *authoring*
1137/// file's directory, not from wherever the merged document happens to be
1138/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
1139/// containing node (or `None` if the node was never stamped — in that case
1140/// the path is treated as if authored at the root).
1141///
1142/// Behaviour matches the include resolver:
1143/// - Root-absolute targets (leading `/`) resolve under `root`.
1144/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
1145/// when `ref_origin` is `None`).
1146/// - The result is lexically normalized and checked against `root` —
1147/// paths that escape it return `RootEscape`.
1148///
1149/// This is a sister to the resolver's internal `resolve_path` and shares
1150/// the same lexical-normalization caveat: it does not touch the filesystem.
1151pub fn resolve_file_reference(
1152 target: &str,
1153 ref_origin: Option<&Path>,
1154 root: &Path,
1155) -> Result<PathBuf, IncludeError> {
1156 let host_dir: PathBuf = ref_origin
1157 .and_then(|p| p.parent())
1158 .map(Path::to_path_buf)
1159 .unwrap_or_else(|| root.to_path_buf());
1160 resolve_path(target, &host_dir, root)
1161}
1162
1163fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
1164 let candidate = if let Some(rel) = src.strip_prefix('/') {
1165 // Root-absolute (Lex spec convention): leading `/` means "from
1166 // the resolution root", not "filesystem root".
1167 root.join(rel)
1168 } else {
1169 // Anything else must be a relative path. Reject inputs the
1170 // host platform would treat as absolute (Windows `C:\foo`,
1171 // `\\server\share`, `\foo`) up front: the spec forbids
1172 // platform-absolute paths from entering the resolution
1173 // pipeline. Without this, `host_dir.join(src)` would silently
1174 // discard `host_dir` because Rust's `PathBuf::join` replaces
1175 // the base when the joined path is absolute. The downstream
1176 // root-escape check would still catch the security side, but
1177 // we'd surface a misleading "escapes root" error instead of
1178 // "absolute paths not allowed", and we'd be relying on
1179 // `PathBuf::join`'s override semantics for the security
1180 // outcome rather than holding the line at the input boundary.
1181 if Path::new(src).is_absolute() {
1182 return Err(IncludeError::AbsolutePath {
1183 path: PathBuf::from(src),
1184 });
1185 }
1186 host_dir.join(src)
1187 };
1188 let normalized = lexical_normalize(&candidate);
1189 let canonical_root = lexical_normalize(root);
1190 if !normalized.starts_with(&canonical_root) {
1191 return Err(IncludeError::RootEscape {
1192 path: normalized,
1193 root: canonical_root,
1194 });
1195 }
1196 Ok(normalized)
1197}
1198
1199/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
1200///
1201/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
1202/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
1203/// version is sufficient for include-site path resolution because the
1204/// resolver only needs a stable identity for cycle detection and a uniform
1205/// shape for the root-escape prefix check.
1206///
1207/// `..` is collapsed only when the *last* component in the buffer is a
1208/// real directory name (`Component::Normal`). When the buffer is empty
1209/// or its last component is itself `..` (or a root marker), the new `..`
1210/// is *preserved* in the buffer.
1211///
1212/// This is what defeats `../../etc/passwd` from collapsing to
1213/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
1214/// would happily strip a `..` (since `Path::new("..").parent()` returns
1215/// `Some("")`), silently losing the second `..` and producing a path
1216/// that falsely starts with the root prefix. Each unmatched `..` in the
1217/// preserved form keeps the normalized path outside any sane root, so
1218/// the escape check fires correctly.
1219fn lexical_normalize(p: &Path) -> PathBuf {
1220 let mut out = PathBuf::new();
1221 for c in p.components() {
1222 match c {
1223 std::path::Component::ParentDir => {
1224 let can_pop = matches!(
1225 out.components().next_back(),
1226 Some(std::path::Component::Normal(_))
1227 );
1228 if can_pop {
1229 out.pop();
1230 } else {
1231 out.push("..");
1232 }
1233 }
1234 std::path::Component::CurDir => {}
1235 other => out.push(other.as_os_str()),
1236 }
1237 }
1238 out
1239}
1240
1241// ============================================================================
1242// Origin stamping
1243// ============================================================================
1244//
1245// Walk every node in a Document and set `Range.origin_path` on each
1246// `.location` field. The walk only stamps the *block-level* `.location`
1247// fields here; finer-grained inline ranges land in PR 6 when file-ref
1248// resolution starts consulting them.
1249
1250pub(crate) fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
1251 if let Some(title) = doc.title.as_mut() {
1252 title.location.origin_path = Some(Arc::clone(origin));
1253 }
1254 for ann in doc.annotations.iter_mut() {
1255 stamp_annotation(ann, origin);
1256 }
1257 stamp_session(&mut doc.root, origin);
1258}
1259
1260fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
1261 s.location.origin_path = Some(Arc::clone(origin));
1262 if let Some(loc) = s.title.location.as_mut() {
1263 loc.origin_path = Some(Arc::clone(origin));
1264 }
1265 for ann in s.annotations.iter_mut() {
1266 stamp_annotation(ann, origin);
1267 }
1268 for item in s.children.as_mut_vec().iter_mut() {
1269 stamp_item(item, origin);
1270 }
1271}
1272
1273fn stamp_annotation(
1274 a: &mut crate::lex::ast::elements::annotation::Annotation,
1275 origin: &Arc<PathBuf>,
1276) {
1277 a.location.origin_path = Some(Arc::clone(origin));
1278 a.data.location.origin_path = Some(Arc::clone(origin));
1279 for item in a.children.as_mut_vec().iter_mut() {
1280 stamp_item(item, origin);
1281 }
1282}
1283
1284fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
1285 match item {
1286 ContentItem::Session(s) => stamp_session(s, origin),
1287 ContentItem::Annotation(a) => stamp_annotation(a, origin),
1288 ContentItem::Paragraph(p) => {
1289 p.location.origin_path = Some(Arc::clone(origin));
1290 for ann in p.annotations.iter_mut() {
1291 stamp_annotation(ann, origin);
1292 }
1293 for line in p.lines.iter_mut() {
1294 stamp_item(line, origin);
1295 }
1296 }
1297 ContentItem::List(l) => {
1298 l.location.origin_path = Some(Arc::clone(origin));
1299 for li in l.items.as_mut_vec().iter_mut() {
1300 stamp_item(li, origin);
1301 }
1302 }
1303 ContentItem::ListItem(li) => {
1304 li.location.origin_path = Some(Arc::clone(origin));
1305 for ann in li.annotations.iter_mut() {
1306 stamp_annotation(ann, origin);
1307 }
1308 for child in li.children.as_mut_vec().iter_mut() {
1309 stamp_item(child, origin);
1310 }
1311 }
1312 ContentItem::Definition(d) => {
1313 d.location.origin_path = Some(Arc::clone(origin));
1314 for ann in d.annotations.iter_mut() {
1315 stamp_annotation(ann, origin);
1316 }
1317 for child in d.children.as_mut_vec().iter_mut() {
1318 stamp_item(child, origin);
1319 }
1320 }
1321 ContentItem::VerbatimBlock(v) => {
1322 v.location.origin_path = Some(Arc::clone(origin));
1323 }
1324 ContentItem::VerbatimLine(vl) => {
1325 vl.location.origin_path = Some(Arc::clone(origin));
1326 }
1327 ContentItem::Table(t) => {
1328 t.location.origin_path = Some(Arc::clone(origin));
1329 }
1330 ContentItem::TextLine(tl) => {
1331 tl.location.origin_path = Some(Arc::clone(origin));
1332 }
1333 ContentItem::BlankLineGroup(b) => {
1334 b.location.origin_path = Some(Arc::clone(origin));
1335 }
1336 }
1337}
1338
1339// ============================================================================
1340// Parser glue
1341// ============================================================================
1342
1343/// Parse `source` into a Document but skip the annotation-attachment stage,
1344/// so include annotations are findable in container children lists.
1345pub(crate) fn parse_no_attach(source: &str) -> Result<Document, String> {
1346 crate::lex::testing::parse_without_annotation_attachment(source)
1347}
1348
1349// ============================================================================
1350// Filesystem-backed loader
1351// ============================================================================
1352
1353/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
1354///
1355/// This is the production loader used by the CLI; the LSP wraps it with a
1356/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
1357/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
1358/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
1359/// WASM-friendly.
1360///
1361/// `FsLoader` is constructed with the resolution root and rechecks every
1362/// load against it post-`fs::canonicalize`, so a symlink pointing outside
1363/// the root is rejected even though the lexical-only check in
1364/// [`resolve_path`] cannot see it. Also rejects non-regular files (devices,
1365/// FIFOs, directories) before reading, so the loader can't be tricked into
1366/// blocking on `/dev/zero` or allocating against an open device.
1367///
1368/// Errors map:
1369/// - canonicalization fails (file missing, permission denied at a parent,
1370/// broken symlink, …) → [`LoadError::NotFound`]
1371/// - canonical path doesn't sit under canonical root → [`LoadError::OutsideRoot`]
1372/// - target is not a regular file → [`LoadError::Io`] with a clear message
1373/// - any other I/O error during read → [`LoadError::Io`]
1374pub struct FsLoader {
1375 /// Filesystem-canonical resolution root. Constructed once at
1376 /// `FsLoader::new`; if canonicalization fails (e.g., the configured
1377 /// root doesn't exist on disk), we fall back to the input verbatim
1378 /// and the bounds check will simply never pass — visible to the user
1379 /// as a `LoadError::OutsideRoot` instead of silently disabling the
1380 /// security check.
1381 canonical_root: PathBuf,
1382 /// Per-file size cap (bytes). Loads of larger files surface as
1383 /// `LoadError::TooLarge` before any bytes are read into memory.
1384 /// Default [`FsLoader::DEFAULT_MAX_FILE_SIZE`].
1385 max_file_size: u64,
1386}
1387
1388impl FsLoader {
1389 /// Default per-file size cap: 10 MiB. Generous for realistic Lex
1390 /// source documents (text only) and tight enough to bound memory
1391 /// allocation per include against an adversarial 1 GB file.
1392 pub const DEFAULT_MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
1393
1394 /// Construct a loader rooted at `root` with default size limits.
1395 /// The loader stores `root`'s fs-canonical form (with symlinks
1396 /// resolved); subsequent loads validate that the requested path's
1397 /// canonical form lives under it.
1398 pub fn new(root: PathBuf) -> Self {
1399 let canonical_root = std::fs::canonicalize(&root).unwrap_or(root);
1400 Self {
1401 canonical_root,
1402 max_file_size: Self::DEFAULT_MAX_FILE_SIZE,
1403 }
1404 }
1405
1406 /// Override the default per-file size cap (bytes). Use to widen the
1407 /// limit for projects with genuinely large source files, or tighten
1408 /// it for stricter sandboxes (e.g., LSPs serving untrusted content).
1409 pub fn with_max_file_size(mut self, max_file_size: u64) -> Self {
1410 self.max_file_size = max_file_size;
1411 self
1412 }
1413}
1414
1415impl Loader for FsLoader {
1416 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1417 // 1. Canonicalize. Resolves symlinks and `..` segments against the
1418 // real filesystem. NotFound / broken-symlink / permission errors
1419 // all surface here.
1420 let canonical_path = std::fs::canonicalize(path).map_err(|e| match e.kind() {
1421 std::io::ErrorKind::NotFound => LoadError::NotFound {
1422 path: path.to_path_buf(),
1423 },
1424 _ => LoadError::Io {
1425 path: path.to_path_buf(),
1426 message: e.to_string(),
1427 },
1428 })?;
1429
1430 // 2. Bounds check against the *canonical* root. This is the
1431 // actual security gate against symlink traversal — the lexical
1432 // check in resolve_path can't see through symlinks.
1433 if !canonical_path.starts_with(&self.canonical_root) {
1434 return Err(LoadError::OutsideRoot {
1435 path: canonical_path,
1436 root: self.canonical_root.clone(),
1437 });
1438 }
1439
1440 // 3. Reject non-regular files. Without this, an attacker (with
1441 // write access to the repo) could symlink an include target to
1442 // `/dev/zero` or a FIFO and block / OOM the reader. The
1443 // is_file() metadata call is a cheap sanity check.
1444 let meta = std::fs::metadata(&canonical_path).map_err(|e| LoadError::Io {
1445 path: canonical_path.clone(),
1446 message: e.to_string(),
1447 })?;
1448 if !meta.is_file() {
1449 return Err(LoadError::Io {
1450 path: canonical_path,
1451 message: "include target is not a regular file".to_string(),
1452 });
1453 }
1454
1455 // 4. Size cap. Bounds memory allocation per include against an
1456 // adversarial 1 GB file before any bytes hit the heap.
1457 let size = meta.len();
1458 if size > self.max_file_size {
1459 return Err(LoadError::TooLarge {
1460 path: canonical_path,
1461 size,
1462 limit: self.max_file_size,
1463 });
1464 }
1465
1466 // 5. Read. By this point we know the path is a regular file under
1467 // the canonical root and within the size cap; anything that
1468 // fails here is a real I/O error worth surfacing.
1469 let source = std::fs::read_to_string(&canonical_path).map_err(|e| LoadError::Io {
1470 path: canonical_path.clone(),
1471 message: e.to_string(),
1472 })?;
1473
1474 Ok(LoadedFile {
1475 source,
1476 canonical_path,
1477 })
1478 }
1479}
1480
1481// ============================================================================
1482// Test fixtures (test-support feature + cfg(test))
1483// ============================================================================
1484
1485/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
1486#[cfg(any(test, feature = "test-support"))]
1487pub struct MemoryLoader {
1488 files: std::collections::HashMap<PathBuf, String>,
1489}
1490
1491#[cfg(any(test, feature = "test-support"))]
1492impl MemoryLoader {
1493 /// Create an empty loader. Add files with [`MemoryLoader::insert`].
1494 pub fn new() -> Self {
1495 Self {
1496 files: std::collections::HashMap::new(),
1497 }
1498 }
1499
1500 /// Register a file at `path` with the given source text.
1501 pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
1502 self.files.insert(path.into(), contents.into());
1503 self
1504 }
1505
1506 /// Convenience constructor: build a loader from any iterator of
1507 /// `(path, contents)` pairs.
1508 pub fn from_pairs<I, P, S>(pairs: I) -> Self
1509 where
1510 I: IntoIterator<Item = (P, S)>,
1511 P: Into<PathBuf>,
1512 S: Into<String>,
1513 {
1514 let mut loader = Self::new();
1515 for (path, contents) in pairs {
1516 loader.insert(path, contents);
1517 }
1518 loader
1519 }
1520}
1521
1522#[cfg(any(test, feature = "test-support"))]
1523impl Default for MemoryLoader {
1524 fn default() -> Self {
1525 Self::new()
1526 }
1527}
1528
1529#[cfg(any(test, feature = "test-support"))]
1530impl Loader for MemoryLoader {
1531 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1532 // Memory loaders have no symlinks; the lookup key *is* the
1533 // canonical identity. Cycle detection in the resolver compares
1534 // `LoadedFile::canonical_path` values; for tests this matches the
1535 // lexically-normalized paths the resolver already produces.
1536 let source = self
1537 .files
1538 .get(path)
1539 .cloned()
1540 .ok_or_else(|| LoadError::NotFound {
1541 path: path.to_path_buf(),
1542 })?;
1543 Ok(LoadedFile {
1544 source,
1545 canonical_path: path.to_path_buf(),
1546 })
1547 }
1548}
1549
1550// ============================================================================
1551// Tests
1552// ============================================================================
1553
1554#[cfg(test)]
1555mod tests;