lex_core/lex/includes.rs
1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//! doc-title/doc-annotation conversion + origin stamping + root-escape
19//! check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//! (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//! directory, so relative paths inside an included file resolve from
23//! that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//! resolves a `ReferenceType::File` target from the authoring file's
26//! directory using `Range.origin_path`.
27//! `Document::find_annotation_by_label_in_origin` scopes footnote
28//! lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//! filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//! into `lex convert` and `lex inspect` (default-on, opt-out via
32//! `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47// `IncludeError` carries diagnostic context (paths, source ranges,
48// handler messages) on every variant; the `result_large_err` lint
49// would have us box the whole error or split it into a thinner shape
50// just to satisfy the size heuristic. The enum is already part of
51// the public API and the error path is rare; suppress the lint for
52// this module rather than churn the public surface.
53#![allow(clippy::result_large_err)]
54
55use crate::lex::assembling::stages::{ApplyTableConfig, NormalizeLabels};
56use crate::lex::assembling::AttachAnnotations;
57use crate::lex::ast::elements::container::GeneralContainer;
58use crate::lex::ast::elements::content_item::ContentItem;
59use crate::lex::ast::elements::session::Session;
60use crate::lex::ast::range::Range;
61use crate::lex::ast::Document;
62use crate::lex::transforms::Runnable;
63use lex_extension::handler::HandlerError;
64use lex_extension_host::registry::Registry;
65use std::path::{Path, PathBuf};
66use std::sync::Arc;
67
68/// Configuration for the include resolution pass.
69#[derive(Debug, Clone)]
70pub struct ResolveConfig {
71 /// Directory all include paths resolve under. Any include that
72 /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
73 ///
74 /// Must be an **absolute** path. Lexical normalization treats `.`
75 /// and `..` against an empty buffer as no-ops; passing a relative
76 /// or unnormalized root weakens the root-escape prefix check.
77 /// Callers (CLI, LSP) should canonicalize the root before
78 /// constructing `ResolveConfig`.
79 pub root: PathBuf,
80 /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
81 /// Hitting the limit is an error, not a silent truncation.
82 pub max_depth: usize,
83 /// Maximum total number of `lex.include` annotations resolved across
84 /// the whole tree (depth × breadth). Default 1000
85 /// (see [`ResolveConfig::DEFAULT_MAX_TOTAL_INCLUDES`]).
86 ///
87 /// Caps fan-out: `max_depth` alone bounds chain length but not
88 /// breadth. A document with 100 thousand top-level includes at depth
89 /// 1 sits inside `max_depth` but can still OOM the resolver / LSP /
90 /// CI. Hitting this limit is an error, not a silent truncation.
91 pub max_total_includes: usize,
92}
93
94impl ResolveConfig {
95 /// Default maximum include depth — enough for any reasonable atomization
96 /// strategy (aggregator → per-chapter → per-section), bounded enough to
97 /// keep the resolver's worst-case work predictable.
98 pub const DEFAULT_MAX_DEPTH: usize = 8;
99
100 /// Default maximum total include count (DoS bound). Generous enough
101 /// for a book-length document with thousands of small fragments,
102 /// tight enough to contain adversarial fan-out within a few seconds
103 /// of resolver work.
104 pub const DEFAULT_MAX_TOTAL_INCLUDES: usize = 1000;
105
106 /// Construct a config with the given root and default limits.
107 pub fn with_root(root: PathBuf) -> Self {
108 Self {
109 root,
110 max_depth: Self::DEFAULT_MAX_DEPTH,
111 max_total_includes: Self::DEFAULT_MAX_TOTAL_INCLUDES,
112 }
113 }
114}
115
116/// A pluggable source-text loader.
117///
118/// Implementations decide where bytes come from (filesystem, in-memory map,
119/// virtual filesystem, content-addressed store, …). lex-core never references
120/// `std::fs` directly through this trait; that keeps the resolver pure and
121/// usable in WASM, sandboxes, and unit tests.
122pub trait Loader {
123 /// Load the source text for `path` and return both the contents and a
124 /// canonical identity for the loaded resource. The path is what the
125 /// resolver decided on after applying the rules in §4 of the proposal.
126 ///
127 /// `LoadedFile::canonical_path` is the loader's authoritative identity
128 /// for the resource. For [`FsLoader`] this is the filesystem-canonical
129 /// path (symlinks resolved, case-folded if the underlying FS is
130 /// case-insensitive); for [`MemoryLoader`] it's the lookup key (since
131 /// memory loaders have no symlinks). The resolver uses this for cycle
132 /// detection and for stamping `Range.origin_path` on the loaded tree.
133 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError>;
134}
135
136/// Result of a successful [`Loader::load`].
137#[derive(Debug, Clone)]
138pub struct LoadedFile {
139 /// The file's source text.
140 pub source: String,
141 /// The loader's authoritative identity for the resource. See
142 /// [`Loader::load`] for how loaders decide this.
143 pub canonical_path: PathBuf,
144}
145
146/// Errors a [`Loader`] can produce.
147#[derive(Debug, Clone)]
148pub enum LoadError {
149 /// The loader could not find a resource at the given path.
150 NotFound { path: PathBuf },
151 /// The resource exists but resolves outside the loader's allowed
152 /// boundary. The lexical resolver normalizes `..` in the requested
153 /// path, but loaders that touch a real filesystem must do a second
154 /// check post-canonicalization to catch symlinks that escape the
155 /// boundary lexically-correct paths can't reach.
156 OutsideRoot { path: PathBuf, root: PathBuf },
157 /// The resource exists but its size exceeds the loader's configured
158 /// limit. `size` and `limit` are in bytes. The resolver maps this to
159 /// [`IncludeError::FileTooLarge`] with the offending annotation's site.
160 TooLarge {
161 path: PathBuf,
162 size: u64,
163 limit: u64,
164 },
165 /// Underlying I/O error (or virtual-filesystem equivalent).
166 Io { path: PathBuf, message: String },
167}
168
169impl std::fmt::Display for LoadError {
170 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171 match self {
172 LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
173 LoadError::OutsideRoot { path, root } => write!(
174 f,
175 "include path {} resolves outside loader root {}",
176 path.display(),
177 root.display()
178 ),
179 LoadError::TooLarge { path, size, limit } => write!(
180 f,
181 "include file {} is {size} bytes, exceeds limit of {limit} bytes",
182 path.display()
183 ),
184 LoadError::Io { path, message } => {
185 write!(f, "io error reading {}: {message}", path.display())
186 }
187 }
188 }
189}
190
191impl std::error::Error for LoadError {}
192
193/// Errors the include resolver can produce.
194#[derive(Debug, Clone)]
195pub enum IncludeError {
196 /// An include chain looped back on itself. `chain` is the resolution
197 /// stack at the moment the duplicate `path` was about to be pushed,
198 /// in source-order (entry first, deepest last). `include_site` is the
199 /// range of the offending `lex.include` annotation in its host file —
200 /// useful for diagnostics that highlight the exact line.
201 Cycle {
202 include_site: Range,
203 path: PathBuf,
204 chain: Vec<PathBuf>,
205 },
206 /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
207 /// shows the resolution stack at the moment of failure, in source
208 /// order. `include_site` is the range of the offending
209 /// `lex.include` annotation in its host file.
210 DepthExceeded {
211 include_site: Range,
212 limit: usize,
213 chain: Vec<PathBuf>,
214 },
215 /// The total number of includes resolved across the document
216 /// exceeded [`ResolveConfig::max_total_includes`]. Bounds adversarial
217 /// fan-out (which `max_depth` alone does not). `include_site` is the
218 /// `lex.include` annotation that pushed the count past the limit.
219 TotalIncludesExceeded { include_site: Range, limit: usize },
220 /// The included file's size exceeded the loader's configured limit.
221 /// Surfaced by loaders that read from a real filesystem (FsLoader)
222 /// to bound memory allocation per include. `include_site` is the
223 /// offending annotation; `size` and `limit` are in bytes.
224 FileTooLarge {
225 include_site: Range,
226 path: PathBuf,
227 size: u64,
228 limit: u64,
229 },
230 /// A path resolved outside the configured [`ResolveConfig::root`].
231 RootEscape { path: PathBuf, root: PathBuf },
232 /// The include `src` was a platform-absolute filesystem path
233 /// (e.g. Windows `C:\foo`, `\\server\share`, `\foo`). The spec
234 /// forbids absolute filesystem paths from entering the
235 /// resolution pipeline; the *root-absolute* form (leading `/`
236 /// resolved against the includes root) is the only spec-allowed
237 /// way to write a path that doesn't start from the host's
238 /// directory. On Unix the only thing that's `Path::is_absolute()`
239 /// is a leading `/`, which is consumed by the root-absolute
240 /// branch first; this variant therefore only fires in practice
241 /// for Windows-shaped absolute paths.
242 AbsolutePath { path: PathBuf },
243 /// The loader could not find or read the included file. `include_site`
244 /// is the range of the offending `lex.include` annotation in its host
245 /// file, so editors can squiggle the line that asked for the missing
246 /// file rather than the document head.
247 NotFound { include_site: Range, path: PathBuf },
248 /// The loader returned text that the parser rejected.
249 ParseFailed { path: PathBuf, message: String },
250 /// The included file's content is not legal in the include site's
251 /// parent container.
252 ///
253 /// Today this only occurs when an included file has top-level Sessions
254 /// and the include site is inside a `GeneralContainer` (Definition,
255 /// ListItem, or another Annotation's body). The `violation` field
256 /// names the offending content kind (e.g. `"Sessions"`) so future
257 /// container/policy combinations can reuse this variant without a
258 /// breaking change.
259 ContainerPolicy {
260 include_site: Range,
261 container: &'static str,
262 file: PathBuf,
263 violation: &'static str,
264 },
265 /// Loader propagated a non-`NotFound` I/O error.
266 LoaderIo { path: PathBuf, message: String },
267 /// `lex.include` annotation was missing the mandatory `src=` parameter.
268 MissingSrc { include_site: Range },
269 /// A registered handler returned an error the pass could not map
270 /// onto a more specific variant — typically a third-party
271 /// namespace's resolve hook surfacing an internal failure, or an
272 /// unrecognised handler-defined code from `lex.*` built-ins. The
273 /// `code` is the string identifier the registry attaches to the
274 /// diagnostic (`"handler.internal"`, `"handler.custom"`, …).
275 HandlerFailed {
276 include_site: Range,
277 label: String,
278 code: String,
279 message: String,
280 },
281}
282
283impl std::fmt::Display for IncludeError {
284 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285 match self {
286 IncludeError::Cycle { path, chain, .. } => {
287 let chain_display: Vec<String> =
288 chain.iter().map(|p| p.display().to_string()).collect();
289 write!(
290 f,
291 "include cycle: {} (chain: {})",
292 path.display(),
293 chain_display.join(" -> ")
294 )
295 }
296 IncludeError::DepthExceeded { limit, chain, .. } => {
297 let chain_display: Vec<String> =
298 chain.iter().map(|p| p.display().to_string()).collect();
299 write!(
300 f,
301 "include depth exceeded limit of {limit} (chain: {})",
302 chain_display.join(" -> ")
303 )
304 }
305 IncludeError::TotalIncludesExceeded { limit, .. } => {
306 write!(f, "total include count exceeded limit of {limit}")
307 }
308 IncludeError::FileTooLarge {
309 path, size, limit, ..
310 } => {
311 write!(
312 f,
313 "included file {} is {size} bytes, exceeds limit of {limit} bytes",
314 path.display()
315 )
316 }
317 IncludeError::RootEscape { path, root } => write!(
318 f,
319 "include path {} escapes resolution root {}",
320 path.display(),
321 root.display()
322 ),
323 IncludeError::AbsolutePath { path } => write!(
324 f,
325 "include src {} is a platform-absolute path; \
326 the spec forbids absolute filesystem paths — use a relative path \
327 (chapters/01.lex) or a root-absolute path (/shared/01.lex)",
328 path.display()
329 ),
330 IncludeError::NotFound { path, .. } => {
331 write!(f, "include not found: {}", path.display())
332 }
333 IncludeError::ParseFailed { path, message } => {
334 write!(f, "failed to parse {}: {message}", path.display())
335 }
336 IncludeError::ContainerPolicy {
337 container,
338 file,
339 violation,
340 ..
341 } => write!(
342 f,
343 "included file {} contains {} but include site is inside {} \
344 (which does not allow {})",
345 file.display(),
346 violation,
347 container,
348 violation
349 ),
350 IncludeError::LoaderIo { path, message } => {
351 write!(f, "loader error reading {}: {message}", path.display())
352 }
353 IncludeError::MissingSrc { .. } => {
354 write!(f, "lex.include annotation missing required src= parameter")
355 }
356 IncludeError::HandlerFailed {
357 label,
358 code,
359 message,
360 ..
361 } => write!(f, "extension handler `{label}` failed ({code}): {message}"),
362 }
363 }
364}
365
366impl std::error::Error for IncludeError {}
367
368// No `From<LoadError>` impl: `IncludeError::NotFound` carries the include
369// site (the `lex.include` annotation's range), which a loader doesn't know
370// about. Callers map `LoadError` explicitly at the call site, where the
371// site is available.
372
373/// Which container the include site sits in. Determines the splice-time
374/// policy check (the only one today is "no Sessions in `GeneralContainer`").
375#[derive(Debug, Clone, Copy)]
376enum ContainerKind {
377 /// `Document.root.children` or `Session.children` — accepts everything.
378 Session,
379 /// `Definition.children` — `GeneralContainer`.
380 Definition,
381 /// `Annotation.children` — `GeneralContainer`.
382 AnnotationBody,
383 /// `ListItem.children` — `GeneralContainer`.
384 ListItem,
385}
386
387impl ContainerKind {
388 fn name(self) -> &'static str {
389 match self {
390 ContainerKind::Session => "Session",
391 ContainerKind::Definition => "Definition",
392 ContainerKind::AnnotationBody => "Annotation body",
393 ContainerKind::ListItem => "ListItem",
394 }
395 }
396
397 fn allows_sessions(self) -> bool {
398 matches!(self, ContainerKind::Session)
399 }
400}
401
402/// Hard cap on resolution depth, applied even when the
403/// configurable [`ResolveConfig::max_depth`] is set higher. Bounds
404/// adversarial varying-position recursion (a handler that returns
405/// content with a different invocation site each iteration so the
406/// cycle key never matches) so the resolver always terminates.
407pub const KERNEL_DEPTH_BACKSTOP: usize = 32;
408
409/// Resolve every `hooks.resolve = true` labelled annotation starting
410/// from `source`, dispatching through `registry`, and recursively
411/// processing the spliced content.
412///
413/// `source_path` identifies the entry-point file. It is used to
414/// (a) stamp `Range.origin_path` on every node so downstream code
415/// (file-ref resolution, diagnostics, LSP goto) can report locations
416/// against the authoring file, and (b) provide the host directory
417/// the built-in `lex.include` handler resolves relative `src=` paths
418/// against (via `LabelCtx.node.origin`). When `None`, origin stamping
419/// is skipped on the entry and the handler resolves relative paths
420/// against `config.root`.
421///
422/// # Generic dispatch
423///
424/// Every label whose schema declares `hooks.resolve = true` flows
425/// through the same path: build a [`LabelCtx`] from the annotation,
426/// call [`Registry::dispatch_resolve_raw`], decode the returned
427/// [`WireNode`] back into typed [`ContentItem`]s via
428/// [`crate::lex::wire::from_wire_node`], and splice in place. The
429/// built-in `lex.include` handler is registered the same way as any
430/// third-party namespace.
431///
432/// # Pre/post-attachment
433///
434/// Internally this re-parses the entry source *without* annotation
435/// attachment so labelled annotations stay visible as standalone
436/// children. The handler does its own `parse_no_attach` for loaded
437/// content. After all splices, [`AttachAnnotations`] runs once on
438/// the merged tree.
439///
440/// # Recursion + cycle detection
441///
442/// Cycle detection keys on `(label, origin_path, start_position)` of
443/// the invocation site. A handler that returns content containing
444/// another invocation at the same source position is caught
445/// immediately. A handler that varies the invocation position each
446/// iteration terminates at `min(config.max_depth, KERNEL_DEPTH_BACKSTOP)`
447/// with `IncludeError::DepthExceeded`. The total-includes counter
448/// caps adversarial fan-out independent of depth.
449pub fn resolve_from_source(
450 source: &str,
451 source_path: Option<PathBuf>,
452 config: &ResolveConfig,
453 registry: &Registry,
454) -> Result<Document, IncludeError> {
455 let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
456
457 // Run the SHARED parser front-end (the same one `run_string_to_ast`
458 // uses): source → assembled Document (annotations still standalone)
459 // plus the reference-line pre-pass results. This is the de-duplication
460 // fix for lex#722 — before this, the resolver had its own hand-rolled
461 // copy of the front-end (`parse_without_annotation_attachment`) that
462 // never ran the reference-line pre-pass, so whole-element anchors were
463 // silently dropped on the default `lexd <file> --to <fmt>` path. Now
464 // there is exactly one front-end and it can't drift.
465 let (mut doc, prepass) = crate::lex::transforms::standard::parse_to_attached_root(
466 source.to_string(),
467 )
468 .map_err(|e| IncludeError::ParseFailed {
469 path: source_path.clone().unwrap_or_default(),
470 message: e.to_string(),
471 })?;
472
473 // Carry the entry file's reference lines (whole-element anchors) onto
474 // the document so the babel serializers / LSP documentLink can render
475 // them. These ranges are in the entry source's original coordinates,
476 // which is correct for the entry file. Reference lines that live
477 // *inside* included files are handled separately after splicing (see
478 // below); they are NOT in `prepass`, which only saw the entry source.
479 doc.reference_lines = prepass.reference_lines;
480 doc.reference_line_diagnostics = prepass.diagnostics;
481
482 if let Some(origin) = entry_origin.as_ref() {
483 stamp_doc(&mut doc, origin);
484 }
485
486 // Normalise labels in the entry source BEFORE the resolve walk so
487 // shortcut spellings (`:: include ::`, `:: image ::`, …) are
488 // rewritten to their canonical form. The resolve dispatcher keys
489 // on `registry.schema_for(label)` with the canonical spelling, so
490 // without this an `:: include src=... ::` annotation would be
491 // skipped because no schema is registered under the bare alias.
492 //
493 // Permissive mode: unknown labels are left as-is rather than
494 // erroring. The standard parse pipeline enforces strict-mode
495 // namespace policy (`STRING_TO_AST`); the resolve entry point is
496 // a downstream stage that just needs the shortcut table applied
497 // so dispatch finds the right handler.
498 let mut doc =
499 NormalizeLabels::permissive()
500 .run(doc)
501 .map_err(|e| IncludeError::ParseFailed {
502 path: source_path.clone().unwrap_or_default(),
503 message: format!("label normalisation failed: {e}"),
504 })?;
505
506 let mut chain: Vec<ResolveKey> = Vec::new();
507 let mut state = ResolverState {
508 config,
509 registry,
510 chain: &mut chain,
511 depth: 0,
512 total_resolved: 0,
513 };
514
515 splice_in_session_container(doc.root.children.as_mut_vec(), &mut state)?;
516
517 let doc = AttachAnnotations::new()
518 .run(doc)
519 .map_err(|e| IncludeError::ParseFailed {
520 path: source_path.clone().unwrap_or_default(),
521 message: format!("annotation attachment failed: {e}"),
522 })?;
523
524 // Re-normalise after splicing. Each included file is parsed via
525 // `parse_no_attach` (no normalisation), so shortcut labels in the
526 // spliced content — e.g. `:: image src=... ::` inside an included
527 // chapter — need rewriting before downstream IR/format passes can
528 // dispatch them.
529 let doc = NormalizeLabels::permissive()
530 .run(doc)
531 .map_err(|e| IncludeError::ParseFailed {
532 path: source_path.clone().unwrap_or_default(),
533 message: format!("label normalisation failed: {e}"),
534 })?;
535
536 // Apply table configuration so `:: table header=N align=... ::`
537 // annotations attached to tables (here or in spliced content) take
538 // effect — matches the order the standard pipeline runs them.
539 let doc = ApplyTableConfig::new()
540 .run(doc)
541 .map_err(|e| IncludeError::ParseFailed {
542 path: source_path.unwrap_or_default(),
543 message: format!("table config application failed: {e}"),
544 })?;
545
546 Ok(doc)
547}
548
549// ============================================================================
550// Splicing
551// ============================================================================
552
553/// One frame on the resolve-pass cycle stack. Two invocations at the
554/// same `(label, origin, start)` position are a cycle, regardless of
555/// what parameters either invocation uses — a handler that varies
556/// params per call (random IDs, timestamps) cannot defeat the
557/// detector by changing param values.
558#[derive(Debug, Clone, PartialEq)]
559struct ResolveKey {
560 label: String,
561 /// `Range.origin_path` of the annotation — the file the
562 /// invocation was authored in. `None` when stamping was skipped
563 /// (e.g., entry source loaded from a string with no path).
564 origin: Option<PathBuf>,
565 start: crate::lex::ast::range::Position,
566}
567
568impl ResolveKey {
569 fn from_annotation(a: &crate::lex::ast::elements::annotation::Annotation) -> Self {
570 Self {
571 label: a.data.label.value.clone(),
572 origin: a.location.origin_path.as_ref().map(|p| (**p).clone()),
573 start: a.location.start,
574 }
575 }
576}
577
578/// Per-resolution state threaded through the recursive walker. Keeps the
579/// signatures of the splice/process functions short and ensures
580/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
581/// each invocation.
582struct ResolverState<'a> {
583 config: &'a ResolveConfig,
584 registry: &'a Registry,
585 /// Active resolution stack of `(label, origin, position)` keys.
586 /// Pushed when we begin dispatching for an invocation and popped
587 /// when its splice subtree is fully resolved. A push that finds
588 /// the same key already on the stack is a cycle.
589 chain: &'a mut Vec<ResolveKey>,
590 /// Number of dispatch hops from the entry point. Each recursion
591 /// increments by 1. Hitting `config.max_depth` or the
592 /// [`KERNEL_DEPTH_BACKSTOP`] (whichever is lower) is an error.
593 depth: usize,
594 /// Total invocations resolved across the entire walk
595 /// (depth × breadth). Incremented on every successful dispatch.
596 /// Hitting `config.max_total_includes` aborts with
597 /// `TotalIncludesExceeded`.
598 total_resolved: usize,
599}
600
601fn splice_in_session_container(
602 children: &mut Vec<ContentItem>,
603 state: &mut ResolverState<'_>,
604) -> Result<(), IncludeError> {
605 // Post-order: recurse into nested containers first, splice this
606 // container's invocations second. Recursion happens inside
607 // `process_resolves` for any spliced subtree, so that subtree
608 // is never re-walked at the parent level.
609 recurse_into_children(children, state)?;
610 process_resolves(children, state, ContainerKind::Session)
611}
612
613fn splice_in_general_container(
614 container: &mut GeneralContainer,
615 state: &mut ResolverState<'_>,
616 kind: ContainerKind,
617) -> Result<(), IncludeError> {
618 recurse_into_children(container.as_mut_vec(), state)?;
619 process_resolves(container.as_mut_vec(), state, kind)
620}
621
622/// Walk the children of a container, dispatch every annotation whose
623/// schema declares `hooks.resolve = true` through the registry, and
624/// splice the returned content in place of the annotation. Recurses
625/// into the spliced content so nested invocations resolve too.
626// Allow &mut Vec because `splice` needs Vec-specific operations.
627#[allow(clippy::ptr_arg)]
628fn process_resolves(
629 children: &mut Vec<ContentItem>,
630 state: &mut ResolverState<'_>,
631 kind: ContainerKind,
632) -> Result<(), IncludeError> {
633 // Collect indices of annotations whose schema has hooks.resolve.
634 let resolve_indices: Vec<usize> = children
635 .iter()
636 .enumerate()
637 .filter_map(|(i, item)| match item {
638 ContentItem::Annotation(a) => {
639 let label = &a.data.label.value;
640 if state
641 .registry
642 .schema_for(label)
643 .map(|s| s.hooks.resolve)
644 .unwrap_or(false)
645 {
646 Some(i)
647 } else {
648 None
649 }
650 }
651 _ => None,
652 })
653 .collect();
654
655 for i in resolve_indices.into_iter().rev() {
656 let annotation = match &children[i] {
657 ContentItem::Annotation(a) => a.clone(),
658 _ => unreachable!("index came from resolve filter"),
659 };
660
661 match resolve_one_invocation(&annotation, state, kind)? {
662 ResolveOutcome::Spliced(splice_items) => {
663 // Expansion replaces the directive with the included content. The
664 // `lex.include` annotation is consumed — drop it. (It used to be
665 // kept in the stream as provenance, relying on the serializer
666 // dropping attached annotations; now that the serializer emits
667 // them (lex#682), keeping it would leak `:: lex.include ::` into
668 // expanded output. Origin provenance is tracked on
669 // `Range.origin_path`, not this node.)
670 children.splice(i..=i, splice_items);
671 }
672 ResolveOutcome::Unexpanded => {
673 // Handler opted out of expanding this invocation. The
674 // annotation stays in place, but its body wasn't
675 // walked by `recurse_into_children` (that walker
676 // skips resolve-hooked annotations to avoid double-
677 // resolution). Walk the body now so any nested
678 // invocations inside the unexpanded annotation get
679 // resolved on the way back up.
680 let mut owned = annotation;
681 splice_in_general_container(
682 &mut owned.children,
683 state,
684 ContainerKind::AnnotationBody,
685 )?;
686 children[i] = ContentItem::Annotation(owned);
687 }
688 }
689 }
690
691 Ok(())
692}
693
694/// Outcome of dispatching a single resolve-hooked annotation. The
695/// pass needs to distinguish between "handler returned content,
696/// splice it in" and "handler opted out, leave the annotation
697/// alone": the second case still requires walking the annotation's
698/// body for nested invocations because `recurse_into_children`
699/// otherwise skips resolve-hooked annotations to prevent double-
700/// resolution.
701enum ResolveOutcome {
702 Spliced(Vec<ContentItem>),
703 Unexpanded,
704}
705
706/// Dispatch a single resolve-hooked annotation through the registry,
707/// decode the returned `WireNode` back into typed children, then
708/// recursively walk the splice items so nested invocations resolve
709/// before the splice is placed into the parent container.
710///
711/// Returns [`ResolveOutcome::Unexpanded`] when the handler returned
712/// `Ok(None)` (third-party handlers can opt out of expanding a
713/// particular invocation). The caller is then responsible for
714/// walking the annotation's body for nested invocations — the
715/// resolve walker normally skips resolve-hooked annotations'
716/// bodies.
717fn resolve_one_invocation(
718 annotation: &crate::lex::ast::elements::annotation::Annotation,
719 state: &mut ResolverState<'_>,
720 parent_kind: ContainerKind,
721) -> Result<ResolveOutcome, IncludeError> {
722 let label = &annotation.data.label.value;
723 let key = ResolveKey::from_annotation(annotation);
724
725 // Cycle check on (label, origin, start) of the invocation site.
726 if state.chain.contains(&key) {
727 return Err(IncludeError::Cycle {
728 include_site: annotation.location.clone(),
729 path: key.origin.clone().unwrap_or_default(),
730 chain: state
731 .chain
732 .iter()
733 .map(|k| k.origin.clone().unwrap_or_default())
734 .collect(),
735 });
736 }
737
738 // Depth check. The effective limit is the lower of the
739 // user-facing `config.max_depth` (default 8) and the hard
740 // [`KERNEL_DEPTH_BACKSTOP`] (32, fixed). The kernel backstop
741 // exists for adversarial varying-position recursion that the
742 // cycle key can't catch — even if a user bumps `max_depth`
743 // higher than 32 for legitimate deep atomization, the backstop
744 // still terminates. The error reports `effective_depth_limit`
745 // (the actual cap that fired) rather than `config.max_depth`,
746 // so when the backstop is the binding limit the user sees `32`
747 // and not the (higher) config value.
748 let effective_depth_limit = state.config.max_depth.min(KERNEL_DEPTH_BACKSTOP);
749 if state.depth >= effective_depth_limit {
750 return Err(IncludeError::DepthExceeded {
751 include_site: annotation.location.clone(),
752 limit: effective_depth_limit,
753 chain: state
754 .chain
755 .iter()
756 .map(|k| k.origin.clone().unwrap_or_default())
757 .collect(),
758 });
759 }
760
761 // Total-count check before dispatch.
762 if state.total_resolved >= state.config.max_total_includes {
763 return Err(IncludeError::TotalIncludesExceeded {
764 include_site: annotation.location.clone(),
765 limit: state.config.max_total_includes,
766 });
767 }
768
769 let ctx = build_label_ctx(annotation);
770
771 let wire_node = match state.registry.dispatch_resolve_raw(&ctx) {
772 Ok(Some(node)) => node,
773 Ok(None) => {
774 // Handler returned "nothing to splice" — leave the
775 // annotation in place. The caller still needs to walk
776 // its body for nested invocations (built-in lex.include
777 // never returns None; this path is reachable only via
778 // third-party handlers that opt out per-invocation).
779 return Ok(ResolveOutcome::Unexpanded);
780 }
781 Err(handler_err) => {
782 return Err(handler_error_to_include_error(
783 &handler_err,
784 label,
785 &annotation.location,
786 ));
787 }
788 };
789
790 state.total_resolved += 1;
791
792 // Decode the wire payload into typed lex-core ContentItems.
793 let mut splice_items = decode_wire_to_items(&wire_node, label, &annotation.location)?;
794
795 // Recurse into the spliced subtree FIRST so nested resolve-hooked
796 // annotations are processed before the splice lands. Validation
797 // must wait until *after* this step: a nested invocation can
798 // splice in content (e.g. a top-level `Session` from a chained
799 // `lex.include`) that wasn't in the handler's original output,
800 // and the final shape is what has to satisfy the parent
801 // container's policy.
802 //
803 // The `IncludeError::ContainerPolicy.file` field describes the
804 // *spliced content's* source file (the file containing the
805 // disallowed shape), not the invocation site. Take it from the
806 // handler-returned wire payload's origin when present, falling
807 // back to the first decoded item's origin path if the wire
808 // payload didn't stamp a `Document` origin.
809 let included_path = wire_node_origin_pathbuf(&wire_node)
810 .or_else(|| splice_items_first_origin(&splice_items))
811 .unwrap_or_default();
812 state.chain.push(key);
813 let saved_depth = state.depth;
814 state.depth = saved_depth + 1;
815 let recurse_result = splice_in_session_container(&mut splice_items, state);
816 state.depth = saved_depth;
817 state.chain.pop();
818 recurse_result?;
819
820 // Container-policy validation: enforce no-Sessions inside
821 // `GeneralContainer` (Definition / Annotation body / ListItem).
822 // Runs against the post-recursion splice list so nested
823 // expansions can't smuggle disallowed shapes past the check.
824 validate_against_kind(
825 &splice_items,
826 parent_kind,
827 &annotation.location,
828 &included_path,
829 )?;
830
831 Ok(ResolveOutcome::Spliced(splice_items))
832}
833
834/// Build a [`LabelCtx`] from a lex-core [`Annotation`]. The body is
835/// derived from the annotation's children (parsed-Lex form), the
836/// params from `Annotation::data::parameters`, and the host node info
837/// from `Annotation::location`.
838fn build_label_ctx(
839 a: &crate::lex::ast::elements::annotation::Annotation,
840) -> lex_extension::wire::LabelCtx {
841 use crate::lex::wire::to_wire_node;
842 use lex_extension::wire::{AnnotationBody, LabelCtx, NodeRef};
843
844 let label = a.data.label.value.clone();
845 let params = {
846 // Pass *semantic* parameter values to handlers (quotes
847 // stripped, escape sequences resolved). Handlers consume
848 // params as JSON values, where there is no "quoted string"
849 // vs "unquoted token" distinction; only the decoded value
850 // is meaningful. The codec's `parameters_to_json` (used by
851 // `annotation_to_wire` for round-tripping annotation
852 // *content*) keeps the raw form to preserve source — the
853 // two paths intentionally differ.
854 let mut obj = serde_json::Map::with_capacity(a.data.parameters.len());
855 for p in &a.data.parameters {
856 obj.insert(p.key.clone(), serde_json::Value::String(p.unquoted_value()));
857 }
858 serde_json::Value::Object(obj)
859 };
860 let body = if a.children.is_empty() {
861 AnnotationBody::None
862 } else {
863 let wire_children: Vec<lex_extension::wire::WireNode> =
864 a.children.iter().map(to_wire_node).collect();
865 AnnotationBody::Lex {
866 children: wire_children,
867 }
868 };
869 let range = lex_extension::wire::Range::new(
870 lex_extension::wire::Position::new(
871 u32::try_from(a.location.start.line).unwrap_or(u32::MAX),
872 u32::try_from(a.location.start.column).unwrap_or(u32::MAX),
873 ),
874 lex_extension::wire::Position::new(
875 u32::try_from(a.location.end.line).unwrap_or(u32::MAX),
876 u32::try_from(a.location.end.column).unwrap_or(u32::MAX),
877 ),
878 );
879 let origin = a
880 .location
881 .origin_path
882 .as_ref()
883 .map(|p| p.to_string_lossy().into_owned());
884 LabelCtx {
885 label,
886 params,
887 body,
888 node: NodeRef {
889 kind: "annotation".into(),
890 range,
891 origin,
892 },
893 }
894}
895
896/// Convert a handler-returned [`WireNode`] back into a list of
897/// [`ContentItem`]s ready for splicing. `WireNode::Document` is
898/// unwrapped (its children become the splice list); any other root
899/// shape is wrapped as a single-item list.
900///
901/// `invocation_label` is the label whose handler produced `wire` —
902/// threaded through so wire-decode failures are attributed to the
903/// real namespace rather than a hardcoded `lex.include`. A
904/// third-party `acme.expand` handler that returns malformed wire
905/// will surface as `IncludeError::HandlerFailed { label:
906/// "acme.expand", .. }`.
907/// Lift a [`WireNode`]'s top-level `origin` field into a `PathBuf`
908/// when present. Used by the resolve pass to attribute
909/// container-policy errors to the *spliced content's* source file
910/// rather than the invocation site.
911fn wire_node_origin_pathbuf(node: &lex_extension::wire::WireNode) -> Option<PathBuf> {
912 use lex_extension::wire::WireNode as W;
913 let s = match node {
914 W::Document { origin, .. } => origin.as_deref(),
915 W::Session { origin, .. } => origin.as_deref(),
916 W::Definition { origin, .. } => origin.as_deref(),
917 W::Paragraph { origin, .. } => origin.as_deref(),
918 W::List { origin, .. } => origin.as_deref(),
919 W::Verbatim { origin, .. } => origin.as_deref(),
920 W::Table { origin, .. } => origin.as_deref(),
921 W::Annotation { origin, .. } => origin.as_deref(),
922 W::Blank { origin, .. } => origin.as_deref(),
923 _ => None,
924 };
925 s.map(PathBuf::from)
926}
927
928/// Fallback when `WireNode::Document.origin` is unset: walk the
929/// decoded splice list and return the first item that carries an
930/// origin. The interner from `from_wire_node` ensures every item
931/// shares one Arc per origin string, so iterating is cheap.
932fn splice_items_first_origin(items: &[ContentItem]) -> Option<PathBuf> {
933 for item in items {
934 let r = match item {
935 ContentItem::Paragraph(p) => &p.location,
936 ContentItem::Session(s) => &s.location,
937 ContentItem::Definition(d) => &d.location,
938 ContentItem::List(l) => &l.location,
939 ContentItem::ListItem(li) => &li.location,
940 ContentItem::Annotation(a) => &a.location,
941 ContentItem::VerbatimBlock(v) => &v.location,
942 ContentItem::VerbatimLine(vl) => &vl.location,
943 ContentItem::Table(t) => &t.location,
944 ContentItem::TextLine(tl) => &tl.location,
945 ContentItem::BlankLineGroup(blg) => &blg.location,
946 };
947 if let Some(arc) = r.origin_path.as_ref() {
948 return Some((**arc).clone());
949 }
950 }
951 None
952}
953
954fn decode_wire_to_items(
955 wire: &lex_extension::wire::WireNode,
956 invocation_label: &str,
957 include_site: &Range,
958) -> Result<Vec<ContentItem>, IncludeError> {
959 use crate::lex::wire::from_wire_node;
960
961 from_wire_node(wire).map_err(|e| IncludeError::HandlerFailed {
962 include_site: include_site.clone(),
963 label: invocation_label.to_string(),
964 code: "wire.decode".into(),
965 message: format!("decoding handler-returned wire payload failed: {e}"),
966 })
967}
968
969/// Map a [`HandlerError`] returned by the registry into the most
970/// specific [`IncludeError`] variant available. Codes in the
971/// `-32001..=-32005` range emitted by [`crate::lex::builtins::LexIncludeHandler`]
972/// translate back to their corresponding pre-extension-system
973/// variants so existing CLI/LSP error rendering and the integration
974/// test suite keep working unchanged. Unknown codes (third-party
975/// namespaces, future built-ins) surface as `HandlerFailed`.
976fn handler_error_to_include_error(
977 err: &HandlerError,
978 label: &str,
979 include_site: &Range,
980) -> IncludeError {
981 use crate::lex::builtins::include::{
982 CODE_ABSOLUTE_PATH, CODE_IO, CODE_MISSING_SRC, CODE_NOT_FOUND, CODE_OUTSIDE_ROOT,
983 CODE_PARSE_FAILED, CODE_TOO_LARGE,
984 };
985
986 match err {
987 HandlerError::Custom {
988 code,
989 message,
990 data,
991 } => match *code {
992 CODE_NOT_FOUND => IncludeError::NotFound {
993 include_site: include_site.clone(),
994 path: data_str(data, "path")
995 .map(PathBuf::from)
996 .unwrap_or_default(),
997 },
998 CODE_OUTSIDE_ROOT => IncludeError::RootEscape {
999 path: data_str(data, "path")
1000 .map(PathBuf::from)
1001 .unwrap_or_default(),
1002 root: data_str(data, "root")
1003 .map(PathBuf::from)
1004 .unwrap_or_default(),
1005 },
1006 CODE_TOO_LARGE => IncludeError::FileTooLarge {
1007 include_site: include_site.clone(),
1008 path: data_str(data, "path")
1009 .map(PathBuf::from)
1010 .unwrap_or_default(),
1011 size: data_u64(data, "size").unwrap_or(0),
1012 limit: data_u64(data, "limit").unwrap_or(0),
1013 },
1014 CODE_ABSOLUTE_PATH => IncludeError::AbsolutePath {
1015 path: data_str(data, "path")
1016 .map(PathBuf::from)
1017 .unwrap_or_default(),
1018 },
1019 CODE_IO => IncludeError::LoaderIo {
1020 path: data_str(data, "path")
1021 .map(PathBuf::from)
1022 .unwrap_or_default(),
1023 message: message.clone(),
1024 },
1025 CODE_MISSING_SRC => IncludeError::MissingSrc {
1026 include_site: include_site.clone(),
1027 },
1028 CODE_PARSE_FAILED => IncludeError::ParseFailed {
1029 path: data_str(data, "path")
1030 .map(PathBuf::from)
1031 .unwrap_or_default(),
1032 message: data_str(data, "message").unwrap_or_else(|| message.clone()),
1033 },
1034 other => IncludeError::HandlerFailed {
1035 include_site: include_site.clone(),
1036 label: label.to_string(),
1037 code: format!("handler.custom({other})"),
1038 message: message.clone(),
1039 },
1040 },
1041 HandlerError::Internal { message } => IncludeError::HandlerFailed {
1042 include_site: include_site.clone(),
1043 label: label.to_string(),
1044 code: "handler.internal".into(),
1045 message: message.clone(),
1046 },
1047 HandlerError::Unsupported { detail } => IncludeError::HandlerFailed {
1048 include_site: include_site.clone(),
1049 label: label.to_string(),
1050 code: "handler.unsupported".into(),
1051 message: detail.clone(),
1052 },
1053 }
1054}
1055
1056fn data_str(data: &Option<serde_json::Value>, key: &str) -> Option<String> {
1057 data.as_ref()?.get(key)?.as_str().map(str::to_string)
1058}
1059
1060fn data_u64(data: &Option<serde_json::Value>, key: &str) -> Option<u64> {
1061 data.as_ref()?.get(key)?.as_u64()
1062}
1063
1064#[allow(clippy::ptr_arg)]
1065fn recurse_into_children(
1066 children: &mut Vec<ContentItem>,
1067 state: &mut ResolverState<'_>,
1068) -> Result<(), IncludeError> {
1069 for item in children.iter_mut() {
1070 match item {
1071 ContentItem::Session(s) => {
1072 splice_in_session_container(s.children.as_mut_vec(), state)?;
1073 }
1074 ContentItem::Definition(d) => {
1075 splice_in_general_container(&mut d.children, state, ContainerKind::Definition)?;
1076 }
1077 ContentItem::Annotation(a) => {
1078 // Skip the body of annotations whose schema declares
1079 // `hooks.resolve = true` — those are dispatched at the
1080 // parent level by `process_resolves`. Walking their
1081 // bodies *here* would trip the resolve again on the
1082 // same invocation.
1083 //
1084 // The body is still walked when the resolve actually
1085 // runs: `process_resolves` calls
1086 // `resolve_one_invocation`, and the
1087 // [`ResolveOutcome::Spliced`] arm walks the splice
1088 // subtree (which replaces the annotation), while the
1089 // [`ResolveOutcome::Unexpanded`] arm explicitly
1090 // walks the kept annotation's body via
1091 // `splice_in_general_container`. So nested
1092 // resolve-hooked annotations inside an unexpanded
1093 // outer annotation are still reached.
1094 //
1095 // Non-resolve-hooked annotations recurse normally
1096 // here so their nested bodies get processed.
1097 let is_resolve_hooked = state
1098 .registry
1099 .schema_for(&a.data.label.value)
1100 .map(|s| s.hooks.resolve)
1101 .unwrap_or(false);
1102 if !is_resolve_hooked {
1103 splice_in_general_container(
1104 &mut a.children,
1105 state,
1106 ContainerKind::AnnotationBody,
1107 )?;
1108 }
1109 }
1110 ContentItem::List(l) => {
1111 for li in l.items.as_mut_vec().iter_mut() {
1112 if let ContentItem::ListItem(item) = li {
1113 splice_in_general_container(
1114 &mut item.children,
1115 state,
1116 ContainerKind::ListItem,
1117 )?;
1118 }
1119 }
1120 }
1121 _ => {}
1122 }
1123 }
1124 Ok(())
1125}
1126
1127fn validate_against_kind(
1128 items: &[ContentItem],
1129 kind: ContainerKind,
1130 site: &Range,
1131 file: &Path,
1132) -> Result<(), IncludeError> {
1133 if kind.allows_sessions() {
1134 return Ok(());
1135 }
1136 if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
1137 return Err(IncludeError::ContainerPolicy {
1138 include_site: site.clone(),
1139 container: kind.name(),
1140 file: file.to_path_buf(),
1141 violation: "Sessions",
1142 });
1143 }
1144 Ok(())
1145}
1146
1147// ============================================================================
1148// Path resolution
1149// ============================================================================
1150
1151/// Resolve a file-reference target string the same way the include
1152/// resolver resolves include paths.
1153///
1154/// Use this when consuming `ReferenceType::File { target }` (or any other
1155/// node-attached path) so that relative paths resolve from the *authoring*
1156/// file's directory, not from wherever the merged document happens to be
1157/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
1158/// containing node (or `None` if the node was never stamped — in that case
1159/// the path is treated as if authored at the root).
1160///
1161/// Behaviour matches the include resolver:
1162/// - Root-absolute targets (leading `/`) resolve under `root`.
1163/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
1164/// when `ref_origin` is `None`).
1165/// - The result is lexically normalized and checked against `root` —
1166/// paths that escape it return `RootEscape`.
1167///
1168/// This is a sister to the resolver's internal `resolve_path` and shares
1169/// the same lexical-normalization caveat: it does not touch the filesystem.
1170pub fn resolve_file_reference(
1171 target: &str,
1172 ref_origin: Option<&Path>,
1173 root: &Path,
1174) -> Result<PathBuf, IncludeError> {
1175 let host_dir: PathBuf = ref_origin
1176 .and_then(|p| p.parent())
1177 .map(Path::to_path_buf)
1178 .unwrap_or_else(|| root.to_path_buf());
1179 resolve_path(target, &host_dir, root)
1180}
1181
1182fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
1183 let candidate = if let Some(rel) = src.strip_prefix('/') {
1184 // Root-absolute (Lex spec convention): leading `/` means "from
1185 // the resolution root", not "filesystem root".
1186 root.join(rel)
1187 } else {
1188 // Anything else must be a relative path. Reject inputs the
1189 // host platform would treat as absolute (Windows `C:\foo`,
1190 // `\\server\share`, `\foo`) up front: the spec forbids
1191 // platform-absolute paths from entering the resolution
1192 // pipeline. Without this, `host_dir.join(src)` would silently
1193 // discard `host_dir` because Rust's `PathBuf::join` replaces
1194 // the base when the joined path is absolute. The downstream
1195 // root-escape check would still catch the security side, but
1196 // we'd surface a misleading "escapes root" error instead of
1197 // "absolute paths not allowed", and we'd be relying on
1198 // `PathBuf::join`'s override semantics for the security
1199 // outcome rather than holding the line at the input boundary.
1200 if Path::new(src).is_absolute() {
1201 return Err(IncludeError::AbsolutePath {
1202 path: PathBuf::from(src),
1203 });
1204 }
1205 host_dir.join(src)
1206 };
1207 let normalized = lexical_normalize(&candidate);
1208 let canonical_root = lexical_normalize(root);
1209 if !normalized.starts_with(&canonical_root) {
1210 return Err(IncludeError::RootEscape {
1211 path: normalized,
1212 root: canonical_root,
1213 });
1214 }
1215 Ok(normalized)
1216}
1217
1218/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
1219///
1220/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
1221/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
1222/// version is sufficient for include-site path resolution because the
1223/// resolver only needs a stable identity for cycle detection and a uniform
1224/// shape for the root-escape prefix check.
1225///
1226/// `..` is collapsed only when the *last* component in the buffer is a
1227/// real directory name (`Component::Normal`). When the buffer is empty
1228/// or its last component is itself `..` (or a root marker), the new `..`
1229/// is *preserved* in the buffer.
1230///
1231/// This is what defeats `../../etc/passwd` from collapsing to
1232/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
1233/// would happily strip a `..` (since `Path::new("..").parent()` returns
1234/// `Some("")`), silently losing the second `..` and producing a path
1235/// that falsely starts with the root prefix. Each unmatched `..` in the
1236/// preserved form keeps the normalized path outside any sane root, so
1237/// the escape check fires correctly.
1238fn lexical_normalize(p: &Path) -> PathBuf {
1239 let mut out = PathBuf::new();
1240 for c in p.components() {
1241 match c {
1242 std::path::Component::ParentDir => {
1243 let can_pop = matches!(
1244 out.components().next_back(),
1245 Some(std::path::Component::Normal(_))
1246 );
1247 if can_pop {
1248 out.pop();
1249 } else {
1250 out.push("..");
1251 }
1252 }
1253 std::path::Component::CurDir => {}
1254 other => out.push(other.as_os_str()),
1255 }
1256 }
1257 out
1258}
1259
1260// ============================================================================
1261// Origin stamping
1262// ============================================================================
1263//
1264// Walk every node in a Document and set `Range.origin_path` on each
1265// `.location` field. The walk only stamps the *block-level* `.location`
1266// fields here; finer-grained inline ranges land in PR 6 when file-ref
1267// resolution starts consulting them.
1268
1269pub(crate) fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
1270 if let Some(title) = doc.title.as_mut() {
1271 title.location.origin_path = Some(Arc::clone(origin));
1272 }
1273 for ann in doc.annotations.iter_mut() {
1274 stamp_annotation(ann, origin);
1275 }
1276 stamp_session(&mut doc.root, origin);
1277}
1278
1279fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
1280 s.location.origin_path = Some(Arc::clone(origin));
1281 if let Some(loc) = s.title.location.as_mut() {
1282 loc.origin_path = Some(Arc::clone(origin));
1283 }
1284 for ann in s.annotations.iter_mut() {
1285 stamp_annotation(ann, origin);
1286 }
1287 for item in s.children.as_mut_vec().iter_mut() {
1288 stamp_item(item, origin);
1289 }
1290}
1291
1292fn stamp_annotation(
1293 a: &mut crate::lex::ast::elements::annotation::Annotation,
1294 origin: &Arc<PathBuf>,
1295) {
1296 a.location.origin_path = Some(Arc::clone(origin));
1297 a.data.location.origin_path = Some(Arc::clone(origin));
1298 for item in a.children.as_mut_vec().iter_mut() {
1299 stamp_item(item, origin);
1300 }
1301}
1302
1303fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
1304 match item {
1305 ContentItem::Session(s) => stamp_session(s, origin),
1306 ContentItem::Annotation(a) => stamp_annotation(a, origin),
1307 ContentItem::Paragraph(p) => {
1308 p.location.origin_path = Some(Arc::clone(origin));
1309 for ann in p.annotations.iter_mut() {
1310 stamp_annotation(ann, origin);
1311 }
1312 for line in p.lines.iter_mut() {
1313 stamp_item(line, origin);
1314 }
1315 }
1316 ContentItem::List(l) => {
1317 l.location.origin_path = Some(Arc::clone(origin));
1318 for li in l.items.as_mut_vec().iter_mut() {
1319 stamp_item(li, origin);
1320 }
1321 }
1322 ContentItem::ListItem(li) => {
1323 li.location.origin_path = Some(Arc::clone(origin));
1324 for ann in li.annotations.iter_mut() {
1325 stamp_annotation(ann, origin);
1326 }
1327 for child in li.children.as_mut_vec().iter_mut() {
1328 stamp_item(child, origin);
1329 }
1330 }
1331 ContentItem::Definition(d) => {
1332 d.location.origin_path = Some(Arc::clone(origin));
1333 for ann in d.annotations.iter_mut() {
1334 stamp_annotation(ann, origin);
1335 }
1336 for child in d.children.as_mut_vec().iter_mut() {
1337 stamp_item(child, origin);
1338 }
1339 }
1340 ContentItem::VerbatimBlock(v) => {
1341 v.location.origin_path = Some(Arc::clone(origin));
1342 }
1343 ContentItem::VerbatimLine(vl) => {
1344 vl.location.origin_path = Some(Arc::clone(origin));
1345 }
1346 ContentItem::Table(t) => {
1347 t.location.origin_path = Some(Arc::clone(origin));
1348 }
1349 ContentItem::TextLine(tl) => {
1350 tl.location.origin_path = Some(Arc::clone(origin));
1351 }
1352 ContentItem::BlankLineGroup(b) => {
1353 b.location.origin_path = Some(Arc::clone(origin));
1354 }
1355 }
1356}
1357
1358// ============================================================================
1359// Parser glue
1360// ============================================================================
1361
1362/// Parse `source` into a Document but skip the annotation-attachment stage,
1363/// so include annotations are findable in container children lists.
1364///
1365/// Runs the shared parser front-end ([`parse_to_attached_root`]) — the same
1366/// one `run_string_to_ast` and `resolve_from_source` use — so the
1367/// reference-line pre-pass and any future front-end stage can never drift
1368/// from the standard path (lex#722). This is used by the built-in
1369/// `lex.include` handler to parse *included* files.
1370///
1371/// The returned document does **not** carry `reference_lines`: included
1372/// files reach the parent tree through the wire-AST codec, which has no
1373/// `reference_lines` field, so whole-element anchors authored *inside* an
1374/// included file are not propagated to the merged document (see the
1375/// follow-up note in `resolve_from_source`). The pre-pass still runs here
1376/// (it must, to keep a reference line from being mistaken for a structural
1377/// blank line in the included file's own parse), but its result is dropped
1378/// rather than emitted as a wrong-coordinate range in the merged document.
1379pub(crate) fn parse_no_attach(source: &str) -> Result<Document, String> {
1380 crate::lex::transforms::standard::parse_to_attached_root(source.to_string())
1381 .map(|(doc, _prepass)| doc)
1382 .map_err(|e| e.to_string())
1383}
1384
1385// ============================================================================
1386// Filesystem-backed loader
1387// ============================================================================
1388
1389/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
1390///
1391/// This is the production loader used by the CLI; the LSP wraps it with a
1392/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
1393/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
1394/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
1395/// WASM-friendly.
1396///
1397/// `FsLoader` is constructed with the resolution root and rechecks every
1398/// load against it post-`fs::canonicalize`, so a symlink pointing outside
1399/// the root is rejected even though the lexical-only check in
1400/// [`resolve_path`] cannot see it. Also rejects non-regular files (devices,
1401/// FIFOs, directories) before reading, so the loader can't be tricked into
1402/// blocking on `/dev/zero` or allocating against an open device.
1403///
1404/// Errors map:
1405/// - canonicalization fails (file missing, permission denied at a parent,
1406/// broken symlink, …) → [`LoadError::NotFound`]
1407/// - canonical path doesn't sit under canonical root → [`LoadError::OutsideRoot`]
1408/// - target is not a regular file → [`LoadError::Io`] with a clear message
1409/// - any other I/O error during read → [`LoadError::Io`]
1410pub struct FsLoader {
1411 /// Filesystem-canonical resolution root. Constructed once at
1412 /// `FsLoader::new`; if canonicalization fails (e.g., the configured
1413 /// root doesn't exist on disk), we fall back to the input verbatim
1414 /// and the bounds check will simply never pass — visible to the user
1415 /// as a `LoadError::OutsideRoot` instead of silently disabling the
1416 /// security check.
1417 canonical_root: PathBuf,
1418 /// Per-file size cap (bytes). Loads of larger files surface as
1419 /// `LoadError::TooLarge` before any bytes are read into memory.
1420 /// Default [`FsLoader::DEFAULT_MAX_FILE_SIZE`].
1421 max_file_size: u64,
1422}
1423
1424impl FsLoader {
1425 /// Default per-file size cap: 10 MiB. Generous for realistic Lex
1426 /// source documents (text only) and tight enough to bound memory
1427 /// allocation per include against an adversarial 1 GB file.
1428 pub const DEFAULT_MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
1429
1430 /// Construct a loader rooted at `root` with default size limits.
1431 /// The loader stores `root`'s fs-canonical form (with symlinks
1432 /// resolved); subsequent loads validate that the requested path's
1433 /// canonical form lives under it.
1434 pub fn new(root: PathBuf) -> Self {
1435 let canonical_root = std::fs::canonicalize(&root).unwrap_or(root);
1436 Self {
1437 canonical_root,
1438 max_file_size: Self::DEFAULT_MAX_FILE_SIZE,
1439 }
1440 }
1441
1442 /// Override the default per-file size cap (bytes). Use to widen the
1443 /// limit for projects with genuinely large source files, or tighten
1444 /// it for stricter sandboxes (e.g., LSPs serving untrusted content).
1445 pub fn with_max_file_size(mut self, max_file_size: u64) -> Self {
1446 self.max_file_size = max_file_size;
1447 self
1448 }
1449}
1450
1451impl Loader for FsLoader {
1452 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1453 // 1. Canonicalize. Resolves symlinks and `..` segments against the
1454 // real filesystem. NotFound / broken-symlink / permission errors
1455 // all surface here.
1456 let canonical_path = std::fs::canonicalize(path).map_err(|e| match e.kind() {
1457 std::io::ErrorKind::NotFound => LoadError::NotFound {
1458 path: path.to_path_buf(),
1459 },
1460 _ => LoadError::Io {
1461 path: path.to_path_buf(),
1462 message: e.to_string(),
1463 },
1464 })?;
1465
1466 // 2. Bounds check against the *canonical* root. This is the
1467 // actual security gate against symlink traversal — the lexical
1468 // check in resolve_path can't see through symlinks.
1469 if !canonical_path.starts_with(&self.canonical_root) {
1470 return Err(LoadError::OutsideRoot {
1471 path: canonical_path,
1472 root: self.canonical_root.clone(),
1473 });
1474 }
1475
1476 // 3. Reject non-regular files. Without this, an attacker (with
1477 // write access to the repo) could symlink an include target to
1478 // `/dev/zero` or a FIFO and block / OOM the reader. The
1479 // is_file() metadata call is a cheap sanity check.
1480 let meta = std::fs::metadata(&canonical_path).map_err(|e| LoadError::Io {
1481 path: canonical_path.clone(),
1482 message: e.to_string(),
1483 })?;
1484 if !meta.is_file() {
1485 return Err(LoadError::Io {
1486 path: canonical_path,
1487 message: "include target is not a regular file".to_string(),
1488 });
1489 }
1490
1491 // 4. Size cap. Bounds memory allocation per include against an
1492 // adversarial 1 GB file before any bytes hit the heap.
1493 let size = meta.len();
1494 if size > self.max_file_size {
1495 return Err(LoadError::TooLarge {
1496 path: canonical_path,
1497 size,
1498 limit: self.max_file_size,
1499 });
1500 }
1501
1502 // 5. Read. By this point we know the path is a regular file under
1503 // the canonical root and within the size cap; anything that
1504 // fails here is a real I/O error worth surfacing.
1505 let source = std::fs::read_to_string(&canonical_path).map_err(|e| LoadError::Io {
1506 path: canonical_path.clone(),
1507 message: e.to_string(),
1508 })?;
1509
1510 Ok(LoadedFile {
1511 source,
1512 canonical_path,
1513 })
1514 }
1515}
1516
1517// ============================================================================
1518// Test fixtures (test-support feature + cfg(test))
1519// ============================================================================
1520
1521/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
1522#[cfg(any(test, feature = "test-support"))]
1523pub struct MemoryLoader {
1524 files: std::collections::HashMap<PathBuf, String>,
1525}
1526
1527#[cfg(any(test, feature = "test-support"))]
1528impl MemoryLoader {
1529 /// Create an empty loader. Add files with [`MemoryLoader::insert`].
1530 pub fn new() -> Self {
1531 Self {
1532 files: std::collections::HashMap::new(),
1533 }
1534 }
1535
1536 /// Register a file at `path` with the given source text.
1537 pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
1538 self.files.insert(path.into(), contents.into());
1539 self
1540 }
1541
1542 /// Convenience constructor: build a loader from any iterator of
1543 /// `(path, contents)` pairs.
1544 pub fn from_pairs<I, P, S>(pairs: I) -> Self
1545 where
1546 I: IntoIterator<Item = (P, S)>,
1547 P: Into<PathBuf>,
1548 S: Into<String>,
1549 {
1550 let mut loader = Self::new();
1551 for (path, contents) in pairs {
1552 loader.insert(path, contents);
1553 }
1554 loader
1555 }
1556}
1557
1558#[cfg(any(test, feature = "test-support"))]
1559impl Default for MemoryLoader {
1560 fn default() -> Self {
1561 Self::new()
1562 }
1563}
1564
1565#[cfg(any(test, feature = "test-support"))]
1566impl Loader for MemoryLoader {
1567 fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1568 // Memory loaders have no symlinks; the lookup key *is* the
1569 // canonical identity. Cycle detection in the resolver compares
1570 // `LoadedFile::canonical_path` values; for tests this matches the
1571 // lexically-normalized paths the resolver already produces.
1572 let source = self
1573 .files
1574 .get(path)
1575 .cloned()
1576 .ok_or_else(|| LoadError::NotFound {
1577 path: path.to_path_buf(),
1578 })?;
1579 Ok(LoadedFile {
1580 source,
1581 canonical_path: path.to_path_buf(),
1582 })
1583 }
1584}
1585
1586// ============================================================================
1587// Tests
1588// ============================================================================
1589
1590#[cfg(test)]
1591mod tests;