Skip to main content

lex_core/lex/
includes.rs

1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//!   doc-title/doc-annotation conversion + origin stamping + root-escape
19//!   check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//!   (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//!   directory, so relative paths inside an included file resolve from
23//!   that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//!   resolves a `ReferenceType::File` target from the authoring file's
26//!   directory using `Range.origin_path`.
27//!   `Document::find_annotation_by_label_in_origin` scopes footnote
28//!   lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//!   filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//!   into `lex convert` and `lex inspect` (default-on, opt-out via
32//!   `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47// `IncludeError` carries diagnostic context (paths, source ranges,
48// handler messages) on every variant; the `result_large_err` lint
49// would have us box the whole error or split it into a thinner shape
50// just to satisfy the size heuristic. The enum is already part of
51// the public API and the error path is rare; suppress the lint for
52// this module rather than churn the public surface.
53#![allow(clippy::result_large_err)]
54
55use crate::lex::assembling::AttachAnnotations;
56use crate::lex::ast::elements::container::GeneralContainer;
57use crate::lex::ast::elements::content_item::ContentItem;
58use crate::lex::ast::elements::session::Session;
59use crate::lex::ast::range::Range;
60use crate::lex::ast::Document;
61use crate::lex::transforms::Runnable;
62use lex_extension::handler::HandlerError;
63use lex_extension_host::registry::Registry;
64use std::path::{Path, PathBuf};
65use std::sync::Arc;
66
67/// Configuration for the include resolution pass.
68#[derive(Debug, Clone)]
69pub struct ResolveConfig {
70    /// Directory all include paths resolve under. Any include that
71    /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
72    ///
73    /// Must be an **absolute** path. Lexical normalization treats `.`
74    /// and `..` against an empty buffer as no-ops; passing a relative
75    /// or unnormalized root weakens the root-escape prefix check.
76    /// Callers (CLI, LSP) should canonicalize the root before
77    /// constructing `ResolveConfig`.
78    pub root: PathBuf,
79    /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
80    /// Hitting the limit is an error, not a silent truncation.
81    pub max_depth: usize,
82    /// Maximum total number of `lex.include` annotations resolved across
83    /// the whole tree (depth × breadth). Default 1000
84    /// (see [`ResolveConfig::DEFAULT_MAX_TOTAL_INCLUDES`]).
85    ///
86    /// Caps fan-out: `max_depth` alone bounds chain length but not
87    /// breadth. A document with 100 thousand top-level includes at depth
88    /// 1 sits inside `max_depth` but can still OOM the resolver / LSP /
89    /// CI. Hitting this limit is an error, not a silent truncation.
90    pub max_total_includes: usize,
91}
92
93impl ResolveConfig {
94    /// Default maximum include depth — enough for any reasonable atomization
95    /// strategy (aggregator → per-chapter → per-section), bounded enough to
96    /// keep the resolver's worst-case work predictable.
97    pub const DEFAULT_MAX_DEPTH: usize = 8;
98
99    /// Default maximum total include count (DoS bound). Generous enough
100    /// for a book-length document with thousands of small fragments,
101    /// tight enough to contain adversarial fan-out within a few seconds
102    /// of resolver work.
103    pub const DEFAULT_MAX_TOTAL_INCLUDES: usize = 1000;
104
105    /// Construct a config with the given root and default limits.
106    pub fn with_root(root: PathBuf) -> Self {
107        Self {
108            root,
109            max_depth: Self::DEFAULT_MAX_DEPTH,
110            max_total_includes: Self::DEFAULT_MAX_TOTAL_INCLUDES,
111        }
112    }
113}
114
115/// A pluggable source-text loader.
116///
117/// Implementations decide where bytes come from (filesystem, in-memory map,
118/// virtual filesystem, content-addressed store, …). lex-core never references
119/// `std::fs` directly through this trait; that keeps the resolver pure and
120/// usable in WASM, sandboxes, and unit tests.
121pub trait Loader {
122    /// Load the source text for `path` and return both the contents and a
123    /// canonical identity for the loaded resource. The path is what the
124    /// resolver decided on after applying the rules in §4 of the proposal.
125    ///
126    /// `LoadedFile::canonical_path` is the loader's authoritative identity
127    /// for the resource. For [`FsLoader`] this is the filesystem-canonical
128    /// path (symlinks resolved, case-folded if the underlying FS is
129    /// case-insensitive); for [`MemoryLoader`] it's the lookup key (since
130    /// memory loaders have no symlinks). The resolver uses this for cycle
131    /// detection and for stamping `Range.origin_path` on the loaded tree.
132    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError>;
133}
134
135/// Result of a successful [`Loader::load`].
136#[derive(Debug, Clone)]
137pub struct LoadedFile {
138    /// The file's source text.
139    pub source: String,
140    /// The loader's authoritative identity for the resource. See
141    /// [`Loader::load`] for how loaders decide this.
142    pub canonical_path: PathBuf,
143}
144
145/// Errors a [`Loader`] can produce.
146#[derive(Debug, Clone)]
147pub enum LoadError {
148    /// The loader could not find a resource at the given path.
149    NotFound { path: PathBuf },
150    /// The resource exists but resolves outside the loader's allowed
151    /// boundary. The lexical resolver normalizes `..` in the requested
152    /// path, but loaders that touch a real filesystem must do a second
153    /// check post-canonicalization to catch symlinks that escape the
154    /// boundary lexically-correct paths can't reach.
155    OutsideRoot { path: PathBuf, root: PathBuf },
156    /// The resource exists but its size exceeds the loader's configured
157    /// limit. `size` and `limit` are in bytes. The resolver maps this to
158    /// [`IncludeError::FileTooLarge`] with the offending annotation's site.
159    TooLarge {
160        path: PathBuf,
161        size: u64,
162        limit: u64,
163    },
164    /// Underlying I/O error (or virtual-filesystem equivalent).
165    Io { path: PathBuf, message: String },
166}
167
168impl std::fmt::Display for LoadError {
169    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
170        match self {
171            LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
172            LoadError::OutsideRoot { path, root } => write!(
173                f,
174                "include path {} resolves outside loader root {}",
175                path.display(),
176                root.display()
177            ),
178            LoadError::TooLarge { path, size, limit } => write!(
179                f,
180                "include file {} is {size} bytes, exceeds limit of {limit} bytes",
181                path.display()
182            ),
183            LoadError::Io { path, message } => {
184                write!(f, "io error reading {}: {message}", path.display())
185            }
186        }
187    }
188}
189
190impl std::error::Error for LoadError {}
191
192/// Errors the include resolver can produce.
193#[derive(Debug, Clone)]
194pub enum IncludeError {
195    /// An include chain looped back on itself. `chain` is the resolution
196    /// stack at the moment the duplicate `path` was about to be pushed,
197    /// in source-order (entry first, deepest last). `include_site` is the
198    /// range of the offending `lex.include` annotation in its host file —
199    /// useful for diagnostics that highlight the exact line.
200    Cycle {
201        include_site: Range,
202        path: PathBuf,
203        chain: Vec<PathBuf>,
204    },
205    /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
206    /// shows the resolution stack at the moment of failure, in source
207    /// order. `include_site` is the range of the offending
208    /// `lex.include` annotation in its host file.
209    DepthExceeded {
210        include_site: Range,
211        limit: usize,
212        chain: Vec<PathBuf>,
213    },
214    /// The total number of includes resolved across the document
215    /// exceeded [`ResolveConfig::max_total_includes`]. Bounds adversarial
216    /// fan-out (which `max_depth` alone does not). `include_site` is the
217    /// `lex.include` annotation that pushed the count past the limit.
218    TotalIncludesExceeded { include_site: Range, limit: usize },
219    /// The included file's size exceeded the loader's configured limit.
220    /// Surfaced by loaders that read from a real filesystem (FsLoader)
221    /// to bound memory allocation per include. `include_site` is the
222    /// offending annotation; `size` and `limit` are in bytes.
223    FileTooLarge {
224        include_site: Range,
225        path: PathBuf,
226        size: u64,
227        limit: u64,
228    },
229    /// A path resolved outside the configured [`ResolveConfig::root`].
230    RootEscape { path: PathBuf, root: PathBuf },
231    /// The include `src` was a platform-absolute filesystem path
232    /// (e.g. Windows `C:\foo`, `\\server\share`, `\foo`). The spec
233    /// forbids absolute filesystem paths from entering the
234    /// resolution pipeline; the *root-absolute* form (leading `/`
235    /// resolved against the includes root) is the only spec-allowed
236    /// way to write a path that doesn't start from the host's
237    /// directory. On Unix the only thing that's `Path::is_absolute()`
238    /// is a leading `/`, which is consumed by the root-absolute
239    /// branch first; this variant therefore only fires in practice
240    /// for Windows-shaped absolute paths.
241    AbsolutePath { path: PathBuf },
242    /// The loader could not find or read the included file. `include_site`
243    /// is the range of the offending `lex.include` annotation in its host
244    /// file, so editors can squiggle the line that asked for the missing
245    /// file rather than the document head.
246    NotFound { include_site: Range, path: PathBuf },
247    /// The loader returned text that the parser rejected.
248    ParseFailed { path: PathBuf, message: String },
249    /// The included file's content is not legal in the include site's
250    /// parent container.
251    ///
252    /// Today this only occurs when an included file has top-level Sessions
253    /// and the include site is inside a `GeneralContainer` (Definition,
254    /// ListItem, or another Annotation's body). The `violation` field
255    /// names the offending content kind (e.g. `"Sessions"`) so future
256    /// container/policy combinations can reuse this variant without a
257    /// breaking change.
258    ContainerPolicy {
259        include_site: Range,
260        container: &'static str,
261        file: PathBuf,
262        violation: &'static str,
263    },
264    /// Loader propagated a non-`NotFound` I/O error.
265    LoaderIo { path: PathBuf, message: String },
266    /// `lex.include` annotation was missing the mandatory `src=` parameter.
267    MissingSrc { include_site: Range },
268    /// A registered handler returned an error the pass could not map
269    /// onto a more specific variant — typically a third-party
270    /// namespace's resolve hook surfacing an internal failure, or an
271    /// unrecognised handler-defined code from `lex.*` built-ins. The
272    /// `code` is the string identifier the registry attaches to the
273    /// diagnostic (`"handler.internal"`, `"handler.custom"`, …).
274    HandlerFailed {
275        include_site: Range,
276        label: String,
277        code: String,
278        message: String,
279    },
280}
281
282impl std::fmt::Display for IncludeError {
283    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
284        match self {
285            IncludeError::Cycle { path, chain, .. } => {
286                let chain_display: Vec<String> =
287                    chain.iter().map(|p| p.display().to_string()).collect();
288                write!(
289                    f,
290                    "include cycle: {} (chain: {})",
291                    path.display(),
292                    chain_display.join(" -> ")
293                )
294            }
295            IncludeError::DepthExceeded { limit, chain, .. } => {
296                let chain_display: Vec<String> =
297                    chain.iter().map(|p| p.display().to_string()).collect();
298                write!(
299                    f,
300                    "include depth exceeded limit of {limit} (chain: {})",
301                    chain_display.join(" -> ")
302                )
303            }
304            IncludeError::TotalIncludesExceeded { limit, .. } => {
305                write!(f, "total include count exceeded limit of {limit}")
306            }
307            IncludeError::FileTooLarge {
308                path, size, limit, ..
309            } => {
310                write!(
311                    f,
312                    "included file {} is {size} bytes, exceeds limit of {limit} bytes",
313                    path.display()
314                )
315            }
316            IncludeError::RootEscape { path, root } => write!(
317                f,
318                "include path {} escapes resolution root {}",
319                path.display(),
320                root.display()
321            ),
322            IncludeError::AbsolutePath { path } => write!(
323                f,
324                "include src {} is a platform-absolute path; \
325                 the spec forbids absolute filesystem paths — use a relative path \
326                 (chapters/01.lex) or a root-absolute path (/shared/01.lex)",
327                path.display()
328            ),
329            IncludeError::NotFound { path, .. } => {
330                write!(f, "include not found: {}", path.display())
331            }
332            IncludeError::ParseFailed { path, message } => {
333                write!(f, "failed to parse {}: {message}", path.display())
334            }
335            IncludeError::ContainerPolicy {
336                container,
337                file,
338                violation,
339                ..
340            } => write!(
341                f,
342                "included file {} contains {} but include site is inside {} \
343                 (which does not allow {})",
344                file.display(),
345                violation,
346                container,
347                violation
348            ),
349            IncludeError::LoaderIo { path, message } => {
350                write!(f, "loader error reading {}: {message}", path.display())
351            }
352            IncludeError::MissingSrc { .. } => {
353                write!(f, "lex.include annotation missing required src= parameter")
354            }
355            IncludeError::HandlerFailed {
356                label,
357                code,
358                message,
359                ..
360            } => write!(f, "extension handler `{label}` failed ({code}): {message}"),
361        }
362    }
363}
364
365impl std::error::Error for IncludeError {}
366
367// No `From<LoadError>` impl: `IncludeError::NotFound` carries the include
368// site (the `lex.include` annotation's range), which a loader doesn't know
369// about. Callers map `LoadError` explicitly at the call site, where the
370// site is available.
371
372/// Which container the include site sits in. Determines the splice-time
373/// policy check (the only one today is "no Sessions in `GeneralContainer`").
374#[derive(Debug, Clone, Copy)]
375enum ContainerKind {
376    /// `Document.root.children` or `Session.children` — accepts everything.
377    Session,
378    /// `Definition.children` — `GeneralContainer`.
379    Definition,
380    /// `Annotation.children` — `GeneralContainer`.
381    AnnotationBody,
382    /// `ListItem.children` — `GeneralContainer`.
383    ListItem,
384}
385
386impl ContainerKind {
387    fn name(self) -> &'static str {
388        match self {
389            ContainerKind::Session => "Session",
390            ContainerKind::Definition => "Definition",
391            ContainerKind::AnnotationBody => "Annotation body",
392            ContainerKind::ListItem => "ListItem",
393        }
394    }
395
396    fn allows_sessions(self) -> bool {
397        matches!(self, ContainerKind::Session)
398    }
399}
400
401/// Hard cap on resolution depth, applied even when the
402/// configurable [`ResolveConfig::max_depth`] is set higher. Bounds
403/// adversarial varying-position recursion (a handler that returns
404/// content with a different invocation site each iteration so the
405/// cycle key never matches) so the resolver always terminates.
406pub const KERNEL_DEPTH_BACKSTOP: usize = 32;
407
408/// Resolve every `hooks.resolve = true` labelled annotation starting
409/// from `source`, dispatching through `registry`, and recursively
410/// processing the spliced content.
411///
412/// `source_path` identifies the entry-point file. It is used to
413/// (a) stamp `Range.origin_path` on every node so downstream code
414/// (file-ref resolution, diagnostics, LSP goto) can report locations
415/// against the authoring file, and (b) provide the host directory
416/// the built-in `lex.include` handler resolves relative `src=` paths
417/// against (via `LabelCtx.node.origin`). When `None`, origin stamping
418/// is skipped on the entry and the handler resolves relative paths
419/// against `config.root`.
420///
421/// # Generic dispatch
422///
423/// Every label whose schema declares `hooks.resolve = true` flows
424/// through the same path: build a [`LabelCtx`] from the annotation,
425/// call [`Registry::dispatch_resolve_raw`], decode the returned
426/// [`WireNode`] back into typed [`ContentItem`]s via
427/// [`crate::lex::wire::from_wire_node`], and splice in place. The
428/// built-in `lex.include` handler is registered the same way as any
429/// third-party namespace.
430///
431/// # Pre/post-attachment
432///
433/// Internally this re-parses the entry source *without* annotation
434/// attachment so labelled annotations stay visible as standalone
435/// children. The handler does its own `parse_no_attach` for loaded
436/// content. After all splices, [`AttachAnnotations`] runs once on
437/// the merged tree.
438///
439/// # Recursion + cycle detection
440///
441/// Cycle detection keys on `(label, origin_path, start_position)` of
442/// the invocation site. A handler that returns content containing
443/// another invocation at the same source position is caught
444/// immediately. A handler that varies the invocation position each
445/// iteration terminates at `min(config.max_depth, KERNEL_DEPTH_BACKSTOP)`
446/// with `IncludeError::DepthExceeded`. The total-includes counter
447/// caps adversarial fan-out independent of depth.
448pub fn resolve_from_source(
449    source: &str,
450    source_path: Option<PathBuf>,
451    config: &ResolveConfig,
452    registry: &Registry,
453) -> Result<Document, IncludeError> {
454    let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
455
456    let mut doc = parse_no_attach(source).map_err(|message| IncludeError::ParseFailed {
457        path: source_path.clone().unwrap_or_default(),
458        message,
459    })?;
460
461    if let Some(origin) = entry_origin.as_ref() {
462        stamp_doc(&mut doc, origin);
463    }
464
465    let mut chain: Vec<ResolveKey> = Vec::new();
466    let mut state = ResolverState {
467        config,
468        registry,
469        chain: &mut chain,
470        depth: 0,
471        total_resolved: 0,
472    };
473
474    splice_in_session_container(doc.root.children.as_mut_vec(), &mut state)?;
475
476    let doc = AttachAnnotations::new()
477        .run(doc)
478        .map_err(|e| IncludeError::ParseFailed {
479            path: source_path.unwrap_or_default(),
480            message: format!("annotation attachment failed: {e}"),
481        })?;
482
483    Ok(doc)
484}
485
486// ============================================================================
487// Splicing
488// ============================================================================
489
490/// One frame on the resolve-pass cycle stack. Two invocations at the
491/// same `(label, origin, start)` position are a cycle, regardless of
492/// what parameters either invocation uses — a handler that varies
493/// params per call (random IDs, timestamps) cannot defeat the
494/// detector by changing param values.
495#[derive(Debug, Clone, PartialEq)]
496struct ResolveKey {
497    label: String,
498    /// `Range.origin_path` of the annotation — the file the
499    /// invocation was authored in. `None` when stamping was skipped
500    /// (e.g., entry source loaded from a string with no path).
501    origin: Option<PathBuf>,
502    start: crate::lex::ast::range::Position,
503}
504
505impl ResolveKey {
506    fn from_annotation(a: &crate::lex::ast::elements::annotation::Annotation) -> Self {
507        Self {
508            label: a.data.label.value.clone(),
509            origin: a.location.origin_path.as_ref().map(|p| (**p).clone()),
510            start: a.location.start,
511        }
512    }
513}
514
515/// Per-resolution state threaded through the recursive walker. Keeps the
516/// signatures of the splice/process functions short and ensures
517/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
518/// each invocation.
519struct ResolverState<'a> {
520    config: &'a ResolveConfig,
521    registry: &'a Registry,
522    /// Active resolution stack of `(label, origin, position)` keys.
523    /// Pushed when we begin dispatching for an invocation and popped
524    /// when its splice subtree is fully resolved. A push that finds
525    /// the same key already on the stack is a cycle.
526    chain: &'a mut Vec<ResolveKey>,
527    /// Number of dispatch hops from the entry point. Each recursion
528    /// increments by 1. Hitting `config.max_depth` or the
529    /// [`KERNEL_DEPTH_BACKSTOP`] (whichever is lower) is an error.
530    depth: usize,
531    /// Total invocations resolved across the entire walk
532    /// (depth × breadth). Incremented on every successful dispatch.
533    /// Hitting `config.max_total_includes` aborts with
534    /// `TotalIncludesExceeded`.
535    total_resolved: usize,
536}
537
538fn splice_in_session_container(
539    children: &mut Vec<ContentItem>,
540    state: &mut ResolverState<'_>,
541) -> Result<(), IncludeError> {
542    // Post-order: recurse into nested containers first, splice this
543    // container's invocations second. Recursion happens inside
544    // `process_resolves` for any spliced subtree, so that subtree
545    // is never re-walked at the parent level.
546    recurse_into_children(children, state)?;
547    process_resolves(children, state, ContainerKind::Session)
548}
549
550fn splice_in_general_container(
551    container: &mut GeneralContainer,
552    state: &mut ResolverState<'_>,
553    kind: ContainerKind,
554) -> Result<(), IncludeError> {
555    recurse_into_children(container.as_mut_vec(), state)?;
556    process_resolves(container.as_mut_vec(), state, kind)
557}
558
559/// Walk the children of a container, dispatch every annotation whose
560/// schema declares `hooks.resolve = true` through the registry, and
561/// splice the returned content in place of the annotation. Recurses
562/// into the spliced content so nested invocations resolve too.
563// Allow &mut Vec because `splice` needs Vec-specific operations.
564#[allow(clippy::ptr_arg)]
565fn process_resolves(
566    children: &mut Vec<ContentItem>,
567    state: &mut ResolverState<'_>,
568    kind: ContainerKind,
569) -> Result<(), IncludeError> {
570    // Collect indices of annotations whose schema has hooks.resolve.
571    let resolve_indices: Vec<usize> = children
572        .iter()
573        .enumerate()
574        .filter_map(|(i, item)| match item {
575            ContentItem::Annotation(a) => {
576                let label = &a.data.label.value;
577                if state
578                    .registry
579                    .schema_for(label)
580                    .map(|s| s.hooks.resolve)
581                    .unwrap_or(false)
582                {
583                    Some(i)
584                } else {
585                    None
586                }
587            }
588            _ => None,
589        })
590        .collect();
591
592    for i in resolve_indices.into_iter().rev() {
593        let annotation = match &children[i] {
594            ContentItem::Annotation(a) => a.clone(),
595            _ => unreachable!("index came from resolve filter"),
596        };
597
598        match resolve_one_invocation(&annotation, state, kind)? {
599            ResolveOutcome::Spliced(splice_items) => {
600                // Replace the annotation with `[annotation, ...splice_items]`.
601                // The annotation itself stays in the children list immediately
602                // before the splice, so the post-resolution AttachAnnotations
603                // pass moves it onto the first spliced node by the standard
604                // "attach to next sibling" rule.
605                let mut replacement = Vec::with_capacity(splice_items.len() + 1);
606                replacement.push(ContentItem::Annotation(annotation));
607                replacement.extend(splice_items);
608                children.splice(i..=i, replacement);
609            }
610            ResolveOutcome::Unexpanded => {
611                // Handler opted out of expanding this invocation. The
612                // annotation stays in place, but its body wasn't
613                // walked by `recurse_into_children` (that walker
614                // skips resolve-hooked annotations to avoid double-
615                // resolution). Walk the body now so any nested
616                // invocations inside the unexpanded annotation get
617                // resolved on the way back up.
618                let mut owned = annotation;
619                splice_in_general_container(
620                    &mut owned.children,
621                    state,
622                    ContainerKind::AnnotationBody,
623                )?;
624                children[i] = ContentItem::Annotation(owned);
625            }
626        }
627    }
628
629    Ok(())
630}
631
632/// Outcome of dispatching a single resolve-hooked annotation. The
633/// pass needs to distinguish between "handler returned content,
634/// splice it in" and "handler opted out, leave the annotation
635/// alone": the second case still requires walking the annotation's
636/// body for nested invocations because `recurse_into_children`
637/// otherwise skips resolve-hooked annotations to prevent double-
638/// resolution.
639enum ResolveOutcome {
640    Spliced(Vec<ContentItem>),
641    Unexpanded,
642}
643
644/// Dispatch a single resolve-hooked annotation through the registry,
645/// decode the returned `WireNode` back into typed children, then
646/// recursively walk the splice items so nested invocations resolve
647/// before the splice is placed into the parent container.
648///
649/// Returns [`ResolveOutcome::Unexpanded`] when the handler returned
650/// `Ok(None)` (third-party handlers can opt out of expanding a
651/// particular invocation). The caller is then responsible for
652/// walking the annotation's body for nested invocations — the
653/// resolve walker normally skips resolve-hooked annotations'
654/// bodies.
655fn resolve_one_invocation(
656    annotation: &crate::lex::ast::elements::annotation::Annotation,
657    state: &mut ResolverState<'_>,
658    parent_kind: ContainerKind,
659) -> Result<ResolveOutcome, IncludeError> {
660    let label = &annotation.data.label.value;
661    let key = ResolveKey::from_annotation(annotation);
662
663    // Cycle check on (label, origin, start) of the invocation site.
664    if state.chain.contains(&key) {
665        return Err(IncludeError::Cycle {
666            include_site: annotation.location.clone(),
667            path: key.origin.clone().unwrap_or_default(),
668            chain: state
669                .chain
670                .iter()
671                .map(|k| k.origin.clone().unwrap_or_default())
672                .collect(),
673        });
674    }
675
676    // Depth check. The effective limit is the lower of the
677    // user-facing `config.max_depth` (default 8) and the hard
678    // [`KERNEL_DEPTH_BACKSTOP`] (32, fixed). The kernel backstop
679    // exists for adversarial varying-position recursion that the
680    // cycle key can't catch — even if a user bumps `max_depth`
681    // higher than 32 for legitimate deep atomization, the backstop
682    // still terminates. The error reports `effective_depth_limit`
683    // (the actual cap that fired) rather than `config.max_depth`,
684    // so when the backstop is the binding limit the user sees `32`
685    // and not the (higher) config value.
686    let effective_depth_limit = state.config.max_depth.min(KERNEL_DEPTH_BACKSTOP);
687    if state.depth >= effective_depth_limit {
688        return Err(IncludeError::DepthExceeded {
689            include_site: annotation.location.clone(),
690            limit: effective_depth_limit,
691            chain: state
692                .chain
693                .iter()
694                .map(|k| k.origin.clone().unwrap_or_default())
695                .collect(),
696        });
697    }
698
699    // Total-count check before dispatch.
700    if state.total_resolved >= state.config.max_total_includes {
701        return Err(IncludeError::TotalIncludesExceeded {
702            include_site: annotation.location.clone(),
703            limit: state.config.max_total_includes,
704        });
705    }
706
707    let ctx = build_label_ctx(annotation);
708
709    let wire_node = match state.registry.dispatch_resolve_raw(&ctx) {
710        Ok(Some(node)) => node,
711        Ok(None) => {
712            // Handler returned "nothing to splice" — leave the
713            // annotation in place. The caller still needs to walk
714            // its body for nested invocations (built-in lex.include
715            // never returns None; this path is reachable only via
716            // third-party handlers that opt out per-invocation).
717            return Ok(ResolveOutcome::Unexpanded);
718        }
719        Err(handler_err) => {
720            return Err(handler_error_to_include_error(
721                &handler_err,
722                label,
723                &annotation.location,
724            ));
725        }
726    };
727
728    state.total_resolved += 1;
729
730    // Decode the wire payload into typed lex-core ContentItems.
731    let mut splice_items = decode_wire_to_items(&wire_node, label, &annotation.location)?;
732
733    // Recurse into the spliced subtree FIRST so nested resolve-hooked
734    // annotations are processed before the splice lands. Validation
735    // must wait until *after* this step: a nested invocation can
736    // splice in content (e.g. a top-level `Session` from a chained
737    // `lex.include`) that wasn't in the handler's original output,
738    // and the final shape is what has to satisfy the parent
739    // container's policy.
740    //
741    // The `IncludeError::ContainerPolicy.file` field describes the
742    // *spliced content's* source file (the file containing the
743    // disallowed shape), not the invocation site. Take it from the
744    // handler-returned wire payload's origin when present, falling
745    // back to the first decoded item's origin path if the wire
746    // payload didn't stamp a `Document` origin.
747    let included_path = wire_node_origin_pathbuf(&wire_node)
748        .or_else(|| splice_items_first_origin(&splice_items))
749        .unwrap_or_default();
750    state.chain.push(key);
751    let saved_depth = state.depth;
752    state.depth = saved_depth + 1;
753    let recurse_result = splice_in_session_container(&mut splice_items, state);
754    state.depth = saved_depth;
755    state.chain.pop();
756    recurse_result?;
757
758    // Container-policy validation: enforce no-Sessions inside
759    // `GeneralContainer` (Definition / Annotation body / ListItem).
760    // Runs against the post-recursion splice list so nested
761    // expansions can't smuggle disallowed shapes past the check.
762    validate_against_kind(
763        &splice_items,
764        parent_kind,
765        &annotation.location,
766        &included_path,
767    )?;
768
769    Ok(ResolveOutcome::Spliced(splice_items))
770}
771
772/// Build a [`LabelCtx`] from a lex-core [`Annotation`]. The body is
773/// derived from the annotation's children (parsed-Lex form), the
774/// params from `Annotation::data::parameters`, and the host node info
775/// from `Annotation::location`.
776fn build_label_ctx(
777    a: &crate::lex::ast::elements::annotation::Annotation,
778) -> lex_extension::wire::LabelCtx {
779    use crate::lex::wire::to_wire_node;
780    use lex_extension::wire::{AnnotationBody, LabelCtx, NodeRef};
781
782    let label = a.data.label.value.clone();
783    let params = {
784        // Pass *semantic* parameter values to handlers (quotes
785        // stripped, escape sequences resolved). Handlers consume
786        // params as JSON values, where there is no "quoted string"
787        // vs "unquoted token" distinction; only the decoded value
788        // is meaningful. The codec's `parameters_to_json` (used by
789        // `annotation_to_wire` for round-tripping annotation
790        // *content*) keeps the raw form to preserve source — the
791        // two paths intentionally differ.
792        let mut obj = serde_json::Map::with_capacity(a.data.parameters.len());
793        for p in &a.data.parameters {
794            obj.insert(p.key.clone(), serde_json::Value::String(p.unquoted_value()));
795        }
796        serde_json::Value::Object(obj)
797    };
798    let body = if a.children.is_empty() {
799        AnnotationBody::None
800    } else {
801        let wire_children: Vec<lex_extension::wire::WireNode> =
802            a.children.iter().map(to_wire_node).collect();
803        AnnotationBody::Lex {
804            children: wire_children,
805        }
806    };
807    let range = lex_extension::wire::Range::new(
808        lex_extension::wire::Position::new(
809            u32::try_from(a.location.start.line).unwrap_or(u32::MAX),
810            u32::try_from(a.location.start.column).unwrap_or(u32::MAX),
811        ),
812        lex_extension::wire::Position::new(
813            u32::try_from(a.location.end.line).unwrap_or(u32::MAX),
814            u32::try_from(a.location.end.column).unwrap_or(u32::MAX),
815        ),
816    );
817    let origin = a
818        .location
819        .origin_path
820        .as_ref()
821        .map(|p| p.to_string_lossy().into_owned());
822    LabelCtx {
823        label,
824        params,
825        body,
826        node: NodeRef {
827            kind: "annotation".into(),
828            range,
829            origin,
830        },
831    }
832}
833
834/// Convert a handler-returned [`WireNode`] back into a list of
835/// [`ContentItem`]s ready for splicing. `WireNode::Document` is
836/// unwrapped (its children become the splice list); any other root
837/// shape is wrapped as a single-item list.
838///
839/// `invocation_label` is the label whose handler produced `wire` —
840/// threaded through so wire-decode failures are attributed to the
841/// real namespace rather than a hardcoded `lex.include`. A
842/// third-party `acme.expand` handler that returns malformed wire
843/// will surface as `IncludeError::HandlerFailed { label:
844/// "acme.expand", .. }`.
845/// Lift a [`WireNode`]'s top-level `origin` field into a `PathBuf`
846/// when present. Used by the resolve pass to attribute
847/// container-policy errors to the *spliced content's* source file
848/// rather than the invocation site.
849fn wire_node_origin_pathbuf(node: &lex_extension::wire::WireNode) -> Option<PathBuf> {
850    use lex_extension::wire::WireNode as W;
851    let s = match node {
852        W::Document { origin, .. } => origin.as_deref(),
853        W::Session { origin, .. } => origin.as_deref(),
854        W::Definition { origin, .. } => origin.as_deref(),
855        W::Paragraph { origin, .. } => origin.as_deref(),
856        W::List { origin, .. } => origin.as_deref(),
857        W::Verbatim { origin, .. } => origin.as_deref(),
858        W::Table { origin, .. } => origin.as_deref(),
859        W::Annotation { origin, .. } => origin.as_deref(),
860        W::Blank { origin, .. } => origin.as_deref(),
861        _ => None,
862    };
863    s.map(PathBuf::from)
864}
865
866/// Fallback when `WireNode::Document.origin` is unset: walk the
867/// decoded splice list and return the first item that carries an
868/// origin. The interner from `from_wire_node` ensures every item
869/// shares one Arc per origin string, so iterating is cheap.
870fn splice_items_first_origin(items: &[ContentItem]) -> Option<PathBuf> {
871    for item in items {
872        let r = match item {
873            ContentItem::Paragraph(p) => &p.location,
874            ContentItem::Session(s) => &s.location,
875            ContentItem::Definition(d) => &d.location,
876            ContentItem::List(l) => &l.location,
877            ContentItem::ListItem(li) => &li.location,
878            ContentItem::Annotation(a) => &a.location,
879            ContentItem::VerbatimBlock(v) => &v.location,
880            ContentItem::VerbatimLine(vl) => &vl.location,
881            ContentItem::Table(t) => &t.location,
882            ContentItem::TextLine(tl) => &tl.location,
883            ContentItem::BlankLineGroup(blg) => &blg.location,
884        };
885        if let Some(arc) = r.origin_path.as_ref() {
886            return Some((**arc).clone());
887        }
888    }
889    None
890}
891
892fn decode_wire_to_items(
893    wire: &lex_extension::wire::WireNode,
894    invocation_label: &str,
895    include_site: &Range,
896) -> Result<Vec<ContentItem>, IncludeError> {
897    use crate::lex::wire::from_wire_node;
898
899    from_wire_node(wire).map_err(|e| IncludeError::HandlerFailed {
900        include_site: include_site.clone(),
901        label: invocation_label.to_string(),
902        code: "wire.decode".into(),
903        message: format!("decoding handler-returned wire payload failed: {e}"),
904    })
905}
906
907/// Map a [`HandlerError`] returned by the registry into the most
908/// specific [`IncludeError`] variant available. Codes in the
909/// `-32001..=-32005` range emitted by [`crate::lex::builtins::LexIncludeHandler`]
910/// translate back to their corresponding pre-extension-system
911/// variants so existing CLI/LSP error rendering and the integration
912/// test suite keep working unchanged. Unknown codes (third-party
913/// namespaces, future built-ins) surface as `HandlerFailed`.
914fn handler_error_to_include_error(
915    err: &HandlerError,
916    label: &str,
917    include_site: &Range,
918) -> IncludeError {
919    use crate::lex::builtins::include::{
920        CODE_ABSOLUTE_PATH, CODE_IO, CODE_MISSING_SRC, CODE_NOT_FOUND, CODE_OUTSIDE_ROOT,
921        CODE_PARSE_FAILED, CODE_TOO_LARGE,
922    };
923
924    match err {
925        HandlerError::Custom {
926            code,
927            message,
928            data,
929        } => match *code {
930            CODE_NOT_FOUND => IncludeError::NotFound {
931                include_site: include_site.clone(),
932                path: data_str(data, "path")
933                    .map(PathBuf::from)
934                    .unwrap_or_default(),
935            },
936            CODE_OUTSIDE_ROOT => IncludeError::RootEscape {
937                path: data_str(data, "path")
938                    .map(PathBuf::from)
939                    .unwrap_or_default(),
940                root: data_str(data, "root")
941                    .map(PathBuf::from)
942                    .unwrap_or_default(),
943            },
944            CODE_TOO_LARGE => IncludeError::FileTooLarge {
945                include_site: include_site.clone(),
946                path: data_str(data, "path")
947                    .map(PathBuf::from)
948                    .unwrap_or_default(),
949                size: data_u64(data, "size").unwrap_or(0),
950                limit: data_u64(data, "limit").unwrap_or(0),
951            },
952            CODE_ABSOLUTE_PATH => IncludeError::AbsolutePath {
953                path: data_str(data, "path")
954                    .map(PathBuf::from)
955                    .unwrap_or_default(),
956            },
957            CODE_IO => IncludeError::LoaderIo {
958                path: data_str(data, "path")
959                    .map(PathBuf::from)
960                    .unwrap_or_default(),
961                message: message.clone(),
962            },
963            CODE_MISSING_SRC => IncludeError::MissingSrc {
964                include_site: include_site.clone(),
965            },
966            CODE_PARSE_FAILED => IncludeError::ParseFailed {
967                path: data_str(data, "path")
968                    .map(PathBuf::from)
969                    .unwrap_or_default(),
970                message: data_str(data, "message").unwrap_or_else(|| message.clone()),
971            },
972            other => IncludeError::HandlerFailed {
973                include_site: include_site.clone(),
974                label: label.to_string(),
975                code: format!("handler.custom({other})"),
976                message: message.clone(),
977            },
978        },
979        HandlerError::Internal { message } => IncludeError::HandlerFailed {
980            include_site: include_site.clone(),
981            label: label.to_string(),
982            code: "handler.internal".into(),
983            message: message.clone(),
984        },
985        HandlerError::Unsupported { detail } => IncludeError::HandlerFailed {
986            include_site: include_site.clone(),
987            label: label.to_string(),
988            code: "handler.unsupported".into(),
989            message: detail.clone(),
990        },
991    }
992}
993
994fn data_str(data: &Option<serde_json::Value>, key: &str) -> Option<String> {
995    data.as_ref()?.get(key)?.as_str().map(str::to_string)
996}
997
998fn data_u64(data: &Option<serde_json::Value>, key: &str) -> Option<u64> {
999    data.as_ref()?.get(key)?.as_u64()
1000}
1001
1002#[allow(clippy::ptr_arg)]
1003fn recurse_into_children(
1004    children: &mut Vec<ContentItem>,
1005    state: &mut ResolverState<'_>,
1006) -> Result<(), IncludeError> {
1007    for item in children.iter_mut() {
1008        match item {
1009            ContentItem::Session(s) => {
1010                splice_in_session_container(s.children.as_mut_vec(), state)?;
1011            }
1012            ContentItem::Definition(d) => {
1013                splice_in_general_container(&mut d.children, state, ContainerKind::Definition)?;
1014            }
1015            ContentItem::Annotation(a) => {
1016                // Skip the body of annotations whose schema declares
1017                // `hooks.resolve = true` — those are dispatched at the
1018                // parent level by `process_resolves`. Walking their
1019                // bodies *here* would trip the resolve again on the
1020                // same invocation.
1021                //
1022                // The body is still walked when the resolve actually
1023                // runs: `process_resolves` calls
1024                // `resolve_one_invocation`, and the
1025                // [`ResolveOutcome::Spliced`] arm walks the splice
1026                // subtree (which replaces the annotation), while the
1027                // [`ResolveOutcome::Unexpanded`] arm explicitly
1028                // walks the kept annotation's body via
1029                // `splice_in_general_container`. So nested
1030                // resolve-hooked annotations inside an unexpanded
1031                // outer annotation are still reached.
1032                //
1033                // Non-resolve-hooked annotations recurse normally
1034                // here so their nested bodies get processed.
1035                let is_resolve_hooked = state
1036                    .registry
1037                    .schema_for(&a.data.label.value)
1038                    .map(|s| s.hooks.resolve)
1039                    .unwrap_or(false);
1040                if !is_resolve_hooked {
1041                    splice_in_general_container(
1042                        &mut a.children,
1043                        state,
1044                        ContainerKind::AnnotationBody,
1045                    )?;
1046                }
1047            }
1048            ContentItem::List(l) => {
1049                for li in l.items.as_mut_vec().iter_mut() {
1050                    if let ContentItem::ListItem(item) = li {
1051                        splice_in_general_container(
1052                            &mut item.children,
1053                            state,
1054                            ContainerKind::ListItem,
1055                        )?;
1056                    }
1057                }
1058            }
1059            _ => {}
1060        }
1061    }
1062    Ok(())
1063}
1064
1065fn validate_against_kind(
1066    items: &[ContentItem],
1067    kind: ContainerKind,
1068    site: &Range,
1069    file: &Path,
1070) -> Result<(), IncludeError> {
1071    if kind.allows_sessions() {
1072        return Ok(());
1073    }
1074    if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
1075        return Err(IncludeError::ContainerPolicy {
1076            include_site: site.clone(),
1077            container: kind.name(),
1078            file: file.to_path_buf(),
1079            violation: "Sessions",
1080        });
1081    }
1082    Ok(())
1083}
1084
1085// ============================================================================
1086// Path resolution
1087// ============================================================================
1088
1089/// Resolve a file-reference target string the same way the include
1090/// resolver resolves include paths.
1091///
1092/// Use this when consuming `ReferenceType::File { target }` (or any other
1093/// node-attached path) so that relative paths resolve from the *authoring*
1094/// file's directory, not from wherever the merged document happens to be
1095/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
1096/// containing node (or `None` if the node was never stamped — in that case
1097/// the path is treated as if authored at the root).
1098///
1099/// Behaviour matches the include resolver:
1100/// - Root-absolute targets (leading `/`) resolve under `root`.
1101/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
1102///   when `ref_origin` is `None`).
1103/// - The result is lexically normalized and checked against `root` —
1104///   paths that escape it return `RootEscape`.
1105///
1106/// This is a sister to the resolver's internal `resolve_path` and shares
1107/// the same lexical-normalization caveat: it does not touch the filesystem.
1108pub fn resolve_file_reference(
1109    target: &str,
1110    ref_origin: Option<&Path>,
1111    root: &Path,
1112) -> Result<PathBuf, IncludeError> {
1113    let host_dir: PathBuf = ref_origin
1114        .and_then(|p| p.parent())
1115        .map(Path::to_path_buf)
1116        .unwrap_or_else(|| root.to_path_buf());
1117    resolve_path(target, &host_dir, root)
1118}
1119
1120fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
1121    let candidate = if let Some(rel) = src.strip_prefix('/') {
1122        // Root-absolute (Lex spec convention): leading `/` means "from
1123        // the resolution root", not "filesystem root".
1124        root.join(rel)
1125    } else {
1126        // Anything else must be a relative path. Reject inputs the
1127        // host platform would treat as absolute (Windows `C:\foo`,
1128        // `\\server\share`, `\foo`) up front: the spec forbids
1129        // platform-absolute paths from entering the resolution
1130        // pipeline. Without this, `host_dir.join(src)` would silently
1131        // discard `host_dir` because Rust's `PathBuf::join` replaces
1132        // the base when the joined path is absolute. The downstream
1133        // root-escape check would still catch the security side, but
1134        // we'd surface a misleading "escapes root" error instead of
1135        // "absolute paths not allowed", and we'd be relying on
1136        // `PathBuf::join`'s override semantics for the security
1137        // outcome rather than holding the line at the input boundary.
1138        if Path::new(src).is_absolute() {
1139            return Err(IncludeError::AbsolutePath {
1140                path: PathBuf::from(src),
1141            });
1142        }
1143        host_dir.join(src)
1144    };
1145    let normalized = lexical_normalize(&candidate);
1146    let canonical_root = lexical_normalize(root);
1147    if !normalized.starts_with(&canonical_root) {
1148        return Err(IncludeError::RootEscape {
1149            path: normalized,
1150            root: canonical_root,
1151        });
1152    }
1153    Ok(normalized)
1154}
1155
1156/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
1157///
1158/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
1159/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
1160/// version is sufficient for include-site path resolution because the
1161/// resolver only needs a stable identity for cycle detection and a uniform
1162/// shape for the root-escape prefix check.
1163///
1164/// `..` is collapsed only when the *last* component in the buffer is a
1165/// real directory name (`Component::Normal`). When the buffer is empty
1166/// or its last component is itself `..` (or a root marker), the new `..`
1167/// is *preserved* in the buffer.
1168///
1169/// This is what defeats `../../etc/passwd` from collapsing to
1170/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
1171/// would happily strip a `..` (since `Path::new("..").parent()` returns
1172/// `Some("")`), silently losing the second `..` and producing a path
1173/// that falsely starts with the root prefix. Each unmatched `..` in the
1174/// preserved form keeps the normalized path outside any sane root, so
1175/// the escape check fires correctly.
1176fn lexical_normalize(p: &Path) -> PathBuf {
1177    let mut out = PathBuf::new();
1178    for c in p.components() {
1179        match c {
1180            std::path::Component::ParentDir => {
1181                let can_pop = matches!(
1182                    out.components().next_back(),
1183                    Some(std::path::Component::Normal(_))
1184                );
1185                if can_pop {
1186                    out.pop();
1187                } else {
1188                    out.push("..");
1189                }
1190            }
1191            std::path::Component::CurDir => {}
1192            other => out.push(other.as_os_str()),
1193        }
1194    }
1195    out
1196}
1197
1198// ============================================================================
1199// Origin stamping
1200// ============================================================================
1201//
1202// Walk every node in a Document and set `Range.origin_path` on each
1203// `.location` field. The walk only stamps the *block-level* `.location`
1204// fields here; finer-grained inline ranges land in PR 6 when file-ref
1205// resolution starts consulting them.
1206
1207pub(crate) fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
1208    if let Some(title) = doc.title.as_mut() {
1209        title.location.origin_path = Some(Arc::clone(origin));
1210    }
1211    for ann in doc.annotations.iter_mut() {
1212        stamp_annotation(ann, origin);
1213    }
1214    stamp_session(&mut doc.root, origin);
1215}
1216
1217fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
1218    s.location.origin_path = Some(Arc::clone(origin));
1219    if let Some(loc) = s.title.location.as_mut() {
1220        loc.origin_path = Some(Arc::clone(origin));
1221    }
1222    for ann in s.annotations.iter_mut() {
1223        stamp_annotation(ann, origin);
1224    }
1225    for item in s.children.as_mut_vec().iter_mut() {
1226        stamp_item(item, origin);
1227    }
1228}
1229
1230fn stamp_annotation(
1231    a: &mut crate::lex::ast::elements::annotation::Annotation,
1232    origin: &Arc<PathBuf>,
1233) {
1234    a.location.origin_path = Some(Arc::clone(origin));
1235    a.data.location.origin_path = Some(Arc::clone(origin));
1236    for item in a.children.as_mut_vec().iter_mut() {
1237        stamp_item(item, origin);
1238    }
1239}
1240
1241fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
1242    match item {
1243        ContentItem::Session(s) => stamp_session(s, origin),
1244        ContentItem::Annotation(a) => stamp_annotation(a, origin),
1245        ContentItem::Paragraph(p) => {
1246            p.location.origin_path = Some(Arc::clone(origin));
1247            for ann in p.annotations.iter_mut() {
1248                stamp_annotation(ann, origin);
1249            }
1250            for line in p.lines.iter_mut() {
1251                stamp_item(line, origin);
1252            }
1253        }
1254        ContentItem::List(l) => {
1255            l.location.origin_path = Some(Arc::clone(origin));
1256            for li in l.items.as_mut_vec().iter_mut() {
1257                stamp_item(li, origin);
1258            }
1259        }
1260        ContentItem::ListItem(li) => {
1261            li.location.origin_path = Some(Arc::clone(origin));
1262            for ann in li.annotations.iter_mut() {
1263                stamp_annotation(ann, origin);
1264            }
1265            for child in li.children.as_mut_vec().iter_mut() {
1266                stamp_item(child, origin);
1267            }
1268        }
1269        ContentItem::Definition(d) => {
1270            d.location.origin_path = Some(Arc::clone(origin));
1271            for ann in d.annotations.iter_mut() {
1272                stamp_annotation(ann, origin);
1273            }
1274            for child in d.children.as_mut_vec().iter_mut() {
1275                stamp_item(child, origin);
1276            }
1277        }
1278        ContentItem::VerbatimBlock(v) => {
1279            v.location.origin_path = Some(Arc::clone(origin));
1280        }
1281        ContentItem::VerbatimLine(vl) => {
1282            vl.location.origin_path = Some(Arc::clone(origin));
1283        }
1284        ContentItem::Table(t) => {
1285            t.location.origin_path = Some(Arc::clone(origin));
1286        }
1287        ContentItem::TextLine(tl) => {
1288            tl.location.origin_path = Some(Arc::clone(origin));
1289        }
1290        ContentItem::BlankLineGroup(b) => {
1291            b.location.origin_path = Some(Arc::clone(origin));
1292        }
1293    }
1294}
1295
1296// ============================================================================
1297// Parser glue
1298// ============================================================================
1299
1300/// Parse `source` into a Document but skip the annotation-attachment stage,
1301/// so include annotations are findable in container children lists.
1302pub(crate) fn parse_no_attach(source: &str) -> Result<Document, String> {
1303    crate::lex::testing::parse_without_annotation_attachment(source)
1304}
1305
1306// ============================================================================
1307// Filesystem-backed loader
1308// ============================================================================
1309
1310/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
1311///
1312/// This is the production loader used by the CLI; the LSP wraps it with a
1313/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
1314/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
1315/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
1316/// WASM-friendly.
1317///
1318/// `FsLoader` is constructed with the resolution root and rechecks every
1319/// load against it post-`fs::canonicalize`, so a symlink pointing outside
1320/// the root is rejected even though the lexical-only check in
1321/// [`resolve_path`] cannot see it. Also rejects non-regular files (devices,
1322/// FIFOs, directories) before reading, so the loader can't be tricked into
1323/// blocking on `/dev/zero` or allocating against an open device.
1324///
1325/// Errors map:
1326/// - canonicalization fails (file missing, permission denied at a parent,
1327///   broken symlink, …) → [`LoadError::NotFound`]
1328/// - canonical path doesn't sit under canonical root → [`LoadError::OutsideRoot`]
1329/// - target is not a regular file → [`LoadError::Io`] with a clear message
1330/// - any other I/O error during read → [`LoadError::Io`]
1331pub struct FsLoader {
1332    /// Filesystem-canonical resolution root. Constructed once at
1333    /// `FsLoader::new`; if canonicalization fails (e.g., the configured
1334    /// root doesn't exist on disk), we fall back to the input verbatim
1335    /// and the bounds check will simply never pass — visible to the user
1336    /// as a `LoadError::OutsideRoot` instead of silently disabling the
1337    /// security check.
1338    canonical_root: PathBuf,
1339    /// Per-file size cap (bytes). Loads of larger files surface as
1340    /// `LoadError::TooLarge` before any bytes are read into memory.
1341    /// Default [`FsLoader::DEFAULT_MAX_FILE_SIZE`].
1342    max_file_size: u64,
1343}
1344
1345impl FsLoader {
1346    /// Default per-file size cap: 10 MiB. Generous for realistic Lex
1347    /// source documents (text only) and tight enough to bound memory
1348    /// allocation per include against an adversarial 1 GB file.
1349    pub const DEFAULT_MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
1350
1351    /// Construct a loader rooted at `root` with default size limits.
1352    /// The loader stores `root`'s fs-canonical form (with symlinks
1353    /// resolved); subsequent loads validate that the requested path's
1354    /// canonical form lives under it.
1355    pub fn new(root: PathBuf) -> Self {
1356        let canonical_root = std::fs::canonicalize(&root).unwrap_or(root);
1357        Self {
1358            canonical_root,
1359            max_file_size: Self::DEFAULT_MAX_FILE_SIZE,
1360        }
1361    }
1362
1363    /// Override the default per-file size cap (bytes). Use to widen the
1364    /// limit for projects with genuinely large source files, or tighten
1365    /// it for stricter sandboxes (e.g., LSPs serving untrusted content).
1366    pub fn with_max_file_size(mut self, max_file_size: u64) -> Self {
1367        self.max_file_size = max_file_size;
1368        self
1369    }
1370}
1371
1372impl Loader for FsLoader {
1373    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1374        // 1. Canonicalize. Resolves symlinks and `..` segments against the
1375        //    real filesystem. NotFound / broken-symlink / permission errors
1376        //    all surface here.
1377        let canonical_path = std::fs::canonicalize(path).map_err(|e| match e.kind() {
1378            std::io::ErrorKind::NotFound => LoadError::NotFound {
1379                path: path.to_path_buf(),
1380            },
1381            _ => LoadError::Io {
1382                path: path.to_path_buf(),
1383                message: e.to_string(),
1384            },
1385        })?;
1386
1387        // 2. Bounds check against the *canonical* root. This is the
1388        //    actual security gate against symlink traversal — the lexical
1389        //    check in resolve_path can't see through symlinks.
1390        if !canonical_path.starts_with(&self.canonical_root) {
1391            return Err(LoadError::OutsideRoot {
1392                path: canonical_path,
1393                root: self.canonical_root.clone(),
1394            });
1395        }
1396
1397        // 3. Reject non-regular files. Without this, an attacker (with
1398        //    write access to the repo) could symlink an include target to
1399        //    `/dev/zero` or a FIFO and block / OOM the reader. The
1400        //    is_file() metadata call is a cheap sanity check.
1401        let meta = std::fs::metadata(&canonical_path).map_err(|e| LoadError::Io {
1402            path: canonical_path.clone(),
1403            message: e.to_string(),
1404        })?;
1405        if !meta.is_file() {
1406            return Err(LoadError::Io {
1407                path: canonical_path,
1408                message: "include target is not a regular file".to_string(),
1409            });
1410        }
1411
1412        // 4. Size cap. Bounds memory allocation per include against an
1413        //    adversarial 1 GB file before any bytes hit the heap.
1414        let size = meta.len();
1415        if size > self.max_file_size {
1416            return Err(LoadError::TooLarge {
1417                path: canonical_path,
1418                size,
1419                limit: self.max_file_size,
1420            });
1421        }
1422
1423        // 5. Read. By this point we know the path is a regular file under
1424        //    the canonical root and within the size cap; anything that
1425        //    fails here is a real I/O error worth surfacing.
1426        let source = std::fs::read_to_string(&canonical_path).map_err(|e| LoadError::Io {
1427            path: canonical_path.clone(),
1428            message: e.to_string(),
1429        })?;
1430
1431        Ok(LoadedFile {
1432            source,
1433            canonical_path,
1434        })
1435    }
1436}
1437
1438// ============================================================================
1439// Test fixtures (test-support feature + cfg(test))
1440// ============================================================================
1441
1442/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
1443#[cfg(any(test, feature = "test-support"))]
1444pub struct MemoryLoader {
1445    files: std::collections::HashMap<PathBuf, String>,
1446}
1447
1448#[cfg(any(test, feature = "test-support"))]
1449impl MemoryLoader {
1450    /// Create an empty loader. Add files with [`MemoryLoader::insert`].
1451    pub fn new() -> Self {
1452        Self {
1453            files: std::collections::HashMap::new(),
1454        }
1455    }
1456
1457    /// Register a file at `path` with the given source text.
1458    pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
1459        self.files.insert(path.into(), contents.into());
1460        self
1461    }
1462
1463    /// Convenience constructor: build a loader from any iterator of
1464    /// `(path, contents)` pairs.
1465    pub fn from_pairs<I, P, S>(pairs: I) -> Self
1466    where
1467        I: IntoIterator<Item = (P, S)>,
1468        P: Into<PathBuf>,
1469        S: Into<String>,
1470    {
1471        let mut loader = Self::new();
1472        for (path, contents) in pairs {
1473            loader.insert(path, contents);
1474        }
1475        loader
1476    }
1477}
1478
1479#[cfg(any(test, feature = "test-support"))]
1480impl Default for MemoryLoader {
1481    fn default() -> Self {
1482        Self::new()
1483    }
1484}
1485
1486#[cfg(any(test, feature = "test-support"))]
1487impl Loader for MemoryLoader {
1488    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1489        // Memory loaders have no symlinks; the lookup key *is* the
1490        // canonical identity. Cycle detection in the resolver compares
1491        // `LoadedFile::canonical_path` values; for tests this matches the
1492        // lexically-normalized paths the resolver already produces.
1493        let source = self
1494            .files
1495            .get(path)
1496            .cloned()
1497            .ok_or_else(|| LoadError::NotFound {
1498                path: path.to_path_buf(),
1499            })?;
1500        Ok(LoadedFile {
1501            source,
1502            canonical_path: path.to_path_buf(),
1503        })
1504    }
1505}
1506
1507// ============================================================================
1508// Tests
1509// ============================================================================
1510
1511#[cfg(test)]
1512mod tests;