Skip to main content

lex_core/lex/
includes.rs

1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//!   doc-title/doc-annotation conversion + origin stamping + root-escape
19//!   check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//!   (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//!   directory, so relative paths inside an included file resolve from
23//!   that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//!   resolves a `ReferenceType::File` target from the authoring file's
26//!   directory using `Range.origin_path`.
27//!   `Document::find_annotation_by_label_in_origin` scopes footnote
28//!   lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//!   filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//!   into `lex convert` and `lex inspect` (default-on, opt-out via
32//!   `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47// `IncludeError` carries diagnostic context (paths, source ranges,
48// handler messages) on every variant; the `result_large_err` lint
49// would have us box the whole error or split it into a thinner shape
50// just to satisfy the size heuristic. The enum is already part of
51// the public API and the error path is rare; suppress the lint for
52// this module rather than churn the public surface.
53#![allow(clippy::result_large_err)]
54
55use crate::lex::assembling::stages::{ApplyTableConfig, NormalizeLabels};
56use crate::lex::assembling::AttachAnnotations;
57use crate::lex::ast::elements::container::GeneralContainer;
58use crate::lex::ast::elements::content_item::ContentItem;
59use crate::lex::ast::elements::session::Session;
60use crate::lex::ast::range::Range;
61use crate::lex::ast::Document;
62use crate::lex::transforms::Runnable;
63use lex_extension::handler::HandlerError;
64use lex_extension_host::registry::Registry;
65use std::path::{Path, PathBuf};
66use std::sync::Arc;
67
68/// Configuration for the include resolution pass.
69#[derive(Debug, Clone)]
70pub struct ResolveConfig {
71    /// Directory all include paths resolve under. Any include that
72    /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
73    ///
74    /// Must be an **absolute** path. Lexical normalization treats `.`
75    /// and `..` against an empty buffer as no-ops; passing a relative
76    /// or unnormalized root weakens the root-escape prefix check.
77    /// Callers (CLI, LSP) should canonicalize the root before
78    /// constructing `ResolveConfig`.
79    pub root: PathBuf,
80    /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
81    /// Hitting the limit is an error, not a silent truncation.
82    pub max_depth: usize,
83    /// Maximum total number of `lex.include` annotations resolved across
84    /// the whole tree (depth × breadth). Default 1000
85    /// (see [`ResolveConfig::DEFAULT_MAX_TOTAL_INCLUDES`]).
86    ///
87    /// Caps fan-out: `max_depth` alone bounds chain length but not
88    /// breadth. A document with 100 thousand top-level includes at depth
89    /// 1 sits inside `max_depth` but can still OOM the resolver / LSP /
90    /// CI. Hitting this limit is an error, not a silent truncation.
91    pub max_total_includes: usize,
92}
93
94impl ResolveConfig {
95    /// Default maximum include depth — enough for any reasonable atomization
96    /// strategy (aggregator → per-chapter → per-section), bounded enough to
97    /// keep the resolver's worst-case work predictable.
98    pub const DEFAULT_MAX_DEPTH: usize = 8;
99
100    /// Default maximum total include count (DoS bound). Generous enough
101    /// for a book-length document with thousands of small fragments,
102    /// tight enough to contain adversarial fan-out within a few seconds
103    /// of resolver work.
104    pub const DEFAULT_MAX_TOTAL_INCLUDES: usize = 1000;
105
106    /// Construct a config with the given root and default limits.
107    pub fn with_root(root: PathBuf) -> Self {
108        Self {
109            root,
110            max_depth: Self::DEFAULT_MAX_DEPTH,
111            max_total_includes: Self::DEFAULT_MAX_TOTAL_INCLUDES,
112        }
113    }
114}
115
116/// A pluggable source-text loader.
117///
118/// Implementations decide where bytes come from (filesystem, in-memory map,
119/// virtual filesystem, content-addressed store, …). lex-core never references
120/// `std::fs` directly through this trait; that keeps the resolver pure and
121/// usable in WASM, sandboxes, and unit tests.
122pub trait Loader {
123    /// Load the source text for `path` and return both the contents and a
124    /// canonical identity for the loaded resource. The path is what the
125    /// resolver decided on after applying the rules in §4 of the proposal.
126    ///
127    /// `LoadedFile::canonical_path` is the loader's authoritative identity
128    /// for the resource. For [`FsLoader`] this is the filesystem-canonical
129    /// path (symlinks resolved, case-folded if the underlying FS is
130    /// case-insensitive); for [`MemoryLoader`] it's the lookup key (since
131    /// memory loaders have no symlinks). The resolver uses this for cycle
132    /// detection and for stamping `Range.origin_path` on the loaded tree.
133    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError>;
134}
135
136/// Result of a successful [`Loader::load`].
137#[derive(Debug, Clone)]
138pub struct LoadedFile {
139    /// The file's source text.
140    pub source: String,
141    /// The loader's authoritative identity for the resource. See
142    /// [`Loader::load`] for how loaders decide this.
143    pub canonical_path: PathBuf,
144}
145
146/// Errors a [`Loader`] can produce.
147#[derive(Debug, Clone)]
148pub enum LoadError {
149    /// The loader could not find a resource at the given path.
150    NotFound { path: PathBuf },
151    /// The resource exists but resolves outside the loader's allowed
152    /// boundary. The lexical resolver normalizes `..` in the requested
153    /// path, but loaders that touch a real filesystem must do a second
154    /// check post-canonicalization to catch symlinks that escape the
155    /// boundary lexically-correct paths can't reach.
156    OutsideRoot { path: PathBuf, root: PathBuf },
157    /// The resource exists but its size exceeds the loader's configured
158    /// limit. `size` and `limit` are in bytes. The resolver maps this to
159    /// [`IncludeError::FileTooLarge`] with the offending annotation's site.
160    TooLarge {
161        path: PathBuf,
162        size: u64,
163        limit: u64,
164    },
165    /// Underlying I/O error (or virtual-filesystem equivalent).
166    Io { path: PathBuf, message: String },
167}
168
169impl std::fmt::Display for LoadError {
170    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171        match self {
172            LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
173            LoadError::OutsideRoot { path, root } => write!(
174                f,
175                "include path {} resolves outside loader root {}",
176                path.display(),
177                root.display()
178            ),
179            LoadError::TooLarge { path, size, limit } => write!(
180                f,
181                "include file {} is {size} bytes, exceeds limit of {limit} bytes",
182                path.display()
183            ),
184            LoadError::Io { path, message } => {
185                write!(f, "io error reading {}: {message}", path.display())
186            }
187        }
188    }
189}
190
191impl std::error::Error for LoadError {}
192
193/// Errors the include resolver can produce.
194#[derive(Debug, Clone)]
195pub enum IncludeError {
196    /// An include chain looped back on itself. `chain` is the resolution
197    /// stack at the moment the duplicate `path` was about to be pushed,
198    /// in source-order (entry first, deepest last). `include_site` is the
199    /// range of the offending `lex.include` annotation in its host file —
200    /// useful for diagnostics that highlight the exact line.
201    Cycle {
202        include_site: Range,
203        path: PathBuf,
204        chain: Vec<PathBuf>,
205    },
206    /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
207    /// shows the resolution stack at the moment of failure, in source
208    /// order. `include_site` is the range of the offending
209    /// `lex.include` annotation in its host file.
210    DepthExceeded {
211        include_site: Range,
212        limit: usize,
213        chain: Vec<PathBuf>,
214    },
215    /// The total number of includes resolved across the document
216    /// exceeded [`ResolveConfig::max_total_includes`]. Bounds adversarial
217    /// fan-out (which `max_depth` alone does not). `include_site` is the
218    /// `lex.include` annotation that pushed the count past the limit.
219    TotalIncludesExceeded { include_site: Range, limit: usize },
220    /// The included file's size exceeded the loader's configured limit.
221    /// Surfaced by loaders that read from a real filesystem (FsLoader)
222    /// to bound memory allocation per include. `include_site` is the
223    /// offending annotation; `size` and `limit` are in bytes.
224    FileTooLarge {
225        include_site: Range,
226        path: PathBuf,
227        size: u64,
228        limit: u64,
229    },
230    /// A path resolved outside the configured [`ResolveConfig::root`].
231    RootEscape { path: PathBuf, root: PathBuf },
232    /// The include `src` was a platform-absolute filesystem path
233    /// (e.g. Windows `C:\foo`, `\\server\share`, `\foo`). The spec
234    /// forbids absolute filesystem paths from entering the
235    /// resolution pipeline; the *root-absolute* form (leading `/`
236    /// resolved against the includes root) is the only spec-allowed
237    /// way to write a path that doesn't start from the host's
238    /// directory. On Unix the only thing that's `Path::is_absolute()`
239    /// is a leading `/`, which is consumed by the root-absolute
240    /// branch first; this variant therefore only fires in practice
241    /// for Windows-shaped absolute paths.
242    AbsolutePath { path: PathBuf },
243    /// The loader could not find or read the included file. `include_site`
244    /// is the range of the offending `lex.include` annotation in its host
245    /// file, so editors can squiggle the line that asked for the missing
246    /// file rather than the document head.
247    NotFound { include_site: Range, path: PathBuf },
248    /// The loader returned text that the parser rejected.
249    ParseFailed { path: PathBuf, message: String },
250    /// The included file's content is not legal in the include site's
251    /// parent container.
252    ///
253    /// Today this only occurs when an included file has top-level Sessions
254    /// and the include site is inside a `GeneralContainer` (Definition,
255    /// ListItem, or another Annotation's body). The `violation` field
256    /// names the offending content kind (e.g. `"Sessions"`) so future
257    /// container/policy combinations can reuse this variant without a
258    /// breaking change.
259    ContainerPolicy {
260        include_site: Range,
261        container: &'static str,
262        file: PathBuf,
263        violation: &'static str,
264    },
265    /// Loader propagated a non-`NotFound` I/O error.
266    LoaderIo { path: PathBuf, message: String },
267    /// `lex.include` annotation was missing the mandatory `src=` parameter.
268    MissingSrc { include_site: Range },
269    /// A registered handler returned an error the pass could not map
270    /// onto a more specific variant — typically a third-party
271    /// namespace's resolve hook surfacing an internal failure, or an
272    /// unrecognised handler-defined code from `lex.*` built-ins. The
273    /// `code` is the string identifier the registry attaches to the
274    /// diagnostic (`"handler.internal"`, `"handler.custom"`, …).
275    HandlerFailed {
276        include_site: Range,
277        label: String,
278        code: String,
279        message: String,
280    },
281}
282
283impl std::fmt::Display for IncludeError {
284    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285        match self {
286            IncludeError::Cycle { path, chain, .. } => {
287                let chain_display: Vec<String> =
288                    chain.iter().map(|p| p.display().to_string()).collect();
289                write!(
290                    f,
291                    "include cycle: {} (chain: {})",
292                    path.display(),
293                    chain_display.join(" -> ")
294                )
295            }
296            IncludeError::DepthExceeded { limit, chain, .. } => {
297                let chain_display: Vec<String> =
298                    chain.iter().map(|p| p.display().to_string()).collect();
299                write!(
300                    f,
301                    "include depth exceeded limit of {limit} (chain: {})",
302                    chain_display.join(" -> ")
303                )
304            }
305            IncludeError::TotalIncludesExceeded { limit, .. } => {
306                write!(f, "total include count exceeded limit of {limit}")
307            }
308            IncludeError::FileTooLarge {
309                path, size, limit, ..
310            } => {
311                write!(
312                    f,
313                    "included file {} is {size} bytes, exceeds limit of {limit} bytes",
314                    path.display()
315                )
316            }
317            IncludeError::RootEscape { path, root } => write!(
318                f,
319                "include path {} escapes resolution root {}",
320                path.display(),
321                root.display()
322            ),
323            IncludeError::AbsolutePath { path } => write!(
324                f,
325                "include src {} is a platform-absolute path; \
326                 the spec forbids absolute filesystem paths — use a relative path \
327                 (chapters/01.lex) or a root-absolute path (/shared/01.lex)",
328                path.display()
329            ),
330            IncludeError::NotFound { path, .. } => {
331                write!(f, "include not found: {}", path.display())
332            }
333            IncludeError::ParseFailed { path, message } => {
334                write!(f, "failed to parse {}: {message}", path.display())
335            }
336            IncludeError::ContainerPolicy {
337                container,
338                file,
339                violation,
340                ..
341            } => write!(
342                f,
343                "included file {} contains {} but include site is inside {} \
344                 (which does not allow {})",
345                file.display(),
346                violation,
347                container,
348                violation
349            ),
350            IncludeError::LoaderIo { path, message } => {
351                write!(f, "loader error reading {}: {message}", path.display())
352            }
353            IncludeError::MissingSrc { .. } => {
354                write!(f, "lex.include annotation missing required src= parameter")
355            }
356            IncludeError::HandlerFailed {
357                label,
358                code,
359                message,
360                ..
361            } => write!(f, "extension handler `{label}` failed ({code}): {message}"),
362        }
363    }
364}
365
366impl std::error::Error for IncludeError {}
367
368// No `From<LoadError>` impl: `IncludeError::NotFound` carries the include
369// site (the `lex.include` annotation's range), which a loader doesn't know
370// about. Callers map `LoadError` explicitly at the call site, where the
371// site is available.
372
373/// Which container the include site sits in. Determines the splice-time
374/// policy check (the only one today is "no Sessions in `GeneralContainer`").
375#[derive(Debug, Clone, Copy)]
376enum ContainerKind {
377    /// `Document.root.children` or `Session.children` — accepts everything.
378    Session,
379    /// `Definition.children` — `GeneralContainer`.
380    Definition,
381    /// `Annotation.children` — `GeneralContainer`.
382    AnnotationBody,
383    /// `ListItem.children` — `GeneralContainer`.
384    ListItem,
385}
386
387impl ContainerKind {
388    fn name(self) -> &'static str {
389        match self {
390            ContainerKind::Session => "Session",
391            ContainerKind::Definition => "Definition",
392            ContainerKind::AnnotationBody => "Annotation body",
393            ContainerKind::ListItem => "ListItem",
394        }
395    }
396
397    fn allows_sessions(self) -> bool {
398        matches!(self, ContainerKind::Session)
399    }
400}
401
402/// Hard cap on resolution depth, applied even when the
403/// configurable [`ResolveConfig::max_depth`] is set higher. Bounds
404/// adversarial varying-position recursion (a handler that returns
405/// content with a different invocation site each iteration so the
406/// cycle key never matches) so the resolver always terminates.
407pub const KERNEL_DEPTH_BACKSTOP: usize = 32;
408
409/// Resolve every `hooks.resolve = true` labelled annotation starting
410/// from `source`, dispatching through `registry`, and recursively
411/// processing the spliced content.
412///
413/// `source_path` identifies the entry-point file. It is used to
414/// (a) stamp `Range.origin_path` on every node so downstream code
415/// (file-ref resolution, diagnostics, LSP goto) can report locations
416/// against the authoring file, and (b) provide the host directory
417/// the built-in `lex.include` handler resolves relative `src=` paths
418/// against (via `LabelCtx.node.origin`). When `None`, origin stamping
419/// is skipped on the entry and the handler resolves relative paths
420/// against `config.root`.
421///
422/// # Generic dispatch
423///
424/// Every label whose schema declares `hooks.resolve = true` flows
425/// through the same path: build a [`LabelCtx`] from the annotation,
426/// call [`Registry::dispatch_resolve_raw`], decode the returned
427/// [`WireNode`] back into typed [`ContentItem`]s via
428/// [`crate::lex::wire::from_wire_node`], and splice in place. The
429/// built-in `lex.include` handler is registered the same way as any
430/// third-party namespace.
431///
432/// # Pre/post-attachment
433///
434/// Internally this re-parses the entry source *without* annotation
435/// attachment so labelled annotations stay visible as standalone
436/// children. The handler does its own `parse_no_attach` for loaded
437/// content. After all splices, [`AttachAnnotations`] runs once on
438/// the merged tree.
439///
440/// # Recursion + cycle detection
441///
442/// Cycle detection keys on `(label, origin_path, start_position)` of
443/// the invocation site. A handler that returns content containing
444/// another invocation at the same source position is caught
445/// immediately. A handler that varies the invocation position each
446/// iteration terminates at `min(config.max_depth, KERNEL_DEPTH_BACKSTOP)`
447/// with `IncludeError::DepthExceeded`. The total-includes counter
448/// caps adversarial fan-out independent of depth.
449pub fn resolve_from_source(
450    source: &str,
451    source_path: Option<PathBuf>,
452    config: &ResolveConfig,
453    registry: &Registry,
454) -> Result<Document, IncludeError> {
455    let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
456
457    let mut doc = parse_no_attach(source).map_err(|message| IncludeError::ParseFailed {
458        path: source_path.clone().unwrap_or_default(),
459        message,
460    })?;
461
462    if let Some(origin) = entry_origin.as_ref() {
463        stamp_doc(&mut doc, origin);
464    }
465
466    // Normalise labels in the entry source BEFORE the resolve walk so
467    // shortcut spellings (`:: include ::`, `:: image ::`, …) are
468    // rewritten to their canonical form. The resolve dispatcher keys
469    // on `registry.schema_for(label)` with the canonical spelling, so
470    // without this an `:: include src=... ::` annotation would be
471    // skipped because no schema is registered under the bare alias.
472    //
473    // Permissive mode: unknown labels are left as-is rather than
474    // erroring. The standard parse pipeline enforces strict-mode
475    // namespace policy (`STRING_TO_AST`); the resolve entry point is
476    // a downstream stage that just needs the shortcut table applied
477    // so dispatch finds the right handler.
478    let mut doc =
479        NormalizeLabels::permissive()
480            .run(doc)
481            .map_err(|e| IncludeError::ParseFailed {
482                path: source_path.clone().unwrap_or_default(),
483                message: format!("label normalisation failed: {e}"),
484            })?;
485
486    let mut chain: Vec<ResolveKey> = Vec::new();
487    let mut state = ResolverState {
488        config,
489        registry,
490        chain: &mut chain,
491        depth: 0,
492        total_resolved: 0,
493    };
494
495    splice_in_session_container(doc.root.children.as_mut_vec(), &mut state)?;
496
497    let doc = AttachAnnotations::new()
498        .run(doc)
499        .map_err(|e| IncludeError::ParseFailed {
500            path: source_path.clone().unwrap_or_default(),
501            message: format!("annotation attachment failed: {e}"),
502        })?;
503
504    // Re-normalise after splicing. Each included file is parsed via
505    // `parse_no_attach` (no normalisation), so shortcut labels in the
506    // spliced content — e.g. `:: image src=... ::` inside an included
507    // chapter — need rewriting before downstream IR/format passes can
508    // dispatch them.
509    let doc = NormalizeLabels::permissive()
510        .run(doc)
511        .map_err(|e| IncludeError::ParseFailed {
512            path: source_path.clone().unwrap_or_default(),
513            message: format!("label normalisation failed: {e}"),
514        })?;
515
516    // Apply table configuration so `:: table header=N align=... ::`
517    // annotations attached to tables (here or in spliced content) take
518    // effect — matches the order the standard pipeline runs them.
519    let doc = ApplyTableConfig::new()
520        .run(doc)
521        .map_err(|e| IncludeError::ParseFailed {
522            path: source_path.unwrap_or_default(),
523            message: format!("table config application failed: {e}"),
524        })?;
525
526    Ok(doc)
527}
528
529// ============================================================================
530// Splicing
531// ============================================================================
532
533/// One frame on the resolve-pass cycle stack. Two invocations at the
534/// same `(label, origin, start)` position are a cycle, regardless of
535/// what parameters either invocation uses — a handler that varies
536/// params per call (random IDs, timestamps) cannot defeat the
537/// detector by changing param values.
538#[derive(Debug, Clone, PartialEq)]
539struct ResolveKey {
540    label: String,
541    /// `Range.origin_path` of the annotation — the file the
542    /// invocation was authored in. `None` when stamping was skipped
543    /// (e.g., entry source loaded from a string with no path).
544    origin: Option<PathBuf>,
545    start: crate::lex::ast::range::Position,
546}
547
548impl ResolveKey {
549    fn from_annotation(a: &crate::lex::ast::elements::annotation::Annotation) -> Self {
550        Self {
551            label: a.data.label.value.clone(),
552            origin: a.location.origin_path.as_ref().map(|p| (**p).clone()),
553            start: a.location.start,
554        }
555    }
556}
557
558/// Per-resolution state threaded through the recursive walker. Keeps the
559/// signatures of the splice/process functions short and ensures
560/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
561/// each invocation.
562struct ResolverState<'a> {
563    config: &'a ResolveConfig,
564    registry: &'a Registry,
565    /// Active resolution stack of `(label, origin, position)` keys.
566    /// Pushed when we begin dispatching for an invocation and popped
567    /// when its splice subtree is fully resolved. A push that finds
568    /// the same key already on the stack is a cycle.
569    chain: &'a mut Vec<ResolveKey>,
570    /// Number of dispatch hops from the entry point. Each recursion
571    /// increments by 1. Hitting `config.max_depth` or the
572    /// [`KERNEL_DEPTH_BACKSTOP`] (whichever is lower) is an error.
573    depth: usize,
574    /// Total invocations resolved across the entire walk
575    /// (depth × breadth). Incremented on every successful dispatch.
576    /// Hitting `config.max_total_includes` aborts with
577    /// `TotalIncludesExceeded`.
578    total_resolved: usize,
579}
580
581fn splice_in_session_container(
582    children: &mut Vec<ContentItem>,
583    state: &mut ResolverState<'_>,
584) -> Result<(), IncludeError> {
585    // Post-order: recurse into nested containers first, splice this
586    // container's invocations second. Recursion happens inside
587    // `process_resolves` for any spliced subtree, so that subtree
588    // is never re-walked at the parent level.
589    recurse_into_children(children, state)?;
590    process_resolves(children, state, ContainerKind::Session)
591}
592
593fn splice_in_general_container(
594    container: &mut GeneralContainer,
595    state: &mut ResolverState<'_>,
596    kind: ContainerKind,
597) -> Result<(), IncludeError> {
598    recurse_into_children(container.as_mut_vec(), state)?;
599    process_resolves(container.as_mut_vec(), state, kind)
600}
601
602/// Walk the children of a container, dispatch every annotation whose
603/// schema declares `hooks.resolve = true` through the registry, and
604/// splice the returned content in place of the annotation. Recurses
605/// into the spliced content so nested invocations resolve too.
606// Allow &mut Vec because `splice` needs Vec-specific operations.
607#[allow(clippy::ptr_arg)]
608fn process_resolves(
609    children: &mut Vec<ContentItem>,
610    state: &mut ResolverState<'_>,
611    kind: ContainerKind,
612) -> Result<(), IncludeError> {
613    // Collect indices of annotations whose schema has hooks.resolve.
614    let resolve_indices: Vec<usize> = children
615        .iter()
616        .enumerate()
617        .filter_map(|(i, item)| match item {
618            ContentItem::Annotation(a) => {
619                let label = &a.data.label.value;
620                if state
621                    .registry
622                    .schema_for(label)
623                    .map(|s| s.hooks.resolve)
624                    .unwrap_or(false)
625                {
626                    Some(i)
627                } else {
628                    None
629                }
630            }
631            _ => None,
632        })
633        .collect();
634
635    for i in resolve_indices.into_iter().rev() {
636        let annotation = match &children[i] {
637            ContentItem::Annotation(a) => a.clone(),
638            _ => unreachable!("index came from resolve filter"),
639        };
640
641        match resolve_one_invocation(&annotation, state, kind)? {
642            ResolveOutcome::Spliced(splice_items) => {
643                // Expansion replaces the directive with the included content. The
644                // `lex.include` annotation is consumed — drop it. (It used to be
645                // kept in the stream as provenance, relying on the serializer
646                // dropping attached annotations; now that the serializer emits
647                // them (lex#682), keeping it would leak `:: lex.include ::` into
648                // expanded output. Origin provenance is tracked on
649                // `Range.origin_path`, not this node.)
650                children.splice(i..=i, splice_items);
651            }
652            ResolveOutcome::Unexpanded => {
653                // Handler opted out of expanding this invocation. The
654                // annotation stays in place, but its body wasn't
655                // walked by `recurse_into_children` (that walker
656                // skips resolve-hooked annotations to avoid double-
657                // resolution). Walk the body now so any nested
658                // invocations inside the unexpanded annotation get
659                // resolved on the way back up.
660                let mut owned = annotation;
661                splice_in_general_container(
662                    &mut owned.children,
663                    state,
664                    ContainerKind::AnnotationBody,
665                )?;
666                children[i] = ContentItem::Annotation(owned);
667            }
668        }
669    }
670
671    Ok(())
672}
673
674/// Outcome of dispatching a single resolve-hooked annotation. The
675/// pass needs to distinguish between "handler returned content,
676/// splice it in" and "handler opted out, leave the annotation
677/// alone": the second case still requires walking the annotation's
678/// body for nested invocations because `recurse_into_children`
679/// otherwise skips resolve-hooked annotations to prevent double-
680/// resolution.
681enum ResolveOutcome {
682    Spliced(Vec<ContentItem>),
683    Unexpanded,
684}
685
686/// Dispatch a single resolve-hooked annotation through the registry,
687/// decode the returned `WireNode` back into typed children, then
688/// recursively walk the splice items so nested invocations resolve
689/// before the splice is placed into the parent container.
690///
691/// Returns [`ResolveOutcome::Unexpanded`] when the handler returned
692/// `Ok(None)` (third-party handlers can opt out of expanding a
693/// particular invocation). The caller is then responsible for
694/// walking the annotation's body for nested invocations — the
695/// resolve walker normally skips resolve-hooked annotations'
696/// bodies.
697fn resolve_one_invocation(
698    annotation: &crate::lex::ast::elements::annotation::Annotation,
699    state: &mut ResolverState<'_>,
700    parent_kind: ContainerKind,
701) -> Result<ResolveOutcome, IncludeError> {
702    let label = &annotation.data.label.value;
703    let key = ResolveKey::from_annotation(annotation);
704
705    // Cycle check on (label, origin, start) of the invocation site.
706    if state.chain.contains(&key) {
707        return Err(IncludeError::Cycle {
708            include_site: annotation.location.clone(),
709            path: key.origin.clone().unwrap_or_default(),
710            chain: state
711                .chain
712                .iter()
713                .map(|k| k.origin.clone().unwrap_or_default())
714                .collect(),
715        });
716    }
717
718    // Depth check. The effective limit is the lower of the
719    // user-facing `config.max_depth` (default 8) and the hard
720    // [`KERNEL_DEPTH_BACKSTOP`] (32, fixed). The kernel backstop
721    // exists for adversarial varying-position recursion that the
722    // cycle key can't catch — even if a user bumps `max_depth`
723    // higher than 32 for legitimate deep atomization, the backstop
724    // still terminates. The error reports `effective_depth_limit`
725    // (the actual cap that fired) rather than `config.max_depth`,
726    // so when the backstop is the binding limit the user sees `32`
727    // and not the (higher) config value.
728    let effective_depth_limit = state.config.max_depth.min(KERNEL_DEPTH_BACKSTOP);
729    if state.depth >= effective_depth_limit {
730        return Err(IncludeError::DepthExceeded {
731            include_site: annotation.location.clone(),
732            limit: effective_depth_limit,
733            chain: state
734                .chain
735                .iter()
736                .map(|k| k.origin.clone().unwrap_or_default())
737                .collect(),
738        });
739    }
740
741    // Total-count check before dispatch.
742    if state.total_resolved >= state.config.max_total_includes {
743        return Err(IncludeError::TotalIncludesExceeded {
744            include_site: annotation.location.clone(),
745            limit: state.config.max_total_includes,
746        });
747    }
748
749    let ctx = build_label_ctx(annotation);
750
751    let wire_node = match state.registry.dispatch_resolve_raw(&ctx) {
752        Ok(Some(node)) => node,
753        Ok(None) => {
754            // Handler returned "nothing to splice" — leave the
755            // annotation in place. The caller still needs to walk
756            // its body for nested invocations (built-in lex.include
757            // never returns None; this path is reachable only via
758            // third-party handlers that opt out per-invocation).
759            return Ok(ResolveOutcome::Unexpanded);
760        }
761        Err(handler_err) => {
762            return Err(handler_error_to_include_error(
763                &handler_err,
764                label,
765                &annotation.location,
766            ));
767        }
768    };
769
770    state.total_resolved += 1;
771
772    // Decode the wire payload into typed lex-core ContentItems.
773    let mut splice_items = decode_wire_to_items(&wire_node, label, &annotation.location)?;
774
775    // Recurse into the spliced subtree FIRST so nested resolve-hooked
776    // annotations are processed before the splice lands. Validation
777    // must wait until *after* this step: a nested invocation can
778    // splice in content (e.g. a top-level `Session` from a chained
779    // `lex.include`) that wasn't in the handler's original output,
780    // and the final shape is what has to satisfy the parent
781    // container's policy.
782    //
783    // The `IncludeError::ContainerPolicy.file` field describes the
784    // *spliced content's* source file (the file containing the
785    // disallowed shape), not the invocation site. Take it from the
786    // handler-returned wire payload's origin when present, falling
787    // back to the first decoded item's origin path if the wire
788    // payload didn't stamp a `Document` origin.
789    let included_path = wire_node_origin_pathbuf(&wire_node)
790        .or_else(|| splice_items_first_origin(&splice_items))
791        .unwrap_or_default();
792    state.chain.push(key);
793    let saved_depth = state.depth;
794    state.depth = saved_depth + 1;
795    let recurse_result = splice_in_session_container(&mut splice_items, state);
796    state.depth = saved_depth;
797    state.chain.pop();
798    recurse_result?;
799
800    // Container-policy validation: enforce no-Sessions inside
801    // `GeneralContainer` (Definition / Annotation body / ListItem).
802    // Runs against the post-recursion splice list so nested
803    // expansions can't smuggle disallowed shapes past the check.
804    validate_against_kind(
805        &splice_items,
806        parent_kind,
807        &annotation.location,
808        &included_path,
809    )?;
810
811    Ok(ResolveOutcome::Spliced(splice_items))
812}
813
814/// Build a [`LabelCtx`] from a lex-core [`Annotation`]. The body is
815/// derived from the annotation's children (parsed-Lex form), the
816/// params from `Annotation::data::parameters`, and the host node info
817/// from `Annotation::location`.
818fn build_label_ctx(
819    a: &crate::lex::ast::elements::annotation::Annotation,
820) -> lex_extension::wire::LabelCtx {
821    use crate::lex::wire::to_wire_node;
822    use lex_extension::wire::{AnnotationBody, LabelCtx, NodeRef};
823
824    let label = a.data.label.value.clone();
825    let params = {
826        // Pass *semantic* parameter values to handlers (quotes
827        // stripped, escape sequences resolved). Handlers consume
828        // params as JSON values, where there is no "quoted string"
829        // vs "unquoted token" distinction; only the decoded value
830        // is meaningful. The codec's `parameters_to_json` (used by
831        // `annotation_to_wire` for round-tripping annotation
832        // *content*) keeps the raw form to preserve source — the
833        // two paths intentionally differ.
834        let mut obj = serde_json::Map::with_capacity(a.data.parameters.len());
835        for p in &a.data.parameters {
836            obj.insert(p.key.clone(), serde_json::Value::String(p.unquoted_value()));
837        }
838        serde_json::Value::Object(obj)
839    };
840    let body = if a.children.is_empty() {
841        AnnotationBody::None
842    } else {
843        let wire_children: Vec<lex_extension::wire::WireNode> =
844            a.children.iter().map(to_wire_node).collect();
845        AnnotationBody::Lex {
846            children: wire_children,
847        }
848    };
849    let range = lex_extension::wire::Range::new(
850        lex_extension::wire::Position::new(
851            u32::try_from(a.location.start.line).unwrap_or(u32::MAX),
852            u32::try_from(a.location.start.column).unwrap_or(u32::MAX),
853        ),
854        lex_extension::wire::Position::new(
855            u32::try_from(a.location.end.line).unwrap_or(u32::MAX),
856            u32::try_from(a.location.end.column).unwrap_or(u32::MAX),
857        ),
858    );
859    let origin = a
860        .location
861        .origin_path
862        .as_ref()
863        .map(|p| p.to_string_lossy().into_owned());
864    LabelCtx {
865        label,
866        params,
867        body,
868        node: NodeRef {
869            kind: "annotation".into(),
870            range,
871            origin,
872        },
873    }
874}
875
876/// Convert a handler-returned [`WireNode`] back into a list of
877/// [`ContentItem`]s ready for splicing. `WireNode::Document` is
878/// unwrapped (its children become the splice list); any other root
879/// shape is wrapped as a single-item list.
880///
881/// `invocation_label` is the label whose handler produced `wire` —
882/// threaded through so wire-decode failures are attributed to the
883/// real namespace rather than a hardcoded `lex.include`. A
884/// third-party `acme.expand` handler that returns malformed wire
885/// will surface as `IncludeError::HandlerFailed { label:
886/// "acme.expand", .. }`.
887/// Lift a [`WireNode`]'s top-level `origin` field into a `PathBuf`
888/// when present. Used by the resolve pass to attribute
889/// container-policy errors to the *spliced content's* source file
890/// rather than the invocation site.
891fn wire_node_origin_pathbuf(node: &lex_extension::wire::WireNode) -> Option<PathBuf> {
892    use lex_extension::wire::WireNode as W;
893    let s = match node {
894        W::Document { origin, .. } => origin.as_deref(),
895        W::Session { origin, .. } => origin.as_deref(),
896        W::Definition { origin, .. } => origin.as_deref(),
897        W::Paragraph { origin, .. } => origin.as_deref(),
898        W::List { origin, .. } => origin.as_deref(),
899        W::Verbatim { origin, .. } => origin.as_deref(),
900        W::Table { origin, .. } => origin.as_deref(),
901        W::Annotation { origin, .. } => origin.as_deref(),
902        W::Blank { origin, .. } => origin.as_deref(),
903        _ => None,
904    };
905    s.map(PathBuf::from)
906}
907
908/// Fallback when `WireNode::Document.origin` is unset: walk the
909/// decoded splice list and return the first item that carries an
910/// origin. The interner from `from_wire_node` ensures every item
911/// shares one Arc per origin string, so iterating is cheap.
912fn splice_items_first_origin(items: &[ContentItem]) -> Option<PathBuf> {
913    for item in items {
914        let r = match item {
915            ContentItem::Paragraph(p) => &p.location,
916            ContentItem::Session(s) => &s.location,
917            ContentItem::Definition(d) => &d.location,
918            ContentItem::List(l) => &l.location,
919            ContentItem::ListItem(li) => &li.location,
920            ContentItem::Annotation(a) => &a.location,
921            ContentItem::VerbatimBlock(v) => &v.location,
922            ContentItem::VerbatimLine(vl) => &vl.location,
923            ContentItem::Table(t) => &t.location,
924            ContentItem::TextLine(tl) => &tl.location,
925            ContentItem::BlankLineGroup(blg) => &blg.location,
926        };
927        if let Some(arc) = r.origin_path.as_ref() {
928            return Some((**arc).clone());
929        }
930    }
931    None
932}
933
934fn decode_wire_to_items(
935    wire: &lex_extension::wire::WireNode,
936    invocation_label: &str,
937    include_site: &Range,
938) -> Result<Vec<ContentItem>, IncludeError> {
939    use crate::lex::wire::from_wire_node;
940
941    from_wire_node(wire).map_err(|e| IncludeError::HandlerFailed {
942        include_site: include_site.clone(),
943        label: invocation_label.to_string(),
944        code: "wire.decode".into(),
945        message: format!("decoding handler-returned wire payload failed: {e}"),
946    })
947}
948
949/// Map a [`HandlerError`] returned by the registry into the most
950/// specific [`IncludeError`] variant available. Codes in the
951/// `-32001..=-32005` range emitted by [`crate::lex::builtins::LexIncludeHandler`]
952/// translate back to their corresponding pre-extension-system
953/// variants so existing CLI/LSP error rendering and the integration
954/// test suite keep working unchanged. Unknown codes (third-party
955/// namespaces, future built-ins) surface as `HandlerFailed`.
956fn handler_error_to_include_error(
957    err: &HandlerError,
958    label: &str,
959    include_site: &Range,
960) -> IncludeError {
961    use crate::lex::builtins::include::{
962        CODE_ABSOLUTE_PATH, CODE_IO, CODE_MISSING_SRC, CODE_NOT_FOUND, CODE_OUTSIDE_ROOT,
963        CODE_PARSE_FAILED, CODE_TOO_LARGE,
964    };
965
966    match err {
967        HandlerError::Custom {
968            code,
969            message,
970            data,
971        } => match *code {
972            CODE_NOT_FOUND => IncludeError::NotFound {
973                include_site: include_site.clone(),
974                path: data_str(data, "path")
975                    .map(PathBuf::from)
976                    .unwrap_or_default(),
977            },
978            CODE_OUTSIDE_ROOT => IncludeError::RootEscape {
979                path: data_str(data, "path")
980                    .map(PathBuf::from)
981                    .unwrap_or_default(),
982                root: data_str(data, "root")
983                    .map(PathBuf::from)
984                    .unwrap_or_default(),
985            },
986            CODE_TOO_LARGE => IncludeError::FileTooLarge {
987                include_site: include_site.clone(),
988                path: data_str(data, "path")
989                    .map(PathBuf::from)
990                    .unwrap_or_default(),
991                size: data_u64(data, "size").unwrap_or(0),
992                limit: data_u64(data, "limit").unwrap_or(0),
993            },
994            CODE_ABSOLUTE_PATH => IncludeError::AbsolutePath {
995                path: data_str(data, "path")
996                    .map(PathBuf::from)
997                    .unwrap_or_default(),
998            },
999            CODE_IO => IncludeError::LoaderIo {
1000                path: data_str(data, "path")
1001                    .map(PathBuf::from)
1002                    .unwrap_or_default(),
1003                message: message.clone(),
1004            },
1005            CODE_MISSING_SRC => IncludeError::MissingSrc {
1006                include_site: include_site.clone(),
1007            },
1008            CODE_PARSE_FAILED => IncludeError::ParseFailed {
1009                path: data_str(data, "path")
1010                    .map(PathBuf::from)
1011                    .unwrap_or_default(),
1012                message: data_str(data, "message").unwrap_or_else(|| message.clone()),
1013            },
1014            other => IncludeError::HandlerFailed {
1015                include_site: include_site.clone(),
1016                label: label.to_string(),
1017                code: format!("handler.custom({other})"),
1018                message: message.clone(),
1019            },
1020        },
1021        HandlerError::Internal { message } => IncludeError::HandlerFailed {
1022            include_site: include_site.clone(),
1023            label: label.to_string(),
1024            code: "handler.internal".into(),
1025            message: message.clone(),
1026        },
1027        HandlerError::Unsupported { detail } => IncludeError::HandlerFailed {
1028            include_site: include_site.clone(),
1029            label: label.to_string(),
1030            code: "handler.unsupported".into(),
1031            message: detail.clone(),
1032        },
1033    }
1034}
1035
1036fn data_str(data: &Option<serde_json::Value>, key: &str) -> Option<String> {
1037    data.as_ref()?.get(key)?.as_str().map(str::to_string)
1038}
1039
1040fn data_u64(data: &Option<serde_json::Value>, key: &str) -> Option<u64> {
1041    data.as_ref()?.get(key)?.as_u64()
1042}
1043
1044#[allow(clippy::ptr_arg)]
1045fn recurse_into_children(
1046    children: &mut Vec<ContentItem>,
1047    state: &mut ResolverState<'_>,
1048) -> Result<(), IncludeError> {
1049    for item in children.iter_mut() {
1050        match item {
1051            ContentItem::Session(s) => {
1052                splice_in_session_container(s.children.as_mut_vec(), state)?;
1053            }
1054            ContentItem::Definition(d) => {
1055                splice_in_general_container(&mut d.children, state, ContainerKind::Definition)?;
1056            }
1057            ContentItem::Annotation(a) => {
1058                // Skip the body of annotations whose schema declares
1059                // `hooks.resolve = true` — those are dispatched at the
1060                // parent level by `process_resolves`. Walking their
1061                // bodies *here* would trip the resolve again on the
1062                // same invocation.
1063                //
1064                // The body is still walked when the resolve actually
1065                // runs: `process_resolves` calls
1066                // `resolve_one_invocation`, and the
1067                // [`ResolveOutcome::Spliced`] arm walks the splice
1068                // subtree (which replaces the annotation), while the
1069                // [`ResolveOutcome::Unexpanded`] arm explicitly
1070                // walks the kept annotation's body via
1071                // `splice_in_general_container`. So nested
1072                // resolve-hooked annotations inside an unexpanded
1073                // outer annotation are still reached.
1074                //
1075                // Non-resolve-hooked annotations recurse normally
1076                // here so their nested bodies get processed.
1077                let is_resolve_hooked = state
1078                    .registry
1079                    .schema_for(&a.data.label.value)
1080                    .map(|s| s.hooks.resolve)
1081                    .unwrap_or(false);
1082                if !is_resolve_hooked {
1083                    splice_in_general_container(
1084                        &mut a.children,
1085                        state,
1086                        ContainerKind::AnnotationBody,
1087                    )?;
1088                }
1089            }
1090            ContentItem::List(l) => {
1091                for li in l.items.as_mut_vec().iter_mut() {
1092                    if let ContentItem::ListItem(item) = li {
1093                        splice_in_general_container(
1094                            &mut item.children,
1095                            state,
1096                            ContainerKind::ListItem,
1097                        )?;
1098                    }
1099                }
1100            }
1101            _ => {}
1102        }
1103    }
1104    Ok(())
1105}
1106
1107fn validate_against_kind(
1108    items: &[ContentItem],
1109    kind: ContainerKind,
1110    site: &Range,
1111    file: &Path,
1112) -> Result<(), IncludeError> {
1113    if kind.allows_sessions() {
1114        return Ok(());
1115    }
1116    if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
1117        return Err(IncludeError::ContainerPolicy {
1118            include_site: site.clone(),
1119            container: kind.name(),
1120            file: file.to_path_buf(),
1121            violation: "Sessions",
1122        });
1123    }
1124    Ok(())
1125}
1126
1127// ============================================================================
1128// Path resolution
1129// ============================================================================
1130
1131/// Resolve a file-reference target string the same way the include
1132/// resolver resolves include paths.
1133///
1134/// Use this when consuming `ReferenceType::File { target }` (or any other
1135/// node-attached path) so that relative paths resolve from the *authoring*
1136/// file's directory, not from wherever the merged document happens to be
1137/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
1138/// containing node (or `None` if the node was never stamped — in that case
1139/// the path is treated as if authored at the root).
1140///
1141/// Behaviour matches the include resolver:
1142/// - Root-absolute targets (leading `/`) resolve under `root`.
1143/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
1144///   when `ref_origin` is `None`).
1145/// - The result is lexically normalized and checked against `root` —
1146///   paths that escape it return `RootEscape`.
1147///
1148/// This is a sister to the resolver's internal `resolve_path` and shares
1149/// the same lexical-normalization caveat: it does not touch the filesystem.
1150pub fn resolve_file_reference(
1151    target: &str,
1152    ref_origin: Option<&Path>,
1153    root: &Path,
1154) -> Result<PathBuf, IncludeError> {
1155    let host_dir: PathBuf = ref_origin
1156        .and_then(|p| p.parent())
1157        .map(Path::to_path_buf)
1158        .unwrap_or_else(|| root.to_path_buf());
1159    resolve_path(target, &host_dir, root)
1160}
1161
1162fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
1163    let candidate = if let Some(rel) = src.strip_prefix('/') {
1164        // Root-absolute (Lex spec convention): leading `/` means "from
1165        // the resolution root", not "filesystem root".
1166        root.join(rel)
1167    } else {
1168        // Anything else must be a relative path. Reject inputs the
1169        // host platform would treat as absolute (Windows `C:\foo`,
1170        // `\\server\share`, `\foo`) up front: the spec forbids
1171        // platform-absolute paths from entering the resolution
1172        // pipeline. Without this, `host_dir.join(src)` would silently
1173        // discard `host_dir` because Rust's `PathBuf::join` replaces
1174        // the base when the joined path is absolute. The downstream
1175        // root-escape check would still catch the security side, but
1176        // we'd surface a misleading "escapes root" error instead of
1177        // "absolute paths not allowed", and we'd be relying on
1178        // `PathBuf::join`'s override semantics for the security
1179        // outcome rather than holding the line at the input boundary.
1180        if Path::new(src).is_absolute() {
1181            return Err(IncludeError::AbsolutePath {
1182                path: PathBuf::from(src),
1183            });
1184        }
1185        host_dir.join(src)
1186    };
1187    let normalized = lexical_normalize(&candidate);
1188    let canonical_root = lexical_normalize(root);
1189    if !normalized.starts_with(&canonical_root) {
1190        return Err(IncludeError::RootEscape {
1191            path: normalized,
1192            root: canonical_root,
1193        });
1194    }
1195    Ok(normalized)
1196}
1197
1198/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
1199///
1200/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
1201/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
1202/// version is sufficient for include-site path resolution because the
1203/// resolver only needs a stable identity for cycle detection and a uniform
1204/// shape for the root-escape prefix check.
1205///
1206/// `..` is collapsed only when the *last* component in the buffer is a
1207/// real directory name (`Component::Normal`). When the buffer is empty
1208/// or its last component is itself `..` (or a root marker), the new `..`
1209/// is *preserved* in the buffer.
1210///
1211/// This is what defeats `../../etc/passwd` from collapsing to
1212/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
1213/// would happily strip a `..` (since `Path::new("..").parent()` returns
1214/// `Some("")`), silently losing the second `..` and producing a path
1215/// that falsely starts with the root prefix. Each unmatched `..` in the
1216/// preserved form keeps the normalized path outside any sane root, so
1217/// the escape check fires correctly.
1218fn lexical_normalize(p: &Path) -> PathBuf {
1219    let mut out = PathBuf::new();
1220    for c in p.components() {
1221        match c {
1222            std::path::Component::ParentDir => {
1223                let can_pop = matches!(
1224                    out.components().next_back(),
1225                    Some(std::path::Component::Normal(_))
1226                );
1227                if can_pop {
1228                    out.pop();
1229                } else {
1230                    out.push("..");
1231                }
1232            }
1233            std::path::Component::CurDir => {}
1234            other => out.push(other.as_os_str()),
1235        }
1236    }
1237    out
1238}
1239
1240// ============================================================================
1241// Origin stamping
1242// ============================================================================
1243//
1244// Walk every node in a Document and set `Range.origin_path` on each
1245// `.location` field. The walk only stamps the *block-level* `.location`
1246// fields here; finer-grained inline ranges land in PR 6 when file-ref
1247// resolution starts consulting them.
1248
1249pub(crate) fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
1250    if let Some(title) = doc.title.as_mut() {
1251        title.location.origin_path = Some(Arc::clone(origin));
1252    }
1253    for ann in doc.annotations.iter_mut() {
1254        stamp_annotation(ann, origin);
1255    }
1256    stamp_session(&mut doc.root, origin);
1257}
1258
1259fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
1260    s.location.origin_path = Some(Arc::clone(origin));
1261    if let Some(loc) = s.title.location.as_mut() {
1262        loc.origin_path = Some(Arc::clone(origin));
1263    }
1264    for ann in s.annotations.iter_mut() {
1265        stamp_annotation(ann, origin);
1266    }
1267    for item in s.children.as_mut_vec().iter_mut() {
1268        stamp_item(item, origin);
1269    }
1270}
1271
1272fn stamp_annotation(
1273    a: &mut crate::lex::ast::elements::annotation::Annotation,
1274    origin: &Arc<PathBuf>,
1275) {
1276    a.location.origin_path = Some(Arc::clone(origin));
1277    a.data.location.origin_path = Some(Arc::clone(origin));
1278    for item in a.children.as_mut_vec().iter_mut() {
1279        stamp_item(item, origin);
1280    }
1281}
1282
1283fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
1284    match item {
1285        ContentItem::Session(s) => stamp_session(s, origin),
1286        ContentItem::Annotation(a) => stamp_annotation(a, origin),
1287        ContentItem::Paragraph(p) => {
1288            p.location.origin_path = Some(Arc::clone(origin));
1289            for ann in p.annotations.iter_mut() {
1290                stamp_annotation(ann, origin);
1291            }
1292            for line in p.lines.iter_mut() {
1293                stamp_item(line, origin);
1294            }
1295        }
1296        ContentItem::List(l) => {
1297            l.location.origin_path = Some(Arc::clone(origin));
1298            for li in l.items.as_mut_vec().iter_mut() {
1299                stamp_item(li, origin);
1300            }
1301        }
1302        ContentItem::ListItem(li) => {
1303            li.location.origin_path = Some(Arc::clone(origin));
1304            for ann in li.annotations.iter_mut() {
1305                stamp_annotation(ann, origin);
1306            }
1307            for child in li.children.as_mut_vec().iter_mut() {
1308                stamp_item(child, origin);
1309            }
1310        }
1311        ContentItem::Definition(d) => {
1312            d.location.origin_path = Some(Arc::clone(origin));
1313            for ann in d.annotations.iter_mut() {
1314                stamp_annotation(ann, origin);
1315            }
1316            for child in d.children.as_mut_vec().iter_mut() {
1317                stamp_item(child, origin);
1318            }
1319        }
1320        ContentItem::VerbatimBlock(v) => {
1321            v.location.origin_path = Some(Arc::clone(origin));
1322        }
1323        ContentItem::VerbatimLine(vl) => {
1324            vl.location.origin_path = Some(Arc::clone(origin));
1325        }
1326        ContentItem::Table(t) => {
1327            t.location.origin_path = Some(Arc::clone(origin));
1328        }
1329        ContentItem::TextLine(tl) => {
1330            tl.location.origin_path = Some(Arc::clone(origin));
1331        }
1332        ContentItem::BlankLineGroup(b) => {
1333            b.location.origin_path = Some(Arc::clone(origin));
1334        }
1335    }
1336}
1337
1338// ============================================================================
1339// Parser glue
1340// ============================================================================
1341
1342/// Parse `source` into a Document but skip the annotation-attachment stage,
1343/// so include annotations are findable in container children lists.
1344pub(crate) fn parse_no_attach(source: &str) -> Result<Document, String> {
1345    crate::lex::testing::parse_without_annotation_attachment(source)
1346}
1347
1348// ============================================================================
1349// Filesystem-backed loader
1350// ============================================================================
1351
1352/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
1353///
1354/// This is the production loader used by the CLI; the LSP wraps it with a
1355/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
1356/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
1357/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
1358/// WASM-friendly.
1359///
1360/// `FsLoader` is constructed with the resolution root and rechecks every
1361/// load against it post-`fs::canonicalize`, so a symlink pointing outside
1362/// the root is rejected even though the lexical-only check in
1363/// [`resolve_path`] cannot see it. Also rejects non-regular files (devices,
1364/// FIFOs, directories) before reading, so the loader can't be tricked into
1365/// blocking on `/dev/zero` or allocating against an open device.
1366///
1367/// Errors map:
1368/// - canonicalization fails (file missing, permission denied at a parent,
1369///   broken symlink, …) → [`LoadError::NotFound`]
1370/// - canonical path doesn't sit under canonical root → [`LoadError::OutsideRoot`]
1371/// - target is not a regular file → [`LoadError::Io`] with a clear message
1372/// - any other I/O error during read → [`LoadError::Io`]
1373pub struct FsLoader {
1374    /// Filesystem-canonical resolution root. Constructed once at
1375    /// `FsLoader::new`; if canonicalization fails (e.g., the configured
1376    /// root doesn't exist on disk), we fall back to the input verbatim
1377    /// and the bounds check will simply never pass — visible to the user
1378    /// as a `LoadError::OutsideRoot` instead of silently disabling the
1379    /// security check.
1380    canonical_root: PathBuf,
1381    /// Per-file size cap (bytes). Loads of larger files surface as
1382    /// `LoadError::TooLarge` before any bytes are read into memory.
1383    /// Default [`FsLoader::DEFAULT_MAX_FILE_SIZE`].
1384    max_file_size: u64,
1385}
1386
1387impl FsLoader {
1388    /// Default per-file size cap: 10 MiB. Generous for realistic Lex
1389    /// source documents (text only) and tight enough to bound memory
1390    /// allocation per include against an adversarial 1 GB file.
1391    pub const DEFAULT_MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
1392
1393    /// Construct a loader rooted at `root` with default size limits.
1394    /// The loader stores `root`'s fs-canonical form (with symlinks
1395    /// resolved); subsequent loads validate that the requested path's
1396    /// canonical form lives under it.
1397    pub fn new(root: PathBuf) -> Self {
1398        let canonical_root = std::fs::canonicalize(&root).unwrap_or(root);
1399        Self {
1400            canonical_root,
1401            max_file_size: Self::DEFAULT_MAX_FILE_SIZE,
1402        }
1403    }
1404
1405    /// Override the default per-file size cap (bytes). Use to widen the
1406    /// limit for projects with genuinely large source files, or tighten
1407    /// it for stricter sandboxes (e.g., LSPs serving untrusted content).
1408    pub fn with_max_file_size(mut self, max_file_size: u64) -> Self {
1409        self.max_file_size = max_file_size;
1410        self
1411    }
1412}
1413
1414impl Loader for FsLoader {
1415    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1416        // 1. Canonicalize. Resolves symlinks and `..` segments against the
1417        //    real filesystem. NotFound / broken-symlink / permission errors
1418        //    all surface here.
1419        let canonical_path = std::fs::canonicalize(path).map_err(|e| match e.kind() {
1420            std::io::ErrorKind::NotFound => LoadError::NotFound {
1421                path: path.to_path_buf(),
1422            },
1423            _ => LoadError::Io {
1424                path: path.to_path_buf(),
1425                message: e.to_string(),
1426            },
1427        })?;
1428
1429        // 2. Bounds check against the *canonical* root. This is the
1430        //    actual security gate against symlink traversal — the lexical
1431        //    check in resolve_path can't see through symlinks.
1432        if !canonical_path.starts_with(&self.canonical_root) {
1433            return Err(LoadError::OutsideRoot {
1434                path: canonical_path,
1435                root: self.canonical_root.clone(),
1436            });
1437        }
1438
1439        // 3. Reject non-regular files. Without this, an attacker (with
1440        //    write access to the repo) could symlink an include target to
1441        //    `/dev/zero` or a FIFO and block / OOM the reader. The
1442        //    is_file() metadata call is a cheap sanity check.
1443        let meta = std::fs::metadata(&canonical_path).map_err(|e| LoadError::Io {
1444            path: canonical_path.clone(),
1445            message: e.to_string(),
1446        })?;
1447        if !meta.is_file() {
1448            return Err(LoadError::Io {
1449                path: canonical_path,
1450                message: "include target is not a regular file".to_string(),
1451            });
1452        }
1453
1454        // 4. Size cap. Bounds memory allocation per include against an
1455        //    adversarial 1 GB file before any bytes hit the heap.
1456        let size = meta.len();
1457        if size > self.max_file_size {
1458            return Err(LoadError::TooLarge {
1459                path: canonical_path,
1460                size,
1461                limit: self.max_file_size,
1462            });
1463        }
1464
1465        // 5. Read. By this point we know the path is a regular file under
1466        //    the canonical root and within the size cap; anything that
1467        //    fails here is a real I/O error worth surfacing.
1468        let source = std::fs::read_to_string(&canonical_path).map_err(|e| LoadError::Io {
1469            path: canonical_path.clone(),
1470            message: e.to_string(),
1471        })?;
1472
1473        Ok(LoadedFile {
1474            source,
1475            canonical_path,
1476        })
1477    }
1478}
1479
1480// ============================================================================
1481// Test fixtures (test-support feature + cfg(test))
1482// ============================================================================
1483
1484/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
1485#[cfg(any(test, feature = "test-support"))]
1486pub struct MemoryLoader {
1487    files: std::collections::HashMap<PathBuf, String>,
1488}
1489
1490#[cfg(any(test, feature = "test-support"))]
1491impl MemoryLoader {
1492    /// Create an empty loader. Add files with [`MemoryLoader::insert`].
1493    pub fn new() -> Self {
1494        Self {
1495            files: std::collections::HashMap::new(),
1496        }
1497    }
1498
1499    /// Register a file at `path` with the given source text.
1500    pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
1501        self.files.insert(path.into(), contents.into());
1502        self
1503    }
1504
1505    /// Convenience constructor: build a loader from any iterator of
1506    /// `(path, contents)` pairs.
1507    pub fn from_pairs<I, P, S>(pairs: I) -> Self
1508    where
1509        I: IntoIterator<Item = (P, S)>,
1510        P: Into<PathBuf>,
1511        S: Into<String>,
1512    {
1513        let mut loader = Self::new();
1514        for (path, contents) in pairs {
1515            loader.insert(path, contents);
1516        }
1517        loader
1518    }
1519}
1520
1521#[cfg(any(test, feature = "test-support"))]
1522impl Default for MemoryLoader {
1523    fn default() -> Self {
1524        Self::new()
1525    }
1526}
1527
1528#[cfg(any(test, feature = "test-support"))]
1529impl Loader for MemoryLoader {
1530    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1531        // Memory loaders have no symlinks; the lookup key *is* the
1532        // canonical identity. Cycle detection in the resolver compares
1533        // `LoadedFile::canonical_path` values; for tests this matches the
1534        // lexically-normalized paths the resolver already produces.
1535        let source = self
1536            .files
1537            .get(path)
1538            .cloned()
1539            .ok_or_else(|| LoadError::NotFound {
1540                path: path.to_path_buf(),
1541            })?;
1542        Ok(LoadedFile {
1543            source,
1544            canonical_path: path.to_path_buf(),
1545        })
1546    }
1547}
1548
1549// ============================================================================
1550// Tests
1551// ============================================================================
1552
1553#[cfg(test)]
1554mod tests;