Skip to main content

lex_core/lex/
includes.rs

1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//!   doc-title/doc-annotation conversion + origin stamping + root-escape
19//!   check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//!   (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//!   directory, so relative paths inside an included file resolve from
23//!   that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//!   resolves a `ReferenceType::File` target from the authoring file's
26//!   directory using `Range.origin_path`.
27//!   `Document::find_annotation_by_label_in_origin` scopes footnote
28//!   lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//!   filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//!   into `lex convert` and `lex inspect` (default-on, opt-out via
32//!   `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47// `IncludeError` carries diagnostic context (paths, source ranges,
48// handler messages) on every variant; the `result_large_err` lint
49// would have us box the whole error or split it into a thinner shape
50// just to satisfy the size heuristic. The enum is already part of
51// the public API and the error path is rare; suppress the lint for
52// this module rather than churn the public surface.
53#![allow(clippy::result_large_err)]
54
55use crate::lex::assembling::stages::{ApplyTableConfig, NormalizeLabels};
56use crate::lex::assembling::AttachAnnotations;
57use crate::lex::ast::elements::container::GeneralContainer;
58use crate::lex::ast::elements::content_item::ContentItem;
59use crate::lex::ast::elements::session::Session;
60use crate::lex::ast::range::Range;
61use crate::lex::ast::Document;
62use crate::lex::transforms::Runnable;
63use lex_extension::handler::HandlerError;
64use lex_extension_host::registry::Registry;
65use std::path::{Path, PathBuf};
66use std::sync::Arc;
67
68/// Configuration for the include resolution pass.
69#[derive(Debug, Clone)]
70pub struct ResolveConfig {
71    /// Directory all include paths resolve under. Any include that
72    /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
73    ///
74    /// Must be an **absolute** path. Lexical normalization treats `.`
75    /// and `..` against an empty buffer as no-ops; passing a relative
76    /// or unnormalized root weakens the root-escape prefix check.
77    /// Callers (CLI, LSP) should canonicalize the root before
78    /// constructing `ResolveConfig`.
79    pub root: PathBuf,
80    /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
81    /// Hitting the limit is an error, not a silent truncation.
82    pub max_depth: usize,
83    /// Maximum total number of `lex.include` annotations resolved across
84    /// the whole tree (depth × breadth). Default 1000
85    /// (see [`ResolveConfig::DEFAULT_MAX_TOTAL_INCLUDES`]).
86    ///
87    /// Caps fan-out: `max_depth` alone bounds chain length but not
88    /// breadth. A document with 100 thousand top-level includes at depth
89    /// 1 sits inside `max_depth` but can still OOM the resolver / LSP /
90    /// CI. Hitting this limit is an error, not a silent truncation.
91    pub max_total_includes: usize,
92}
93
94impl ResolveConfig {
95    /// Default maximum include depth — enough for any reasonable atomization
96    /// strategy (aggregator → per-chapter → per-section), bounded enough to
97    /// keep the resolver's worst-case work predictable.
98    pub const DEFAULT_MAX_DEPTH: usize = 8;
99
100    /// Default maximum total include count (DoS bound). Generous enough
101    /// for a book-length document with thousands of small fragments,
102    /// tight enough to contain adversarial fan-out within a few seconds
103    /// of resolver work.
104    pub const DEFAULT_MAX_TOTAL_INCLUDES: usize = 1000;
105
106    /// Construct a config with the given root and default limits.
107    pub fn with_root(root: PathBuf) -> Self {
108        Self {
109            root,
110            max_depth: Self::DEFAULT_MAX_DEPTH,
111            max_total_includes: Self::DEFAULT_MAX_TOTAL_INCLUDES,
112        }
113    }
114}
115
116/// A pluggable source-text loader.
117///
118/// Implementations decide where bytes come from (filesystem, in-memory map,
119/// virtual filesystem, content-addressed store, …). lex-core never references
120/// `std::fs` directly through this trait; that keeps the resolver pure and
121/// usable in WASM, sandboxes, and unit tests.
122pub trait Loader {
123    /// Load the source text for `path` and return both the contents and a
124    /// canonical identity for the loaded resource. The path is what the
125    /// resolver decided on after applying the rules in §4 of the proposal.
126    ///
127    /// `LoadedFile::canonical_path` is the loader's authoritative identity
128    /// for the resource. For [`FsLoader`] this is the filesystem-canonical
129    /// path (symlinks resolved, case-folded if the underlying FS is
130    /// case-insensitive); for [`MemoryLoader`] it's the lookup key (since
131    /// memory loaders have no symlinks). The resolver uses this for cycle
132    /// detection and for stamping `Range.origin_path` on the loaded tree.
133    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError>;
134}
135
136/// Result of a successful [`Loader::load`].
137#[derive(Debug, Clone)]
138pub struct LoadedFile {
139    /// The file's source text.
140    pub source: String,
141    /// The loader's authoritative identity for the resource. See
142    /// [`Loader::load`] for how loaders decide this.
143    pub canonical_path: PathBuf,
144}
145
146/// Errors a [`Loader`] can produce.
147#[derive(Debug, Clone)]
148pub enum LoadError {
149    /// The loader could not find a resource at the given path.
150    NotFound { path: PathBuf },
151    /// The resource exists but resolves outside the loader's allowed
152    /// boundary. The lexical resolver normalizes `..` in the requested
153    /// path, but loaders that touch a real filesystem must do a second
154    /// check post-canonicalization to catch symlinks that escape the
155    /// boundary lexically-correct paths can't reach.
156    OutsideRoot { path: PathBuf, root: PathBuf },
157    /// The resource exists but its size exceeds the loader's configured
158    /// limit. `size` and `limit` are in bytes. The resolver maps this to
159    /// [`IncludeError::FileTooLarge`] with the offending annotation's site.
160    TooLarge {
161        path: PathBuf,
162        size: u64,
163        limit: u64,
164    },
165    /// Underlying I/O error (or virtual-filesystem equivalent).
166    Io { path: PathBuf, message: String },
167}
168
169impl std::fmt::Display for LoadError {
170    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171        match self {
172            LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
173            LoadError::OutsideRoot { path, root } => write!(
174                f,
175                "include path {} resolves outside loader root {}",
176                path.display(),
177                root.display()
178            ),
179            LoadError::TooLarge { path, size, limit } => write!(
180                f,
181                "include file {} is {size} bytes, exceeds limit of {limit} bytes",
182                path.display()
183            ),
184            LoadError::Io { path, message } => {
185                write!(f, "io error reading {}: {message}", path.display())
186            }
187        }
188    }
189}
190
191impl std::error::Error for LoadError {}
192
193/// Errors the include resolver can produce.
194#[derive(Debug, Clone)]
195pub enum IncludeError {
196    /// An include chain looped back on itself. `chain` is the resolution
197    /// stack at the moment the duplicate `path` was about to be pushed,
198    /// in source-order (entry first, deepest last). `include_site` is the
199    /// range of the offending `lex.include` annotation in its host file —
200    /// useful for diagnostics that highlight the exact line.
201    Cycle {
202        include_site: Range,
203        path: PathBuf,
204        chain: Vec<PathBuf>,
205    },
206    /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
207    /// shows the resolution stack at the moment of failure, in source
208    /// order. `include_site` is the range of the offending
209    /// `lex.include` annotation in its host file.
210    DepthExceeded {
211        include_site: Range,
212        limit: usize,
213        chain: Vec<PathBuf>,
214    },
215    /// The total number of includes resolved across the document
216    /// exceeded [`ResolveConfig::max_total_includes`]. Bounds adversarial
217    /// fan-out (which `max_depth` alone does not). `include_site` is the
218    /// `lex.include` annotation that pushed the count past the limit.
219    TotalIncludesExceeded { include_site: Range, limit: usize },
220    /// The included file's size exceeded the loader's configured limit.
221    /// Surfaced by loaders that read from a real filesystem (FsLoader)
222    /// to bound memory allocation per include. `include_site` is the
223    /// offending annotation; `size` and `limit` are in bytes.
224    FileTooLarge {
225        include_site: Range,
226        path: PathBuf,
227        size: u64,
228        limit: u64,
229    },
230    /// A path resolved outside the configured [`ResolveConfig::root`].
231    RootEscape { path: PathBuf, root: PathBuf },
232    /// The include `src` was a platform-absolute filesystem path
233    /// (e.g. Windows `C:\foo`, `\\server\share`, `\foo`). The spec
234    /// forbids absolute filesystem paths from entering the
235    /// resolution pipeline; the *root-absolute* form (leading `/`
236    /// resolved against the includes root) is the only spec-allowed
237    /// way to write a path that doesn't start from the host's
238    /// directory. On Unix the only thing that's `Path::is_absolute()`
239    /// is a leading `/`, which is consumed by the root-absolute
240    /// branch first; this variant therefore only fires in practice
241    /// for Windows-shaped absolute paths.
242    AbsolutePath { path: PathBuf },
243    /// The loader could not find or read the included file. `include_site`
244    /// is the range of the offending `lex.include` annotation in its host
245    /// file, so editors can squiggle the line that asked for the missing
246    /// file rather than the document head.
247    NotFound { include_site: Range, path: PathBuf },
248    /// The loader returned text that the parser rejected.
249    ParseFailed { path: PathBuf, message: String },
250    /// The included file's content is not legal in the include site's
251    /// parent container.
252    ///
253    /// Today this only occurs when an included file has top-level Sessions
254    /// and the include site is inside a `GeneralContainer` (Definition,
255    /// ListItem, or another Annotation's body). The `violation` field
256    /// names the offending content kind (e.g. `"Sessions"`) so future
257    /// container/policy combinations can reuse this variant without a
258    /// breaking change.
259    ContainerPolicy {
260        include_site: Range,
261        container: &'static str,
262        file: PathBuf,
263        violation: &'static str,
264    },
265    /// Loader propagated a non-`NotFound` I/O error.
266    LoaderIo { path: PathBuf, message: String },
267    /// `lex.include` annotation was missing the mandatory `src=` parameter.
268    MissingSrc { include_site: Range },
269    /// A registered handler returned an error the pass could not map
270    /// onto a more specific variant — typically a third-party
271    /// namespace's resolve hook surfacing an internal failure, or an
272    /// unrecognised handler-defined code from `lex.*` built-ins. The
273    /// `code` is the string identifier the registry attaches to the
274    /// diagnostic (`"handler.internal"`, `"handler.custom"`, …).
275    HandlerFailed {
276        include_site: Range,
277        label: String,
278        code: String,
279        message: String,
280    },
281}
282
283impl std::fmt::Display for IncludeError {
284    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285        match self {
286            IncludeError::Cycle { path, chain, .. } => {
287                let chain_display: Vec<String> =
288                    chain.iter().map(|p| p.display().to_string()).collect();
289                write!(
290                    f,
291                    "include cycle: {} (chain: {})",
292                    path.display(),
293                    chain_display.join(" -> ")
294                )
295            }
296            IncludeError::DepthExceeded { limit, chain, .. } => {
297                let chain_display: Vec<String> =
298                    chain.iter().map(|p| p.display().to_string()).collect();
299                write!(
300                    f,
301                    "include depth exceeded limit of {limit} (chain: {})",
302                    chain_display.join(" -> ")
303                )
304            }
305            IncludeError::TotalIncludesExceeded { limit, .. } => {
306                write!(f, "total include count exceeded limit of {limit}")
307            }
308            IncludeError::FileTooLarge {
309                path, size, limit, ..
310            } => {
311                write!(
312                    f,
313                    "included file {} is {size} bytes, exceeds limit of {limit} bytes",
314                    path.display()
315                )
316            }
317            IncludeError::RootEscape { path, root } => write!(
318                f,
319                "include path {} escapes resolution root {}",
320                path.display(),
321                root.display()
322            ),
323            IncludeError::AbsolutePath { path } => write!(
324                f,
325                "include src {} is a platform-absolute path; \
326                 the spec forbids absolute filesystem paths — use a relative path \
327                 (chapters/01.lex) or a root-absolute path (/shared/01.lex)",
328                path.display()
329            ),
330            IncludeError::NotFound { path, .. } => {
331                write!(f, "include not found: {}", path.display())
332            }
333            IncludeError::ParseFailed { path, message } => {
334                write!(f, "failed to parse {}: {message}", path.display())
335            }
336            IncludeError::ContainerPolicy {
337                container,
338                file,
339                violation,
340                ..
341            } => write!(
342                f,
343                "included file {} contains {} but include site is inside {} \
344                 (which does not allow {})",
345                file.display(),
346                violation,
347                container,
348                violation
349            ),
350            IncludeError::LoaderIo { path, message } => {
351                write!(f, "loader error reading {}: {message}", path.display())
352            }
353            IncludeError::MissingSrc { .. } => {
354                write!(f, "lex.include annotation missing required src= parameter")
355            }
356            IncludeError::HandlerFailed {
357                label,
358                code,
359                message,
360                ..
361            } => write!(f, "extension handler `{label}` failed ({code}): {message}"),
362        }
363    }
364}
365
366impl std::error::Error for IncludeError {}
367
368// No `From<LoadError>` impl: `IncludeError::NotFound` carries the include
369// site (the `lex.include` annotation's range), which a loader doesn't know
370// about. Callers map `LoadError` explicitly at the call site, where the
371// site is available.
372
373/// Which container the include site sits in. Determines the splice-time
374/// policy check (the only one today is "no Sessions in `GeneralContainer`").
375#[derive(Debug, Clone, Copy)]
376enum ContainerKind {
377    /// `Document.root.children` or `Session.children` — accepts everything.
378    Session,
379    /// `Definition.children` — `GeneralContainer`.
380    Definition,
381    /// `Annotation.children` — `GeneralContainer`.
382    AnnotationBody,
383    /// `ListItem.children` — `GeneralContainer`.
384    ListItem,
385}
386
387impl ContainerKind {
388    fn name(self) -> &'static str {
389        match self {
390            ContainerKind::Session => "Session",
391            ContainerKind::Definition => "Definition",
392            ContainerKind::AnnotationBody => "Annotation body",
393            ContainerKind::ListItem => "ListItem",
394        }
395    }
396
397    fn allows_sessions(self) -> bool {
398        matches!(self, ContainerKind::Session)
399    }
400}
401
402/// Hard cap on resolution depth, applied even when the
403/// configurable [`ResolveConfig::max_depth`] is set higher. Bounds
404/// adversarial varying-position recursion (a handler that returns
405/// content with a different invocation site each iteration so the
406/// cycle key never matches) so the resolver always terminates.
407pub const KERNEL_DEPTH_BACKSTOP: usize = 32;
408
409/// Resolve every `hooks.resolve = true` labelled annotation starting
410/// from `source`, dispatching through `registry`, and recursively
411/// processing the spliced content.
412///
413/// `source_path` identifies the entry-point file. It is used to
414/// (a) stamp `Range.origin_path` on every node so downstream code
415/// (file-ref resolution, diagnostics, LSP goto) can report locations
416/// against the authoring file, and (b) provide the host directory
417/// the built-in `lex.include` handler resolves relative `src=` paths
418/// against (via `LabelCtx.node.origin`). When `None`, origin stamping
419/// is skipped on the entry and the handler resolves relative paths
420/// against `config.root`.
421///
422/// # Generic dispatch
423///
424/// Every label whose schema declares `hooks.resolve = true` flows
425/// through the same path: build a [`LabelCtx`] from the annotation,
426/// call [`Registry::dispatch_resolve_raw`], decode the returned
427/// [`WireNode`] back into typed [`ContentItem`]s via
428/// [`crate::lex::wire::from_wire_node`], and splice in place. The
429/// built-in `lex.include` handler is registered the same way as any
430/// third-party namespace.
431///
432/// # Pre/post-attachment
433///
434/// Internally this re-parses the entry source *without* annotation
435/// attachment so labelled annotations stay visible as standalone
436/// children. The handler does its own `parse_no_attach` for loaded
437/// content. After all splices, [`AttachAnnotations`] runs once on
438/// the merged tree.
439///
440/// # Recursion + cycle detection
441///
442/// Cycle detection keys on `(label, origin_path, start_position)` of
443/// the invocation site. A handler that returns content containing
444/// another invocation at the same source position is caught
445/// immediately. A handler that varies the invocation position each
446/// iteration terminates at `min(config.max_depth, KERNEL_DEPTH_BACKSTOP)`
447/// with `IncludeError::DepthExceeded`. The total-includes counter
448/// caps adversarial fan-out independent of depth.
449pub fn resolve_from_source(
450    source: &str,
451    source_path: Option<PathBuf>,
452    config: &ResolveConfig,
453    registry: &Registry,
454) -> Result<Document, IncludeError> {
455    let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
456
457    let mut doc = parse_no_attach(source).map_err(|message| IncludeError::ParseFailed {
458        path: source_path.clone().unwrap_or_default(),
459        message,
460    })?;
461
462    if let Some(origin) = entry_origin.as_ref() {
463        stamp_doc(&mut doc, origin);
464    }
465
466    // Normalise labels in the entry source BEFORE the resolve walk so
467    // shortcut spellings (`:: include ::`, `:: image ::`, …) are
468    // rewritten to their canonical form. The resolve dispatcher keys
469    // on `registry.schema_for(label)` with the canonical spelling, so
470    // without this an `:: include src=... ::` annotation would be
471    // skipped because no schema is registered under the bare alias.
472    //
473    // Permissive mode: unknown labels are left as-is rather than
474    // erroring. The standard parse pipeline enforces strict-mode
475    // namespace policy (`STRING_TO_AST`); the resolve entry point is
476    // a downstream stage that just needs the shortcut table applied
477    // so dispatch finds the right handler.
478    let mut doc =
479        NormalizeLabels::permissive()
480            .run(doc)
481            .map_err(|e| IncludeError::ParseFailed {
482                path: source_path.clone().unwrap_or_default(),
483                message: format!("label normalisation failed: {e}"),
484            })?;
485
486    let mut chain: Vec<ResolveKey> = Vec::new();
487    let mut state = ResolverState {
488        config,
489        registry,
490        chain: &mut chain,
491        depth: 0,
492        total_resolved: 0,
493    };
494
495    splice_in_session_container(doc.root.children.as_mut_vec(), &mut state)?;
496
497    let doc = AttachAnnotations::new()
498        .run(doc)
499        .map_err(|e| IncludeError::ParseFailed {
500            path: source_path.clone().unwrap_or_default(),
501            message: format!("annotation attachment failed: {e}"),
502        })?;
503
504    // Re-normalise after splicing. Each included file is parsed via
505    // `parse_no_attach` (no normalisation), so shortcut labels in the
506    // spliced content — e.g. `:: image src=... ::` inside an included
507    // chapter — need rewriting before downstream IR/format passes can
508    // dispatch them.
509    let doc = NormalizeLabels::permissive()
510        .run(doc)
511        .map_err(|e| IncludeError::ParseFailed {
512            path: source_path.clone().unwrap_or_default(),
513            message: format!("label normalisation failed: {e}"),
514        })?;
515
516    // Apply table configuration so `:: table header=N align=... ::`
517    // annotations attached to tables (here or in spliced content) take
518    // effect — matches the order the standard pipeline runs them.
519    let doc = ApplyTableConfig::new()
520        .run(doc)
521        .map_err(|e| IncludeError::ParseFailed {
522            path: source_path.unwrap_or_default(),
523            message: format!("table config application failed: {e}"),
524        })?;
525
526    Ok(doc)
527}
528
529// ============================================================================
530// Splicing
531// ============================================================================
532
533/// One frame on the resolve-pass cycle stack. Two invocations at the
534/// same `(label, origin, start)` position are a cycle, regardless of
535/// what parameters either invocation uses — a handler that varies
536/// params per call (random IDs, timestamps) cannot defeat the
537/// detector by changing param values.
538#[derive(Debug, Clone, PartialEq)]
539struct ResolveKey {
540    label: String,
541    /// `Range.origin_path` of the annotation — the file the
542    /// invocation was authored in. `None` when stamping was skipped
543    /// (e.g., entry source loaded from a string with no path).
544    origin: Option<PathBuf>,
545    start: crate::lex::ast::range::Position,
546}
547
548impl ResolveKey {
549    fn from_annotation(a: &crate::lex::ast::elements::annotation::Annotation) -> Self {
550        Self {
551            label: a.data.label.value.clone(),
552            origin: a.location.origin_path.as_ref().map(|p| (**p).clone()),
553            start: a.location.start,
554        }
555    }
556}
557
558/// Per-resolution state threaded through the recursive walker. Keeps the
559/// signatures of the splice/process functions short and ensures
560/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
561/// each invocation.
562struct ResolverState<'a> {
563    config: &'a ResolveConfig,
564    registry: &'a Registry,
565    /// Active resolution stack of `(label, origin, position)` keys.
566    /// Pushed when we begin dispatching for an invocation and popped
567    /// when its splice subtree is fully resolved. A push that finds
568    /// the same key already on the stack is a cycle.
569    chain: &'a mut Vec<ResolveKey>,
570    /// Number of dispatch hops from the entry point. Each recursion
571    /// increments by 1. Hitting `config.max_depth` or the
572    /// [`KERNEL_DEPTH_BACKSTOP`] (whichever is lower) is an error.
573    depth: usize,
574    /// Total invocations resolved across the entire walk
575    /// (depth × breadth). Incremented on every successful dispatch.
576    /// Hitting `config.max_total_includes` aborts with
577    /// `TotalIncludesExceeded`.
578    total_resolved: usize,
579}
580
581fn splice_in_session_container(
582    children: &mut Vec<ContentItem>,
583    state: &mut ResolverState<'_>,
584) -> Result<(), IncludeError> {
585    // Post-order: recurse into nested containers first, splice this
586    // container's invocations second. Recursion happens inside
587    // `process_resolves` for any spliced subtree, so that subtree
588    // is never re-walked at the parent level.
589    recurse_into_children(children, state)?;
590    process_resolves(children, state, ContainerKind::Session)
591}
592
593fn splice_in_general_container(
594    container: &mut GeneralContainer,
595    state: &mut ResolverState<'_>,
596    kind: ContainerKind,
597) -> Result<(), IncludeError> {
598    recurse_into_children(container.as_mut_vec(), state)?;
599    process_resolves(container.as_mut_vec(), state, kind)
600}
601
602/// Walk the children of a container, dispatch every annotation whose
603/// schema declares `hooks.resolve = true` through the registry, and
604/// splice the returned content in place of the annotation. Recurses
605/// into the spliced content so nested invocations resolve too.
606// Allow &mut Vec because `splice` needs Vec-specific operations.
607#[allow(clippy::ptr_arg)]
608fn process_resolves(
609    children: &mut Vec<ContentItem>,
610    state: &mut ResolverState<'_>,
611    kind: ContainerKind,
612) -> Result<(), IncludeError> {
613    // Collect indices of annotations whose schema has hooks.resolve.
614    let resolve_indices: Vec<usize> = children
615        .iter()
616        .enumerate()
617        .filter_map(|(i, item)| match item {
618            ContentItem::Annotation(a) => {
619                let label = &a.data.label.value;
620                if state
621                    .registry
622                    .schema_for(label)
623                    .map(|s| s.hooks.resolve)
624                    .unwrap_or(false)
625                {
626                    Some(i)
627                } else {
628                    None
629                }
630            }
631            _ => None,
632        })
633        .collect();
634
635    for i in resolve_indices.into_iter().rev() {
636        let annotation = match &children[i] {
637            ContentItem::Annotation(a) => a.clone(),
638            _ => unreachable!("index came from resolve filter"),
639        };
640
641        match resolve_one_invocation(&annotation, state, kind)? {
642            ResolveOutcome::Spliced(splice_items) => {
643                // Replace the annotation with `[annotation, ...splice_items]`.
644                // The annotation itself stays in the children list immediately
645                // before the splice, so the post-resolution AttachAnnotations
646                // pass moves it onto the first spliced node by the standard
647                // "attach to next sibling" rule.
648                let mut replacement = Vec::with_capacity(splice_items.len() + 1);
649                replacement.push(ContentItem::Annotation(annotation));
650                replacement.extend(splice_items);
651                children.splice(i..=i, replacement);
652            }
653            ResolveOutcome::Unexpanded => {
654                // Handler opted out of expanding this invocation. The
655                // annotation stays in place, but its body wasn't
656                // walked by `recurse_into_children` (that walker
657                // skips resolve-hooked annotations to avoid double-
658                // resolution). Walk the body now so any nested
659                // invocations inside the unexpanded annotation get
660                // resolved on the way back up.
661                let mut owned = annotation;
662                splice_in_general_container(
663                    &mut owned.children,
664                    state,
665                    ContainerKind::AnnotationBody,
666                )?;
667                children[i] = ContentItem::Annotation(owned);
668            }
669        }
670    }
671
672    Ok(())
673}
674
675/// Outcome of dispatching a single resolve-hooked annotation. The
676/// pass needs to distinguish between "handler returned content,
677/// splice it in" and "handler opted out, leave the annotation
678/// alone": the second case still requires walking the annotation's
679/// body for nested invocations because `recurse_into_children`
680/// otherwise skips resolve-hooked annotations to prevent double-
681/// resolution.
682enum ResolveOutcome {
683    Spliced(Vec<ContentItem>),
684    Unexpanded,
685}
686
687/// Dispatch a single resolve-hooked annotation through the registry,
688/// decode the returned `WireNode` back into typed children, then
689/// recursively walk the splice items so nested invocations resolve
690/// before the splice is placed into the parent container.
691///
692/// Returns [`ResolveOutcome::Unexpanded`] when the handler returned
693/// `Ok(None)` (third-party handlers can opt out of expanding a
694/// particular invocation). The caller is then responsible for
695/// walking the annotation's body for nested invocations — the
696/// resolve walker normally skips resolve-hooked annotations'
697/// bodies.
698fn resolve_one_invocation(
699    annotation: &crate::lex::ast::elements::annotation::Annotation,
700    state: &mut ResolverState<'_>,
701    parent_kind: ContainerKind,
702) -> Result<ResolveOutcome, IncludeError> {
703    let label = &annotation.data.label.value;
704    let key = ResolveKey::from_annotation(annotation);
705
706    // Cycle check on (label, origin, start) of the invocation site.
707    if state.chain.contains(&key) {
708        return Err(IncludeError::Cycle {
709            include_site: annotation.location.clone(),
710            path: key.origin.clone().unwrap_or_default(),
711            chain: state
712                .chain
713                .iter()
714                .map(|k| k.origin.clone().unwrap_or_default())
715                .collect(),
716        });
717    }
718
719    // Depth check. The effective limit is the lower of the
720    // user-facing `config.max_depth` (default 8) and the hard
721    // [`KERNEL_DEPTH_BACKSTOP`] (32, fixed). The kernel backstop
722    // exists for adversarial varying-position recursion that the
723    // cycle key can't catch — even if a user bumps `max_depth`
724    // higher than 32 for legitimate deep atomization, the backstop
725    // still terminates. The error reports `effective_depth_limit`
726    // (the actual cap that fired) rather than `config.max_depth`,
727    // so when the backstop is the binding limit the user sees `32`
728    // and not the (higher) config value.
729    let effective_depth_limit = state.config.max_depth.min(KERNEL_DEPTH_BACKSTOP);
730    if state.depth >= effective_depth_limit {
731        return Err(IncludeError::DepthExceeded {
732            include_site: annotation.location.clone(),
733            limit: effective_depth_limit,
734            chain: state
735                .chain
736                .iter()
737                .map(|k| k.origin.clone().unwrap_or_default())
738                .collect(),
739        });
740    }
741
742    // Total-count check before dispatch.
743    if state.total_resolved >= state.config.max_total_includes {
744        return Err(IncludeError::TotalIncludesExceeded {
745            include_site: annotation.location.clone(),
746            limit: state.config.max_total_includes,
747        });
748    }
749
750    let ctx = build_label_ctx(annotation);
751
752    let wire_node = match state.registry.dispatch_resolve_raw(&ctx) {
753        Ok(Some(node)) => node,
754        Ok(None) => {
755            // Handler returned "nothing to splice" — leave the
756            // annotation in place. The caller still needs to walk
757            // its body for nested invocations (built-in lex.include
758            // never returns None; this path is reachable only via
759            // third-party handlers that opt out per-invocation).
760            return Ok(ResolveOutcome::Unexpanded);
761        }
762        Err(handler_err) => {
763            return Err(handler_error_to_include_error(
764                &handler_err,
765                label,
766                &annotation.location,
767            ));
768        }
769    };
770
771    state.total_resolved += 1;
772
773    // Decode the wire payload into typed lex-core ContentItems.
774    let mut splice_items = decode_wire_to_items(&wire_node, label, &annotation.location)?;
775
776    // Recurse into the spliced subtree FIRST so nested resolve-hooked
777    // annotations are processed before the splice lands. Validation
778    // must wait until *after* this step: a nested invocation can
779    // splice in content (e.g. a top-level `Session` from a chained
780    // `lex.include`) that wasn't in the handler's original output,
781    // and the final shape is what has to satisfy the parent
782    // container's policy.
783    //
784    // The `IncludeError::ContainerPolicy.file` field describes the
785    // *spliced content's* source file (the file containing the
786    // disallowed shape), not the invocation site. Take it from the
787    // handler-returned wire payload's origin when present, falling
788    // back to the first decoded item's origin path if the wire
789    // payload didn't stamp a `Document` origin.
790    let included_path = wire_node_origin_pathbuf(&wire_node)
791        .or_else(|| splice_items_first_origin(&splice_items))
792        .unwrap_or_default();
793    state.chain.push(key);
794    let saved_depth = state.depth;
795    state.depth = saved_depth + 1;
796    let recurse_result = splice_in_session_container(&mut splice_items, state);
797    state.depth = saved_depth;
798    state.chain.pop();
799    recurse_result?;
800
801    // Container-policy validation: enforce no-Sessions inside
802    // `GeneralContainer` (Definition / Annotation body / ListItem).
803    // Runs against the post-recursion splice list so nested
804    // expansions can't smuggle disallowed shapes past the check.
805    validate_against_kind(
806        &splice_items,
807        parent_kind,
808        &annotation.location,
809        &included_path,
810    )?;
811
812    Ok(ResolveOutcome::Spliced(splice_items))
813}
814
815/// Build a [`LabelCtx`] from a lex-core [`Annotation`]. The body is
816/// derived from the annotation's children (parsed-Lex form), the
817/// params from `Annotation::data::parameters`, and the host node info
818/// from `Annotation::location`.
819fn build_label_ctx(
820    a: &crate::lex::ast::elements::annotation::Annotation,
821) -> lex_extension::wire::LabelCtx {
822    use crate::lex::wire::to_wire_node;
823    use lex_extension::wire::{AnnotationBody, LabelCtx, NodeRef};
824
825    let label = a.data.label.value.clone();
826    let params = {
827        // Pass *semantic* parameter values to handlers (quotes
828        // stripped, escape sequences resolved). Handlers consume
829        // params as JSON values, where there is no "quoted string"
830        // vs "unquoted token" distinction; only the decoded value
831        // is meaningful. The codec's `parameters_to_json` (used by
832        // `annotation_to_wire` for round-tripping annotation
833        // *content*) keeps the raw form to preserve source — the
834        // two paths intentionally differ.
835        let mut obj = serde_json::Map::with_capacity(a.data.parameters.len());
836        for p in &a.data.parameters {
837            obj.insert(p.key.clone(), serde_json::Value::String(p.unquoted_value()));
838        }
839        serde_json::Value::Object(obj)
840    };
841    let body = if a.children.is_empty() {
842        AnnotationBody::None
843    } else {
844        let wire_children: Vec<lex_extension::wire::WireNode> =
845            a.children.iter().map(to_wire_node).collect();
846        AnnotationBody::Lex {
847            children: wire_children,
848        }
849    };
850    let range = lex_extension::wire::Range::new(
851        lex_extension::wire::Position::new(
852            u32::try_from(a.location.start.line).unwrap_or(u32::MAX),
853            u32::try_from(a.location.start.column).unwrap_or(u32::MAX),
854        ),
855        lex_extension::wire::Position::new(
856            u32::try_from(a.location.end.line).unwrap_or(u32::MAX),
857            u32::try_from(a.location.end.column).unwrap_or(u32::MAX),
858        ),
859    );
860    let origin = a
861        .location
862        .origin_path
863        .as_ref()
864        .map(|p| p.to_string_lossy().into_owned());
865    LabelCtx {
866        label,
867        params,
868        body,
869        node: NodeRef {
870            kind: "annotation".into(),
871            range,
872            origin,
873        },
874    }
875}
876
877/// Convert a handler-returned [`WireNode`] back into a list of
878/// [`ContentItem`]s ready for splicing. `WireNode::Document` is
879/// unwrapped (its children become the splice list); any other root
880/// shape is wrapped as a single-item list.
881///
882/// `invocation_label` is the label whose handler produced `wire` —
883/// threaded through so wire-decode failures are attributed to the
884/// real namespace rather than a hardcoded `lex.include`. A
885/// third-party `acme.expand` handler that returns malformed wire
886/// will surface as `IncludeError::HandlerFailed { label:
887/// "acme.expand", .. }`.
888/// Lift a [`WireNode`]'s top-level `origin` field into a `PathBuf`
889/// when present. Used by the resolve pass to attribute
890/// container-policy errors to the *spliced content's* source file
891/// rather than the invocation site.
892fn wire_node_origin_pathbuf(node: &lex_extension::wire::WireNode) -> Option<PathBuf> {
893    use lex_extension::wire::WireNode as W;
894    let s = match node {
895        W::Document { origin, .. } => origin.as_deref(),
896        W::Session { origin, .. } => origin.as_deref(),
897        W::Definition { origin, .. } => origin.as_deref(),
898        W::Paragraph { origin, .. } => origin.as_deref(),
899        W::List { origin, .. } => origin.as_deref(),
900        W::Verbatim { origin, .. } => origin.as_deref(),
901        W::Table { origin, .. } => origin.as_deref(),
902        W::Annotation { origin, .. } => origin.as_deref(),
903        W::Blank { origin, .. } => origin.as_deref(),
904        _ => None,
905    };
906    s.map(PathBuf::from)
907}
908
909/// Fallback when `WireNode::Document.origin` is unset: walk the
910/// decoded splice list and return the first item that carries an
911/// origin. The interner from `from_wire_node` ensures every item
912/// shares one Arc per origin string, so iterating is cheap.
913fn splice_items_first_origin(items: &[ContentItem]) -> Option<PathBuf> {
914    for item in items {
915        let r = match item {
916            ContentItem::Paragraph(p) => &p.location,
917            ContentItem::Session(s) => &s.location,
918            ContentItem::Definition(d) => &d.location,
919            ContentItem::List(l) => &l.location,
920            ContentItem::ListItem(li) => &li.location,
921            ContentItem::Annotation(a) => &a.location,
922            ContentItem::VerbatimBlock(v) => &v.location,
923            ContentItem::VerbatimLine(vl) => &vl.location,
924            ContentItem::Table(t) => &t.location,
925            ContentItem::TextLine(tl) => &tl.location,
926            ContentItem::BlankLineGroup(blg) => &blg.location,
927        };
928        if let Some(arc) = r.origin_path.as_ref() {
929            return Some((**arc).clone());
930        }
931    }
932    None
933}
934
935fn decode_wire_to_items(
936    wire: &lex_extension::wire::WireNode,
937    invocation_label: &str,
938    include_site: &Range,
939) -> Result<Vec<ContentItem>, IncludeError> {
940    use crate::lex::wire::from_wire_node;
941
942    from_wire_node(wire).map_err(|e| IncludeError::HandlerFailed {
943        include_site: include_site.clone(),
944        label: invocation_label.to_string(),
945        code: "wire.decode".into(),
946        message: format!("decoding handler-returned wire payload failed: {e}"),
947    })
948}
949
950/// Map a [`HandlerError`] returned by the registry into the most
951/// specific [`IncludeError`] variant available. Codes in the
952/// `-32001..=-32005` range emitted by [`crate::lex::builtins::LexIncludeHandler`]
953/// translate back to their corresponding pre-extension-system
954/// variants so existing CLI/LSP error rendering and the integration
955/// test suite keep working unchanged. Unknown codes (third-party
956/// namespaces, future built-ins) surface as `HandlerFailed`.
957fn handler_error_to_include_error(
958    err: &HandlerError,
959    label: &str,
960    include_site: &Range,
961) -> IncludeError {
962    use crate::lex::builtins::include::{
963        CODE_ABSOLUTE_PATH, CODE_IO, CODE_MISSING_SRC, CODE_NOT_FOUND, CODE_OUTSIDE_ROOT,
964        CODE_PARSE_FAILED, CODE_TOO_LARGE,
965    };
966
967    match err {
968        HandlerError::Custom {
969            code,
970            message,
971            data,
972        } => match *code {
973            CODE_NOT_FOUND => IncludeError::NotFound {
974                include_site: include_site.clone(),
975                path: data_str(data, "path")
976                    .map(PathBuf::from)
977                    .unwrap_or_default(),
978            },
979            CODE_OUTSIDE_ROOT => IncludeError::RootEscape {
980                path: data_str(data, "path")
981                    .map(PathBuf::from)
982                    .unwrap_or_default(),
983                root: data_str(data, "root")
984                    .map(PathBuf::from)
985                    .unwrap_or_default(),
986            },
987            CODE_TOO_LARGE => IncludeError::FileTooLarge {
988                include_site: include_site.clone(),
989                path: data_str(data, "path")
990                    .map(PathBuf::from)
991                    .unwrap_or_default(),
992                size: data_u64(data, "size").unwrap_or(0),
993                limit: data_u64(data, "limit").unwrap_or(0),
994            },
995            CODE_ABSOLUTE_PATH => IncludeError::AbsolutePath {
996                path: data_str(data, "path")
997                    .map(PathBuf::from)
998                    .unwrap_or_default(),
999            },
1000            CODE_IO => IncludeError::LoaderIo {
1001                path: data_str(data, "path")
1002                    .map(PathBuf::from)
1003                    .unwrap_or_default(),
1004                message: message.clone(),
1005            },
1006            CODE_MISSING_SRC => IncludeError::MissingSrc {
1007                include_site: include_site.clone(),
1008            },
1009            CODE_PARSE_FAILED => IncludeError::ParseFailed {
1010                path: data_str(data, "path")
1011                    .map(PathBuf::from)
1012                    .unwrap_or_default(),
1013                message: data_str(data, "message").unwrap_or_else(|| message.clone()),
1014            },
1015            other => IncludeError::HandlerFailed {
1016                include_site: include_site.clone(),
1017                label: label.to_string(),
1018                code: format!("handler.custom({other})"),
1019                message: message.clone(),
1020            },
1021        },
1022        HandlerError::Internal { message } => IncludeError::HandlerFailed {
1023            include_site: include_site.clone(),
1024            label: label.to_string(),
1025            code: "handler.internal".into(),
1026            message: message.clone(),
1027        },
1028        HandlerError::Unsupported { detail } => IncludeError::HandlerFailed {
1029            include_site: include_site.clone(),
1030            label: label.to_string(),
1031            code: "handler.unsupported".into(),
1032            message: detail.clone(),
1033        },
1034    }
1035}
1036
1037fn data_str(data: &Option<serde_json::Value>, key: &str) -> Option<String> {
1038    data.as_ref()?.get(key)?.as_str().map(str::to_string)
1039}
1040
1041fn data_u64(data: &Option<serde_json::Value>, key: &str) -> Option<u64> {
1042    data.as_ref()?.get(key)?.as_u64()
1043}
1044
1045#[allow(clippy::ptr_arg)]
1046fn recurse_into_children(
1047    children: &mut Vec<ContentItem>,
1048    state: &mut ResolverState<'_>,
1049) -> Result<(), IncludeError> {
1050    for item in children.iter_mut() {
1051        match item {
1052            ContentItem::Session(s) => {
1053                splice_in_session_container(s.children.as_mut_vec(), state)?;
1054            }
1055            ContentItem::Definition(d) => {
1056                splice_in_general_container(&mut d.children, state, ContainerKind::Definition)?;
1057            }
1058            ContentItem::Annotation(a) => {
1059                // Skip the body of annotations whose schema declares
1060                // `hooks.resolve = true` — those are dispatched at the
1061                // parent level by `process_resolves`. Walking their
1062                // bodies *here* would trip the resolve again on the
1063                // same invocation.
1064                //
1065                // The body is still walked when the resolve actually
1066                // runs: `process_resolves` calls
1067                // `resolve_one_invocation`, and the
1068                // [`ResolveOutcome::Spliced`] arm walks the splice
1069                // subtree (which replaces the annotation), while the
1070                // [`ResolveOutcome::Unexpanded`] arm explicitly
1071                // walks the kept annotation's body via
1072                // `splice_in_general_container`. So nested
1073                // resolve-hooked annotations inside an unexpanded
1074                // outer annotation are still reached.
1075                //
1076                // Non-resolve-hooked annotations recurse normally
1077                // here so their nested bodies get processed.
1078                let is_resolve_hooked = state
1079                    .registry
1080                    .schema_for(&a.data.label.value)
1081                    .map(|s| s.hooks.resolve)
1082                    .unwrap_or(false);
1083                if !is_resolve_hooked {
1084                    splice_in_general_container(
1085                        &mut a.children,
1086                        state,
1087                        ContainerKind::AnnotationBody,
1088                    )?;
1089                }
1090            }
1091            ContentItem::List(l) => {
1092                for li in l.items.as_mut_vec().iter_mut() {
1093                    if let ContentItem::ListItem(item) = li {
1094                        splice_in_general_container(
1095                            &mut item.children,
1096                            state,
1097                            ContainerKind::ListItem,
1098                        )?;
1099                    }
1100                }
1101            }
1102            _ => {}
1103        }
1104    }
1105    Ok(())
1106}
1107
1108fn validate_against_kind(
1109    items: &[ContentItem],
1110    kind: ContainerKind,
1111    site: &Range,
1112    file: &Path,
1113) -> Result<(), IncludeError> {
1114    if kind.allows_sessions() {
1115        return Ok(());
1116    }
1117    if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
1118        return Err(IncludeError::ContainerPolicy {
1119            include_site: site.clone(),
1120            container: kind.name(),
1121            file: file.to_path_buf(),
1122            violation: "Sessions",
1123        });
1124    }
1125    Ok(())
1126}
1127
1128// ============================================================================
1129// Path resolution
1130// ============================================================================
1131
1132/// Resolve a file-reference target string the same way the include
1133/// resolver resolves include paths.
1134///
1135/// Use this when consuming `ReferenceType::File { target }` (or any other
1136/// node-attached path) so that relative paths resolve from the *authoring*
1137/// file's directory, not from wherever the merged document happens to be
1138/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
1139/// containing node (or `None` if the node was never stamped — in that case
1140/// the path is treated as if authored at the root).
1141///
1142/// Behaviour matches the include resolver:
1143/// - Root-absolute targets (leading `/`) resolve under `root`.
1144/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
1145///   when `ref_origin` is `None`).
1146/// - The result is lexically normalized and checked against `root` —
1147///   paths that escape it return `RootEscape`.
1148///
1149/// This is a sister to the resolver's internal `resolve_path` and shares
1150/// the same lexical-normalization caveat: it does not touch the filesystem.
1151pub fn resolve_file_reference(
1152    target: &str,
1153    ref_origin: Option<&Path>,
1154    root: &Path,
1155) -> Result<PathBuf, IncludeError> {
1156    let host_dir: PathBuf = ref_origin
1157        .and_then(|p| p.parent())
1158        .map(Path::to_path_buf)
1159        .unwrap_or_else(|| root.to_path_buf());
1160    resolve_path(target, &host_dir, root)
1161}
1162
1163fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
1164    let candidate = if let Some(rel) = src.strip_prefix('/') {
1165        // Root-absolute (Lex spec convention): leading `/` means "from
1166        // the resolution root", not "filesystem root".
1167        root.join(rel)
1168    } else {
1169        // Anything else must be a relative path. Reject inputs the
1170        // host platform would treat as absolute (Windows `C:\foo`,
1171        // `\\server\share`, `\foo`) up front: the spec forbids
1172        // platform-absolute paths from entering the resolution
1173        // pipeline. Without this, `host_dir.join(src)` would silently
1174        // discard `host_dir` because Rust's `PathBuf::join` replaces
1175        // the base when the joined path is absolute. The downstream
1176        // root-escape check would still catch the security side, but
1177        // we'd surface a misleading "escapes root" error instead of
1178        // "absolute paths not allowed", and we'd be relying on
1179        // `PathBuf::join`'s override semantics for the security
1180        // outcome rather than holding the line at the input boundary.
1181        if Path::new(src).is_absolute() {
1182            return Err(IncludeError::AbsolutePath {
1183                path: PathBuf::from(src),
1184            });
1185        }
1186        host_dir.join(src)
1187    };
1188    let normalized = lexical_normalize(&candidate);
1189    let canonical_root = lexical_normalize(root);
1190    if !normalized.starts_with(&canonical_root) {
1191        return Err(IncludeError::RootEscape {
1192            path: normalized,
1193            root: canonical_root,
1194        });
1195    }
1196    Ok(normalized)
1197}
1198
1199/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
1200///
1201/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
1202/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
1203/// version is sufficient for include-site path resolution because the
1204/// resolver only needs a stable identity for cycle detection and a uniform
1205/// shape for the root-escape prefix check.
1206///
1207/// `..` is collapsed only when the *last* component in the buffer is a
1208/// real directory name (`Component::Normal`). When the buffer is empty
1209/// or its last component is itself `..` (or a root marker), the new `..`
1210/// is *preserved* in the buffer.
1211///
1212/// This is what defeats `../../etc/passwd` from collapsing to
1213/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
1214/// would happily strip a `..` (since `Path::new("..").parent()` returns
1215/// `Some("")`), silently losing the second `..` and producing a path
1216/// that falsely starts with the root prefix. Each unmatched `..` in the
1217/// preserved form keeps the normalized path outside any sane root, so
1218/// the escape check fires correctly.
1219fn lexical_normalize(p: &Path) -> PathBuf {
1220    let mut out = PathBuf::new();
1221    for c in p.components() {
1222        match c {
1223            std::path::Component::ParentDir => {
1224                let can_pop = matches!(
1225                    out.components().next_back(),
1226                    Some(std::path::Component::Normal(_))
1227                );
1228                if can_pop {
1229                    out.pop();
1230                } else {
1231                    out.push("..");
1232                }
1233            }
1234            std::path::Component::CurDir => {}
1235            other => out.push(other.as_os_str()),
1236        }
1237    }
1238    out
1239}
1240
1241// ============================================================================
1242// Origin stamping
1243// ============================================================================
1244//
1245// Walk every node in a Document and set `Range.origin_path` on each
1246// `.location` field. The walk only stamps the *block-level* `.location`
1247// fields here; finer-grained inline ranges land in PR 6 when file-ref
1248// resolution starts consulting them.
1249
1250pub(crate) fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
1251    if let Some(title) = doc.title.as_mut() {
1252        title.location.origin_path = Some(Arc::clone(origin));
1253    }
1254    for ann in doc.annotations.iter_mut() {
1255        stamp_annotation(ann, origin);
1256    }
1257    stamp_session(&mut doc.root, origin);
1258}
1259
1260fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
1261    s.location.origin_path = Some(Arc::clone(origin));
1262    if let Some(loc) = s.title.location.as_mut() {
1263        loc.origin_path = Some(Arc::clone(origin));
1264    }
1265    for ann in s.annotations.iter_mut() {
1266        stamp_annotation(ann, origin);
1267    }
1268    for item in s.children.as_mut_vec().iter_mut() {
1269        stamp_item(item, origin);
1270    }
1271}
1272
1273fn stamp_annotation(
1274    a: &mut crate::lex::ast::elements::annotation::Annotation,
1275    origin: &Arc<PathBuf>,
1276) {
1277    a.location.origin_path = Some(Arc::clone(origin));
1278    a.data.location.origin_path = Some(Arc::clone(origin));
1279    for item in a.children.as_mut_vec().iter_mut() {
1280        stamp_item(item, origin);
1281    }
1282}
1283
1284fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
1285    match item {
1286        ContentItem::Session(s) => stamp_session(s, origin),
1287        ContentItem::Annotation(a) => stamp_annotation(a, origin),
1288        ContentItem::Paragraph(p) => {
1289            p.location.origin_path = Some(Arc::clone(origin));
1290            for ann in p.annotations.iter_mut() {
1291                stamp_annotation(ann, origin);
1292            }
1293            for line in p.lines.iter_mut() {
1294                stamp_item(line, origin);
1295            }
1296        }
1297        ContentItem::List(l) => {
1298            l.location.origin_path = Some(Arc::clone(origin));
1299            for li in l.items.as_mut_vec().iter_mut() {
1300                stamp_item(li, origin);
1301            }
1302        }
1303        ContentItem::ListItem(li) => {
1304            li.location.origin_path = Some(Arc::clone(origin));
1305            for ann in li.annotations.iter_mut() {
1306                stamp_annotation(ann, origin);
1307            }
1308            for child in li.children.as_mut_vec().iter_mut() {
1309                stamp_item(child, origin);
1310            }
1311        }
1312        ContentItem::Definition(d) => {
1313            d.location.origin_path = Some(Arc::clone(origin));
1314            for ann in d.annotations.iter_mut() {
1315                stamp_annotation(ann, origin);
1316            }
1317            for child in d.children.as_mut_vec().iter_mut() {
1318                stamp_item(child, origin);
1319            }
1320        }
1321        ContentItem::VerbatimBlock(v) => {
1322            v.location.origin_path = Some(Arc::clone(origin));
1323        }
1324        ContentItem::VerbatimLine(vl) => {
1325            vl.location.origin_path = Some(Arc::clone(origin));
1326        }
1327        ContentItem::Table(t) => {
1328            t.location.origin_path = Some(Arc::clone(origin));
1329        }
1330        ContentItem::TextLine(tl) => {
1331            tl.location.origin_path = Some(Arc::clone(origin));
1332        }
1333        ContentItem::BlankLineGroup(b) => {
1334            b.location.origin_path = Some(Arc::clone(origin));
1335        }
1336    }
1337}
1338
1339// ============================================================================
1340// Parser glue
1341// ============================================================================
1342
1343/// Parse `source` into a Document but skip the annotation-attachment stage,
1344/// so include annotations are findable in container children lists.
1345pub(crate) fn parse_no_attach(source: &str) -> Result<Document, String> {
1346    crate::lex::testing::parse_without_annotation_attachment(source)
1347}
1348
1349// ============================================================================
1350// Filesystem-backed loader
1351// ============================================================================
1352
1353/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
1354///
1355/// This is the production loader used by the CLI; the LSP wraps it with a
1356/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
1357/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
1358/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
1359/// WASM-friendly.
1360///
1361/// `FsLoader` is constructed with the resolution root and rechecks every
1362/// load against it post-`fs::canonicalize`, so a symlink pointing outside
1363/// the root is rejected even though the lexical-only check in
1364/// [`resolve_path`] cannot see it. Also rejects non-regular files (devices,
1365/// FIFOs, directories) before reading, so the loader can't be tricked into
1366/// blocking on `/dev/zero` or allocating against an open device.
1367///
1368/// Errors map:
1369/// - canonicalization fails (file missing, permission denied at a parent,
1370///   broken symlink, …) → [`LoadError::NotFound`]
1371/// - canonical path doesn't sit under canonical root → [`LoadError::OutsideRoot`]
1372/// - target is not a regular file → [`LoadError::Io`] with a clear message
1373/// - any other I/O error during read → [`LoadError::Io`]
1374pub struct FsLoader {
1375    /// Filesystem-canonical resolution root. Constructed once at
1376    /// `FsLoader::new`; if canonicalization fails (e.g., the configured
1377    /// root doesn't exist on disk), we fall back to the input verbatim
1378    /// and the bounds check will simply never pass — visible to the user
1379    /// as a `LoadError::OutsideRoot` instead of silently disabling the
1380    /// security check.
1381    canonical_root: PathBuf,
1382    /// Per-file size cap (bytes). Loads of larger files surface as
1383    /// `LoadError::TooLarge` before any bytes are read into memory.
1384    /// Default [`FsLoader::DEFAULT_MAX_FILE_SIZE`].
1385    max_file_size: u64,
1386}
1387
1388impl FsLoader {
1389    /// Default per-file size cap: 10 MiB. Generous for realistic Lex
1390    /// source documents (text only) and tight enough to bound memory
1391    /// allocation per include against an adversarial 1 GB file.
1392    pub const DEFAULT_MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
1393
1394    /// Construct a loader rooted at `root` with default size limits.
1395    /// The loader stores `root`'s fs-canonical form (with symlinks
1396    /// resolved); subsequent loads validate that the requested path's
1397    /// canonical form lives under it.
1398    pub fn new(root: PathBuf) -> Self {
1399        let canonical_root = std::fs::canonicalize(&root).unwrap_or(root);
1400        Self {
1401            canonical_root,
1402            max_file_size: Self::DEFAULT_MAX_FILE_SIZE,
1403        }
1404    }
1405
1406    /// Override the default per-file size cap (bytes). Use to widen the
1407    /// limit for projects with genuinely large source files, or tighten
1408    /// it for stricter sandboxes (e.g., LSPs serving untrusted content).
1409    pub fn with_max_file_size(mut self, max_file_size: u64) -> Self {
1410        self.max_file_size = max_file_size;
1411        self
1412    }
1413}
1414
1415impl Loader for FsLoader {
1416    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1417        // 1. Canonicalize. Resolves symlinks and `..` segments against the
1418        //    real filesystem. NotFound / broken-symlink / permission errors
1419        //    all surface here.
1420        let canonical_path = std::fs::canonicalize(path).map_err(|e| match e.kind() {
1421            std::io::ErrorKind::NotFound => LoadError::NotFound {
1422                path: path.to_path_buf(),
1423            },
1424            _ => LoadError::Io {
1425                path: path.to_path_buf(),
1426                message: e.to_string(),
1427            },
1428        })?;
1429
1430        // 2. Bounds check against the *canonical* root. This is the
1431        //    actual security gate against symlink traversal — the lexical
1432        //    check in resolve_path can't see through symlinks.
1433        if !canonical_path.starts_with(&self.canonical_root) {
1434            return Err(LoadError::OutsideRoot {
1435                path: canonical_path,
1436                root: self.canonical_root.clone(),
1437            });
1438        }
1439
1440        // 3. Reject non-regular files. Without this, an attacker (with
1441        //    write access to the repo) could symlink an include target to
1442        //    `/dev/zero` or a FIFO and block / OOM the reader. The
1443        //    is_file() metadata call is a cheap sanity check.
1444        let meta = std::fs::metadata(&canonical_path).map_err(|e| LoadError::Io {
1445            path: canonical_path.clone(),
1446            message: e.to_string(),
1447        })?;
1448        if !meta.is_file() {
1449            return Err(LoadError::Io {
1450                path: canonical_path,
1451                message: "include target is not a regular file".to_string(),
1452            });
1453        }
1454
1455        // 4. Size cap. Bounds memory allocation per include against an
1456        //    adversarial 1 GB file before any bytes hit the heap.
1457        let size = meta.len();
1458        if size > self.max_file_size {
1459            return Err(LoadError::TooLarge {
1460                path: canonical_path,
1461                size,
1462                limit: self.max_file_size,
1463            });
1464        }
1465
1466        // 5. Read. By this point we know the path is a regular file under
1467        //    the canonical root and within the size cap; anything that
1468        //    fails here is a real I/O error worth surfacing.
1469        let source = std::fs::read_to_string(&canonical_path).map_err(|e| LoadError::Io {
1470            path: canonical_path.clone(),
1471            message: e.to_string(),
1472        })?;
1473
1474        Ok(LoadedFile {
1475            source,
1476            canonical_path,
1477        })
1478    }
1479}
1480
1481// ============================================================================
1482// Test fixtures (test-support feature + cfg(test))
1483// ============================================================================
1484
1485/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
1486#[cfg(any(test, feature = "test-support"))]
1487pub struct MemoryLoader {
1488    files: std::collections::HashMap<PathBuf, String>,
1489}
1490
1491#[cfg(any(test, feature = "test-support"))]
1492impl MemoryLoader {
1493    /// Create an empty loader. Add files with [`MemoryLoader::insert`].
1494    pub fn new() -> Self {
1495        Self {
1496            files: std::collections::HashMap::new(),
1497        }
1498    }
1499
1500    /// Register a file at `path` with the given source text.
1501    pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
1502        self.files.insert(path.into(), contents.into());
1503        self
1504    }
1505
1506    /// Convenience constructor: build a loader from any iterator of
1507    /// `(path, contents)` pairs.
1508    pub fn from_pairs<I, P, S>(pairs: I) -> Self
1509    where
1510        I: IntoIterator<Item = (P, S)>,
1511        P: Into<PathBuf>,
1512        S: Into<String>,
1513    {
1514        let mut loader = Self::new();
1515        for (path, contents) in pairs {
1516            loader.insert(path, contents);
1517        }
1518        loader
1519    }
1520}
1521
1522#[cfg(any(test, feature = "test-support"))]
1523impl Default for MemoryLoader {
1524    fn default() -> Self {
1525        Self::new()
1526    }
1527}
1528
1529#[cfg(any(test, feature = "test-support"))]
1530impl Loader for MemoryLoader {
1531    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1532        // Memory loaders have no symlinks; the lookup key *is* the
1533        // canonical identity. Cycle detection in the resolver compares
1534        // `LoadedFile::canonical_path` values; for tests this matches the
1535        // lexically-normalized paths the resolver already produces.
1536        let source = self
1537            .files
1538            .get(path)
1539            .cloned()
1540            .ok_or_else(|| LoadError::NotFound {
1541                path: path.to_path_buf(),
1542            })?;
1543        Ok(LoadedFile {
1544            source,
1545            canonical_path: path.to_path_buf(),
1546        })
1547    }
1548}
1549
1550// ============================================================================
1551// Tests
1552// ============================================================================
1553
1554#[cfg(test)]
1555mod tests;