Skip to main content

lex_core/lex/
includes.rs

1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//!   doc-title/doc-annotation conversion + origin stamping + root-escape
19//!   check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//!   (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//!   directory, so relative paths inside an included file resolve from
23//!   that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//!   resolves a `ReferenceType::File` target from the authoring file's
26//!   directory using `Range.origin_path`.
27//!   `Document::find_annotation_by_label_in_origin` scopes footnote
28//!   lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//!   filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//!   into `lex convert` and `lex inspect` (default-on, opt-out via
32//!   `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47// `IncludeError` carries diagnostic context (paths, source ranges,
48// handler messages) on every variant; the `result_large_err` lint
49// would have us box the whole error or split it into a thinner shape
50// just to satisfy the size heuristic. The enum is already part of
51// the public API and the error path is rare; suppress the lint for
52// this module rather than churn the public surface.
53#![allow(clippy::result_large_err)]
54
55use crate::lex::assembling::stages::{ApplyTableConfig, NormalizeLabels};
56use crate::lex::assembling::AttachAnnotations;
57use crate::lex::ast::elements::container::GeneralContainer;
58use crate::lex::ast::elements::content_item::ContentItem;
59use crate::lex::ast::elements::session::Session;
60use crate::lex::ast::range::Range;
61use crate::lex::ast::Document;
62use crate::lex::transforms::Runnable;
63use lex_extension::handler::HandlerError;
64use lex_extension_host::registry::Registry;
65use std::path::{Path, PathBuf};
66use std::sync::Arc;
67
68/// Configuration for the include resolution pass.
69#[derive(Debug, Clone)]
70pub struct ResolveConfig {
71    /// Directory all include paths resolve under. Any include that
72    /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
73    ///
74    /// Must be an **absolute** path. Lexical normalization treats `.`
75    /// and `..` against an empty buffer as no-ops; passing a relative
76    /// or unnormalized root weakens the root-escape prefix check.
77    /// Callers (CLI, LSP) should canonicalize the root before
78    /// constructing `ResolveConfig`.
79    pub root: PathBuf,
80    /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
81    /// Hitting the limit is an error, not a silent truncation.
82    pub max_depth: usize,
83    /// Maximum total number of `lex.include` annotations resolved across
84    /// the whole tree (depth × breadth). Default 1000
85    /// (see [`ResolveConfig::DEFAULT_MAX_TOTAL_INCLUDES`]).
86    ///
87    /// Caps fan-out: `max_depth` alone bounds chain length but not
88    /// breadth. A document with 100 thousand top-level includes at depth
89    /// 1 sits inside `max_depth` but can still OOM the resolver / LSP /
90    /// CI. Hitting this limit is an error, not a silent truncation.
91    pub max_total_includes: usize,
92}
93
94impl ResolveConfig {
95    /// Default maximum include depth — enough for any reasonable atomization
96    /// strategy (aggregator → per-chapter → per-section), bounded enough to
97    /// keep the resolver's worst-case work predictable.
98    pub const DEFAULT_MAX_DEPTH: usize = 8;
99
100    /// Default maximum total include count (DoS bound). Generous enough
101    /// for a book-length document with thousands of small fragments,
102    /// tight enough to contain adversarial fan-out within a few seconds
103    /// of resolver work.
104    pub const DEFAULT_MAX_TOTAL_INCLUDES: usize = 1000;
105
106    /// Construct a config with the given root and default limits.
107    pub fn with_root(root: PathBuf) -> Self {
108        Self {
109            root,
110            max_depth: Self::DEFAULT_MAX_DEPTH,
111            max_total_includes: Self::DEFAULT_MAX_TOTAL_INCLUDES,
112        }
113    }
114}
115
116/// A pluggable source-text loader.
117///
118/// Implementations decide where bytes come from (filesystem, in-memory map,
119/// virtual filesystem, content-addressed store, …). lex-core never references
120/// `std::fs` directly through this trait; that keeps the resolver pure and
121/// usable in WASM, sandboxes, and unit tests.
122pub trait Loader {
123    /// Load the source text for `path` and return both the contents and a
124    /// canonical identity for the loaded resource. The path is what the
125    /// resolver decided on after applying the rules in §4 of the proposal.
126    ///
127    /// `LoadedFile::canonical_path` is the loader's authoritative identity
128    /// for the resource. For [`FsLoader`] this is the filesystem-canonical
129    /// path (symlinks resolved, case-folded if the underlying FS is
130    /// case-insensitive); for [`MemoryLoader`] it's the lookup key (since
131    /// memory loaders have no symlinks). The resolver uses this for cycle
132    /// detection and for stamping `Range.origin_path` on the loaded tree.
133    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError>;
134}
135
136/// Result of a successful [`Loader::load`].
137#[derive(Debug, Clone)]
138pub struct LoadedFile {
139    /// The file's source text.
140    pub source: String,
141    /// The loader's authoritative identity for the resource. See
142    /// [`Loader::load`] for how loaders decide this.
143    pub canonical_path: PathBuf,
144}
145
146/// Errors a [`Loader`] can produce.
147#[derive(Debug, Clone)]
148pub enum LoadError {
149    /// The loader could not find a resource at the given path.
150    NotFound { path: PathBuf },
151    /// The resource exists but resolves outside the loader's allowed
152    /// boundary. The lexical resolver normalizes `..` in the requested
153    /// path, but loaders that touch a real filesystem must do a second
154    /// check post-canonicalization to catch symlinks that escape the
155    /// boundary lexically-correct paths can't reach.
156    OutsideRoot { path: PathBuf, root: PathBuf },
157    /// The resource exists but its size exceeds the loader's configured
158    /// limit. `size` and `limit` are in bytes. The resolver maps this to
159    /// [`IncludeError::FileTooLarge`] with the offending annotation's site.
160    TooLarge {
161        path: PathBuf,
162        size: u64,
163        limit: u64,
164    },
165    /// Underlying I/O error (or virtual-filesystem equivalent).
166    Io { path: PathBuf, message: String },
167}
168
169impl std::fmt::Display for LoadError {
170    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171        match self {
172            LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
173            LoadError::OutsideRoot { path, root } => write!(
174                f,
175                "include path {} resolves outside loader root {}",
176                path.display(),
177                root.display()
178            ),
179            LoadError::TooLarge { path, size, limit } => write!(
180                f,
181                "include file {} is {size} bytes, exceeds limit of {limit} bytes",
182                path.display()
183            ),
184            LoadError::Io { path, message } => {
185                write!(f, "io error reading {}: {message}", path.display())
186            }
187        }
188    }
189}
190
191impl std::error::Error for LoadError {}
192
193/// Errors the include resolver can produce.
194#[derive(Debug, Clone)]
195pub enum IncludeError {
196    /// An include chain looped back on itself. `chain` is the resolution
197    /// stack at the moment the duplicate `path` was about to be pushed,
198    /// in source-order (entry first, deepest last). `include_site` is the
199    /// range of the offending `lex.include` annotation in its host file —
200    /// useful for diagnostics that highlight the exact line.
201    Cycle {
202        include_site: Range,
203        path: PathBuf,
204        chain: Vec<PathBuf>,
205    },
206    /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
207    /// shows the resolution stack at the moment of failure, in source
208    /// order. `include_site` is the range of the offending
209    /// `lex.include` annotation in its host file.
210    DepthExceeded {
211        include_site: Range,
212        limit: usize,
213        chain: Vec<PathBuf>,
214    },
215    /// The total number of includes resolved across the document
216    /// exceeded [`ResolveConfig::max_total_includes`]. Bounds adversarial
217    /// fan-out (which `max_depth` alone does not). `include_site` is the
218    /// `lex.include` annotation that pushed the count past the limit.
219    TotalIncludesExceeded { include_site: Range, limit: usize },
220    /// The included file's size exceeded the loader's configured limit.
221    /// Surfaced by loaders that read from a real filesystem (FsLoader)
222    /// to bound memory allocation per include. `include_site` is the
223    /// offending annotation; `size` and `limit` are in bytes.
224    FileTooLarge {
225        include_site: Range,
226        path: PathBuf,
227        size: u64,
228        limit: u64,
229    },
230    /// A path resolved outside the configured [`ResolveConfig::root`].
231    RootEscape { path: PathBuf, root: PathBuf },
232    /// The include `src` was a platform-absolute filesystem path
233    /// (e.g. Windows `C:\foo`, `\\server\share`, `\foo`). The spec
234    /// forbids absolute filesystem paths from entering the
235    /// resolution pipeline; the *root-absolute* form (leading `/`
236    /// resolved against the includes root) is the only spec-allowed
237    /// way to write a path that doesn't start from the host's
238    /// directory. On Unix the only thing that's `Path::is_absolute()`
239    /// is a leading `/`, which is consumed by the root-absolute
240    /// branch first; this variant therefore only fires in practice
241    /// for Windows-shaped absolute paths.
242    AbsolutePath { path: PathBuf },
243    /// The loader could not find or read the included file. `include_site`
244    /// is the range of the offending `lex.include` annotation in its host
245    /// file, so editors can squiggle the line that asked for the missing
246    /// file rather than the document head.
247    NotFound { include_site: Range, path: PathBuf },
248    /// The loader returned text that the parser rejected.
249    ParseFailed { path: PathBuf, message: String },
250    /// The included file's content is not legal in the include site's
251    /// parent container.
252    ///
253    /// Today this only occurs when an included file has top-level Sessions
254    /// and the include site is inside a `GeneralContainer` (Definition,
255    /// ListItem, or another Annotation's body). The `violation` field
256    /// names the offending content kind (e.g. `"Sessions"`) so future
257    /// container/policy combinations can reuse this variant without a
258    /// breaking change.
259    ContainerPolicy {
260        include_site: Range,
261        container: &'static str,
262        file: PathBuf,
263        violation: &'static str,
264    },
265    /// Loader propagated a non-`NotFound` I/O error.
266    LoaderIo { path: PathBuf, message: String },
267    /// `lex.include` annotation was missing the mandatory `src=` parameter.
268    MissingSrc { include_site: Range },
269    /// A registered handler returned an error the pass could not map
270    /// onto a more specific variant — typically a third-party
271    /// namespace's resolve hook surfacing an internal failure, or an
272    /// unrecognised handler-defined code from `lex.*` built-ins. The
273    /// `code` is the string identifier the registry attaches to the
274    /// diagnostic (`"handler.internal"`, `"handler.custom"`, …).
275    HandlerFailed {
276        include_site: Range,
277        label: String,
278        code: String,
279        message: String,
280    },
281}
282
283impl std::fmt::Display for IncludeError {
284    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285        match self {
286            IncludeError::Cycle { path, chain, .. } => {
287                let chain_display: Vec<String> =
288                    chain.iter().map(|p| p.display().to_string()).collect();
289                write!(
290                    f,
291                    "include cycle: {} (chain: {})",
292                    path.display(),
293                    chain_display.join(" -> ")
294                )
295            }
296            IncludeError::DepthExceeded { limit, chain, .. } => {
297                let chain_display: Vec<String> =
298                    chain.iter().map(|p| p.display().to_string()).collect();
299                write!(
300                    f,
301                    "include depth exceeded limit of {limit} (chain: {})",
302                    chain_display.join(" -> ")
303                )
304            }
305            IncludeError::TotalIncludesExceeded { limit, .. } => {
306                write!(f, "total include count exceeded limit of {limit}")
307            }
308            IncludeError::FileTooLarge {
309                path, size, limit, ..
310            } => {
311                write!(
312                    f,
313                    "included file {} is {size} bytes, exceeds limit of {limit} bytes",
314                    path.display()
315                )
316            }
317            IncludeError::RootEscape { path, root } => write!(
318                f,
319                "include path {} escapes resolution root {}",
320                path.display(),
321                root.display()
322            ),
323            IncludeError::AbsolutePath { path } => write!(
324                f,
325                "include src {} is a platform-absolute path; \
326                 the spec forbids absolute filesystem paths — use a relative path \
327                 (chapters/01.lex) or a root-absolute path (/shared/01.lex)",
328                path.display()
329            ),
330            IncludeError::NotFound { path, .. } => {
331                write!(f, "include not found: {}", path.display())
332            }
333            IncludeError::ParseFailed { path, message } => {
334                write!(f, "failed to parse {}: {message}", path.display())
335            }
336            IncludeError::ContainerPolicy {
337                container,
338                file,
339                violation,
340                ..
341            } => write!(
342                f,
343                "included file {} contains {} but include site is inside {} \
344                 (which does not allow {})",
345                file.display(),
346                violation,
347                container,
348                violation
349            ),
350            IncludeError::LoaderIo { path, message } => {
351                write!(f, "loader error reading {}: {message}", path.display())
352            }
353            IncludeError::MissingSrc { .. } => {
354                write!(f, "lex.include annotation missing required src= parameter")
355            }
356            IncludeError::HandlerFailed {
357                label,
358                code,
359                message,
360                ..
361            } => write!(f, "extension handler `{label}` failed ({code}): {message}"),
362        }
363    }
364}
365
366impl std::error::Error for IncludeError {}
367
368// No `From<LoadError>` impl: `IncludeError::NotFound` carries the include
369// site (the `lex.include` annotation's range), which a loader doesn't know
370// about. Callers map `LoadError` explicitly at the call site, where the
371// site is available.
372
373/// Which container the include site sits in. Determines the splice-time
374/// policy check (the only one today is "no Sessions in `GeneralContainer`").
375#[derive(Debug, Clone, Copy)]
376enum ContainerKind {
377    /// `Document.root.children` or `Session.children` — accepts everything.
378    Session,
379    /// `Definition.children` — `GeneralContainer`.
380    Definition,
381    /// `Annotation.children` — `GeneralContainer`.
382    AnnotationBody,
383    /// `ListItem.children` — `GeneralContainer`.
384    ListItem,
385}
386
387impl ContainerKind {
388    fn name(self) -> &'static str {
389        match self {
390            ContainerKind::Session => "Session",
391            ContainerKind::Definition => "Definition",
392            ContainerKind::AnnotationBody => "Annotation body",
393            ContainerKind::ListItem => "ListItem",
394        }
395    }
396
397    fn allows_sessions(self) -> bool {
398        matches!(self, ContainerKind::Session)
399    }
400}
401
402/// Hard cap on resolution depth, applied even when the
403/// configurable [`ResolveConfig::max_depth`] is set higher. Bounds
404/// adversarial varying-position recursion (a handler that returns
405/// content with a different invocation site each iteration so the
406/// cycle key never matches) so the resolver always terminates.
407pub const KERNEL_DEPTH_BACKSTOP: usize = 32;
408
409/// Resolve every `hooks.resolve = true` labelled annotation starting
410/// from `source`, dispatching through `registry`, and recursively
411/// processing the spliced content.
412///
413/// `source_path` identifies the entry-point file. It is used to
414/// (a) stamp `Range.origin_path` on every node so downstream code
415/// (file-ref resolution, diagnostics, LSP goto) can report locations
416/// against the authoring file, and (b) provide the host directory
417/// the built-in `lex.include` handler resolves relative `src=` paths
418/// against (via `LabelCtx.node.origin`). When `None`, origin stamping
419/// is skipped on the entry and the handler resolves relative paths
420/// against `config.root`.
421///
422/// # Generic dispatch
423///
424/// Every label whose schema declares `hooks.resolve = true` flows
425/// through the same path: build a [`LabelCtx`] from the annotation,
426/// call [`Registry::dispatch_resolve_raw`], decode the returned
427/// [`WireNode`] back into typed [`ContentItem`]s via
428/// [`crate::lex::wire::from_wire_node`], and splice in place. The
429/// built-in `lex.include` handler is registered the same way as any
430/// third-party namespace.
431///
432/// # Pre/post-attachment
433///
434/// Internally this re-parses the entry source *without* annotation
435/// attachment so labelled annotations stay visible as standalone
436/// children. The handler does its own `parse_no_attach` for loaded
437/// content. After all splices, [`AttachAnnotations`] runs once on
438/// the merged tree.
439///
440/// # Recursion + cycle detection
441///
442/// Cycle detection keys on `(label, origin_path, start_position)` of
443/// the invocation site. A handler that returns content containing
444/// another invocation at the same source position is caught
445/// immediately. A handler that varies the invocation position each
446/// iteration terminates at `min(config.max_depth, KERNEL_DEPTH_BACKSTOP)`
447/// with `IncludeError::DepthExceeded`. The total-includes counter
448/// caps adversarial fan-out independent of depth.
449pub fn resolve_from_source(
450    source: &str,
451    source_path: Option<PathBuf>,
452    config: &ResolveConfig,
453    registry: &Registry,
454) -> Result<Document, IncludeError> {
455    let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
456
457    // Run the SHARED parser front-end (the same one `run_string_to_ast`
458    // uses): source → assembled Document (annotations still standalone)
459    // plus the reference-line pre-pass results. This is the de-duplication
460    // fix for lex#722 — before this, the resolver had its own hand-rolled
461    // copy of the front-end (`parse_without_annotation_attachment`) that
462    // never ran the reference-line pre-pass, so whole-element anchors were
463    // silently dropped on the default `lexd <file> --to <fmt>` path. Now
464    // there is exactly one front-end and it can't drift.
465    let (mut doc, prepass) = crate::lex::transforms::standard::parse_to_attached_root(
466        source.to_string(),
467    )
468    .map_err(|e| IncludeError::ParseFailed {
469        path: source_path.clone().unwrap_or_default(),
470        message: e.to_string(),
471    })?;
472
473    // Carry the entry file's reference lines (whole-element anchors) onto
474    // the document so the babel serializers / LSP documentLink can render
475    // them. These ranges are in the entry source's original coordinates,
476    // which is correct for the entry file. Reference lines that live
477    // *inside* included files are handled separately after splicing (see
478    // below); they are NOT in `prepass`, which only saw the entry source.
479    doc.reference_lines = prepass.reference_lines;
480    doc.reference_line_diagnostics = prepass.diagnostics;
481
482    if let Some(origin) = entry_origin.as_ref() {
483        stamp_doc(&mut doc, origin);
484    }
485
486    // Normalise labels in the entry source BEFORE the resolve walk so
487    // shortcut spellings (`:: include ::`, `:: image ::`, …) are
488    // rewritten to their canonical form. The resolve dispatcher keys
489    // on `registry.schema_for(label)` with the canonical spelling, so
490    // without this an `:: include src=... ::` annotation would be
491    // skipped because no schema is registered under the bare alias.
492    //
493    // Permissive mode: unknown labels are left as-is rather than
494    // erroring. The standard parse pipeline enforces strict-mode
495    // namespace policy (`STRING_TO_AST`); the resolve entry point is
496    // a downstream stage that just needs the shortcut table applied
497    // so dispatch finds the right handler.
498    let mut doc =
499        NormalizeLabels::permissive()
500            .run(doc)
501            .map_err(|e| IncludeError::ParseFailed {
502                path: source_path.clone().unwrap_or_default(),
503                message: format!("label normalisation failed: {e}"),
504            })?;
505
506    let mut chain: Vec<ResolveKey> = Vec::new();
507    let mut state = ResolverState {
508        config,
509        registry,
510        chain: &mut chain,
511        depth: 0,
512        total_resolved: 0,
513    };
514
515    splice_in_session_container(doc.root.children.as_mut_vec(), &mut state)?;
516
517    let doc = AttachAnnotations::new()
518        .run(doc)
519        .map_err(|e| IncludeError::ParseFailed {
520            path: source_path.clone().unwrap_or_default(),
521            message: format!("annotation attachment failed: {e}"),
522        })?;
523
524    // Re-normalise after splicing. Each included file is parsed via
525    // `parse_no_attach` (no normalisation), so shortcut labels in the
526    // spliced content — e.g. `:: image src=... ::` inside an included
527    // chapter — need rewriting before downstream IR/format passes can
528    // dispatch them.
529    let doc = NormalizeLabels::permissive()
530        .run(doc)
531        .map_err(|e| IncludeError::ParseFailed {
532            path: source_path.clone().unwrap_or_default(),
533            message: format!("label normalisation failed: {e}"),
534        })?;
535
536    // Apply table configuration so `:: table header=N align=... ::`
537    // annotations attached to tables (here or in spliced content) take
538    // effect — matches the order the standard pipeline runs them.
539    let doc = ApplyTableConfig::new()
540        .run(doc)
541        .map_err(|e| IncludeError::ParseFailed {
542            path: source_path.unwrap_or_default(),
543            message: format!("table config application failed: {e}"),
544        })?;
545
546    Ok(doc)
547}
548
549// ============================================================================
550// Splicing
551// ============================================================================
552
553/// One frame on the resolve-pass cycle stack. Two invocations at the
554/// same `(label, origin, start)` position are a cycle, regardless of
555/// what parameters either invocation uses — a handler that varies
556/// params per call (random IDs, timestamps) cannot defeat the
557/// detector by changing param values.
558#[derive(Debug, Clone, PartialEq)]
559struct ResolveKey {
560    label: String,
561    /// `Range.origin_path` of the annotation — the file the
562    /// invocation was authored in. `None` when stamping was skipped
563    /// (e.g., entry source loaded from a string with no path).
564    origin: Option<PathBuf>,
565    start: crate::lex::ast::range::Position,
566}
567
568impl ResolveKey {
569    fn from_annotation(a: &crate::lex::ast::elements::annotation::Annotation) -> Self {
570        Self {
571            label: a.data.label.value.clone(),
572            origin: a.location.origin_path.as_ref().map(|p| (**p).clone()),
573            start: a.location.start,
574        }
575    }
576}
577
578/// Per-resolution state threaded through the recursive walker. Keeps the
579/// signatures of the splice/process functions short and ensures
580/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
581/// each invocation.
582struct ResolverState<'a> {
583    config: &'a ResolveConfig,
584    registry: &'a Registry,
585    /// Active resolution stack of `(label, origin, position)` keys.
586    /// Pushed when we begin dispatching for an invocation and popped
587    /// when its splice subtree is fully resolved. A push that finds
588    /// the same key already on the stack is a cycle.
589    chain: &'a mut Vec<ResolveKey>,
590    /// Number of dispatch hops from the entry point. Each recursion
591    /// increments by 1. Hitting `config.max_depth` or the
592    /// [`KERNEL_DEPTH_BACKSTOP`] (whichever is lower) is an error.
593    depth: usize,
594    /// Total invocations resolved across the entire walk
595    /// (depth × breadth). Incremented on every successful dispatch.
596    /// Hitting `config.max_total_includes` aborts with
597    /// `TotalIncludesExceeded`.
598    total_resolved: usize,
599}
600
601fn splice_in_session_container(
602    children: &mut Vec<ContentItem>,
603    state: &mut ResolverState<'_>,
604) -> Result<(), IncludeError> {
605    // Post-order: recurse into nested containers first, splice this
606    // container's invocations second. Recursion happens inside
607    // `process_resolves` for any spliced subtree, so that subtree
608    // is never re-walked at the parent level.
609    recurse_into_children(children, state)?;
610    process_resolves(children, state, ContainerKind::Session)
611}
612
613fn splice_in_general_container(
614    container: &mut GeneralContainer,
615    state: &mut ResolverState<'_>,
616    kind: ContainerKind,
617) -> Result<(), IncludeError> {
618    recurse_into_children(container.as_mut_vec(), state)?;
619    process_resolves(container.as_mut_vec(), state, kind)
620}
621
622/// Walk the children of a container, dispatch every annotation whose
623/// schema declares `hooks.resolve = true` through the registry, and
624/// splice the returned content in place of the annotation. Recurses
625/// into the spliced content so nested invocations resolve too.
626// Allow &mut Vec because `splice` needs Vec-specific operations.
627#[allow(clippy::ptr_arg)]
628fn process_resolves(
629    children: &mut Vec<ContentItem>,
630    state: &mut ResolverState<'_>,
631    kind: ContainerKind,
632) -> Result<(), IncludeError> {
633    // Collect indices of annotations whose schema has hooks.resolve.
634    let resolve_indices: Vec<usize> = children
635        .iter()
636        .enumerate()
637        .filter_map(|(i, item)| match item {
638            ContentItem::Annotation(a) => {
639                let label = &a.data.label.value;
640                if state
641                    .registry
642                    .schema_for(label)
643                    .map(|s| s.hooks.resolve)
644                    .unwrap_or(false)
645                {
646                    Some(i)
647                } else {
648                    None
649                }
650            }
651            _ => None,
652        })
653        .collect();
654
655    for i in resolve_indices.into_iter().rev() {
656        let annotation = match &children[i] {
657            ContentItem::Annotation(a) => a.clone(),
658            _ => unreachable!("index came from resolve filter"),
659        };
660
661        match resolve_one_invocation(&annotation, state, kind)? {
662            ResolveOutcome::Spliced(splice_items) => {
663                // Expansion replaces the directive with the included content. The
664                // `lex.include` annotation is consumed — drop it. (It used to be
665                // kept in the stream as provenance, relying on the serializer
666                // dropping attached annotations; now that the serializer emits
667                // them (lex#682), keeping it would leak `:: lex.include ::` into
668                // expanded output. Origin provenance is tracked on
669                // `Range.origin_path`, not this node.)
670                children.splice(i..=i, splice_items);
671            }
672            ResolveOutcome::Unexpanded => {
673                // Handler opted out of expanding this invocation. The
674                // annotation stays in place, but its body wasn't
675                // walked by `recurse_into_children` (that walker
676                // skips resolve-hooked annotations to avoid double-
677                // resolution). Walk the body now so any nested
678                // invocations inside the unexpanded annotation get
679                // resolved on the way back up.
680                let mut owned = annotation;
681                splice_in_general_container(
682                    &mut owned.children,
683                    state,
684                    ContainerKind::AnnotationBody,
685                )?;
686                children[i] = ContentItem::Annotation(owned);
687            }
688        }
689    }
690
691    Ok(())
692}
693
694/// Outcome of dispatching a single resolve-hooked annotation. The
695/// pass needs to distinguish between "handler returned content,
696/// splice it in" and "handler opted out, leave the annotation
697/// alone": the second case still requires walking the annotation's
698/// body for nested invocations because `recurse_into_children`
699/// otherwise skips resolve-hooked annotations to prevent double-
700/// resolution.
701enum ResolveOutcome {
702    Spliced(Vec<ContentItem>),
703    Unexpanded,
704}
705
706/// Dispatch a single resolve-hooked annotation through the registry,
707/// decode the returned `WireNode` back into typed children, then
708/// recursively walk the splice items so nested invocations resolve
709/// before the splice is placed into the parent container.
710///
711/// Returns [`ResolveOutcome::Unexpanded`] when the handler returned
712/// `Ok(None)` (third-party handlers can opt out of expanding a
713/// particular invocation). The caller is then responsible for
714/// walking the annotation's body for nested invocations — the
715/// resolve walker normally skips resolve-hooked annotations'
716/// bodies.
717fn resolve_one_invocation(
718    annotation: &crate::lex::ast::elements::annotation::Annotation,
719    state: &mut ResolverState<'_>,
720    parent_kind: ContainerKind,
721) -> Result<ResolveOutcome, IncludeError> {
722    let label = &annotation.data.label.value;
723    let key = ResolveKey::from_annotation(annotation);
724
725    // Cycle check on (label, origin, start) of the invocation site.
726    if state.chain.contains(&key) {
727        return Err(IncludeError::Cycle {
728            include_site: annotation.location.clone(),
729            path: key.origin.clone().unwrap_or_default(),
730            chain: state
731                .chain
732                .iter()
733                .map(|k| k.origin.clone().unwrap_or_default())
734                .collect(),
735        });
736    }
737
738    // Depth check. The effective limit is the lower of the
739    // user-facing `config.max_depth` (default 8) and the hard
740    // [`KERNEL_DEPTH_BACKSTOP`] (32, fixed). The kernel backstop
741    // exists for adversarial varying-position recursion that the
742    // cycle key can't catch — even if a user bumps `max_depth`
743    // higher than 32 for legitimate deep atomization, the backstop
744    // still terminates. The error reports `effective_depth_limit`
745    // (the actual cap that fired) rather than `config.max_depth`,
746    // so when the backstop is the binding limit the user sees `32`
747    // and not the (higher) config value.
748    let effective_depth_limit = state.config.max_depth.min(KERNEL_DEPTH_BACKSTOP);
749    if state.depth >= effective_depth_limit {
750        return Err(IncludeError::DepthExceeded {
751            include_site: annotation.location.clone(),
752            limit: effective_depth_limit,
753            chain: state
754                .chain
755                .iter()
756                .map(|k| k.origin.clone().unwrap_or_default())
757                .collect(),
758        });
759    }
760
761    // Total-count check before dispatch.
762    if state.total_resolved >= state.config.max_total_includes {
763        return Err(IncludeError::TotalIncludesExceeded {
764            include_site: annotation.location.clone(),
765            limit: state.config.max_total_includes,
766        });
767    }
768
769    let ctx = build_label_ctx(annotation);
770
771    let wire_node = match state.registry.dispatch_resolve_raw(&ctx) {
772        Ok(Some(node)) => node,
773        Ok(None) => {
774            // Handler returned "nothing to splice" — leave the
775            // annotation in place. The caller still needs to walk
776            // its body for nested invocations (built-in lex.include
777            // never returns None; this path is reachable only via
778            // third-party handlers that opt out per-invocation).
779            return Ok(ResolveOutcome::Unexpanded);
780        }
781        Err(handler_err) => {
782            return Err(handler_error_to_include_error(
783                &handler_err,
784                label,
785                &annotation.location,
786            ));
787        }
788    };
789
790    state.total_resolved += 1;
791
792    // Decode the wire payload into typed lex-core ContentItems.
793    let mut splice_items = decode_wire_to_items(&wire_node, label, &annotation.location)?;
794
795    // Recurse into the spliced subtree FIRST so nested resolve-hooked
796    // annotations are processed before the splice lands. Validation
797    // must wait until *after* this step: a nested invocation can
798    // splice in content (e.g. a top-level `Session` from a chained
799    // `lex.include`) that wasn't in the handler's original output,
800    // and the final shape is what has to satisfy the parent
801    // container's policy.
802    //
803    // The `IncludeError::ContainerPolicy.file` field describes the
804    // *spliced content's* source file (the file containing the
805    // disallowed shape), not the invocation site. Take it from the
806    // handler-returned wire payload's origin when present, falling
807    // back to the first decoded item's origin path if the wire
808    // payload didn't stamp a `Document` origin.
809    let included_path = wire_node_origin_pathbuf(&wire_node)
810        .or_else(|| splice_items_first_origin(&splice_items))
811        .unwrap_or_default();
812    state.chain.push(key);
813    let saved_depth = state.depth;
814    state.depth = saved_depth + 1;
815    let recurse_result = splice_in_session_container(&mut splice_items, state);
816    state.depth = saved_depth;
817    state.chain.pop();
818    recurse_result?;
819
820    // Container-policy validation: enforce no-Sessions inside
821    // `GeneralContainer` (Definition / Annotation body / ListItem).
822    // Runs against the post-recursion splice list so nested
823    // expansions can't smuggle disallowed shapes past the check.
824    validate_against_kind(
825        &splice_items,
826        parent_kind,
827        &annotation.location,
828        &included_path,
829    )?;
830
831    Ok(ResolveOutcome::Spliced(splice_items))
832}
833
834/// Build a [`LabelCtx`] from a lex-core [`Annotation`]. The body is
835/// derived from the annotation's children (parsed-Lex form), the
836/// params from `Annotation::data::parameters`, and the host node info
837/// from `Annotation::location`.
838fn build_label_ctx(
839    a: &crate::lex::ast::elements::annotation::Annotation,
840) -> lex_extension::wire::LabelCtx {
841    use crate::lex::wire::to_wire_node;
842    use lex_extension::wire::{AnnotationBody, LabelCtx, NodeRef};
843
844    let label = a.data.label.value.clone();
845    let params = {
846        // Pass *semantic* parameter values to handlers (quotes
847        // stripped, escape sequences resolved). Handlers consume
848        // params as JSON values, where there is no "quoted string"
849        // vs "unquoted token" distinction; only the decoded value
850        // is meaningful. The codec's `parameters_to_json` (used by
851        // `annotation_to_wire` for round-tripping annotation
852        // *content*) keeps the raw form to preserve source — the
853        // two paths intentionally differ.
854        let mut obj = serde_json::Map::with_capacity(a.data.parameters.len());
855        for p in &a.data.parameters {
856            obj.insert(p.key.clone(), serde_json::Value::String(p.unquoted_value()));
857        }
858        serde_json::Value::Object(obj)
859    };
860    let body = if a.children.is_empty() {
861        AnnotationBody::None
862    } else {
863        let wire_children: Vec<lex_extension::wire::WireNode> =
864            a.children.iter().map(to_wire_node).collect();
865        AnnotationBody::Lex {
866            children: wire_children,
867        }
868    };
869    let range = lex_extension::wire::Range::new(
870        lex_extension::wire::Position::new(
871            u32::try_from(a.location.start.line).unwrap_or(u32::MAX),
872            u32::try_from(a.location.start.column).unwrap_or(u32::MAX),
873        ),
874        lex_extension::wire::Position::new(
875            u32::try_from(a.location.end.line).unwrap_or(u32::MAX),
876            u32::try_from(a.location.end.column).unwrap_or(u32::MAX),
877        ),
878    );
879    let origin = a
880        .location
881        .origin_path
882        .as_ref()
883        .map(|p| p.to_string_lossy().into_owned());
884    LabelCtx {
885        label,
886        params,
887        body,
888        node: NodeRef {
889            kind: "annotation".into(),
890            range,
891            origin,
892        },
893    }
894}
895
896/// Convert a handler-returned [`WireNode`] back into a list of
897/// [`ContentItem`]s ready for splicing. `WireNode::Document` is
898/// unwrapped (its children become the splice list); any other root
899/// shape is wrapped as a single-item list.
900///
901/// `invocation_label` is the label whose handler produced `wire` —
902/// threaded through so wire-decode failures are attributed to the
903/// real namespace rather than a hardcoded `lex.include`. A
904/// third-party `acme.expand` handler that returns malformed wire
905/// will surface as `IncludeError::HandlerFailed { label:
906/// "acme.expand", .. }`.
907/// Lift a [`WireNode`]'s top-level `origin` field into a `PathBuf`
908/// when present. Used by the resolve pass to attribute
909/// container-policy errors to the *spliced content's* source file
910/// rather than the invocation site.
911fn wire_node_origin_pathbuf(node: &lex_extension::wire::WireNode) -> Option<PathBuf> {
912    use lex_extension::wire::WireNode as W;
913    let s = match node {
914        W::Document { origin, .. } => origin.as_deref(),
915        W::Session { origin, .. } => origin.as_deref(),
916        W::Definition { origin, .. } => origin.as_deref(),
917        W::Paragraph { origin, .. } => origin.as_deref(),
918        W::List { origin, .. } => origin.as_deref(),
919        W::Verbatim { origin, .. } => origin.as_deref(),
920        W::Table { origin, .. } => origin.as_deref(),
921        W::Annotation { origin, .. } => origin.as_deref(),
922        W::Blank { origin, .. } => origin.as_deref(),
923        _ => None,
924    };
925    s.map(PathBuf::from)
926}
927
928/// Fallback when `WireNode::Document.origin` is unset: walk the
929/// decoded splice list and return the first item that carries an
930/// origin. The interner from `from_wire_node` ensures every item
931/// shares one Arc per origin string, so iterating is cheap.
932fn splice_items_first_origin(items: &[ContentItem]) -> Option<PathBuf> {
933    for item in items {
934        let r = match item {
935            ContentItem::Paragraph(p) => &p.location,
936            ContentItem::Session(s) => &s.location,
937            ContentItem::Definition(d) => &d.location,
938            ContentItem::List(l) => &l.location,
939            ContentItem::ListItem(li) => &li.location,
940            ContentItem::Annotation(a) => &a.location,
941            ContentItem::VerbatimBlock(v) => &v.location,
942            ContentItem::VerbatimLine(vl) => &vl.location,
943            ContentItem::Table(t) => &t.location,
944            ContentItem::TextLine(tl) => &tl.location,
945            ContentItem::BlankLineGroup(blg) => &blg.location,
946        };
947        if let Some(arc) = r.origin_path.as_ref() {
948            return Some((**arc).clone());
949        }
950    }
951    None
952}
953
954fn decode_wire_to_items(
955    wire: &lex_extension::wire::WireNode,
956    invocation_label: &str,
957    include_site: &Range,
958) -> Result<Vec<ContentItem>, IncludeError> {
959    use crate::lex::wire::from_wire_node;
960
961    from_wire_node(wire).map_err(|e| IncludeError::HandlerFailed {
962        include_site: include_site.clone(),
963        label: invocation_label.to_string(),
964        code: "wire.decode".into(),
965        message: format!("decoding handler-returned wire payload failed: {e}"),
966    })
967}
968
969/// Map a [`HandlerError`] returned by the registry into the most
970/// specific [`IncludeError`] variant available. Codes in the
971/// `-32001..=-32005` range emitted by [`crate::lex::builtins::LexIncludeHandler`]
972/// translate back to their corresponding pre-extension-system
973/// variants so existing CLI/LSP error rendering and the integration
974/// test suite keep working unchanged. Unknown codes (third-party
975/// namespaces, future built-ins) surface as `HandlerFailed`.
976fn handler_error_to_include_error(
977    err: &HandlerError,
978    label: &str,
979    include_site: &Range,
980) -> IncludeError {
981    use crate::lex::builtins::include::{
982        CODE_ABSOLUTE_PATH, CODE_IO, CODE_MISSING_SRC, CODE_NOT_FOUND, CODE_OUTSIDE_ROOT,
983        CODE_PARSE_FAILED, CODE_TOO_LARGE,
984    };
985
986    match err {
987        HandlerError::Custom {
988            code,
989            message,
990            data,
991        } => match *code {
992            CODE_NOT_FOUND => IncludeError::NotFound {
993                include_site: include_site.clone(),
994                path: data_str(data, "path")
995                    .map(PathBuf::from)
996                    .unwrap_or_default(),
997            },
998            CODE_OUTSIDE_ROOT => IncludeError::RootEscape {
999                path: data_str(data, "path")
1000                    .map(PathBuf::from)
1001                    .unwrap_or_default(),
1002                root: data_str(data, "root")
1003                    .map(PathBuf::from)
1004                    .unwrap_or_default(),
1005            },
1006            CODE_TOO_LARGE => IncludeError::FileTooLarge {
1007                include_site: include_site.clone(),
1008                path: data_str(data, "path")
1009                    .map(PathBuf::from)
1010                    .unwrap_or_default(),
1011                size: data_u64(data, "size").unwrap_or(0),
1012                limit: data_u64(data, "limit").unwrap_or(0),
1013            },
1014            CODE_ABSOLUTE_PATH => IncludeError::AbsolutePath {
1015                path: data_str(data, "path")
1016                    .map(PathBuf::from)
1017                    .unwrap_or_default(),
1018            },
1019            CODE_IO => IncludeError::LoaderIo {
1020                path: data_str(data, "path")
1021                    .map(PathBuf::from)
1022                    .unwrap_or_default(),
1023                message: message.clone(),
1024            },
1025            CODE_MISSING_SRC => IncludeError::MissingSrc {
1026                include_site: include_site.clone(),
1027            },
1028            CODE_PARSE_FAILED => IncludeError::ParseFailed {
1029                path: data_str(data, "path")
1030                    .map(PathBuf::from)
1031                    .unwrap_or_default(),
1032                message: data_str(data, "message").unwrap_or_else(|| message.clone()),
1033            },
1034            other => IncludeError::HandlerFailed {
1035                include_site: include_site.clone(),
1036                label: label.to_string(),
1037                code: format!("handler.custom({other})"),
1038                message: message.clone(),
1039            },
1040        },
1041        HandlerError::Internal { message } => IncludeError::HandlerFailed {
1042            include_site: include_site.clone(),
1043            label: label.to_string(),
1044            code: "handler.internal".into(),
1045            message: message.clone(),
1046        },
1047        HandlerError::Unsupported { detail } => IncludeError::HandlerFailed {
1048            include_site: include_site.clone(),
1049            label: label.to_string(),
1050            code: "handler.unsupported".into(),
1051            message: detail.clone(),
1052        },
1053    }
1054}
1055
1056fn data_str(data: &Option<serde_json::Value>, key: &str) -> Option<String> {
1057    data.as_ref()?.get(key)?.as_str().map(str::to_string)
1058}
1059
1060fn data_u64(data: &Option<serde_json::Value>, key: &str) -> Option<u64> {
1061    data.as_ref()?.get(key)?.as_u64()
1062}
1063
1064#[allow(clippy::ptr_arg)]
1065fn recurse_into_children(
1066    children: &mut Vec<ContentItem>,
1067    state: &mut ResolverState<'_>,
1068) -> Result<(), IncludeError> {
1069    for item in children.iter_mut() {
1070        match item {
1071            ContentItem::Session(s) => {
1072                splice_in_session_container(s.children.as_mut_vec(), state)?;
1073            }
1074            ContentItem::Definition(d) => {
1075                splice_in_general_container(&mut d.children, state, ContainerKind::Definition)?;
1076            }
1077            ContentItem::Annotation(a) => {
1078                // Skip the body of annotations whose schema declares
1079                // `hooks.resolve = true` — those are dispatched at the
1080                // parent level by `process_resolves`. Walking their
1081                // bodies *here* would trip the resolve again on the
1082                // same invocation.
1083                //
1084                // The body is still walked when the resolve actually
1085                // runs: `process_resolves` calls
1086                // `resolve_one_invocation`, and the
1087                // [`ResolveOutcome::Spliced`] arm walks the splice
1088                // subtree (which replaces the annotation), while the
1089                // [`ResolveOutcome::Unexpanded`] arm explicitly
1090                // walks the kept annotation's body via
1091                // `splice_in_general_container`. So nested
1092                // resolve-hooked annotations inside an unexpanded
1093                // outer annotation are still reached.
1094                //
1095                // Non-resolve-hooked annotations recurse normally
1096                // here so their nested bodies get processed.
1097                let is_resolve_hooked = state
1098                    .registry
1099                    .schema_for(&a.data.label.value)
1100                    .map(|s| s.hooks.resolve)
1101                    .unwrap_or(false);
1102                if !is_resolve_hooked {
1103                    splice_in_general_container(
1104                        &mut a.children,
1105                        state,
1106                        ContainerKind::AnnotationBody,
1107                    )?;
1108                }
1109            }
1110            ContentItem::List(l) => {
1111                for li in l.items.as_mut_vec().iter_mut() {
1112                    if let ContentItem::ListItem(item) = li {
1113                        splice_in_general_container(
1114                            &mut item.children,
1115                            state,
1116                            ContainerKind::ListItem,
1117                        )?;
1118                    }
1119                }
1120            }
1121            _ => {}
1122        }
1123    }
1124    Ok(())
1125}
1126
1127fn validate_against_kind(
1128    items: &[ContentItem],
1129    kind: ContainerKind,
1130    site: &Range,
1131    file: &Path,
1132) -> Result<(), IncludeError> {
1133    if kind.allows_sessions() {
1134        return Ok(());
1135    }
1136    if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
1137        return Err(IncludeError::ContainerPolicy {
1138            include_site: site.clone(),
1139            container: kind.name(),
1140            file: file.to_path_buf(),
1141            violation: "Sessions",
1142        });
1143    }
1144    Ok(())
1145}
1146
1147// ============================================================================
1148// Path resolution
1149// ============================================================================
1150
1151/// Resolve a file-reference target string the same way the include
1152/// resolver resolves include paths.
1153///
1154/// Use this when consuming `ReferenceType::File { target }` (or any other
1155/// node-attached path) so that relative paths resolve from the *authoring*
1156/// file's directory, not from wherever the merged document happens to be
1157/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
1158/// containing node (or `None` if the node was never stamped — in that case
1159/// the path is treated as if authored at the root).
1160///
1161/// Behaviour matches the include resolver:
1162/// - Root-absolute targets (leading `/`) resolve under `root`.
1163/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
1164///   when `ref_origin` is `None`).
1165/// - The result is lexically normalized and checked against `root` —
1166///   paths that escape it return `RootEscape`.
1167///
1168/// This is a sister to the resolver's internal `resolve_path` and shares
1169/// the same lexical-normalization caveat: it does not touch the filesystem.
1170pub fn resolve_file_reference(
1171    target: &str,
1172    ref_origin: Option<&Path>,
1173    root: &Path,
1174) -> Result<PathBuf, IncludeError> {
1175    let host_dir: PathBuf = ref_origin
1176        .and_then(|p| p.parent())
1177        .map(Path::to_path_buf)
1178        .unwrap_or_else(|| root.to_path_buf());
1179    resolve_path(target, &host_dir, root)
1180}
1181
1182fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
1183    let candidate = if let Some(rel) = src.strip_prefix('/') {
1184        // Root-absolute (Lex spec convention): leading `/` means "from
1185        // the resolution root", not "filesystem root".
1186        root.join(rel)
1187    } else {
1188        // Anything else must be a relative path. Reject inputs the
1189        // host platform would treat as absolute (Windows `C:\foo`,
1190        // `\\server\share`, `\foo`) up front: the spec forbids
1191        // platform-absolute paths from entering the resolution
1192        // pipeline. Without this, `host_dir.join(src)` would silently
1193        // discard `host_dir` because Rust's `PathBuf::join` replaces
1194        // the base when the joined path is absolute. The downstream
1195        // root-escape check would still catch the security side, but
1196        // we'd surface a misleading "escapes root" error instead of
1197        // "absolute paths not allowed", and we'd be relying on
1198        // `PathBuf::join`'s override semantics for the security
1199        // outcome rather than holding the line at the input boundary.
1200        if Path::new(src).is_absolute() {
1201            return Err(IncludeError::AbsolutePath {
1202                path: PathBuf::from(src),
1203            });
1204        }
1205        host_dir.join(src)
1206    };
1207    let normalized = lexical_normalize(&candidate);
1208    let canonical_root = lexical_normalize(root);
1209    if !normalized.starts_with(&canonical_root) {
1210        return Err(IncludeError::RootEscape {
1211            path: normalized,
1212            root: canonical_root,
1213        });
1214    }
1215    Ok(normalized)
1216}
1217
1218/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
1219///
1220/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
1221/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
1222/// version is sufficient for include-site path resolution because the
1223/// resolver only needs a stable identity for cycle detection and a uniform
1224/// shape for the root-escape prefix check.
1225///
1226/// `..` is collapsed only when the *last* component in the buffer is a
1227/// real directory name (`Component::Normal`). When the buffer is empty
1228/// or its last component is itself `..` (or a root marker), the new `..`
1229/// is *preserved* in the buffer.
1230///
1231/// This is what defeats `../../etc/passwd` from collapsing to
1232/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
1233/// would happily strip a `..` (since `Path::new("..").parent()` returns
1234/// `Some("")`), silently losing the second `..` and producing a path
1235/// that falsely starts with the root prefix. Each unmatched `..` in the
1236/// preserved form keeps the normalized path outside any sane root, so
1237/// the escape check fires correctly.
1238fn lexical_normalize(p: &Path) -> PathBuf {
1239    let mut out = PathBuf::new();
1240    for c in p.components() {
1241        match c {
1242            std::path::Component::ParentDir => {
1243                let can_pop = matches!(
1244                    out.components().next_back(),
1245                    Some(std::path::Component::Normal(_))
1246                );
1247                if can_pop {
1248                    out.pop();
1249                } else {
1250                    out.push("..");
1251                }
1252            }
1253            std::path::Component::CurDir => {}
1254            other => out.push(other.as_os_str()),
1255        }
1256    }
1257    out
1258}
1259
1260// ============================================================================
1261// Origin stamping
1262// ============================================================================
1263//
1264// Walk every node in a Document and set `Range.origin_path` on each
1265// `.location` field. The walk only stamps the *block-level* `.location`
1266// fields here; finer-grained inline ranges land in PR 6 when file-ref
1267// resolution starts consulting them.
1268
1269pub(crate) fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
1270    if let Some(title) = doc.title.as_mut() {
1271        title.location.origin_path = Some(Arc::clone(origin));
1272    }
1273    for ann in doc.annotations.iter_mut() {
1274        stamp_annotation(ann, origin);
1275    }
1276    stamp_session(&mut doc.root, origin);
1277}
1278
1279fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
1280    s.location.origin_path = Some(Arc::clone(origin));
1281    if let Some(loc) = s.title.location.as_mut() {
1282        loc.origin_path = Some(Arc::clone(origin));
1283    }
1284    for ann in s.annotations.iter_mut() {
1285        stamp_annotation(ann, origin);
1286    }
1287    for item in s.children.as_mut_vec().iter_mut() {
1288        stamp_item(item, origin);
1289    }
1290}
1291
1292fn stamp_annotation(
1293    a: &mut crate::lex::ast::elements::annotation::Annotation,
1294    origin: &Arc<PathBuf>,
1295) {
1296    a.location.origin_path = Some(Arc::clone(origin));
1297    a.data.location.origin_path = Some(Arc::clone(origin));
1298    for item in a.children.as_mut_vec().iter_mut() {
1299        stamp_item(item, origin);
1300    }
1301}
1302
1303fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
1304    match item {
1305        ContentItem::Session(s) => stamp_session(s, origin),
1306        ContentItem::Annotation(a) => stamp_annotation(a, origin),
1307        ContentItem::Paragraph(p) => {
1308            p.location.origin_path = Some(Arc::clone(origin));
1309            for ann in p.annotations.iter_mut() {
1310                stamp_annotation(ann, origin);
1311            }
1312            for line in p.lines.iter_mut() {
1313                stamp_item(line, origin);
1314            }
1315        }
1316        ContentItem::List(l) => {
1317            l.location.origin_path = Some(Arc::clone(origin));
1318            for li in l.items.as_mut_vec().iter_mut() {
1319                stamp_item(li, origin);
1320            }
1321        }
1322        ContentItem::ListItem(li) => {
1323            li.location.origin_path = Some(Arc::clone(origin));
1324            for ann in li.annotations.iter_mut() {
1325                stamp_annotation(ann, origin);
1326            }
1327            for child in li.children.as_mut_vec().iter_mut() {
1328                stamp_item(child, origin);
1329            }
1330        }
1331        ContentItem::Definition(d) => {
1332            d.location.origin_path = Some(Arc::clone(origin));
1333            for ann in d.annotations.iter_mut() {
1334                stamp_annotation(ann, origin);
1335            }
1336            for child in d.children.as_mut_vec().iter_mut() {
1337                stamp_item(child, origin);
1338            }
1339        }
1340        ContentItem::VerbatimBlock(v) => {
1341            v.location.origin_path = Some(Arc::clone(origin));
1342        }
1343        ContentItem::VerbatimLine(vl) => {
1344            vl.location.origin_path = Some(Arc::clone(origin));
1345        }
1346        ContentItem::Table(t) => {
1347            t.location.origin_path = Some(Arc::clone(origin));
1348        }
1349        ContentItem::TextLine(tl) => {
1350            tl.location.origin_path = Some(Arc::clone(origin));
1351        }
1352        ContentItem::BlankLineGroup(b) => {
1353            b.location.origin_path = Some(Arc::clone(origin));
1354        }
1355    }
1356}
1357
1358// ============================================================================
1359// Parser glue
1360// ============================================================================
1361
1362/// Parse `source` into a Document but skip the annotation-attachment stage,
1363/// so include annotations are findable in container children lists.
1364///
1365/// Runs the shared parser front-end ([`parse_to_attached_root`]) — the same
1366/// one `run_string_to_ast` and `resolve_from_source` use — so the
1367/// reference-line pre-pass and any future front-end stage can never drift
1368/// from the standard path (lex#722). This is used by the built-in
1369/// `lex.include` handler to parse *included* files.
1370///
1371/// The returned document does **not** carry `reference_lines`: included
1372/// files reach the parent tree through the wire-AST codec, which has no
1373/// `reference_lines` field, so whole-element anchors authored *inside* an
1374/// included file are not propagated to the merged document (see the
1375/// follow-up note in `resolve_from_source`). The pre-pass still runs here
1376/// (it must, to keep a reference line from being mistaken for a structural
1377/// blank line in the included file's own parse), but its result is dropped
1378/// rather than emitted as a wrong-coordinate range in the merged document.
1379pub(crate) fn parse_no_attach(source: &str) -> Result<Document, String> {
1380    crate::lex::transforms::standard::parse_to_attached_root(source.to_string())
1381        .map(|(doc, _prepass)| doc)
1382        .map_err(|e| e.to_string())
1383}
1384
1385// ============================================================================
1386// Filesystem-backed loader
1387// ============================================================================
1388
1389/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
1390///
1391/// This is the production loader used by the CLI; the LSP wraps it with a
1392/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
1393/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
1394/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
1395/// WASM-friendly.
1396///
1397/// `FsLoader` is constructed with the resolution root and rechecks every
1398/// load against it post-`fs::canonicalize`, so a symlink pointing outside
1399/// the root is rejected even though the lexical-only check in
1400/// [`resolve_path`] cannot see it. Also rejects non-regular files (devices,
1401/// FIFOs, directories) before reading, so the loader can't be tricked into
1402/// blocking on `/dev/zero` or allocating against an open device.
1403///
1404/// Errors map:
1405/// - canonicalization fails (file missing, permission denied at a parent,
1406///   broken symlink, …) → [`LoadError::NotFound`]
1407/// - canonical path doesn't sit under canonical root → [`LoadError::OutsideRoot`]
1408/// - target is not a regular file → [`LoadError::Io`] with a clear message
1409/// - any other I/O error during read → [`LoadError::Io`]
1410pub struct FsLoader {
1411    /// Filesystem-canonical resolution root. Constructed once at
1412    /// `FsLoader::new`; if canonicalization fails (e.g., the configured
1413    /// root doesn't exist on disk), we fall back to the input verbatim
1414    /// and the bounds check will simply never pass — visible to the user
1415    /// as a `LoadError::OutsideRoot` instead of silently disabling the
1416    /// security check.
1417    canonical_root: PathBuf,
1418    /// Per-file size cap (bytes). Loads of larger files surface as
1419    /// `LoadError::TooLarge` before any bytes are read into memory.
1420    /// Default [`FsLoader::DEFAULT_MAX_FILE_SIZE`].
1421    max_file_size: u64,
1422}
1423
1424impl FsLoader {
1425    /// Default per-file size cap: 10 MiB. Generous for realistic Lex
1426    /// source documents (text only) and tight enough to bound memory
1427    /// allocation per include against an adversarial 1 GB file.
1428    pub const DEFAULT_MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
1429
1430    /// Construct a loader rooted at `root` with default size limits.
1431    /// The loader stores `root`'s fs-canonical form (with symlinks
1432    /// resolved); subsequent loads validate that the requested path's
1433    /// canonical form lives under it.
1434    pub fn new(root: PathBuf) -> Self {
1435        let canonical_root = std::fs::canonicalize(&root).unwrap_or(root);
1436        Self {
1437            canonical_root,
1438            max_file_size: Self::DEFAULT_MAX_FILE_SIZE,
1439        }
1440    }
1441
1442    /// Override the default per-file size cap (bytes). Use to widen the
1443    /// limit for projects with genuinely large source files, or tighten
1444    /// it for stricter sandboxes (e.g., LSPs serving untrusted content).
1445    pub fn with_max_file_size(mut self, max_file_size: u64) -> Self {
1446        self.max_file_size = max_file_size;
1447        self
1448    }
1449}
1450
1451impl Loader for FsLoader {
1452    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1453        // 1. Canonicalize. Resolves symlinks and `..` segments against the
1454        //    real filesystem. NotFound / broken-symlink / permission errors
1455        //    all surface here.
1456        let canonical_path = std::fs::canonicalize(path).map_err(|e| match e.kind() {
1457            std::io::ErrorKind::NotFound => LoadError::NotFound {
1458                path: path.to_path_buf(),
1459            },
1460            _ => LoadError::Io {
1461                path: path.to_path_buf(),
1462                message: e.to_string(),
1463            },
1464        })?;
1465
1466        // 2. Bounds check against the *canonical* root. This is the
1467        //    actual security gate against symlink traversal — the lexical
1468        //    check in resolve_path can't see through symlinks.
1469        if !canonical_path.starts_with(&self.canonical_root) {
1470            return Err(LoadError::OutsideRoot {
1471                path: canonical_path,
1472                root: self.canonical_root.clone(),
1473            });
1474        }
1475
1476        // 3. Reject non-regular files. Without this, an attacker (with
1477        //    write access to the repo) could symlink an include target to
1478        //    `/dev/zero` or a FIFO and block / OOM the reader. The
1479        //    is_file() metadata call is a cheap sanity check.
1480        let meta = std::fs::metadata(&canonical_path).map_err(|e| LoadError::Io {
1481            path: canonical_path.clone(),
1482            message: e.to_string(),
1483        })?;
1484        if !meta.is_file() {
1485            return Err(LoadError::Io {
1486                path: canonical_path,
1487                message: "include target is not a regular file".to_string(),
1488            });
1489        }
1490
1491        // 4. Size cap. Bounds memory allocation per include against an
1492        //    adversarial 1 GB file before any bytes hit the heap.
1493        let size = meta.len();
1494        if size > self.max_file_size {
1495            return Err(LoadError::TooLarge {
1496                path: canonical_path,
1497                size,
1498                limit: self.max_file_size,
1499            });
1500        }
1501
1502        // 5. Read. By this point we know the path is a regular file under
1503        //    the canonical root and within the size cap; anything that
1504        //    fails here is a real I/O error worth surfacing.
1505        let source = std::fs::read_to_string(&canonical_path).map_err(|e| LoadError::Io {
1506            path: canonical_path.clone(),
1507            message: e.to_string(),
1508        })?;
1509
1510        Ok(LoadedFile {
1511            source,
1512            canonical_path,
1513        })
1514    }
1515}
1516
1517// ============================================================================
1518// Test fixtures (test-support feature + cfg(test))
1519// ============================================================================
1520
1521/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
1522#[cfg(any(test, feature = "test-support"))]
1523pub struct MemoryLoader {
1524    files: std::collections::HashMap<PathBuf, String>,
1525}
1526
1527#[cfg(any(test, feature = "test-support"))]
1528impl MemoryLoader {
1529    /// Create an empty loader. Add files with [`MemoryLoader::insert`].
1530    pub fn new() -> Self {
1531        Self {
1532            files: std::collections::HashMap::new(),
1533        }
1534    }
1535
1536    /// Register a file at `path` with the given source text.
1537    pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
1538        self.files.insert(path.into(), contents.into());
1539        self
1540    }
1541
1542    /// Convenience constructor: build a loader from any iterator of
1543    /// `(path, contents)` pairs.
1544    pub fn from_pairs<I, P, S>(pairs: I) -> Self
1545    where
1546        I: IntoIterator<Item = (P, S)>,
1547        P: Into<PathBuf>,
1548        S: Into<String>,
1549    {
1550        let mut loader = Self::new();
1551        for (path, contents) in pairs {
1552            loader.insert(path, contents);
1553        }
1554        loader
1555    }
1556}
1557
1558#[cfg(any(test, feature = "test-support"))]
1559impl Default for MemoryLoader {
1560    fn default() -> Self {
1561        Self::new()
1562    }
1563}
1564
1565#[cfg(any(test, feature = "test-support"))]
1566impl Loader for MemoryLoader {
1567    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1568        // Memory loaders have no symlinks; the lookup key *is* the
1569        // canonical identity. Cycle detection in the resolver compares
1570        // `LoadedFile::canonical_path` values; for tests this matches the
1571        // lexically-normalized paths the resolver already produces.
1572        let source = self
1573            .files
1574            .get(path)
1575            .cloned()
1576            .ok_or_else(|| LoadError::NotFound {
1577                path: path.to_path_buf(),
1578            })?;
1579        Ok(LoadedFile {
1580            source,
1581            canonical_path: path.to_path_buf(),
1582        })
1583    }
1584}
1585
1586// ============================================================================
1587// Tests
1588// ============================================================================
1589
1590#[cfg(test)]
1591mod tests;