Skip to main content

lex_core/lex/
includes.rs

1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//!   doc-title/doc-annotation conversion + origin stamping + root-escape
19//!   check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//!   (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//!   directory, so relative paths inside an included file resolve from
23//!   that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//!   resolves a `ReferenceType::File` target from the authoring file's
26//!   directory using `Range.origin_path`.
27//!   `Document::find_annotation_by_label_in_origin` scopes footnote
28//!   lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//!   filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//!   into `lex convert` and `lex inspect` (default-on, opt-out via
32//!   `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47use crate::lex::assembling::AttachAnnotations;
48use crate::lex::ast::elements::container::GeneralContainer;
49use crate::lex::ast::elements::content_item::ContentItem;
50use crate::lex::ast::elements::paragraph::Paragraph;
51use crate::lex::ast::elements::session::Session;
52use crate::lex::ast::range::Range;
53use crate::lex::ast::Document;
54use crate::lex::transforms::Runnable;
55use std::path::{Path, PathBuf};
56use std::sync::Arc;
57
58/// Configuration for the include resolution pass.
59#[derive(Debug, Clone)]
60pub struct ResolveConfig {
61    /// Directory all include paths resolve under. Any include that
62    /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
63    ///
64    /// Must be an **absolute** path. Lexical normalization treats `.`
65    /// and `..` against an empty buffer as no-ops; passing a relative
66    /// or unnormalized root weakens the root-escape prefix check.
67    /// Callers (CLI, LSP) should canonicalize the root before
68    /// constructing `ResolveConfig`.
69    pub root: PathBuf,
70    /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
71    /// Hitting the limit is an error, not a silent truncation.
72    pub max_depth: usize,
73    /// Maximum total number of `lex.include` annotations resolved across
74    /// the whole tree (depth × breadth). Default 1000
75    /// (see [`ResolveConfig::DEFAULT_MAX_TOTAL_INCLUDES`]).
76    ///
77    /// Caps fan-out: `max_depth` alone bounds chain length but not
78    /// breadth. A document with 100 thousand top-level includes at depth
79    /// 1 sits inside `max_depth` but can still OOM the resolver / LSP /
80    /// CI. Hitting this limit is an error, not a silent truncation.
81    pub max_total_includes: usize,
82}
83
84impl ResolveConfig {
85    /// Default maximum include depth — enough for any reasonable atomization
86    /// strategy (aggregator → per-chapter → per-section), bounded enough to
87    /// keep the resolver's worst-case work predictable.
88    pub const DEFAULT_MAX_DEPTH: usize = 8;
89
90    /// Default maximum total include count (DoS bound). Generous enough
91    /// for a book-length document with thousands of small fragments,
92    /// tight enough to contain adversarial fan-out within a few seconds
93    /// of resolver work.
94    pub const DEFAULT_MAX_TOTAL_INCLUDES: usize = 1000;
95
96    /// Construct a config with the given root and default limits.
97    pub fn with_root(root: PathBuf) -> Self {
98        Self {
99            root,
100            max_depth: Self::DEFAULT_MAX_DEPTH,
101            max_total_includes: Self::DEFAULT_MAX_TOTAL_INCLUDES,
102        }
103    }
104}
105
106/// A pluggable source-text loader.
107///
108/// Implementations decide where bytes come from (filesystem, in-memory map,
109/// virtual filesystem, content-addressed store, …). lex-core never references
110/// `std::fs` directly through this trait; that keeps the resolver pure and
111/// usable in WASM, sandboxes, and unit tests.
112pub trait Loader {
113    /// Load the source text for `path` and return both the contents and a
114    /// canonical identity for the loaded resource. The path is what the
115    /// resolver decided on after applying the rules in §4 of the proposal.
116    ///
117    /// `LoadedFile::canonical_path` is the loader's authoritative identity
118    /// for the resource. For [`FsLoader`] this is the filesystem-canonical
119    /// path (symlinks resolved, case-folded if the underlying FS is
120    /// case-insensitive); for [`MemoryLoader`] it's the lookup key (since
121    /// memory loaders have no symlinks). The resolver uses this for cycle
122    /// detection and for stamping `Range.origin_path` on the loaded tree.
123    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError>;
124}
125
126/// Result of a successful [`Loader::load`].
127#[derive(Debug, Clone)]
128pub struct LoadedFile {
129    /// The file's source text.
130    pub source: String,
131    /// The loader's authoritative identity for the resource. See
132    /// [`Loader::load`] for how loaders decide this.
133    pub canonical_path: PathBuf,
134}
135
136/// Errors a [`Loader`] can produce.
137#[derive(Debug, Clone)]
138pub enum LoadError {
139    /// The loader could not find a resource at the given path.
140    NotFound { path: PathBuf },
141    /// The resource exists but resolves outside the loader's allowed
142    /// boundary. The lexical resolver normalizes `..` in the requested
143    /// path, but loaders that touch a real filesystem must do a second
144    /// check post-canonicalization to catch symlinks that escape the
145    /// boundary lexically-correct paths can't reach.
146    OutsideRoot { path: PathBuf, root: PathBuf },
147    /// The resource exists but its size exceeds the loader's configured
148    /// limit. `size` and `limit` are in bytes. The resolver maps this to
149    /// [`IncludeError::FileTooLarge`] with the offending annotation's site.
150    TooLarge {
151        path: PathBuf,
152        size: u64,
153        limit: u64,
154    },
155    /// Underlying I/O error (or virtual-filesystem equivalent).
156    Io { path: PathBuf, message: String },
157}
158
159impl std::fmt::Display for LoadError {
160    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
161        match self {
162            LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
163            LoadError::OutsideRoot { path, root } => write!(
164                f,
165                "include path {} resolves outside loader root {}",
166                path.display(),
167                root.display()
168            ),
169            LoadError::TooLarge { path, size, limit } => write!(
170                f,
171                "include file {} is {size} bytes, exceeds limit of {limit} bytes",
172                path.display()
173            ),
174            LoadError::Io { path, message } => {
175                write!(f, "io error reading {}: {message}", path.display())
176            }
177        }
178    }
179}
180
181impl std::error::Error for LoadError {}
182
183/// Errors the include resolver can produce.
184#[derive(Debug, Clone)]
185pub enum IncludeError {
186    /// An include chain looped back on itself. `chain` is the resolution
187    /// stack at the moment the duplicate `path` was about to be pushed,
188    /// in source-order (entry first, deepest last). `include_site` is the
189    /// range of the offending `lex.include` annotation in its host file —
190    /// useful for diagnostics that highlight the exact line.
191    Cycle {
192        include_site: Range,
193        path: PathBuf,
194        chain: Vec<PathBuf>,
195    },
196    /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
197    /// shows the resolution stack at the moment of failure, in source
198    /// order. `include_site` is the range of the offending
199    /// `lex.include` annotation in its host file.
200    DepthExceeded {
201        include_site: Range,
202        limit: usize,
203        chain: Vec<PathBuf>,
204    },
205    /// The total number of includes resolved across the document
206    /// exceeded [`ResolveConfig::max_total_includes`]. Bounds adversarial
207    /// fan-out (which `max_depth` alone does not). `include_site` is the
208    /// `lex.include` annotation that pushed the count past the limit.
209    TotalIncludesExceeded { include_site: Range, limit: usize },
210    /// The included file's size exceeded the loader's configured limit.
211    /// Surfaced by loaders that read from a real filesystem (FsLoader)
212    /// to bound memory allocation per include. `include_site` is the
213    /// offending annotation; `size` and `limit` are in bytes.
214    FileTooLarge {
215        include_site: Range,
216        path: PathBuf,
217        size: u64,
218        limit: u64,
219    },
220    /// A path resolved outside the configured [`ResolveConfig::root`].
221    RootEscape { path: PathBuf, root: PathBuf },
222    /// The include `src` was a platform-absolute filesystem path
223    /// (e.g. Windows `C:\foo`, `\\server\share`, `\foo`). The spec
224    /// forbids absolute filesystem paths from entering the
225    /// resolution pipeline; the *root-absolute* form (leading `/`
226    /// resolved against the includes root) is the only spec-allowed
227    /// way to write a path that doesn't start from the host's
228    /// directory. On Unix the only thing that's `Path::is_absolute()`
229    /// is a leading `/`, which is consumed by the root-absolute
230    /// branch first; this variant therefore only fires in practice
231    /// for Windows-shaped absolute paths.
232    AbsolutePath { path: PathBuf },
233    /// The loader could not find or read the included file. `include_site`
234    /// is the range of the offending `lex.include` annotation in its host
235    /// file, so editors can squiggle the line that asked for the missing
236    /// file rather than the document head.
237    NotFound { include_site: Range, path: PathBuf },
238    /// The loader returned text that the parser rejected.
239    ParseFailed { path: PathBuf, message: String },
240    /// The included file's content is not legal in the include site's
241    /// parent container.
242    ///
243    /// Today this only occurs when an included file has top-level Sessions
244    /// and the include site is inside a `GeneralContainer` (Definition,
245    /// ListItem, or another Annotation's body). The `violation` field
246    /// names the offending content kind (e.g. `"Sessions"`) so future
247    /// container/policy combinations can reuse this variant without a
248    /// breaking change.
249    ContainerPolicy {
250        include_site: Range,
251        container: &'static str,
252        file: PathBuf,
253        violation: &'static str,
254    },
255    /// Loader propagated a non-`NotFound` I/O error.
256    LoaderIo { path: PathBuf, message: String },
257    /// `lex.include` annotation was missing the mandatory `src=` parameter.
258    MissingSrc { include_site: Range },
259}
260
261impl std::fmt::Display for IncludeError {
262    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
263        match self {
264            IncludeError::Cycle { path, chain, .. } => {
265                let chain_display: Vec<String> =
266                    chain.iter().map(|p| p.display().to_string()).collect();
267                write!(
268                    f,
269                    "include cycle: {} (chain: {})",
270                    path.display(),
271                    chain_display.join(" -> ")
272                )
273            }
274            IncludeError::DepthExceeded { limit, chain, .. } => {
275                let chain_display: Vec<String> =
276                    chain.iter().map(|p| p.display().to_string()).collect();
277                write!(
278                    f,
279                    "include depth exceeded limit of {limit} (chain: {})",
280                    chain_display.join(" -> ")
281                )
282            }
283            IncludeError::TotalIncludesExceeded { limit, .. } => {
284                write!(f, "total include count exceeded limit of {limit}")
285            }
286            IncludeError::FileTooLarge {
287                path, size, limit, ..
288            } => {
289                write!(
290                    f,
291                    "included file {} is {size} bytes, exceeds limit of {limit} bytes",
292                    path.display()
293                )
294            }
295            IncludeError::RootEscape { path, root } => write!(
296                f,
297                "include path {} escapes resolution root {}",
298                path.display(),
299                root.display()
300            ),
301            IncludeError::AbsolutePath { path } => write!(
302                f,
303                "include src {} is a platform-absolute path; \
304                 the spec forbids absolute filesystem paths — use a relative path \
305                 (chapters/01.lex) or a root-absolute path (/shared/01.lex)",
306                path.display()
307            ),
308            IncludeError::NotFound { path, .. } => {
309                write!(f, "include not found: {}", path.display())
310            }
311            IncludeError::ParseFailed { path, message } => {
312                write!(f, "failed to parse {}: {message}", path.display())
313            }
314            IncludeError::ContainerPolicy {
315                container,
316                file,
317                violation,
318                ..
319            } => write!(
320                f,
321                "included file {} contains {} but include site is inside {} \
322                 (which does not allow {})",
323                file.display(),
324                violation,
325                container,
326                violation
327            ),
328            IncludeError::LoaderIo { path, message } => {
329                write!(f, "loader error reading {}: {message}", path.display())
330            }
331            IncludeError::MissingSrc { .. } => {
332                write!(f, "lex.include annotation missing required src= parameter")
333            }
334        }
335    }
336}
337
338impl std::error::Error for IncludeError {}
339
340// No `From<LoadError>` impl: `IncludeError::NotFound` carries the include
341// site (the `lex.include` annotation's range), which a loader doesn't know
342// about. Callers map `LoadError` explicitly at the call site, where the
343// site is available.
344
345/// Which container the include site sits in. Determines the splice-time
346/// policy check (the only one today is "no Sessions in `GeneralContainer`").
347#[derive(Debug, Clone, Copy)]
348enum ContainerKind {
349    /// `Document.root.children` or `Session.children` — accepts everything.
350    Session,
351    /// `Definition.children` — `GeneralContainer`.
352    Definition,
353    /// `Annotation.children` — `GeneralContainer`.
354    AnnotationBody,
355    /// `ListItem.children` — `GeneralContainer`.
356    ListItem,
357}
358
359impl ContainerKind {
360    fn name(self) -> &'static str {
361        match self {
362            ContainerKind::Session => "Session",
363            ContainerKind::Definition => "Definition",
364            ContainerKind::AnnotationBody => "Annotation body",
365            ContainerKind::ListItem => "ListItem",
366        }
367    }
368
369    fn allows_sessions(self) -> bool {
370        matches!(self, ContainerKind::Session)
371    }
372}
373
374/// Resolve `:: lex.include ::` annotations starting from `source`, recursively.
375///
376/// `source_path` identifies the entry-point file. It is used to (a) resolve
377/// relative include paths against the entry file's directory, (b) stamp
378/// `Range.origin_path` on every node so downstream code (file-ref resolution,
379/// diagnostics, LSP goto) can report locations against the authoring file,
380/// and (c) seed the cycle-detection chain so an include cycle that loops
381/// back to the entry is caught. When `None`, relative paths resolve against
382/// `config.root`, origin stamping is skipped on the entry, and the chain
383/// starts empty.
384///
385/// # Pre/post-attachment
386///
387/// Internally this re-parses each source (entry + every loaded file) *without*
388/// annotation attachment so `lex.include` annotations are visible as standalone
389/// children where the splice can replace them in-place. After all splices,
390/// [`AttachAnnotations`] runs once on the merged tree, which lands the include
391/// annotation on the first spliced node by the standard "attach to next
392/// sibling" rule. This matches the textual paste mental model from the proposal.
393///
394/// # Recursion
395///
396/// Each loaded file is fully resolved (its own includes replaced) *before*
397/// being spliced into the host. The recursion uses each file's own directory
398/// as `host_dir`, so a relative path inside an included file resolves from
399/// that file's location — not the entry's. An active-chain stack of
400/// canonicalized paths gates against cycles; the depth counter gates against
401/// pathological nesting (default 8, configurable via [`ResolveConfig::max_depth`]).
402pub fn resolve_from_source(
403    source: &str,
404    source_path: Option<PathBuf>,
405    config: &ResolveConfig,
406    loader: &dyn Loader,
407) -> Result<Document, IncludeError> {
408    let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
409    let host_dir = source_path
410        .as_ref()
411        .and_then(|p| p.parent().map(Path::to_path_buf))
412        .unwrap_or_else(|| config.root.clone());
413
414    let mut doc = parse_no_attach(source).map_err(|message| IncludeError::ParseFailed {
415        path: source_path.clone().unwrap_or_default(),
416        message,
417    })?;
418
419    if let Some(origin) = entry_origin.as_ref() {
420        stamp_doc(&mut doc, origin);
421    }
422
423    // Seed the chain with the lexically-normalized entry path (when known)
424    // so an include that loops back to the entry is detected as a cycle.
425    // Normalization here is essential — `target_path` values produced by
426    // `resolve_path` are also lexically normalized, so an unnormalized
427    // entry would never compare equal to its normalized self.
428    let mut chain: Vec<PathBuf> = source_path
429        .as_ref()
430        .map(|p| vec![lexical_normalize(p)])
431        .unwrap_or_default();
432    let mut state = ResolverState {
433        config,
434        loader,
435        chain: &mut chain,
436        depth: 0,
437        total_resolved: 0,
438    };
439
440    splice_in_session_container(doc.root.children.as_mut_vec(), &host_dir, &mut state)?;
441
442    let doc = AttachAnnotations::new()
443        .run(doc)
444        .map_err(|e| IncludeError::ParseFailed {
445            path: source_path.unwrap_or_default(),
446            message: format!("annotation attachment failed: {e}"),
447        })?;
448
449    Ok(doc)
450}
451
452// ============================================================================
453// Splicing
454// ============================================================================
455
456/// Per-resolution state threaded through the recursive walker. Keeps the
457/// signatures of the splice/process functions short and ensures
458/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
459/// each include site.
460struct ResolverState<'a> {
461    config: &'a ResolveConfig,
462    loader: &'a dyn Loader,
463    /// Active resolution stack: lexically-normalized absolute paths
464    /// currently being resolved. Pushed when we begin loading a file and
465    /// popped when its tree is fully resolved. A push that finds the
466    /// path already on the stack is a cycle.
467    ///
468    /// Normalization (not filesystem canonicalization) is what's used
469    /// here: the resolver never touches `std::fs`, so symlink resolution
470    /// is out. Two paths that lexically refer to the same file (after
471    /// `.`/`..` collapse) compare equal; two paths reaching the same
472    /// inode via different routes do not. For real-FS use cases this is
473    /// fine because `FsLoader` will canonicalize on load before the
474    /// chain comparison sees the path.
475    chain: &'a mut Vec<PathBuf>,
476    /// Number of include hops from the entry point. Each recursion into a
477    /// loaded file increments by 1. Hitting `config.max_depth` is an error.
478    depth: usize,
479    /// Total includes resolved across the entire walk (depth × breadth).
480    /// Incremented on every successful load. Hitting
481    /// `config.max_total_includes` aborts with `TotalIncludesExceeded` —
482    /// caps adversarial fan-out that `max_depth` alone wouldn't catch.
483    total_resolved: usize,
484}
485
486fn splice_in_session_container(
487    children: &mut Vec<ContentItem>,
488    host_dir: &Path,
489    state: &mut ResolverState<'_>,
490) -> Result<(), IncludeError> {
491    // Post-order: recurse into nested containers first, splice this
492    // container's includes second. The recurse step walks the *original*
493    // tree; the splice step inserts already-fully-resolved content
494    // (recursion happens inside `process_includes`), which is therefore
495    // never re-walked.
496    recurse_into_children(children, host_dir, state)?;
497    process_includes(children, host_dir, state, ContainerKind::Session)
498}
499
500fn splice_in_general_container(
501    container: &mut GeneralContainer,
502    host_dir: &Path,
503    state: &mut ResolverState<'_>,
504    kind: ContainerKind,
505) -> Result<(), IncludeError> {
506    recurse_into_children(container.as_mut_vec(), host_dir, state)?;
507    process_includes(container.as_mut_vec(), host_dir, state, kind)
508}
509
510// Allow &mut Vec because `splice` needs Vec-specific operations.
511#[allow(clippy::ptr_arg)]
512fn process_includes(
513    children: &mut Vec<ContentItem>,
514    host_dir: &Path,
515    state: &mut ResolverState<'_>,
516    kind: ContainerKind,
517) -> Result<(), IncludeError> {
518    // Collect indices of standalone include annotations in this container.
519    let include_indices: Vec<usize> = children
520        .iter()
521        .enumerate()
522        .filter_map(|(i, item)| match item {
523            ContentItem::Annotation(a) if a.is_include() => Some(i),
524            _ => None,
525        })
526        .collect();
527
528    // Process in reverse order so earlier indices stay valid.
529    for i in include_indices.into_iter().rev() {
530        let annotation = match &children[i] {
531            ContentItem::Annotation(a) => a.clone(),
532            _ => unreachable!("index came from include filter"),
533        };
534
535        let splice_items = resolve_one_include(&annotation, host_dir, state, kind)?;
536
537        // Replace the include annotation with the splice content.
538        // The annotation itself stays in the children list immediately
539        // before the splice, so the post-resolution AttachAnnotations
540        // pass moves it onto the first spliced node by the standard
541        // "attach to next sibling" rule.
542        let mut replacement = Vec::with_capacity(splice_items.len() + 1);
543        replacement.push(ContentItem::Annotation(annotation));
544        replacement.extend(splice_items);
545        children.splice(i..=i, replacement);
546    }
547
548    Ok(())
549}
550
551/// Resolve a single include annotation: path → load → parse → recurse →
552/// stamp → policy-check → splice list.
553///
554/// The recursion happens *here*: after parsing the loaded file, we walk
555/// its tree with the loaded file's own directory as `host_dir`, with the
556/// loaded file pushed onto `state.chain` and `state.depth` bumped by 1.
557/// When this call returns, the splice list is fully resolved and ready to
558/// be inserted into the host container.
559fn resolve_one_include(
560    annotation: &crate::lex::ast::elements::annotation::Annotation,
561    host_dir: &Path,
562    state: &mut ResolverState<'_>,
563    parent_kind: ContainerKind,
564) -> Result<Vec<ContentItem>, IncludeError> {
565    let src = annotation
566        .include_src()
567        .ok_or_else(|| IncludeError::MissingSrc {
568            include_site: annotation.location.clone(),
569        })?;
570
571    let target_path = resolve_path(&src, host_dir, &state.config.root)?;
572
573    // Depth check before any FS access. A site sitting exactly at
574    // `max_depth` is fine; one that would push us *past* it is the
575    // failure case.
576    if state.depth >= state.config.max_depth {
577        return Err(IncludeError::DepthExceeded {
578            include_site: annotation.location.clone(),
579            limit: state.config.max_depth,
580            chain: state.chain.clone(),
581        });
582    }
583
584    // Total-count check before loading. Caps fan-out — a doc with
585    // 100k top-level includes would blow past max_total_includes long
586    // before max_depth would catch anything.
587    if state.total_resolved >= state.config.max_total_includes {
588        return Err(IncludeError::TotalIncludesExceeded {
589            include_site: annotation.location.clone(),
590            limit: state.config.max_total_includes,
591        });
592    }
593
594    // Load via the injected loader. The loader returns the source plus
595    // a *canonical* identity for the resource — for FsLoader that's
596    // post-`fs::canonicalize` (symlinks resolved, case-folded on
597    // case-insensitive FS); for MemoryLoader it's the lookup key. We
598    // use the canonical path for cycle detection so a symlink loop or
599    // a case-folded re-include is caught here rather than slipping
600    // through to `max_depth`.
601    let LoadedFile {
602        source: target_source,
603        canonical_path,
604    } = state.loader.load(&target_path).map_err(|e| match e {
605        LoadError::NotFound { path } => IncludeError::NotFound {
606            include_site: annotation.location.clone(),
607            path,
608        },
609        LoadError::OutsideRoot { path, root } => IncludeError::RootEscape { path, root },
610        LoadError::TooLarge { path, size, limit } => IncludeError::FileTooLarge {
611            include_site: annotation.location.clone(),
612            path,
613            size,
614            limit,
615        },
616        LoadError::Io { path, message } => IncludeError::LoaderIo { path, message },
617    })?;
618    state.total_resolved += 1;
619
620    // Cycle check uses the canonical path so symlink/case-fold cycles
621    // are caught even though `target_path` (which we used for the load
622    // request) was just lexically resolved.
623    if state.chain.iter().any(|p| p == &canonical_path) {
624        return Err(IncludeError::Cycle {
625            include_site: annotation.location.clone(),
626            path: canonical_path,
627            chain: state.chain.clone(),
628        });
629    }
630
631    let mut included =
632        parse_no_attach(&target_source).map_err(|message| IncludeError::ParseFailed {
633            path: canonical_path.clone(),
634            message,
635        })?;
636
637    let target_origin = Arc::new(canonical_path.clone());
638    stamp_doc(&mut included, &target_origin);
639
640    // Recursively resolve includes inside the loaded file. The host_dir
641    // for that walk is the loaded file's own canonical parent; the
642    // chain gains the canonical path and depth bumps by 1 — both are
643    // popped/restored on the way back so siblings see the same state.
644    let included_dir = canonical_path
645        .parent()
646        .map(Path::to_path_buf)
647        .unwrap_or_else(|| state.config.root.clone());
648
649    state.chain.push(canonical_path.clone());
650    let saved_depth = state.depth;
651    state.depth = saved_depth + 1;
652    let recurse_result =
653        splice_in_session_container(included.root.children.as_mut_vec(), &included_dir, state);
654    state.depth = saved_depth;
655    state.chain.pop();
656    recurse_result?;
657
658    let splice_items = prepare_splice_list(included);
659    validate_against_kind(
660        &splice_items,
661        parent_kind,
662        &annotation.location,
663        &canonical_path,
664    )?;
665
666    Ok(splice_items)
667}
668
669#[allow(clippy::ptr_arg)]
670fn recurse_into_children(
671    children: &mut Vec<ContentItem>,
672    host_dir: &Path,
673    state: &mut ResolverState<'_>,
674) -> Result<(), IncludeError> {
675    for item in children.iter_mut() {
676        match item {
677            ContentItem::Session(s) => {
678                splice_in_session_container(s.children.as_mut_vec(), host_dir, state)?;
679            }
680            ContentItem::Definition(d) => {
681                splice_in_general_container(
682                    &mut d.children,
683                    host_dir,
684                    state,
685                    ContainerKind::Definition,
686                )?;
687            }
688            ContentItem::Annotation(a) if !a.is_include() => {
689                splice_in_general_container(
690                    &mut a.children,
691                    host_dir,
692                    state,
693                    ContainerKind::AnnotationBody,
694                )?;
695            }
696            ContentItem::List(l) => {
697                for li in l.items.as_mut_vec().iter_mut() {
698                    if let ContentItem::ListItem(item) = li {
699                        splice_in_general_container(
700                            &mut item.children,
701                            host_dir,
702                            state,
703                            ContainerKind::ListItem,
704                        )?;
705                    }
706                }
707            }
708            _ => {}
709        }
710    }
711    Ok(())
712}
713
714fn prepare_splice_list(mut included: Document) -> Vec<ContentItem> {
715    let mut items: Vec<ContentItem> = Vec::new();
716
717    // Document title → Paragraph, prepended.
718    // Equivalent to what a textual paste would parse (an unindented line
719    // becomes a paragraph in the host's context). Per the revised
720    // spec §5.2 this is "do nothing" semantics — converting matches what
721    // the parser would do if the included source were inlined and reparsed.
722    if let Some(title) = included.title {
723        let location = title.location.clone();
724        let para = Paragraph::from_line(title.as_str().to_string()).at(location);
725        items.push(ContentItem::Paragraph(para));
726    }
727
728    // Document-level annotations → regular annotations, prepended.
729    for ann in included.annotations {
730        items.push(ContentItem::Annotation(ann));
731    }
732
733    // Body of the included document.
734    items.append(included.root.children.as_mut_vec());
735
736    items
737}
738
739fn validate_against_kind(
740    items: &[ContentItem],
741    kind: ContainerKind,
742    site: &Range,
743    file: &Path,
744) -> Result<(), IncludeError> {
745    if kind.allows_sessions() {
746        return Ok(());
747    }
748    if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
749        return Err(IncludeError::ContainerPolicy {
750            include_site: site.clone(),
751            container: kind.name(),
752            file: file.to_path_buf(),
753            violation: "Sessions",
754        });
755    }
756    Ok(())
757}
758
759// ============================================================================
760// Path resolution
761// ============================================================================
762
763/// Resolve a file-reference target string the same way the include
764/// resolver resolves include paths.
765///
766/// Use this when consuming `ReferenceType::File { target }` (or any other
767/// node-attached path) so that relative paths resolve from the *authoring*
768/// file's directory, not from wherever the merged document happens to be
769/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
770/// containing node (or `None` if the node was never stamped — in that case
771/// the path is treated as if authored at the root).
772///
773/// Behaviour matches the include resolver:
774/// - Root-absolute targets (leading `/`) resolve under `root`.
775/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
776///   when `ref_origin` is `None`).
777/// - The result is lexically normalized and checked against `root` —
778///   paths that escape it return `RootEscape`.
779///
780/// This is a sister to the resolver's internal `resolve_path` and shares
781/// the same lexical-normalization caveat: it does not touch the filesystem.
782pub fn resolve_file_reference(
783    target: &str,
784    ref_origin: Option<&Path>,
785    root: &Path,
786) -> Result<PathBuf, IncludeError> {
787    let host_dir: PathBuf = ref_origin
788        .and_then(|p| p.parent())
789        .map(Path::to_path_buf)
790        .unwrap_or_else(|| root.to_path_buf());
791    resolve_path(target, &host_dir, root)
792}
793
794fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
795    let candidate = if let Some(rel) = src.strip_prefix('/') {
796        // Root-absolute (Lex spec convention): leading `/` means "from
797        // the resolution root", not "filesystem root".
798        root.join(rel)
799    } else {
800        // Anything else must be a relative path. Reject inputs the
801        // host platform would treat as absolute (Windows `C:\foo`,
802        // `\\server\share`, `\foo`) up front: the spec forbids
803        // platform-absolute paths from entering the resolution
804        // pipeline. Without this, `host_dir.join(src)` would silently
805        // discard `host_dir` because Rust's `PathBuf::join` replaces
806        // the base when the joined path is absolute. The downstream
807        // root-escape check would still catch the security side, but
808        // we'd surface a misleading "escapes root" error instead of
809        // "absolute paths not allowed", and we'd be relying on
810        // `PathBuf::join`'s override semantics for the security
811        // outcome rather than holding the line at the input boundary.
812        if Path::new(src).is_absolute() {
813            return Err(IncludeError::AbsolutePath {
814                path: PathBuf::from(src),
815            });
816        }
817        host_dir.join(src)
818    };
819    let normalized = lexical_normalize(&candidate);
820    let canonical_root = lexical_normalize(root);
821    if !normalized.starts_with(&canonical_root) {
822        return Err(IncludeError::RootEscape {
823            path: normalized,
824            root: canonical_root,
825        });
826    }
827    Ok(normalized)
828}
829
830/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
831///
832/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
833/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
834/// version is sufficient for include-site path resolution because the
835/// resolver only needs a stable identity for cycle detection and a uniform
836/// shape for the root-escape prefix check.
837///
838/// `..` is collapsed only when the *last* component in the buffer is a
839/// real directory name (`Component::Normal`). When the buffer is empty
840/// or its last component is itself `..` (or a root marker), the new `..`
841/// is *preserved* in the buffer.
842///
843/// This is what defeats `../../etc/passwd` from collapsing to
844/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
845/// would happily strip a `..` (since `Path::new("..").parent()` returns
846/// `Some("")`), silently losing the second `..` and producing a path
847/// that falsely starts with the root prefix. Each unmatched `..` in the
848/// preserved form keeps the normalized path outside any sane root, so
849/// the escape check fires correctly.
850fn lexical_normalize(p: &Path) -> PathBuf {
851    let mut out = PathBuf::new();
852    for c in p.components() {
853        match c {
854            std::path::Component::ParentDir => {
855                let can_pop = matches!(
856                    out.components().next_back(),
857                    Some(std::path::Component::Normal(_))
858                );
859                if can_pop {
860                    out.pop();
861                } else {
862                    out.push("..");
863                }
864            }
865            std::path::Component::CurDir => {}
866            other => out.push(other.as_os_str()),
867        }
868    }
869    out
870}
871
872// ============================================================================
873// Origin stamping
874// ============================================================================
875//
876// Walk every node in a Document and set `Range.origin_path` on each
877// `.location` field. The walk only stamps the *block-level* `.location`
878// fields here; finer-grained inline ranges land in PR 6 when file-ref
879// resolution starts consulting them.
880
881fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
882    if let Some(title) = doc.title.as_mut() {
883        title.location.origin_path = Some(Arc::clone(origin));
884    }
885    for ann in doc.annotations.iter_mut() {
886        stamp_annotation(ann, origin);
887    }
888    stamp_session(&mut doc.root, origin);
889}
890
891fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
892    s.location.origin_path = Some(Arc::clone(origin));
893    if let Some(loc) = s.title.location.as_mut() {
894        loc.origin_path = Some(Arc::clone(origin));
895    }
896    for ann in s.annotations.iter_mut() {
897        stamp_annotation(ann, origin);
898    }
899    for item in s.children.as_mut_vec().iter_mut() {
900        stamp_item(item, origin);
901    }
902}
903
904fn stamp_annotation(
905    a: &mut crate::lex::ast::elements::annotation::Annotation,
906    origin: &Arc<PathBuf>,
907) {
908    a.location.origin_path = Some(Arc::clone(origin));
909    a.data.location.origin_path = Some(Arc::clone(origin));
910    for item in a.children.as_mut_vec().iter_mut() {
911        stamp_item(item, origin);
912    }
913}
914
915fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
916    match item {
917        ContentItem::Session(s) => stamp_session(s, origin),
918        ContentItem::Annotation(a) => stamp_annotation(a, origin),
919        ContentItem::Paragraph(p) => {
920            p.location.origin_path = Some(Arc::clone(origin));
921            for ann in p.annotations.iter_mut() {
922                stamp_annotation(ann, origin);
923            }
924            for line in p.lines.iter_mut() {
925                stamp_item(line, origin);
926            }
927        }
928        ContentItem::List(l) => {
929            l.location.origin_path = Some(Arc::clone(origin));
930            for li in l.items.as_mut_vec().iter_mut() {
931                stamp_item(li, origin);
932            }
933        }
934        ContentItem::ListItem(li) => {
935            li.location.origin_path = Some(Arc::clone(origin));
936            for ann in li.annotations.iter_mut() {
937                stamp_annotation(ann, origin);
938            }
939            for child in li.children.as_mut_vec().iter_mut() {
940                stamp_item(child, origin);
941            }
942        }
943        ContentItem::Definition(d) => {
944            d.location.origin_path = Some(Arc::clone(origin));
945            for ann in d.annotations.iter_mut() {
946                stamp_annotation(ann, origin);
947            }
948            for child in d.children.as_mut_vec().iter_mut() {
949                stamp_item(child, origin);
950            }
951        }
952        ContentItem::VerbatimBlock(v) => {
953            v.location.origin_path = Some(Arc::clone(origin));
954        }
955        ContentItem::VerbatimLine(vl) => {
956            vl.location.origin_path = Some(Arc::clone(origin));
957        }
958        ContentItem::Table(t) => {
959            t.location.origin_path = Some(Arc::clone(origin));
960        }
961        ContentItem::TextLine(tl) => {
962            tl.location.origin_path = Some(Arc::clone(origin));
963        }
964        ContentItem::BlankLineGroup(b) => {
965            b.location.origin_path = Some(Arc::clone(origin));
966        }
967    }
968}
969
970// ============================================================================
971// Parser glue
972// ============================================================================
973
974/// Parse `source` into a Document but skip the annotation-attachment stage,
975/// so include annotations are findable in container children lists.
976fn parse_no_attach(source: &str) -> Result<Document, String> {
977    crate::lex::testing::parse_without_annotation_attachment(source)
978}
979
980// ============================================================================
981// Filesystem-backed loader
982// ============================================================================
983
984/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
985///
986/// This is the production loader used by the CLI; the LSP wraps it with a
987/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
988/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
989/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
990/// WASM-friendly.
991///
992/// `FsLoader` is constructed with the resolution root and rechecks every
993/// load against it post-`fs::canonicalize`, so a symlink pointing outside
994/// the root is rejected even though the lexical-only check in
995/// [`resolve_path`] cannot see it. Also rejects non-regular files (devices,
996/// FIFOs, directories) before reading, so the loader can't be tricked into
997/// blocking on `/dev/zero` or allocating against an open device.
998///
999/// Errors map:
1000/// - canonicalization fails (file missing, permission denied at a parent,
1001///   broken symlink, …) → [`LoadError::NotFound`]
1002/// - canonical path doesn't sit under canonical root → [`LoadError::OutsideRoot`]
1003/// - target is not a regular file → [`LoadError::Io`] with a clear message
1004/// - any other I/O error during read → [`LoadError::Io`]
1005pub struct FsLoader {
1006    /// Filesystem-canonical resolution root. Constructed once at
1007    /// `FsLoader::new`; if canonicalization fails (e.g., the configured
1008    /// root doesn't exist on disk), we fall back to the input verbatim
1009    /// and the bounds check will simply never pass — visible to the user
1010    /// as a `LoadError::OutsideRoot` instead of silently disabling the
1011    /// security check.
1012    canonical_root: PathBuf,
1013    /// Per-file size cap (bytes). Loads of larger files surface as
1014    /// `LoadError::TooLarge` before any bytes are read into memory.
1015    /// Default [`FsLoader::DEFAULT_MAX_FILE_SIZE`].
1016    max_file_size: u64,
1017}
1018
1019impl FsLoader {
1020    /// Default per-file size cap: 10 MiB. Generous for realistic Lex
1021    /// source documents (text only) and tight enough to bound memory
1022    /// allocation per include against an adversarial 1 GB file.
1023    pub const DEFAULT_MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
1024
1025    /// Construct a loader rooted at `root` with default size limits.
1026    /// The loader stores `root`'s fs-canonical form (with symlinks
1027    /// resolved); subsequent loads validate that the requested path's
1028    /// canonical form lives under it.
1029    pub fn new(root: PathBuf) -> Self {
1030        let canonical_root = std::fs::canonicalize(&root).unwrap_or(root);
1031        Self {
1032            canonical_root,
1033            max_file_size: Self::DEFAULT_MAX_FILE_SIZE,
1034        }
1035    }
1036
1037    /// Override the default per-file size cap (bytes). Use to widen the
1038    /// limit for projects with genuinely large source files, or tighten
1039    /// it for stricter sandboxes (e.g., LSPs serving untrusted content).
1040    pub fn with_max_file_size(mut self, max_file_size: u64) -> Self {
1041        self.max_file_size = max_file_size;
1042        self
1043    }
1044}
1045
1046impl Loader for FsLoader {
1047    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1048        // 1. Canonicalize. Resolves symlinks and `..` segments against the
1049        //    real filesystem. NotFound / broken-symlink / permission errors
1050        //    all surface here.
1051        let canonical_path = std::fs::canonicalize(path).map_err(|e| match e.kind() {
1052            std::io::ErrorKind::NotFound => LoadError::NotFound {
1053                path: path.to_path_buf(),
1054            },
1055            _ => LoadError::Io {
1056                path: path.to_path_buf(),
1057                message: e.to_string(),
1058            },
1059        })?;
1060
1061        // 2. Bounds check against the *canonical* root. This is the
1062        //    actual security gate against symlink traversal — the lexical
1063        //    check in resolve_path can't see through symlinks.
1064        if !canonical_path.starts_with(&self.canonical_root) {
1065            return Err(LoadError::OutsideRoot {
1066                path: canonical_path,
1067                root: self.canonical_root.clone(),
1068            });
1069        }
1070
1071        // 3. Reject non-regular files. Without this, an attacker (with
1072        //    write access to the repo) could symlink an include target to
1073        //    `/dev/zero` or a FIFO and block / OOM the reader. The
1074        //    is_file() metadata call is a cheap sanity check.
1075        let meta = std::fs::metadata(&canonical_path).map_err(|e| LoadError::Io {
1076            path: canonical_path.clone(),
1077            message: e.to_string(),
1078        })?;
1079        if !meta.is_file() {
1080            return Err(LoadError::Io {
1081                path: canonical_path,
1082                message: "include target is not a regular file".to_string(),
1083            });
1084        }
1085
1086        // 4. Size cap. Bounds memory allocation per include against an
1087        //    adversarial 1 GB file before any bytes hit the heap.
1088        let size = meta.len();
1089        if size > self.max_file_size {
1090            return Err(LoadError::TooLarge {
1091                path: canonical_path,
1092                size,
1093                limit: self.max_file_size,
1094            });
1095        }
1096
1097        // 5. Read. By this point we know the path is a regular file under
1098        //    the canonical root and within the size cap; anything that
1099        //    fails here is a real I/O error worth surfacing.
1100        let source = std::fs::read_to_string(&canonical_path).map_err(|e| LoadError::Io {
1101            path: canonical_path.clone(),
1102            message: e.to_string(),
1103        })?;
1104
1105        Ok(LoadedFile {
1106            source,
1107            canonical_path,
1108        })
1109    }
1110}
1111
1112// ============================================================================
1113// Test fixtures (test-support feature + cfg(test))
1114// ============================================================================
1115
1116/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
1117#[cfg(any(test, feature = "test-support"))]
1118pub struct MemoryLoader {
1119    files: std::collections::HashMap<PathBuf, String>,
1120}
1121
1122#[cfg(any(test, feature = "test-support"))]
1123impl MemoryLoader {
1124    /// Create an empty loader. Add files with [`MemoryLoader::insert`].
1125    pub fn new() -> Self {
1126        Self {
1127            files: std::collections::HashMap::new(),
1128        }
1129    }
1130
1131    /// Register a file at `path` with the given source text.
1132    pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
1133        self.files.insert(path.into(), contents.into());
1134        self
1135    }
1136
1137    /// Convenience constructor: build a loader from any iterator of
1138    /// `(path, contents)` pairs.
1139    pub fn from_pairs<I, P, S>(pairs: I) -> Self
1140    where
1141        I: IntoIterator<Item = (P, S)>,
1142        P: Into<PathBuf>,
1143        S: Into<String>,
1144    {
1145        let mut loader = Self::new();
1146        for (path, contents) in pairs {
1147            loader.insert(path, contents);
1148        }
1149        loader
1150    }
1151}
1152
1153#[cfg(any(test, feature = "test-support"))]
1154impl Default for MemoryLoader {
1155    fn default() -> Self {
1156        Self::new()
1157    }
1158}
1159
1160#[cfg(any(test, feature = "test-support"))]
1161impl Loader for MemoryLoader {
1162    fn load(&self, path: &Path) -> Result<LoadedFile, LoadError> {
1163        // Memory loaders have no symlinks; the lookup key *is* the
1164        // canonical identity. Cycle detection in the resolver compares
1165        // `LoadedFile::canonical_path` values; for tests this matches the
1166        // lexically-normalized paths the resolver already produces.
1167        let source = self
1168            .files
1169            .get(path)
1170            .cloned()
1171            .ok_or_else(|| LoadError::NotFound {
1172                path: path.to_path_buf(),
1173            })?;
1174        Ok(LoadedFile {
1175            source,
1176            canonical_path: path.to_path_buf(),
1177        })
1178    }
1179}
1180
1181// ============================================================================
1182// Tests
1183// ============================================================================
1184
1185#[cfg(test)]
1186mod tests;