Skip to main content

lex_core/lex/
includes.rs

1//! Include resolution for Lex documents.
2//!
3//! This module turns `:: lex.include src="..." ::` annotations into spliced
4//! content from the referenced files. It is *opt-in*: callers that want the
5//! unresolved tree (the formatter, tree-sitter parity, editor tooling that
6//! displays include statements as authored) skip this pass entirely. The
7//! parser itself never touches the filesystem — all I/O goes through the
8//! injected [`Loader`] trait.
9//!
10//! See `comms/specs/proposals/includes.lex` for the full design.
11//!
12//! # Status
13//!
14//! This module is being built up across PRs 3–6:
15//!
16//! - PR 3: skeleton — trait, config, errors, stub.
17//! - PR 4: single-pass splice + container-policy validation +
18//!   doc-title/doc-annotation conversion + origin stamping + root-escape
19//!   check.
20//! - PR 5: recursive resolution into included files + cycle detection
21//!   (chain stack) + depth limit. Each loaded file gets walked in its OWN
22//!   directory, so relative paths inside an included file resolve from
23//!   that file's directory, not the entry's.
24//! - PR 6: origin-aware reference helpers. [`resolve_file_reference`]
25//!   resolves a `ReferenceType::File` target from the authoring file's
26//!   directory using `Range.origin_path`.
27//!   `Document::find_annotation_by_label_in_origin` scopes footnote
28//!   lookups to the file the reference was authored in.
29//! - PR 7 (this PR): [`FsLoader`] — production loader that reads from the
30//!   filesystem with `std::fs::read_to_string`. CLI wires the resolver
31//!   into `lex convert` and `lex inspect` (default-on, opt-out via
32//!   `--no-includes`); `lex format` never expands.
33//!
34//! # Layering
35//!
36//! Of all of lex-core, only [`FsLoader`] references `std::fs`. The
37//! resolver itself does no I/O — it always goes through the [`Loader`]
38//! trait. Callers can swap loaders to keep the resolver sandboxed:
39//!
40//! - The LSP wraps [`FsLoader`] with file-watch invalidation (PR 8).
41//! - WASM builds provide a JS-backed loader instead of [`FsLoader`].
42//! - Tests use [`MemoryLoader`] (gated behind `test-support`).
43//!
44//! For tests, lex-core itself ships [`MemoryLoader`] gated behind the
45//! `test-support` cargo feature. It is not intended for production use.
46
47use crate::lex::assembling::AttachAnnotations;
48use crate::lex::ast::elements::container::GeneralContainer;
49use crate::lex::ast::elements::content_item::ContentItem;
50use crate::lex::ast::elements::paragraph::Paragraph;
51use crate::lex::ast::elements::session::Session;
52use crate::lex::ast::range::Range;
53use crate::lex::ast::Document;
54use crate::lex::transforms::Runnable;
55use std::path::{Path, PathBuf};
56use std::sync::Arc;
57
58/// Configuration for the include resolution pass.
59#[derive(Debug, Clone)]
60pub struct ResolveConfig {
61    /// Directory all include paths resolve under. Any include that
62    /// canonicalizes outside this root is a [`IncludeError::RootEscape`].
63    ///
64    /// Must be an **absolute** path. Lexical normalization treats `.`
65    /// and `..` against an empty buffer as no-ops; passing a relative
66    /// or unnormalized root weakens the root-escape prefix check.
67    /// Callers (CLI, LSP) should canonicalize the root before
68    /// constructing `ResolveConfig`.
69    pub root: PathBuf,
70    /// Maximum include depth. Default 8 (see [`ResolveConfig::DEFAULT_MAX_DEPTH`]).
71    /// Hitting the limit is an error, not a silent truncation.
72    pub max_depth: usize,
73}
74
75impl ResolveConfig {
76    /// Default maximum include depth — enough for any reasonable atomization
77    /// strategy (aggregator → per-chapter → per-section), bounded enough to
78    /// keep the resolver's worst-case work predictable.
79    pub const DEFAULT_MAX_DEPTH: usize = 8;
80
81    /// Construct a config with the given root and default depth.
82    pub fn with_root(root: PathBuf) -> Self {
83        Self {
84            root,
85            max_depth: Self::DEFAULT_MAX_DEPTH,
86        }
87    }
88}
89
90/// A pluggable source-text loader.
91///
92/// Implementations decide where bytes come from (filesystem, in-memory map,
93/// virtual filesystem, content-addressed store, …). lex-core never references
94/// `std::fs` directly through this trait; that keeps the resolver pure and
95/// usable in WASM, sandboxes, and unit tests.
96pub trait Loader {
97    /// Load the source text for `path`. The path is the canonical absolute
98    /// path the resolver decided on after applying the rules in §4 of the
99    /// proposal.
100    fn load(&self, path: &Path) -> Result<String, LoadError>;
101}
102
103/// Errors a [`Loader`] can produce.
104#[derive(Debug, Clone)]
105pub enum LoadError {
106    /// The loader could not find a resource at the given path.
107    NotFound { path: PathBuf },
108    /// Underlying I/O error (or virtual-filesystem equivalent).
109    Io { path: PathBuf, message: String },
110}
111
112impl std::fmt::Display for LoadError {
113    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
114        match self {
115            LoadError::NotFound { path } => write!(f, "include not found: {}", path.display()),
116            LoadError::Io { path, message } => {
117                write!(f, "io error reading {}: {message}", path.display())
118            }
119        }
120    }
121}
122
123impl std::error::Error for LoadError {}
124
125/// Errors the include resolver can produce.
126#[derive(Debug, Clone)]
127pub enum IncludeError {
128    /// An include chain looped back on itself. `chain` is the resolution
129    /// stack at the moment the duplicate `path` was about to be pushed,
130    /// in source-order (entry first, deepest last). `include_site` is the
131    /// range of the offending `lex.include` annotation in its host file —
132    /// useful for diagnostics that highlight the exact line.
133    Cycle {
134        include_site: Range,
135        path: PathBuf,
136        chain: Vec<PathBuf>,
137    },
138    /// The include depth exceeded [`ResolveConfig::max_depth`]. `chain`
139    /// shows the resolution stack at the moment of failure, in source
140    /// order. `include_site` is the range of the offending
141    /// `lex.include` annotation in its host file.
142    DepthExceeded {
143        include_site: Range,
144        limit: usize,
145        chain: Vec<PathBuf>,
146    },
147    /// A path resolved outside the configured [`ResolveConfig::root`].
148    RootEscape { path: PathBuf, root: PathBuf },
149    /// The loader could not find or read the included file.
150    NotFound { path: PathBuf },
151    /// The loader returned text that the parser rejected.
152    ParseFailed { path: PathBuf, message: String },
153    /// The included file's content is not legal in the include site's
154    /// parent container.
155    ///
156    /// Today this only occurs when an included file has top-level Sessions
157    /// and the include site is inside a `GeneralContainer` (Definition,
158    /// ListItem, or another Annotation's body). The `violation` field
159    /// names the offending content kind (e.g. `"Sessions"`) so future
160    /// container/policy combinations can reuse this variant without a
161    /// breaking change.
162    ContainerPolicy {
163        include_site: Range,
164        container: &'static str,
165        file: PathBuf,
166        violation: &'static str,
167    },
168    /// Loader propagated a non-`NotFound` I/O error.
169    LoaderIo { path: PathBuf, message: String },
170    /// `lex.include` annotation was missing the mandatory `src=` parameter.
171    MissingSrc { include_site: Range },
172}
173
174impl std::fmt::Display for IncludeError {
175    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
176        match self {
177            IncludeError::Cycle { path, chain, .. } => {
178                let chain_display: Vec<String> =
179                    chain.iter().map(|p| p.display().to_string()).collect();
180                write!(
181                    f,
182                    "include cycle: {} (chain: {})",
183                    path.display(),
184                    chain_display.join(" -> ")
185                )
186            }
187            IncludeError::DepthExceeded { limit, chain, .. } => {
188                let chain_display: Vec<String> =
189                    chain.iter().map(|p| p.display().to_string()).collect();
190                write!(
191                    f,
192                    "include depth exceeded limit of {limit} (chain: {})",
193                    chain_display.join(" -> ")
194                )
195            }
196            IncludeError::RootEscape { path, root } => write!(
197                f,
198                "include path {} escapes resolution root {}",
199                path.display(),
200                root.display()
201            ),
202            IncludeError::NotFound { path } => write!(f, "include not found: {}", path.display()),
203            IncludeError::ParseFailed { path, message } => {
204                write!(f, "failed to parse {}: {message}", path.display())
205            }
206            IncludeError::ContainerPolicy {
207                container,
208                file,
209                violation,
210                ..
211            } => write!(
212                f,
213                "included file {} contains {} but include site is inside {} \
214                 (which does not allow {})",
215                file.display(),
216                violation,
217                container,
218                violation
219            ),
220            IncludeError::LoaderIo { path, message } => {
221                write!(f, "loader error reading {}: {message}", path.display())
222            }
223            IncludeError::MissingSrc { .. } => {
224                write!(f, "lex.include annotation missing required src= parameter")
225            }
226        }
227    }
228}
229
230impl std::error::Error for IncludeError {}
231
232impl From<LoadError> for IncludeError {
233    fn from(err: LoadError) -> Self {
234        match err {
235            LoadError::NotFound { path } => IncludeError::NotFound { path },
236            LoadError::Io { path, message } => IncludeError::LoaderIo { path, message },
237        }
238    }
239}
240
241/// Which container the include site sits in. Determines the splice-time
242/// policy check (the only one today is "no Sessions in `GeneralContainer`").
243#[derive(Debug, Clone, Copy)]
244enum ContainerKind {
245    /// `Document.root.children` or `Session.children` — accepts everything.
246    Session,
247    /// `Definition.children` — `GeneralContainer`.
248    Definition,
249    /// `Annotation.children` — `GeneralContainer`.
250    AnnotationBody,
251    /// `ListItem.children` — `GeneralContainer`.
252    ListItem,
253}
254
255impl ContainerKind {
256    fn name(self) -> &'static str {
257        match self {
258            ContainerKind::Session => "Session",
259            ContainerKind::Definition => "Definition",
260            ContainerKind::AnnotationBody => "Annotation body",
261            ContainerKind::ListItem => "ListItem",
262        }
263    }
264
265    fn allows_sessions(self) -> bool {
266        matches!(self, ContainerKind::Session)
267    }
268}
269
270/// Resolve `:: lex.include ::` annotations starting from `source`, recursively.
271///
272/// `source_path` identifies the entry-point file. It is used to (a) resolve
273/// relative include paths against the entry file's directory, (b) stamp
274/// `Range.origin_path` on every node so downstream code (file-ref resolution,
275/// diagnostics, LSP goto) can report locations against the authoring file,
276/// and (c) seed the cycle-detection chain so an include cycle that loops
277/// back to the entry is caught. When `None`, relative paths resolve against
278/// `config.root`, origin stamping is skipped on the entry, and the chain
279/// starts empty.
280///
281/// # Pre/post-attachment
282///
283/// Internally this re-parses each source (entry + every loaded file) *without*
284/// annotation attachment so `lex.include` annotations are visible as standalone
285/// children where the splice can replace them in-place. After all splices,
286/// [`AttachAnnotations`] runs once on the merged tree, which lands the include
287/// annotation on the first spliced node by the standard "attach to next
288/// sibling" rule. This matches the textual paste mental model from the proposal.
289///
290/// # Recursion
291///
292/// Each loaded file is fully resolved (its own includes replaced) *before*
293/// being spliced into the host. The recursion uses each file's own directory
294/// as `host_dir`, so a relative path inside an included file resolves from
295/// that file's location — not the entry's. An active-chain stack of
296/// canonicalized paths gates against cycles; the depth counter gates against
297/// pathological nesting (default 8, configurable via [`ResolveConfig::max_depth`]).
298pub fn resolve_from_source(
299    source: &str,
300    source_path: Option<PathBuf>,
301    config: &ResolveConfig,
302    loader: &dyn Loader,
303) -> Result<Document, IncludeError> {
304    let entry_origin = source_path.as_ref().map(|p| Arc::new(p.clone()));
305    let host_dir = source_path
306        .as_ref()
307        .and_then(|p| p.parent().map(Path::to_path_buf))
308        .unwrap_or_else(|| config.root.clone());
309
310    let mut doc = parse_no_attach(source).map_err(|message| IncludeError::ParseFailed {
311        path: source_path.clone().unwrap_or_default(),
312        message,
313    })?;
314
315    if let Some(origin) = entry_origin.as_ref() {
316        stamp_doc(&mut doc, origin);
317    }
318
319    // Seed the chain with the lexically-normalized entry path (when known)
320    // so an include that loops back to the entry is detected as a cycle.
321    // Normalization here is essential — `target_path` values produced by
322    // `resolve_path` are also lexically normalized, so an unnormalized
323    // entry would never compare equal to its normalized self.
324    let mut chain: Vec<PathBuf> = source_path
325        .as_ref()
326        .map(|p| vec![lexical_normalize(p)])
327        .unwrap_or_default();
328    let mut state = ResolverState {
329        config,
330        loader,
331        chain: &mut chain,
332        depth: 0,
333    };
334
335    splice_in_session_container(doc.root.children.as_mut_vec(), &host_dir, &mut state)?;
336
337    let doc = AttachAnnotations::new()
338        .run(doc)
339        .map_err(|e| IncludeError::ParseFailed {
340            path: source_path.unwrap_or_default(),
341            message: format!("annotation attachment failed: {e}"),
342        })?;
343
344    Ok(doc)
345}
346
347// ============================================================================
348// Splicing
349// ============================================================================
350
351/// Per-resolution state threaded through the recursive walker. Keeps the
352/// signatures of the splice/process functions short and ensures
353/// `chain`/`depth` are updated in lock-step (push/pop, +1/back-out) at
354/// each include site.
355struct ResolverState<'a> {
356    config: &'a ResolveConfig,
357    loader: &'a dyn Loader,
358    /// Active resolution stack: lexically-normalized absolute paths
359    /// currently being resolved. Pushed when we begin loading a file and
360    /// popped when its tree is fully resolved. A push that finds the
361    /// path already on the stack is a cycle.
362    ///
363    /// Normalization (not filesystem canonicalization) is what's used
364    /// here: the resolver never touches `std::fs`, so symlink resolution
365    /// is out. Two paths that lexically refer to the same file (after
366    /// `.`/`..` collapse) compare equal; two paths reaching the same
367    /// inode via different routes do not. For real-FS use cases this is
368    /// fine because `FsLoader` will canonicalize on load before the
369    /// chain comparison sees the path.
370    chain: &'a mut Vec<PathBuf>,
371    /// Number of include hops from the entry point. Each recursion into a
372    /// loaded file increments by 1. Hitting `config.max_depth` is an error.
373    depth: usize,
374}
375
376fn splice_in_session_container(
377    children: &mut Vec<ContentItem>,
378    host_dir: &Path,
379    state: &mut ResolverState<'_>,
380) -> Result<(), IncludeError> {
381    // Post-order: recurse into nested containers first, splice this
382    // container's includes second. The recurse step walks the *original*
383    // tree; the splice step inserts already-fully-resolved content
384    // (recursion happens inside `process_includes`), which is therefore
385    // never re-walked.
386    recurse_into_children(children, host_dir, state)?;
387    process_includes(children, host_dir, state, ContainerKind::Session)
388}
389
390fn splice_in_general_container(
391    container: &mut GeneralContainer,
392    host_dir: &Path,
393    state: &mut ResolverState<'_>,
394    kind: ContainerKind,
395) -> Result<(), IncludeError> {
396    recurse_into_children(container.as_mut_vec(), host_dir, state)?;
397    process_includes(container.as_mut_vec(), host_dir, state, kind)
398}
399
400// Allow &mut Vec because `splice` needs Vec-specific operations.
401#[allow(clippy::ptr_arg)]
402fn process_includes(
403    children: &mut Vec<ContentItem>,
404    host_dir: &Path,
405    state: &mut ResolverState<'_>,
406    kind: ContainerKind,
407) -> Result<(), IncludeError> {
408    // Collect indices of standalone include annotations in this container.
409    let include_indices: Vec<usize> = children
410        .iter()
411        .enumerate()
412        .filter_map(|(i, item)| match item {
413            ContentItem::Annotation(a) if a.is_include() => Some(i),
414            _ => None,
415        })
416        .collect();
417
418    // Process in reverse order so earlier indices stay valid.
419    for i in include_indices.into_iter().rev() {
420        let annotation = match &children[i] {
421            ContentItem::Annotation(a) => a.clone(),
422            _ => unreachable!("index came from include filter"),
423        };
424
425        let splice_items = resolve_one_include(&annotation, host_dir, state, kind)?;
426
427        // Replace the include annotation with the splice content.
428        // The annotation itself stays in the children list immediately
429        // before the splice, so the post-resolution AttachAnnotations
430        // pass moves it onto the first spliced node by the standard
431        // "attach to next sibling" rule.
432        let mut replacement = Vec::with_capacity(splice_items.len() + 1);
433        replacement.push(ContentItem::Annotation(annotation));
434        replacement.extend(splice_items);
435        children.splice(i..=i, replacement);
436    }
437
438    Ok(())
439}
440
441/// Resolve a single include annotation: path → load → parse → recurse →
442/// stamp → policy-check → splice list.
443///
444/// The recursion happens *here*: after parsing the loaded file, we walk
445/// its tree with the loaded file's own directory as `host_dir`, with the
446/// loaded file pushed onto `state.chain` and `state.depth` bumped by 1.
447/// When this call returns, the splice list is fully resolved and ready to
448/// be inserted into the host container.
449fn resolve_one_include(
450    annotation: &crate::lex::ast::elements::annotation::Annotation,
451    host_dir: &Path,
452    state: &mut ResolverState<'_>,
453    parent_kind: ContainerKind,
454) -> Result<Vec<ContentItem>, IncludeError> {
455    let src = annotation
456        .include_src()
457        .ok_or_else(|| IncludeError::MissingSrc {
458            include_site: annotation.location.clone(),
459        })?;
460
461    let target_path = resolve_path(&src, host_dir, &state.config.root)?;
462
463    // Cycle check before load — keep loader free of duplicate work.
464    if state.chain.iter().any(|p| p == &target_path) {
465        return Err(IncludeError::Cycle {
466            include_site: annotation.location.clone(),
467            path: target_path,
468            chain: state.chain.clone(),
469        });
470    }
471
472    // Depth check before recursing into the loaded file. A site that sits
473    // exactly at `max_depth` is fine; a site that would push us *past* it
474    // is the failure case.
475    if state.depth >= state.config.max_depth {
476        return Err(IncludeError::DepthExceeded {
477            include_site: annotation.location.clone(),
478            limit: state.config.max_depth,
479            chain: state.chain.clone(),
480        });
481    }
482
483    let target_source = state.loader.load(&target_path)?;
484
485    let mut included =
486        parse_no_attach(&target_source).map_err(|message| IncludeError::ParseFailed {
487            path: target_path.clone(),
488            message,
489        })?;
490
491    let target_origin = Arc::new(target_path.clone());
492    stamp_doc(&mut included, &target_origin);
493
494    // Recursively resolve includes inside the loaded file. The host_dir
495    // for that walk is the loaded file's own parent; the chain gains
496    // this path and depth bumps by 1 — both are popped/restored on the
497    // way back so siblings see the same state we got.
498    let included_dir = target_path
499        .parent()
500        .map(Path::to_path_buf)
501        .unwrap_or_else(|| state.config.root.clone());
502
503    state.chain.push(target_path.clone());
504    let saved_depth = state.depth;
505    state.depth = saved_depth + 1;
506    let recurse_result =
507        splice_in_session_container(included.root.children.as_mut_vec(), &included_dir, state);
508    state.depth = saved_depth;
509    state.chain.pop();
510    recurse_result?;
511
512    let splice_items = prepare_splice_list(included);
513    validate_against_kind(
514        &splice_items,
515        parent_kind,
516        &annotation.location,
517        &target_path,
518    )?;
519
520    Ok(splice_items)
521}
522
523#[allow(clippy::ptr_arg)]
524fn recurse_into_children(
525    children: &mut Vec<ContentItem>,
526    host_dir: &Path,
527    state: &mut ResolverState<'_>,
528) -> Result<(), IncludeError> {
529    for item in children.iter_mut() {
530        match item {
531            ContentItem::Session(s) => {
532                splice_in_session_container(s.children.as_mut_vec(), host_dir, state)?;
533            }
534            ContentItem::Definition(d) => {
535                splice_in_general_container(
536                    &mut d.children,
537                    host_dir,
538                    state,
539                    ContainerKind::Definition,
540                )?;
541            }
542            ContentItem::Annotation(a) if !a.is_include() => {
543                splice_in_general_container(
544                    &mut a.children,
545                    host_dir,
546                    state,
547                    ContainerKind::AnnotationBody,
548                )?;
549            }
550            ContentItem::List(l) => {
551                for li in l.items.as_mut_vec().iter_mut() {
552                    if let ContentItem::ListItem(item) = li {
553                        splice_in_general_container(
554                            &mut item.children,
555                            host_dir,
556                            state,
557                            ContainerKind::ListItem,
558                        )?;
559                    }
560                }
561            }
562            _ => {}
563        }
564    }
565    Ok(())
566}
567
568fn prepare_splice_list(mut included: Document) -> Vec<ContentItem> {
569    let mut items: Vec<ContentItem> = Vec::new();
570
571    // Document title → Paragraph, prepended.
572    // Equivalent to what a textual paste would parse (an unindented line
573    // becomes a paragraph in the host's context). Per the revised
574    // spec §5.2 this is "do nothing" semantics — converting matches what
575    // the parser would do if the included source were inlined and reparsed.
576    if let Some(title) = included.title {
577        let location = title.location.clone();
578        let para = Paragraph::from_line(title.as_str().to_string()).at(location);
579        items.push(ContentItem::Paragraph(para));
580    }
581
582    // Document-level annotations → regular annotations, prepended.
583    for ann in included.annotations {
584        items.push(ContentItem::Annotation(ann));
585    }
586
587    // Body of the included document.
588    items.append(included.root.children.as_mut_vec());
589
590    items
591}
592
593fn validate_against_kind(
594    items: &[ContentItem],
595    kind: ContainerKind,
596    site: &Range,
597    file: &Path,
598) -> Result<(), IncludeError> {
599    if kind.allows_sessions() {
600        return Ok(());
601    }
602    if items.iter().any(|i| matches!(i, ContentItem::Session(_))) {
603        return Err(IncludeError::ContainerPolicy {
604            include_site: site.clone(),
605            container: kind.name(),
606            file: file.to_path_buf(),
607            violation: "Sessions",
608        });
609    }
610    Ok(())
611}
612
613// ============================================================================
614// Path resolution
615// ============================================================================
616
617/// Resolve a file-reference target string the same way the include
618/// resolver resolves include paths.
619///
620/// Use this when consuming `ReferenceType::File { target }` (or any other
621/// node-attached path) so that relative paths resolve from the *authoring*
622/// file's directory, not from wherever the merged document happens to be
623/// rooted. Pass `ref_origin` as the [`Range::origin_path`] of the inline's
624/// containing node (or `None` if the node was never stamped — in that case
625/// the path is treated as if authored at the root).
626///
627/// Behaviour matches the include resolver:
628/// - Root-absolute targets (leading `/`) resolve under `root`.
629/// - Other targets resolve relative to `ref_origin`'s parent (or `root`
630///   when `ref_origin` is `None`).
631/// - The result is lexically normalized and checked against `root` —
632///   paths that escape it return `RootEscape`.
633///
634/// This is a sister to the resolver's internal `resolve_path` and shares
635/// the same lexical-normalization caveat: it does not touch the filesystem.
636pub fn resolve_file_reference(
637    target: &str,
638    ref_origin: Option<&Path>,
639    root: &Path,
640) -> Result<PathBuf, IncludeError> {
641    let host_dir: PathBuf = ref_origin
642        .and_then(|p| p.parent())
643        .map(Path::to_path_buf)
644        .unwrap_or_else(|| root.to_path_buf());
645    resolve_path(target, &host_dir, root)
646}
647
648fn resolve_path(src: &str, host_dir: &Path, root: &Path) -> Result<PathBuf, IncludeError> {
649    let candidate = if let Some(rel) = src.strip_prefix('/') {
650        // Root-absolute: leading slash means "from the resolution root".
651        root.join(rel)
652    } else {
653        // Relative: from the host file's directory.
654        host_dir.join(src)
655    };
656    let normalized = lexical_normalize(&candidate);
657    let canonical_root = lexical_normalize(root);
658    if !normalized.starts_with(&canonical_root) {
659        return Err(IncludeError::RootEscape {
660            path: normalized,
661            root: canonical_root,
662        });
663    }
664    Ok(normalized)
665}
666
667/// Lexical (no-filesystem) path normalization: resolve `.` and `..` components.
668///
669/// Filesystem-based canonicalization (`std::fs::canonicalize`) requires the
670/// path to exist, which breaks tests that use [`MemoryLoader`]. The lexical
671/// version is sufficient for include-site path resolution because the
672/// resolver only needs a stable identity for cycle detection and a uniform
673/// shape for the root-escape prefix check.
674///
675/// `..` is collapsed only when the *last* component in the buffer is a
676/// real directory name (`Component::Normal`). When the buffer is empty
677/// or its last component is itself `..` (or a root marker), the new `..`
678/// is *preserved* in the buffer.
679///
680/// This is what defeats `../../etc/passwd` from collapsing to
681/// `etc/passwd` and bypassing the root-escape check — `PathBuf::pop`
682/// would happily strip a `..` (since `Path::new("..").parent()` returns
683/// `Some("")`), silently losing the second `..` and producing a path
684/// that falsely starts with the root prefix. Each unmatched `..` in the
685/// preserved form keeps the normalized path outside any sane root, so
686/// the escape check fires correctly.
687fn lexical_normalize(p: &Path) -> PathBuf {
688    let mut out = PathBuf::new();
689    for c in p.components() {
690        match c {
691            std::path::Component::ParentDir => {
692                let can_pop = matches!(
693                    out.components().next_back(),
694                    Some(std::path::Component::Normal(_))
695                );
696                if can_pop {
697                    out.pop();
698                } else {
699                    out.push("..");
700                }
701            }
702            std::path::Component::CurDir => {}
703            other => out.push(other.as_os_str()),
704        }
705    }
706    out
707}
708
709// ============================================================================
710// Origin stamping
711// ============================================================================
712//
713// Walk every node in a Document and set `Range.origin_path` on each
714// `.location` field. The walk only stamps the *block-level* `.location`
715// fields here; finer-grained inline ranges land in PR 6 when file-ref
716// resolution starts consulting them.
717
718fn stamp_doc(doc: &mut Document, origin: &Arc<PathBuf>) {
719    if let Some(title) = doc.title.as_mut() {
720        title.location.origin_path = Some(Arc::clone(origin));
721    }
722    for ann in doc.annotations.iter_mut() {
723        stamp_annotation(ann, origin);
724    }
725    stamp_session(&mut doc.root, origin);
726}
727
728fn stamp_session(s: &mut Session, origin: &Arc<PathBuf>) {
729    s.location.origin_path = Some(Arc::clone(origin));
730    if let Some(loc) = s.title.location.as_mut() {
731        loc.origin_path = Some(Arc::clone(origin));
732    }
733    for ann in s.annotations.iter_mut() {
734        stamp_annotation(ann, origin);
735    }
736    for item in s.children.as_mut_vec().iter_mut() {
737        stamp_item(item, origin);
738    }
739}
740
741fn stamp_annotation(
742    a: &mut crate::lex::ast::elements::annotation::Annotation,
743    origin: &Arc<PathBuf>,
744) {
745    a.location.origin_path = Some(Arc::clone(origin));
746    a.data.location.origin_path = Some(Arc::clone(origin));
747    for item in a.children.as_mut_vec().iter_mut() {
748        stamp_item(item, origin);
749    }
750}
751
752fn stamp_item(item: &mut ContentItem, origin: &Arc<PathBuf>) {
753    match item {
754        ContentItem::Session(s) => stamp_session(s, origin),
755        ContentItem::Annotation(a) => stamp_annotation(a, origin),
756        ContentItem::Paragraph(p) => {
757            p.location.origin_path = Some(Arc::clone(origin));
758            for ann in p.annotations.iter_mut() {
759                stamp_annotation(ann, origin);
760            }
761            for line in p.lines.iter_mut() {
762                stamp_item(line, origin);
763            }
764        }
765        ContentItem::List(l) => {
766            l.location.origin_path = Some(Arc::clone(origin));
767            for li in l.items.as_mut_vec().iter_mut() {
768                stamp_item(li, origin);
769            }
770        }
771        ContentItem::ListItem(li) => {
772            li.location.origin_path = Some(Arc::clone(origin));
773            for ann in li.annotations.iter_mut() {
774                stamp_annotation(ann, origin);
775            }
776            for child in li.children.as_mut_vec().iter_mut() {
777                stamp_item(child, origin);
778            }
779        }
780        ContentItem::Definition(d) => {
781            d.location.origin_path = Some(Arc::clone(origin));
782            for ann in d.annotations.iter_mut() {
783                stamp_annotation(ann, origin);
784            }
785            for child in d.children.as_mut_vec().iter_mut() {
786                stamp_item(child, origin);
787            }
788        }
789        ContentItem::VerbatimBlock(v) => {
790            v.location.origin_path = Some(Arc::clone(origin));
791        }
792        ContentItem::VerbatimLine(vl) => {
793            vl.location.origin_path = Some(Arc::clone(origin));
794        }
795        ContentItem::Table(t) => {
796            t.location.origin_path = Some(Arc::clone(origin));
797        }
798        ContentItem::TextLine(tl) => {
799            tl.location.origin_path = Some(Arc::clone(origin));
800        }
801        ContentItem::BlankLineGroup(b) => {
802            b.location.origin_path = Some(Arc::clone(origin));
803        }
804    }
805}
806
807// ============================================================================
808// Parser glue
809// ============================================================================
810
811/// Parse `source` into a Document but skip the annotation-attachment stage,
812/// so include annotations are findable in container children lists.
813fn parse_no_attach(source: &str) -> Result<Document, String> {
814    crate::lex::testing::parse_without_annotation_attachment(source)
815}
816
817// ============================================================================
818// Filesystem-backed loader
819// ============================================================================
820
821/// [`Loader`] that reads files from the filesystem with `std::fs::read_to_string`.
822///
823/// This is the production loader used by the CLI; the LSP wraps it with a
824/// file-watch invalidation layer in PR 8. lex-core's *resolver* code does not
825/// reference `std::fs` — `FsLoader` is the one place where it does, isolated
826/// behind the [`Loader`] trait so the rest of the crate stays sandbox- and
827/// WASM-friendly.
828///
829/// `FsLoader` is stateless; construct one at the start of a resolution and
830/// share it for the duration. Errors map cleanly:
831/// - `std::io::ErrorKind::NotFound` → [`LoadError::NotFound`]
832/// - any other I/O error → [`LoadError::Io`]
833pub struct FsLoader;
834
835impl FsLoader {
836    pub fn new() -> Self {
837        Self
838    }
839}
840
841impl Default for FsLoader {
842    fn default() -> Self {
843        Self::new()
844    }
845}
846
847impl Loader for FsLoader {
848    fn load(&self, path: &Path) -> Result<String, LoadError> {
849        std::fs::read_to_string(path).map_err(|e| match e.kind() {
850            std::io::ErrorKind::NotFound => LoadError::NotFound {
851                path: path.to_path_buf(),
852            },
853            _ => LoadError::Io {
854                path: path.to_path_buf(),
855                message: e.to_string(),
856            },
857        })
858    }
859}
860
861// ============================================================================
862// Test fixtures (test-support feature + cfg(test))
863// ============================================================================
864
865/// In-memory [`Loader`] backed by a `HashMap<PathBuf, String>`.
866#[cfg(any(test, feature = "test-support"))]
867pub struct MemoryLoader {
868    files: std::collections::HashMap<PathBuf, String>,
869}
870
871#[cfg(any(test, feature = "test-support"))]
872impl MemoryLoader {
873    /// Create an empty loader. Add files with [`MemoryLoader::insert`].
874    pub fn new() -> Self {
875        Self {
876            files: std::collections::HashMap::new(),
877        }
878    }
879
880    /// Register a file at `path` with the given source text.
881    pub fn insert<P: Into<PathBuf>, S: Into<String>>(&mut self, path: P, contents: S) -> &mut Self {
882        self.files.insert(path.into(), contents.into());
883        self
884    }
885
886    /// Convenience constructor: build a loader from any iterator of
887    /// `(path, contents)` pairs.
888    pub fn from_pairs<I, P, S>(pairs: I) -> Self
889    where
890        I: IntoIterator<Item = (P, S)>,
891        P: Into<PathBuf>,
892        S: Into<String>,
893    {
894        let mut loader = Self::new();
895        for (path, contents) in pairs {
896            loader.insert(path, contents);
897        }
898        loader
899    }
900}
901
902#[cfg(any(test, feature = "test-support"))]
903impl Default for MemoryLoader {
904    fn default() -> Self {
905        Self::new()
906    }
907}
908
909#[cfg(any(test, feature = "test-support"))]
910impl Loader for MemoryLoader {
911    fn load(&self, path: &Path) -> Result<String, LoadError> {
912        self.files
913            .get(path)
914            .cloned()
915            .ok_or_else(|| LoadError::NotFound {
916                path: path.to_path_buf(),
917            })
918    }
919}
920
921// ============================================================================
922// Tests
923// ============================================================================
924
925#[cfg(test)]
926mod tests;