Skip to main content

nodex_core/builder/
mod.rs

1pub mod cache;
2pub mod resolver;
3pub mod scanner;
4pub mod validator;
5
6use indexmap::IndexMap;
7use rayon::prelude::*;
8use std::collections::BTreeMap;
9use std::path::Path;
10
11use crate::config::Config;
12use crate::error::{Error, Result};
13use crate::model::{Graph, Node, RawEdge};
14use crate::parser::{self, ParsedDocument};
15
16use cache::BuildCache;
17use resolver::{build_id_set, build_path_index, resolve_edges};
18use validator::validate_supersedes_dag;
19
20/// Build result with stats for CLI output.
21pub struct BuildResult {
22    pub graph: Graph,
23    pub stats: BuildStats,
24}
25
26#[derive(Debug, serde::Serialize)]
27pub struct BuildStats {
28    pub nodes: usize,
29    pub edges: usize,
30    pub cached: usize,
31    pub parsed: usize,
32    #[serde(skip_serializing_if = "Vec::is_empty")]
33    pub warnings: Vec<String>,
34}
35
36/// Build the full document graph.
37pub fn build(root: &Path, config: &Config, full_rebuild: bool) -> Result<BuildResult> {
38    // 1. Scan scope
39    let paths = scanner::scan_scope(root, config)?;
40
41    // 2. Load cache (unless full rebuild). Invalidates if config
42    // changed OR if the nodex binary itself was upgraded — the cache
43    // holds serialised `Node` / `RawEdge` / `Confidence` values, so a
44    // struct-shape change in a new version would otherwise let an old
45    // cache silently produce stale nodes on the next build. Mixing
46    // `CARGO_PKG_VERSION` into the hashed input makes every upgrade a
47    // one-time full rebuild, which is cheap and correct.
48    //
49    // `Config` is a plain, fully-serialisable struct — silently
50    // falling back to an empty hash on serialisation failure (the
51    // previous `unwrap_or_default`) would let a changed config reuse
52    // stale cache entries. `expect` makes the invariant explicit so
53    // anyone adding a non-serialisable field to `Config` fails fast.
54    let cache_path = root.join(&config.output.dir).join("cache.json");
55    let config_json = serde_json::to_string(config)
56        .expect("Config is defined entirely over serializable primitives");
57    let config_hash = cache::compute_hash(&format!(
58        "nodex={}\n{}",
59        env!("CARGO_PKG_VERSION"),
60        config_json
61    ));
62    let (mut cache, cache_warning) = if full_rebuild {
63        (BuildCache::default(), None)
64    } else {
65        BuildCache::load(&cache_path, &config_hash)
66    };
67    cache.config_hash = config_hash;
68
69    // 3. Read file contents (parallel). Collect read errors for warning.
70    let read_results: Vec<(
71        std::path::PathBuf,
72        std::result::Result<String, std::io::Error>,
73    )> = paths
74        .par_iter()
75        .map(|rel_path| {
76            let abs_path = root.join(rel_path);
77            let result = std::fs::read_to_string(&abs_path);
78            (rel_path.clone(), result)
79        })
80        .collect();
81
82    let mut read_warnings = Vec::new();
83    let mut file_contents: Vec<(std::path::PathBuf, String)> = Vec::new();
84    for (rel_path, result) in read_results {
85        match result {
86            Ok(content) => file_contents.push((rel_path, content)),
87            Err(e) => read_warnings.push(format!("skipped {}: {e}", rel_path.display())),
88        }
89    }
90
91    // 4. Parse documents (parallel, with caching)
92    let mut cached_count = 0usize;
93    let mut parsed_count = 0usize;
94
95    // Separate into cached hits and cache misses
96    let mut cached_results: Vec<(Node, Vec<RawEdge>)> = Vec::new();
97    let mut to_parse: Vec<(std::path::PathBuf, String)> = Vec::new();
98
99    for (rel_path, content) in &file_contents {
100        if let Some(entry) = cache.get(rel_path, content) {
101            cached_results.push((
102                entry.node.clone(),
103                entry.raw_edges.iter().cloned().map(RawEdge::from).collect(),
104            ));
105            cached_count += 1;
106        } else {
107            to_parse.push((rel_path.clone(), content.clone()));
108        }
109    }
110
111    // Parse cache misses in parallel
112    let fresh_results: Vec<Result<(std::path::PathBuf, String, ParsedDocument)>> = to_parse
113        .par_iter()
114        .map(|(rel_path, content)| {
115            let doc = parser::parse_document(rel_path, content, config)?;
116            Ok((rel_path.clone(), content.clone(), doc))
117        })
118        .collect();
119
120    let mut all_nodes: Vec<(String, Node)> = Vec::new();
121    let mut all_raw_edges: Vec<(String, std::path::PathBuf, Vec<RawEdge>)> = Vec::new();
122
123    // Collect cached results
124    for (node, raw_edges) in cached_results {
125        let id = node.id.clone();
126        let path = node.path.clone();
127        all_raw_edges.push((id.clone(), path, raw_edges));
128        all_nodes.push((id, node));
129    }
130
131    // Collect fresh results and update cache
132    for result in fresh_results {
133        let (rel_path, content, doc) = result?;
134        parsed_count += 1;
135
136        cache.insert(rel_path, &content, doc.node.clone(), &doc.raw_edges);
137
138        let id = doc.node.id.clone();
139        let path = doc.node.path.clone();
140        all_raw_edges.push((id.clone(), path, doc.raw_edges));
141        all_nodes.push((id, doc.node));
142    }
143
144    // 5. Check for duplicate ids
145    {
146        let mut seen: BTreeMap<&str, &Path> = BTreeMap::new();
147        for (id, node) in &all_nodes {
148            if let Some(&first_path) = seen.get(id.as_str()) {
149                return Err(Error::DuplicateId {
150                    id: id.clone(),
151                    first: first_path.to_path_buf(),
152                    second: node.path.clone(),
153                });
154            }
155            seen.insert(id.as_str(), &node.path);
156        }
157    }
158
159    // 6. Build resolution indices
160    let path_index = build_path_index(&all_nodes);
161    let id_set = build_id_set(&all_nodes);
162
163    // 7. Resolve edges
164    let mut edges = Vec::new();
165    for (source_id, source_path, raw_edges) in all_raw_edges {
166        let resolved = resolve_edges(&source_id, raw_edges, &source_path, &path_index, &id_set);
167        edges.extend(resolved);
168    }
169
170    // 7b. Translate each `superseded_by` scalar into its canonical
171    //     `supersedes` edge. frontmatter `supersedes: [X]` on node N
172    //     yields edge N → X; frontmatter `superseded_by: Y` on node M
173    //     yields edge Y → M (same direction, different authoring style).
174    //     Without this step, documents that author only the
175    //     `superseded_by` field never show up in `backlinks` / `node`
176    //     incoming, and `chain` had to traverse a scalar pointer that
177    //     lived outside the edge graph — two representations of the
178    //     same relation. Materialising both into edges unifies the
179    //     graph so every query uses the same traversal.
180    edges.extend(derive_superseded_by_edges(&all_nodes));
181
182    // Dedupe by (source, target, relation) so documents that declare
183    // both sides (N.supersedes=[X] AND X.superseded_by=N) produce a
184    // single edge rather than two identical ones. The body-link
185    // resolver never produces duplicates, so this only affects
186    // frontmatter-sourced edges.
187    dedupe_edges(&mut edges);
188
189    // 8. Validate supersedes DAG
190    validate_supersedes_dag(&edges)?;
191
192    // 9. Sort edges for deterministic output
193    edges.sort_by(|a, b| {
194        a.source
195            .cmp(&b.source)
196            .then_with(|| a.relation.cmp(&b.relation))
197            .then_with(|| a.location.cmp(&b.location))
198    });
199
200    // 10. Build sorted node map
201    let mut node_map = IndexMap::new();
202    all_nodes.sort_by(|a, b| a.0.cmp(&b.0));
203    for (id, node) in all_nodes {
204        node_map.insert(id, node);
205    }
206
207    // 11. Clean cache and save
208    let valid_paths: Vec<_> = file_contents.iter().map(|(p, _)| p.clone()).collect();
209    cache.retain_paths(&valid_paths);
210    let mut warnings = read_warnings;
211    if let Some(msg) = cache_warning {
212        warnings.push(msg);
213    }
214    if let Err(e) = cache.save(&cache_path) {
215        warnings.push(format!("cache save failed: {e}"));
216    }
217
218    let stats = BuildStats {
219        nodes: node_map.len(),
220        edges: edges.len(),
221        cached: cached_count,
222        parsed: parsed_count,
223        warnings,
224    };
225
226    Ok(BuildResult {
227        graph: Graph::new(node_map, edges),
228        stats,
229    })
230}
231
232/// Build the edges implied by `superseded_by` scalars. Each `M.superseded_by = Y`
233/// becomes an edge `Y → M` with relation `"supersedes"`, matching the
234/// canonical direction produced by `supersedes: [...]` vectors. When Y
235/// isn't itself a known node id the synthesized edge is skipped — an
236/// unresolved target here is better caught by the regular body-link
237/// path than by being smuggled into the graph as `ResolvedTarget::Unresolved`.
238fn derive_superseded_by_edges(
239    all_nodes: &[(String, crate::model::Node)],
240) -> Vec<crate::model::Edge> {
241    use crate::model::{Confidence, Edge, ResolvedTarget};
242    let known_ids: std::collections::BTreeSet<&str> =
243        all_nodes.iter().map(|(id, _)| id.as_str()).collect();
244    let mut out = Vec::new();
245    for (id, node) in all_nodes {
246        let Some(ref successor) = node.superseded_by else {
247            continue;
248        };
249        if !known_ids.contains(successor.as_str()) {
250            // `successor` isn't a known node id — skip. The standard
251            // resolver will record this as an unresolved edge from the
252            // body-link pipeline if the content references it.
253            continue;
254        }
255        out.push(Edge {
256            source: successor.clone(),
257            target: ResolvedTarget::resolved(id.as_str()),
258            relation: "supersedes".to_string(),
259            confidence: Confidence::Extracted,
260            location: format!("frontmatter:superseded_by@{id}"),
261        });
262    }
263    out
264}
265
266/// Remove duplicate `(source, target, relation)` edges while preserving
267/// the first occurrence's location. The canonical representation keeps
268/// the original edge (usually a direct `supersedes` declaration) and
269/// discards the mirrored one derived from `superseded_by`.
270fn dedupe_edges(edges: &mut Vec<crate::model::Edge>) {
271    use crate::model::ResolvedTarget;
272    let mut seen: std::collections::BTreeSet<(String, String, String)> =
273        std::collections::BTreeSet::new();
274    edges.retain(|edge| {
275        let target_key = match &edge.target {
276            ResolvedTarget::Resolved { id } => format!("r:{id}"),
277            ResolvedTarget::Unresolved { raw, .. } => format!("u:{raw}"),
278        };
279        seen.insert((edge.source.clone(), target_key, edge.relation.clone()))
280    });
281}