Skip to main content

trusty_memory/bootstrap/
scan.rs

1//! Project-file scanners for the KG bootstrap seeder.
2//!
3//! Why: Splitting scanner logic into its own module keeps each file under the
4//! 500-SLOC cap and lets unit tests target the scanners directly without
5//! pulling in the async entry point or type definitions.
6//! What: `scan_project` is the top-level blocking orchestrator; the private
7//! per-file functions (`scan_cargo_toml`, `scan_package_json`,
8//! `scan_pyproject_toml`, `scan_go_mod`, `scan_claude_md`, `scan_git_config`,
9//! `read_origin_url`) each extract triples from one well-known file format.
10//! Test: `scan_project_extracts_cargo_facts`,
11//! `scan_project_extracts_package_json`,
12//! `scan_project_falls_back_to_palace_id_when_no_manifest`,
13//! and others in the `tests` sub-module.
14
15use std::path::Path;
16
17use anyhow::Result;
18
19use super::types::{BootstrapTriple, ScannedFile};
20
21/// Blocking scanner: walk well-known files under `root` and extract triples.
22///
23/// Why: Pulled out as a sync function so the file I/O + TOML/JSON parsing
24/// run on a blocking thread (via `spawn_blocking`) and the algorithm itself
25/// is trivially unit-testable against fixture directories.
26/// What: Returns `(triples, per_file_summary, project_subject)`. The
27/// project subject is derived from the first manifest that yields a name;
28/// falls back to `fallback_subject` (typically the palace id) when none
29/// match.
30/// Test: `scan_project_extracts_cargo_facts`,
31/// `scan_project_extracts_package_json`,
32/// `scan_project_falls_back_to_palace_id_when_no_manifest`.
33pub fn scan_project(
34    root: &Path,
35    fallback_subject: &str,
36) -> Result<(Vec<BootstrapTriple>, Vec<ScannedFile>, String)> {
37    let mut triples: Vec<BootstrapTriple> = Vec::new();
38    let mut summary: Vec<ScannedFile> = Vec::new();
39    let mut project_subject: Option<String> = None;
40
41    // 1. Cargo.toml
42    let before = triples.len();
43    if let Some(name) = scan_cargo_toml(root, &mut triples) {
44        project_subject.get_or_insert(name);
45    }
46    if triples.len() > before {
47        summary.push(ScannedFile {
48            file: "Cargo.toml".to_string(),
49            triples: triples.len() - before,
50        });
51    }
52
53    // 2. package.json
54    let before = triples.len();
55    if let Some(name) = scan_package_json(root, &mut triples) {
56        project_subject.get_or_insert(name);
57    }
58    if triples.len() > before {
59        summary.push(ScannedFile {
60            file: "package.json".to_string(),
61            triples: triples.len() - before,
62        });
63    }
64
65    // 3. pyproject.toml
66    let before = triples.len();
67    if let Some(name) = scan_pyproject_toml(root, &mut triples) {
68        project_subject.get_or_insert(name);
69    }
70    if triples.len() > before {
71        summary.push(ScannedFile {
72            file: "pyproject.toml".to_string(),
73            triples: triples.len() - before,
74        });
75    }
76
77    // 4. go.mod
78    let before = triples.len();
79    if let Some(name) = scan_go_mod(root, &mut triples) {
80        project_subject.get_or_insert(name);
81    }
82    if triples.len() > before {
83        summary.push(ScannedFile {
84            file: "go.mod".to_string(),
85            triples: triples.len() - before,
86        });
87    }
88
89    // 5. CLAUDE.md — first H1 heading as descriptive name. Does not set
90    //    project_subject (the manifest sources are stronger signals) but
91    //    contributes a `has_description` triple when the subject is known.
92    let before = triples.len();
93    scan_claude_md(root, project_subject.as_deref(), &mut triples);
94    if triples.len() > before {
95        summary.push(ScannedFile {
96            file: "CLAUDE.md".to_string(),
97            triples: triples.len() - before,
98        });
99    }
100
101    // 6. .git/config — source repo URL.
102    let before = triples.len();
103    scan_git_config(root, project_subject.as_deref(), &mut triples);
104    if triples.len() > before {
105        summary.push(ScannedFile {
106            file: ".git/config".to_string(),
107            triples: triples.len() - before,
108        });
109    }
110
111    let subject = project_subject.unwrap_or_else(|| fallback_subject.to_string());
112
113    // Rewrite any triples that used a placeholder subject (only the
114    // CLAUDE.md / .git/config scanners are subject-dependent; if no manifest
115    // matched, those scanners ran with subject=None and produced nothing, so
116    // this is currently a no-op — but keeping the loop makes future scanner
117    // additions safe).
118    for t in &mut triples {
119        if t.subject.is_empty() {
120            t.subject = subject.clone();
121        }
122    }
123
124    Ok((triples, summary, subject))
125}
126
127/// Scan `Cargo.toml`. Returns the package/workspace name if extracted.
128///
129/// Why: Rust projects are the primary trusty-tools target; we want
130/// `has_language=Rust`, `has_version`, `has_edition`, `has_rust_version`,
131/// and `workspace_member` triples auto-populated so `kg_query` against the
132/// project name returns useful context immediately.
133/// What: Parses the TOML; emits `(name, has_language, "Rust")` always when
134/// the manifest exists, plus version/edition/rust-version/workspace member
135/// triples when present.
136/// Test: `scan_project_extracts_cargo_facts`.
137fn scan_cargo_toml(root: &Path, out: &mut Vec<BootstrapTriple>) -> Option<String> {
138    let manifest = root.join("Cargo.toml");
139    let raw = std::fs::read_to_string(&manifest).ok()?;
140    let parsed: toml::Value = match toml::from_str(&raw) {
141        Ok(v) => v,
142        Err(e) => {
143            tracing::debug!("bootstrap: parse Cargo.toml failed: {e:#}");
144            return None;
145        }
146    };
147
148    // Workspace root manifests may have no [package] section. Use the
149    // workspace.package.name if present; otherwise derive from the dir name.
150    let name = parsed
151        .get("package")
152        .and_then(|p| p.get("name"))
153        .and_then(|n| n.as_str())
154        .map(|s| s.to_string())
155        .or_else(|| {
156            parsed
157                .get("workspace")
158                .and_then(|w| w.get("package"))
159                .and_then(|p| p.get("name"))
160                .and_then(|n| n.as_str())
161                .map(|s| s.to_string())
162        })
163        .or_else(|| {
164            root.file_name()
165                .and_then(|n| n.to_str())
166                .map(|s| s.to_string())
167        })?;
168
169    out.push(BootstrapTriple {
170        subject: name.clone(),
171        predicate: "has_language".to_string(),
172        object: "Rust".to_string(),
173        provenance: "bootstrap:cargo.toml".to_string(),
174    });
175
176    if let Some(version) = parsed
177        .get("package")
178        .and_then(|p| p.get("version"))
179        .and_then(|v| v.as_str())
180    {
181        out.push(BootstrapTriple {
182            subject: name.clone(),
183            predicate: "has_version".to_string(),
184            object: version.to_string(),
185            provenance: "bootstrap:cargo.toml".to_string(),
186        });
187    }
188    if let Some(edition) = parsed
189        .get("package")
190        .and_then(|p| p.get("edition"))
191        .and_then(|v| v.as_str())
192    {
193        out.push(BootstrapTriple {
194            subject: name.clone(),
195            predicate: "has_edition".to_string(),
196            object: edition.to_string(),
197            provenance: "bootstrap:cargo.toml".to_string(),
198        });
199    }
200    if let Some(rv) = parsed
201        .get("package")
202        .and_then(|p| p.get("rust-version"))
203        .and_then(|v| v.as_str())
204    {
205        out.push(BootstrapTriple {
206            subject: name.clone(),
207            predicate: "has_rust_version".to_string(),
208            object: rv.to_string(),
209            provenance: "bootstrap:cargo.toml".to_string(),
210        });
211    }
212
213    // Workspace members (capped at 64 to avoid flooding the KG on huge
214    // monorepos; bootstrap is a coarse seeder, not an exhaustive index).
215    if let Some(members) = parsed
216        .get("workspace")
217        .and_then(|w| w.get("members"))
218        .and_then(|m| m.as_array())
219    {
220        for member in members.iter().take(64) {
221            if let Some(s) = member.as_str() {
222                out.push(BootstrapTriple {
223                    subject: name.clone(),
224                    predicate: "has_workspace_member".to_string(),
225                    object: s.to_string(),
226                    provenance: "bootstrap:cargo.toml".to_string(),
227                });
228            }
229        }
230    }
231
232    Some(name)
233}
234
235/// Scan `package.json`.
236///
237/// Why: Node/TypeScript projects are the second most common target. We want
238/// `has_language=JavaScript`, `has_version`, and `has_dependency` triples.
239/// What: Parses the JSON; emits language/version triples + one
240/// `has_dependency` per top-level key in the `dependencies` object (cap 64).
241/// Test: `scan_project_extracts_package_json`.
242fn scan_package_json(root: &Path, out: &mut Vec<BootstrapTriple>) -> Option<String> {
243    let manifest = root.join("package.json");
244    let raw = std::fs::read_to_string(&manifest).ok()?;
245    let parsed: serde_json::Value = match serde_json::from_str(&raw) {
246        Ok(v) => v,
247        Err(e) => {
248            tracing::debug!("bootstrap: parse package.json failed: {e:#}");
249            return None;
250        }
251    };
252    let name = parsed.get("name").and_then(|n| n.as_str())?.to_string();
253
254    out.push(BootstrapTriple {
255        subject: name.clone(),
256        predicate: "has_language".to_string(),
257        object: "JavaScript".to_string(),
258        provenance: "bootstrap:package.json".to_string(),
259    });
260
261    if let Some(version) = parsed.get("version").and_then(|v| v.as_str()) {
262        out.push(BootstrapTriple {
263            subject: name.clone(),
264            predicate: "has_version".to_string(),
265            object: version.to_string(),
266            provenance: "bootstrap:package.json".to_string(),
267        });
268    }
269
270    if let Some(deps) = parsed.get("dependencies").and_then(|d| d.as_object()) {
271        for (k, _) in deps.iter().take(64) {
272            out.push(BootstrapTriple {
273                subject: name.clone(),
274                predicate: "has_dependency".to_string(),
275                object: k.clone(),
276                provenance: "bootstrap:package.json".to_string(),
277            });
278        }
279    }
280
281    Some(name)
282}
283
284/// Scan `pyproject.toml`.
285///
286/// Why: Python projects use PEP-621 `[project]` metadata; surfacing the
287/// language tag + version + `requires-python` makes Python repos legible to
288/// the KG without manual assertions.
289/// What: Parses the TOML; emits language/version/requires-python triples
290/// when the `[project]` table is present.
291/// Test: `scan_project_extracts_pyproject`.
292fn scan_pyproject_toml(root: &Path, out: &mut Vec<BootstrapTriple>) -> Option<String> {
293    let manifest = root.join("pyproject.toml");
294    let raw = std::fs::read_to_string(&manifest).ok()?;
295    let parsed: toml::Value = match toml::from_str(&raw) {
296        Ok(v) => v,
297        Err(e) => {
298            tracing::debug!("bootstrap: parse pyproject.toml failed: {e:#}");
299            return None;
300        }
301    };
302    let project = parsed.get("project")?;
303    let name = project.get("name").and_then(|n| n.as_str())?.to_string();
304
305    out.push(BootstrapTriple {
306        subject: name.clone(),
307        predicate: "has_language".to_string(),
308        object: "Python".to_string(),
309        provenance: "bootstrap:pyproject.toml".to_string(),
310    });
311
312    if let Some(v) = project.get("version").and_then(|v| v.as_str()) {
313        out.push(BootstrapTriple {
314            subject: name.clone(),
315            predicate: "has_version".to_string(),
316            object: v.to_string(),
317            provenance: "bootstrap:pyproject.toml".to_string(),
318        });
319    }
320    if let Some(rp) = project.get("requires-python").and_then(|v| v.as_str()) {
321        out.push(BootstrapTriple {
322            subject: name.clone(),
323            predicate: "requires_python".to_string(),
324            object: rp.to_string(),
325            provenance: "bootstrap:pyproject.toml".to_string(),
326        });
327    }
328
329    Some(name)
330}
331
332/// Scan `go.mod` for the module name.
333///
334/// Why: Go projects encode their canonical name on the `module` line of
335/// `go.mod`; surfacing it as the project subject lets Go repos opt into the
336/// same KG shape as Rust/Node/Python.
337/// What: Reads `go.mod`, extracts the `module <name>` directive, and emits
338/// `(name, has_language, "Go")` plus `(name, has_module_path, <name>)`.
339/// Test: `scan_project_extracts_go_mod`.
340fn scan_go_mod(root: &Path, out: &mut Vec<BootstrapTriple>) -> Option<String> {
341    let raw = std::fs::read_to_string(root.join("go.mod")).ok()?;
342    let module = raw
343        .lines()
344        .find_map(|line| line.trim().strip_prefix("module "))
345        .map(|s| s.trim().to_string())?;
346    if module.is_empty() {
347        return None;
348    }
349    out.push(BootstrapTriple {
350        subject: module.clone(),
351        predicate: "has_language".to_string(),
352        object: "Go".to_string(),
353        provenance: "bootstrap:go.mod".to_string(),
354    });
355    out.push(BootstrapTriple {
356        subject: module.clone(),
357        predicate: "has_module_path".to_string(),
358        object: module.clone(),
359        provenance: "bootstrap:go.mod".to_string(),
360    });
361    Some(module)
362}
363
364/// Scan `CLAUDE.md` for the first H1 heading; attach as project description.
365///
366/// Why: Trusty-* projects use `CLAUDE.md` as the canonical orientation
367/// document; the first H1 line is invariably the project name/tagline and
368/// makes a good `has_description` triple.
369/// What: Walks lines, finds the first `# Title` heading, strips the prefix,
370/// and pushes a `has_description` triple under `subject` (when known).
371/// Test: `scan_project_extracts_claude_md_h1`.
372fn scan_claude_md(root: &Path, subject: Option<&str>, out: &mut Vec<BootstrapTriple>) {
373    let Some(subject) = subject else {
374        // No project subject yet — skip; we don't want orphan triples.
375        return;
376    };
377    let Ok(raw) = std::fs::read_to_string(root.join("CLAUDE.md")) else {
378        return;
379    };
380    if let Some(h1) = raw.lines().find_map(|line| {
381        let t = line.trim_start();
382        t.strip_prefix("# ")
383            .filter(|rest| !rest.is_empty())
384            .map(|s| s.trim().to_string())
385    }) {
386        out.push(BootstrapTriple {
387            subject: subject.to_string(),
388            predicate: "has_description".to_string(),
389            object: h1,
390            provenance: "bootstrap:claude.md".to_string(),
391        });
392    }
393}
394
395/// Scan the git origin URL for the project rooted at `root`.
396///
397/// Why: Tying a project to its source repo URL is the single highest-signal
398/// fact for downstream tooling (link to issues, find upstream, etc.). The
399/// canonical source is `[remote "origin"] url = …`, but its physical location
400/// depends on whether `root` is a normal checkout or a git worktree:
401///
402/// - Normal checkout: `.git/` is a directory; the config lives in
403///   `<root>/.git/config`.
404/// - Worktree: `.git` is a *file* containing `gitdir: <pointer>` to the
405///   parent repo's `.git/worktrees/<name>/` dir; the `[remote]` section lives
406///   in the parent's `.git/config`, not anywhere reachable by joining
407///   `<root>/.git/config`.
408///
409/// Issue #113: the previous implementation only handled the first case and
410/// silently dropped the `source_repo` triple in any worktree-based checkout.
411///
412/// What: First shells out to `git -C <root> config --get remote.origin.url`,
413/// which natively resolves the `.git`-file pointer for us. Falls back to a
414/// direct file scan of `<root>/.git/config` when `git` is unavailable on PATH
415/// (matters for the fixture-based unit tests, which fabricate a `.git/config`
416/// file in a tempdir without a real repo). Emits a
417/// `(subject, source_repo, url)` triple when a URL is found.
418/// Test: `scan_project_extracts_git_origin` (file fallback path),
419/// `tools::tests::kg_bootstrap_seeds_workspace_facts` (git-CLI path,
420/// exercised inside worktrees).
421fn scan_git_config(root: &Path, subject: Option<&str>, out: &mut Vec<BootstrapTriple>) {
422    let Some(subject) = subject else { return };
423    let Some(url) = read_origin_url(root) else {
424        return;
425    };
426    out.push(BootstrapTriple {
427        subject: subject.to_string(),
428        predicate: "source_repo".to_string(),
429        object: url,
430        provenance: "bootstrap:git.config".to_string(),
431    });
432}
433
434/// Resolve `remote.origin.url` for the repo rooted at `root`, transparent to
435/// worktree vs. normal-checkout layout.
436///
437/// Why: Centralises the worktree-vs-checkout indirection in one place so the
438/// bootstrap scanner stays readable. See `scan_git_config` for the full
439/// reasoning behind the two-strategy approach.
440/// What: (1) tries `git -C <root> config --get remote.origin.url`, which
441/// works equally well in worktrees, normal checkouts, and submodules; (2)
442/// falls back to a manual INI scan of `<root>/.git/config` for environments
443/// without a `git` binary (notably the fixture-based unit tests in this
444/// module). Returns `None` if neither path yields a non-empty URL.
445/// Test: `read_origin_url_returns_none_for_non_git_dir`,
446/// `scan_project_extracts_git_origin` (file fallback).
447fn read_origin_url(root: &Path) -> Option<String> {
448    // Strategy 1: ask git directly. This is the only path that handles
449    // worktrees correctly without us re-implementing `gitdir:` resolution.
450    if let Ok(output) = std::process::Command::new("git")
451        .arg("-C")
452        .arg(root)
453        .arg("config")
454        .arg("--get")
455        .arg("remote.origin.url")
456        .output()
457    {
458        if output.status.success() {
459            let url = String::from_utf8_lossy(&output.stdout).trim().to_string();
460            if !url.is_empty() {
461                return Some(url);
462            }
463        }
464    }
465
466    // Strategy 2: direct INI scan of `<root>/.git/config`. Only useful for
467    // fixture tests that fabricate a `.git/config` in a tempdir; real-world
468    // worktrees will never reach this branch because the file read fails
469    // (the worktree `.git` is a file, not a directory).
470    let raw = std::fs::read_to_string(root.join(".git").join("config")).ok()?;
471    let mut in_origin = false;
472    for line in raw.lines() {
473        let trimmed = line.trim();
474        if trimmed.starts_with('[') {
475            in_origin = trimmed == "[remote \"origin\"]";
476            continue;
477        }
478        if in_origin {
479            if let Some(rest) = trimmed.strip_prefix("url") {
480                let rest = rest.trim_start();
481                if let Some(rest) = rest.strip_prefix('=') {
482                    let url = rest.trim().to_string();
483                    if !url.is_empty() {
484                        return Some(url);
485                    }
486                }
487            }
488        }
489    }
490    None
491}
492
493#[cfg(test)]
494mod scan_tests;