Skip to main content

kovra_core/
scaffold.rs

1//! `kovra scaffold` — repo scan → proposed `.env.refs` (spec §13, L12).
2//!
3//! Reads a repository's **source** for environment-variable *references*
4//! (`os.getenv("X")`, `process.env.X`, `std::env::var("X")`, …) and proposes a
5//! `.env.refs` mapping each one to a kovra coordinate. It is a code-reading
6//! accelerator: the safe path (a secret contract) becomes the fast path.
7//!
8//! **It never reads, materializes, or writes a secret value.** It works purely
9//! from source *references* — variable names, never values — so no value can
10//! enter the agent's context. It deliberately skips `.env*` files (which hold
11//! values) and reads only known source extensions.
12//!
13//! The generated coordinates use the `${ENV}` placeholder (one contract serves
14//! all environments, substituted by `kovra run --env`) and follow the
15//! three-segment grammar `<env>/<component>/<key>` (spec §1.2/§4.2). The output
16//! is a **proposal** for a human to review — callers must not silently overwrite
17//! an existing `.env.refs`.
18
19use std::collections::BTreeMap;
20use std::path::Path;
21use std::sync::LazyLock;
22
23use ignore::WalkBuilder;
24use regex::Regex;
25
26use crate::error::CoreError;
27
28/// A source language whose env-var reference patterns scaffold understands.
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub enum Lang {
31    Python,
32    JavaScript,
33    Rust,
34}
35
36impl Lang {
37    /// The language for a file extension, or `None` if scaffold does not scan it.
38    pub fn for_extension(ext: &str) -> Option<Lang> {
39        match ext {
40            "py" | "pyi" => Some(Lang::Python),
41            "js" | "jsx" | "ts" | "tsx" | "mjs" | "cjs" => Some(Lang::JavaScript),
42            "rs" => Some(Lang::Rust),
43            _ => None,
44        }
45    }
46}
47
48// One capture group (group 1) per pattern: the env-var name. Names are the
49// conventional SHOUTING_SNAKE_CASE — lowercase/mixed names are not treated as
50// env vars (too noisy), matching how teams actually name them.
51static PY_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
52    vec![
53        Regex::new(r#"os\.getenv\(\s*["']([A-Z][A-Z0-9_]*)["']"#).unwrap(),
54        Regex::new(r#"os\.environ\.get\(\s*["']([A-Z][A-Z0-9_]*)["']"#).unwrap(),
55        Regex::new(r#"os\.environ\[\s*["']([A-Z][A-Z0-9_]*)["']\s*\]"#).unwrap(),
56    ]
57});
58static JS_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
59    vec![
60        Regex::new(r#"process\.env\.([A-Z][A-Z0-9_]*)"#).unwrap(),
61        Regex::new(r#"process\.env\[\s*["']([A-Z][A-Z0-9_]*)["']\s*\]"#).unwrap(),
62    ]
63});
64static RS_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
65    vec![Regex::new(r#"env::var(?:_os)?\(\s*["']([A-Z][A-Z0-9_]*)["']"#).unwrap()]
66});
67
68/// OS/process env vars that are never application secrets — proposing them is
69/// pure noise, so scaffold drops them. Conservative on purpose: anything that
70/// *might* be a secret is kept for the human to prune.
71const NEVER_SECRET: &[&str] = &[
72    "PATH", "HOME", "PWD", "USER", "SHELL", "TERM", "LANG", "LC_ALL", "TMPDIR", "HOSTNAME",
73];
74
75fn patterns(lang: Lang) -> &'static [Regex] {
76    match lang {
77        Lang::Python => &PY_PATTERNS,
78        Lang::JavaScript => &JS_PATTERNS,
79        Lang::Rust => &RS_PATTERNS,
80    }
81}
82
83/// Env-var names referenced in `source` for `lang`, in **source order**
84/// (by first byte offset), deduped. Pure (no I/O) so it is exhaustively
85/// unit-tested per language. Matches across patterns are merged by position so
86/// the order reflects the code, not the pattern list.
87pub fn detect_in_source(source: &str, lang: Lang) -> Vec<String> {
88    let mut hits: Vec<(usize, String)> = Vec::new();
89    for re in patterns(lang) {
90        for caps in re.captures_iter(source) {
91            let m = caps.get(1).expect("pattern has capture group 1");
92            let name = m.as_str().to_string();
93            if !NEVER_SECRET.contains(&name.as_str()) {
94                hits.push((m.start(), name));
95            }
96        }
97    }
98    hits.sort_by_key(|(pos, _)| *pos);
99    let mut seen: Vec<String> = Vec::new();
100    for (_, name) in hits {
101        if !seen.contains(&name) {
102            seen.push(name);
103        }
104    }
105    seen
106}
107
108/// A single proposed mapping: an env var → a kovra coordinate.
109#[derive(Debug, Clone, PartialEq, Eq)]
110pub struct Proposal {
111    /// The environment variable name as written in source (e.g. `DATABASE_URL`).
112    pub var: String,
113    /// The proposed coordinate, `secret:${ENV}/<component>/<key>`.
114    pub coordinate: String,
115}
116
117/// Lowercase, replace any run of non-alphanumerics with a single `-`, and trim
118/// leading/trailing `-`. Yields a valid single coordinate segment (no `/`, no
119/// `${...}`). Empty input (or all-punctuation) falls back to `fallback`.
120fn slug(raw: &str, fallback: &str) -> String {
121    let mut out = String::with_capacity(raw.len());
122    let mut prev_dash = false;
123    for ch in raw.chars() {
124        if ch.is_ascii_alphanumeric() {
125            out.push(ch.to_ascii_lowercase());
126            prev_dash = false;
127        } else if !prev_dash {
128            out.push('-');
129            prev_dash = true;
130        }
131    }
132    let trimmed = out.trim_matches('-');
133    if trimmed.is_empty() {
134        fallback.to_string()
135    } else {
136        trimmed.to_string()
137    }
138}
139
140/// Infer the `component` segment from a file's path relative to the repo root:
141/// the top-level directory name (e.g. `backend/db.py` → `backend`). A file at
142/// the repo root has no directory, so it falls back to `app`.
143fn component_for(rel_path: &Path) -> String {
144    let top = rel_path.components().next().and_then(|c| {
145        // Only treat it as a component if there is a further path segment (i.e.
146        // it is a directory, not the file itself).
147        if rel_path.components().count() > 1 {
148            c.as_os_str().to_str()
149        } else {
150            None
151        }
152    });
153    match top {
154        Some(dir) => slug(dir, "app"),
155        None => "app".to_string(),
156    }
157}
158
159/// Build the coordinate for a var detected in `component`:
160/// `secret:${ENV}/<component>/<key>` with `key` the kebab-cased var name.
161pub fn coordinate_for(var: &str, component: &str) -> String {
162    format!("secret:${{ENV}}/{}/{}", component, slug(var, "value"))
163}
164
165/// Scan `root` and return the proposals, sorted by variable name and unique per
166/// variable (the lexicographically-first file path wins the component). Walks
167/// with `.gitignore` honored and `.env*` files skipped — only source files are
168/// read, and only for variable *names*.
169pub fn scan_repo(root: &Path) -> Result<Vec<Proposal>, CoreError> {
170    // var -> component, keyed so the first path (sorted) wins deterministically.
171    let mut found: BTreeMap<String, String> = BTreeMap::new();
172
173    let walker = WalkBuilder::new(root)
174        // Skip hidden trees (`.git`, `.venv`, …) and honor `.gitignore`/`.ignore`,
175        // so vendored/generated dirs (`node_modules`, `target`) aren't scanned as
176        // project source. `.env*` is hidden anyway, and also skipped by name below
177        // (belt-and-suspenders for the no-value rule).
178        .hidden(true)
179        .git_ignore(true)
180        .ignore(true)
181        // Honor `.gitignore` even when the scan root is not itself a git repo
182        // (a worktree, an exported tree); otherwise vendored dirs slip through.
183        .require_git(false)
184        .git_global(false)
185        .build();
186
187    // Collect (rel_path, lang) for deterministic ordering, then process sorted.
188    let mut files: Vec<(std::path::PathBuf, Lang)> = Vec::new();
189    for entry in walker.flatten() {
190        let path = entry.path();
191        if !path.is_file() {
192            continue;
193        }
194        let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
195        // Never read value-bearing env files — only source references.
196        if name == ".env" || name.starts_with(".env.") {
197            continue;
198        }
199        let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
200            continue;
201        };
202        let Some(lang) = Lang::for_extension(ext) else {
203            continue;
204        };
205        let rel = path.strip_prefix(root).unwrap_or(path).to_path_buf();
206        files.push((rel, lang));
207    }
208    files.sort_by(|a, b| a.0.cmp(&b.0));
209
210    for (rel, lang) in files {
211        let abs = root.join(&rel);
212        // Read as bytes → lossy UTF-8: a binary or non-UTF8 file just yields no
213        // matches; it never errors out the whole scan.
214        let Ok(bytes) = std::fs::read(&abs) else {
215            continue;
216        };
217        let source = String::from_utf8_lossy(&bytes);
218        let component = component_for(&rel);
219        for var in detect_in_source(&source, lang) {
220            found.entry(var).or_insert_with(|| component.clone());
221        }
222    }
223
224    Ok(found
225        .into_iter()
226        .map(|(var, component)| Proposal {
227            coordinate: coordinate_for(&var, &component),
228            var,
229        })
230        .collect())
231}
232
233/// Render proposals as a committable `.env.refs` body (addresses only, never
234/// values). An empty proposal set still yields the header so the output is a
235/// valid, self-explanatory file.
236pub fn render_env_refs(proposals: &[Proposal]) -> String {
237    let mut out = String::new();
238    out.push_str("# Proposed by `kovra scaffold` — REVIEW before use.\n");
239    out.push_str(
240        "# Holds only ADDRESSES, never values; safe to commit (replaces a plaintext .env).\n",
241    );
242    out.push_str("# `${ENV}` is substituted by `kovra run --env <e>`. Prune non-secret vars\n");
243    out.push_str("# (e.g. PORT, LOG_LEVEL) and adjust components/keys as needed.\n");
244    if proposals.is_empty() {
245        out.push_str("# (no environment-variable references detected)\n");
246        return out;
247    }
248    out.push('\n');
249    for p in proposals {
250        out.push_str(&p.var);
251        out.push('=');
252        out.push_str(&p.coordinate);
253        out.push('\n');
254    }
255    out
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261
262    #[test]
263    fn detects_python_patterns() {
264        let src = r#"
265            db = os.getenv("DATABASE_URL")
266            key = os.environ.get("STRIPE_KEY")
267            tok = os.environ["API_TOKEN"]
268            lower = os.getenv("not_a_secret")   # mixed-case: ignored
269        "#;
270        let found = detect_in_source(src, Lang::Python);
271        assert_eq!(found, vec!["DATABASE_URL", "STRIPE_KEY", "API_TOKEN"]);
272    }
273
274    #[test]
275    fn detects_js_ts_patterns() {
276        let src = r#"
277            const url = process.env.DATABASE_URL;
278            const k = process.env["STRIPE_KEY"];
279            const p = process.env.PORT;
280        "#;
281        let found = detect_in_source(src, Lang::JavaScript);
282        assert_eq!(found, vec!["DATABASE_URL", "STRIPE_KEY", "PORT"]);
283    }
284
285    #[test]
286    fn detects_rust_patterns() {
287        let src = r#"
288            let u = std::env::var("DATABASE_URL").unwrap();
289            let o = env::var_os("HOME");          // NEVER_SECRET: dropped
290            let s = env::var("SECRET_KEY")?;
291        "#;
292        let found = detect_in_source(src, Lang::Rust);
293        assert_eq!(found, vec!["DATABASE_URL", "SECRET_KEY"]);
294    }
295
296    #[test]
297    fn dedups_within_a_source() {
298        let src = r#"os.getenv("X"); os.getenv("X"); os.environ["X"]"#;
299        assert_eq!(detect_in_source(src, Lang::Python), vec!["X"]);
300    }
301
302    #[test]
303    fn coordinate_uses_three_segment_grammar_with_placeholder() {
304        assert_eq!(
305            coordinate_for("DATABASE_URL", "backend"),
306            "secret:${ENV}/backend/database-url"
307        );
308        // The generated coordinate parses under the L4 grammar.
309        let parsed = crate::EnvRefs::parse("X=secret:${ENV}/backend/database-url").unwrap();
310        assert_eq!(parsed.vars.len(), 1);
311    }
312
313    #[test]
314    fn slug_kebab_cases_and_falls_back() {
315        assert_eq!(slug("DATABASE_URL", "x"), "database-url");
316        assert_eq!(slug("___", "fallback"), "fallback");
317        assert_eq!(slug("Mixed.Name", "x"), "mixed-name");
318    }
319
320    #[test]
321    fn component_is_top_dir_or_app() {
322        assert_eq!(component_for(Path::new("backend/db.py")), "backend");
323        assert_eq!(component_for(Path::new("main.py")), "app");
324        assert_eq!(component_for(Path::new("api/v1/handler.ts")), "api");
325    }
326
327    #[test]
328    fn render_is_valid_env_refs_and_round_trips() {
329        let proposals = vec![
330            Proposal {
331                var: "DATABASE_URL".into(),
332                coordinate: "secret:${ENV}/backend/database-url".into(),
333            },
334            Proposal {
335                var: "STRIPE_KEY".into(),
336                coordinate: "secret:${ENV}/backend/stripe-key".into(),
337            },
338        ];
339        let body = render_env_refs(&proposals);
340        // Every non-comment line parses under the shipped grammar.
341        let parsed = crate::EnvRefs::parse(&body).unwrap();
342        assert_eq!(parsed.vars.len(), 2);
343        assert!(body.contains("DATABASE_URL=secret:${ENV}/backend/database-url"));
344    }
345
346    #[test]
347    fn scan_repo_walks_sources_and_skips_env_files() {
348        let tmp = tempfile::tempdir().unwrap();
349        let root = tmp.path();
350        std::fs::create_dir_all(root.join("backend")).unwrap();
351        std::fs::write(
352            root.join("backend/app.py"),
353            r#"db = os.getenv("DATABASE_URL")"#,
354        )
355        .unwrap();
356        std::fs::write(root.join("web.ts"), r#"const k = process.env.STRIPE_KEY;"#).unwrap();
357        // A value-bearing .env must NEVER be read (no value enters context).
358        std::fs::write(root.join(".env"), "DATABASE_URL=super-secret-value\n").unwrap();
359
360        let proposals = scan_repo(root).unwrap();
361        let vars: Vec<&str> = proposals.iter().map(|p| p.var.as_str()).collect();
362        assert_eq!(vars, vec!["DATABASE_URL", "STRIPE_KEY"]);
363        // component inference: backend/ → backend, root file → app
364        let by_var: std::collections::HashMap<_, _> = proposals
365            .iter()
366            .map(|p| (p.var.as_str(), p.coordinate.as_str()))
367            .collect();
368        assert_eq!(by_var["DATABASE_URL"], "secret:${ENV}/backend/database-url");
369        assert_eq!(by_var["STRIPE_KEY"], "secret:${ENV}/app/stripe-key");
370        // The rendered body never contains the planted .env value.
371        let body = render_env_refs(&proposals);
372        assert!(!body.contains("super-secret-value"));
373    }
374
375    #[test]
376    fn scan_repo_skips_hidden_and_vendored_dirs() {
377        let tmp = tempfile::tempdir().unwrap();
378        let root = tmp.path();
379        // A hidden virtualenv with third-party source must be skipped.
380        std::fs::create_dir_all(root.join(".venv/lib")).unwrap();
381        std::fs::write(root.join(".venv/lib/dep.py"), r#"os.getenv("VENDOR_KEY")"#).unwrap();
382        // A gitignored vendored dir must be skipped too.
383        std::fs::write(root.join(".gitignore"), "node_modules/\n").unwrap();
384        std::fs::create_dir_all(root.join("node_modules/pkg")).unwrap();
385        std::fs::write(
386            root.join("node_modules/pkg/i.ts"),
387            r#"process.env.DEP_TOKEN"#,
388        )
389        .unwrap();
390        // The project's own source IS scanned.
391        std::fs::write(root.join("app.py"), r#"os.getenv("APP_KEY")"#).unwrap();
392
393        let vars: Vec<String> = scan_repo(root)
394            .unwrap()
395            .into_iter()
396            .map(|p| p.var)
397            .collect();
398        assert_eq!(
399            vars,
400            vec!["APP_KEY"],
401            "hidden (.venv) and gitignored (node_modules) trees must be skipped"
402        );
403    }
404}