Skip to main content

llm_wiki/
ingest.rs

1use std::collections::HashSet;
2use std::path::Path;
3use std::path::PathBuf;
4
5use anyhow::{Result, bail};
6use serde::{Deserialize, Serialize};
7use walkdir::WalkDir;
8
9use crate::config::{RedactConfig, ValidationConfig};
10use crate::frontmatter;
11use crate::git;
12use crate::ops::redact::{RedactionMatch, RedactionReport, redact_body};
13use crate::type_registry::SpaceTypeRegistry;
14
15/// Normalize line endings: CRLF → LF, lone CR → LF.
16pub fn normalize_line_endings(input: &str) -> String {
17    input.replace("\r\n", "\n").replace('\r', "\n")
18}
19
20/// Options controlling an ingest run.
21#[derive(Debug, Clone, Default)]
22pub struct IngestOptions {
23    /// Validate only — do not write to disk or commit.
24    pub dry_run: bool,
25    /// Automatically commit validated files to git.
26    pub auto_commit: bool,
27    /// When `Some`, only files in this set are validated; others increment `unchanged_count`.
28    /// When `None`, all files are validated.
29    pub changed_paths: Option<HashSet<PathBuf>>,
30    /// When `Some`, run redaction pass on each file body before validation.
31    pub redact: Option<RedactConfig>,
32}
33
34/// Result of an ingest operation.
35#[derive(Debug, Clone, Serialize, Deserialize, Default)]
36pub struct IngestReport {
37    /// Number of Markdown pages that passed validation.
38    pub pages_validated: usize,
39    /// Number of non-Markdown asset files discovered.
40    pub assets_found: usize,
41    /// Validation warning messages (non-fatal).
42    pub warnings: Vec<String>,
43    /// Git commit hash produced after ingest, or empty string if no commit was made.
44    pub commit: String,
45    /// Number of files skipped because they were not in `changed_paths`.
46    #[serde(default)]
47    pub unchanged_count: usize,
48    /// Redaction reports for any files that had secrets removed.
49    #[serde(default)]
50    pub redacted: Vec<RedactionReport>,
51}
52
53/// Walk `path` (file or directory), validate, optionally redact, commit, and return a report.
54pub fn ingest(
55    path: &Path,
56    options: &IngestOptions,
57    wiki_root: &Path,
58    registry: &SpaceTypeRegistry,
59    validation: &ValidationConfig,
60) -> Result<IngestReport> {
61    let repo_root = wiki_root
62        .parent()
63        .ok_or_else(|| anyhow::anyhow!("wiki_root has no parent"))?;
64
65    let full_path = if path.is_absolute() {
66        path.to_path_buf()
67    } else {
68        wiki_root.join(path)
69    };
70
71    if !full_path.exists() {
72        bail!("path does not exist: {}", full_path.display());
73    }
74
75    // Reject path traversal
76    let canonical = full_path.canonicalize()?;
77    let canonical_root = wiki_root.canonicalize()?;
78    if !canonical.starts_with(&canonical_root) {
79        bail!("path is outside wiki root");
80    }
81
82    let mut report = IngestReport::default();
83
84    if full_path.is_file() {
85        let skip = should_skip(&full_path, wiki_root, &options.changed_paths);
86        if skip {
87            report.unchanged_count += 1;
88        } else {
89            validate_file(
90                &full_path,
91                wiki_root,
92                registry,
93                validation,
94                options.redact.as_ref(),
95                &mut report,
96            )?;
97        }
98    } else {
99        for entry in WalkDir::new(&full_path).into_iter().filter_map(|e| e.ok()) {
100            let p = entry.path();
101            if p.is_file() {
102                if p.extension().and_then(|e| e.to_str()) == Some("md") {
103                    if should_skip(p, wiki_root, &options.changed_paths) {
104                        report.unchanged_count += 1;
105                    } else {
106                        validate_file(
107                            p,
108                            wiki_root,
109                            registry,
110                            validation,
111                            options.redact.as_ref(),
112                            &mut report,
113                        )?;
114                    }
115                } else {
116                    report.assets_found += 1;
117                }
118            }
119        }
120    }
121
122    if !options.dry_run && options.auto_commit {
123        let msg = format!(
124            "ingest: {} — +{} pages, +{} assets",
125            path.display(),
126            report.pages_validated,
127            report.assets_found
128        );
129        let hash = git::commit(repo_root, &msg)?;
130        report.commit = hash;
131    }
132
133    Ok(report)
134}
135
136fn should_skip(abs_path: &Path, wiki_root: &Path, changed: &Option<HashSet<PathBuf>>) -> bool {
137    let Some(set) = changed else { return false };
138    if set.is_empty() {
139        return false;
140    }
141    let rel = abs_path.strip_prefix(wiki_root).unwrap_or(abs_path);
142    !set.contains(rel)
143}
144
145fn slug_from_path(abs_path: &Path, wiki_root: &Path) -> String {
146    abs_path
147        .strip_prefix(wiki_root)
148        .unwrap_or(abs_path)
149        .with_extension("")
150        .to_string_lossy()
151        .into_owned()
152}
153
154fn validate_file(
155    path: &Path,
156    wiki_root: &Path,
157    registry: &SpaceTypeRegistry,
158    validation: &ValidationConfig,
159    redact_cfg: Option<&RedactConfig>,
160    report: &mut IngestReport,
161) -> Result<()> {
162    let raw = std::fs::read_to_string(path)?;
163    let mut content = normalize_line_endings(&raw);
164
165    // Redaction pass — body only, before validation
166    if let Some(cfg) = redact_cfg {
167        let parsed = frontmatter::parse(&content);
168        let separator = "---";
169        // Find where body starts (after the closing frontmatter delimiter)
170        let body_start = if content.starts_with(separator) {
171            // skip first "---", find closing "---"
172            let after_open = &content[3..];
173            after_open
174                .find("\n---")
175                .map(|pos| 3 + pos + 4 + 1)
176                .unwrap_or(0)
177        } else {
178            0
179        };
180
181        if body_start > 0 && body_start <= content.len() {
182            let front = &content[..body_start];
183            let body = &content[body_start..];
184            let (redacted_body, matches) = redact_body(body, cfg);
185            if !matches.is_empty() {
186                let slug = slug_from_path(path, wiki_root);
187                // Adjust line numbers by frontmatter line count
188                let fm_lines = front.lines().count();
189                let adjusted: Vec<RedactionMatch> = matches
190                    .into_iter()
191                    .map(|m| RedactionMatch {
192                        pattern_name: m.pattern_name,
193                        line_number: m.line_number + fm_lines,
194                    })
195                    .collect();
196                report.redacted.push(RedactionReport {
197                    slug,
198                    matches: adjusted,
199                });
200                std::fs::write(path, format!("{front}{redacted_body}"))?;
201                content = normalize_line_endings(&std::fs::read_to_string(path)?);
202            }
203        } else {
204            // No frontmatter — redact the whole file
205            let (redacted, matches) = redact_body(&content, cfg);
206            if !matches.is_empty() {
207                let slug = slug_from_path(path, wiki_root);
208                report.redacted.push(RedactionReport { slug, matches });
209                std::fs::write(path, &redacted)?;
210                content = normalize_line_endings(&redacted);
211            }
212        }
213        let _ = parsed; // parsed only used to determine frontmatter presence above
214    }
215
216    let page = frontmatter::parse(&content);
217
218    // No frontmatter — warn but count as validated
219    if page.frontmatter.is_empty() {
220        report
221            .warnings
222            .push(format!("{}: no frontmatter found", path.display()));
223        report.pages_validated += 1;
224        return Ok(());
225    }
226
227    // Validate base fields via type registry
228    let warnings = registry.validate(&page.frontmatter, &validation.type_strictness)?;
229    for w in warnings {
230        report.warnings.push(format!("{}: {}", path.display(), w));
231    }
232
233    report.pages_validated += 1;
234    Ok(())
235}