Skip to main content

rustledger_plugin/native/plugins/
document_discovery.rs

1//! Auto-discover documents from directories.
2
3use serde::Deserialize;
4
5use crate::types::{
6    DirectiveData, DirectiveWrapper, DocumentData, PluginError, PluginInput, PluginOp, PluginOutput,
7};
8
9use super::super::{NativePlugin, SynthPlugin};
10
11/// Maximum recursion depth for directory scanning to prevent denial-of-service from deeply nested structures.
12const MAX_SCAN_DEPTH: usize = 32;
13
14/// Plugin that auto-discovers document files from configured directories.
15///
16/// Scans directories specified in `option "documents"` for files matching
17/// the pattern: `{Account}/YYYY-MM-DD.description.*`
18///
19/// For example: `documents/Assets/Bank/Checking/2024-01-15.statement.pdf`
20/// generates: `2024-01-15 document Assets:Bank:Checking "documents/Assets/Bank/Checking/2024-01-15.statement.pdf"`
21///
22/// # Configuration
23///
24/// The plugin reads its per-load context (resolved document directories
25/// and the ledger's base directory for relative-path normalization) from
26/// [`PluginInput::config`] as a JSON object:
27///
28/// ```json
29/// {"base_dir": "/path/to/ledger", "directories": ["/abs/path/docs"]}
30/// ```
31///
32/// The loader constructs this config when populating the synth pass; if
33/// `config` is `None` or `directories` is empty, the plugin returns a
34/// no-op (every input directive is kept, nothing synthesized). If `config`
35/// is present but malformed JSON, every input directive is still kept and
36/// a `PluginError::error` is added to the output errors — the plugin
37/// never silently drops directives on bad config. This lets the plugin
38/// sit in the registry as a static instance and be dispatched through
39/// the normal synth-pass machinery.
40///
41/// # Security
42///
43/// - Symlinks are skipped to prevent infinite recursion from symlink cycles
44/// - Maximum recursion depth is enforced to prevent denial-of-service from deeply nested directories
45pub struct DocumentDiscoveryPlugin;
46
47/// Name passed to file-declared / extra-plugin lookups and used by the
48/// loader when emitting the synth-pass config entry. Kept as a constant
49/// so the registry, the loader, and the rustdoc stay in sync.
50pub const DOCUMENT_DISCOVERY_NAME: &str = "document_discovery";
51
52/// JSON config schema parsed from [`PluginInput::config`].
53#[derive(Debug, Deserialize)]
54struct DocumentDiscoveryConfig {
55    base_dir: std::path::PathBuf,
56    directories: Vec<String>,
57}
58
59/// Build the [`PluginInput::config`] JSON string for this plugin.
60///
61/// Centralized here so callers (the loader) don't need to know the
62/// schema — the plugin owns its own config shape.
63#[must_use]
64pub fn document_discovery_config(base_dir: &std::path::Path, directories: &[String]) -> String {
65    serde_json::json!({
66        "base_dir": base_dir,
67        "directories": directories,
68    })
69    .to_string()
70}
71
72impl NativePlugin for DocumentDiscoveryPlugin {
73    fn name(&self) -> &'static str {
74        DOCUMENT_DISCOVERY_NAME
75    }
76
77    fn description(&self) -> &'static str {
78        "Auto-discover documents from directories"
79    }
80
81    fn process(&self, input: PluginInput) -> PluginOutput {
82        use std::path::Path;
83
84        // No config → no-op pass-through. Lets the plugin sit in the
85        // registry unconditionally without doing work when the ledger
86        // hasn't declared `option "documents"`.
87        let Some(config_json) = input.config.as_deref() else {
88            return PluginOutput {
89                ops: (0..input.directives.len()).map(PluginOp::Keep).collect(),
90                errors: Vec::new(),
91            };
92        };
93
94        let config: DocumentDiscoveryConfig = match serde_json::from_str(config_json) {
95            Ok(c) => c,
96            Err(e) => {
97                return PluginOutput {
98                    ops: (0..input.directives.len()).map(PluginOp::Keep).collect(),
99                    errors: vec![PluginError::error(format!(
100                        "document_discovery: invalid config JSON: {e}"
101                    ))],
102                };
103            }
104        };
105
106        if config.directories.is_empty() {
107            return PluginOutput {
108                ops: (0..input.directives.len()).map(PluginOp::Keep).collect(),
109                errors: Vec::new(),
110            };
111        }
112
113        let mut new_directives = Vec::new();
114        let mut errors = Vec::new();
115
116        // Collect existing document paths to avoid duplicates.
117        // Normalize paths by resolving relative paths against base_dir, then canonicalizing.
118        let mut existing_docs: std::collections::HashSet<String> = std::collections::HashSet::new();
119        for wrapper in &input.directives {
120            if let DirectiveData::Document(doc) = &wrapper.data {
121                let doc_path = Path::new(&doc.path);
122                let resolved = if doc_path.is_absolute() {
123                    doc_path.to_path_buf()
124                } else {
125                    config.base_dir.join(doc_path)
126                };
127                let normalized = resolved
128                    .canonicalize()
129                    .map_or_else(|_| doc.path.clone(), |p| p.to_string_lossy().to_string());
130                existing_docs.insert(normalized);
131            }
132        }
133
134        // Scan each directory
135        for dir in &config.directories {
136            let dir_path = Path::new(dir);
137            if !dir_path.exists() {
138                continue;
139            }
140
141            if let Err(e) = scan_documents(
142                dir_path,
143                dir,
144                &existing_docs,
145                &mut new_directives,
146                &mut errors,
147                0, // Initial depth
148            ) {
149                errors.push(PluginError::error(format!(
150                    "Error scanning documents in {dir}: {e}"
151                )));
152            }
153        }
154
155        // Keep all input directives, then insert discovered documents.
156        let mut ops: Vec<PluginOp> = (0..input.directives.len()).map(PluginOp::Keep).collect();
157        for w in new_directives {
158            ops.push(PluginOp::Insert(w));
159        }
160
161        // Final ordering is the loader's responsibility — it re-sorts
162        // directives after the plugin pass.
163        PluginOutput { ops, errors }
164    }
165}
166
167/// Synthesizes `Document` directives that downstream consumers expect
168/// alongside user-written ones — runs in the synth pass so the early
169/// validator sees them.
170impl SynthPlugin for DocumentDiscoveryPlugin {}
171
172/// Recursively scan a directory for document files.
173///
174/// # Security
175/// - Uses `symlink_metadata` to detect and skip symlinks, preventing infinite loops
176/// - Enforces maximum recursion depth to prevent denial-of-service from deeply nested directories
177#[allow(clippy::only_used_in_recursion)]
178fn scan_documents(
179    path: &std::path::Path,
180    base_dir: &str,
181    existing: &std::collections::HashSet<String>,
182    directives: &mut Vec<DirectiveWrapper>,
183    errors: &mut Vec<PluginError>,
184    depth: usize,
185) -> std::io::Result<()> {
186    use std::fs;
187
188    // Enforce maximum recursion depth
189    if depth > MAX_SCAN_DEPTH {
190        errors.push(PluginError::warning(format!(
191            "Maximum directory depth ({MAX_SCAN_DEPTH}) exceeded at {}",
192            path.display()
193        )));
194        return Ok(());
195    }
196
197    for entry in fs::read_dir(path)? {
198        let entry = entry?;
199        let entry_path = entry.path();
200
201        // Use symlink_metadata to check file type WITHOUT following symlinks.
202        // This prevents infinite recursion from symlink cycles.
203        let metadata = match fs::symlink_metadata(&entry_path) {
204            Ok(m) => m,
205            Err(_) => continue, // Skip entries we can't stat
206        };
207
208        // Skip symlinks entirely to prevent security issues
209        if metadata.file_type().is_symlink() {
210            continue;
211        }
212
213        if metadata.is_dir() {
214            scan_documents(
215                &entry_path,
216                base_dir,
217                existing,
218                directives,
219                errors,
220                depth + 1,
221            )?;
222        } else if metadata.is_file() {
223            // Try to parse filename as YYYY-MM-DD.description.ext
224            if let Some(file_name) = entry_path.file_name().and_then(|n| n.to_str())
225                && file_name.len() >= 10
226                && file_name.chars().nth(4) == Some('-')
227                && file_name.chars().nth(7) == Some('-')
228            {
229                let date_str = &file_name[0..10];
230                // Validate date format
231                if date_str.chars().take(4).all(|c| c.is_ascii_digit())
232                    && date_str.chars().skip(5).take(2).all(|c| c.is_ascii_digit())
233                    && date_str.chars().skip(8).take(2).all(|c| c.is_ascii_digit())
234                {
235                    // Extract account from path relative to base_dir
236                    if let Ok(rel_path) = entry_path.strip_prefix(base_dir)
237                        && let Some(parent) = rel_path.parent()
238                    {
239                        let account = parent
240                            .components()
241                            .map(|c| c.as_os_str().to_string_lossy().to_string())
242                            .collect::<Vec<_>>()
243                            .join(":");
244
245                        if !account.is_empty() {
246                            let full_path = entry_path.to_string_lossy().to_string();
247
248                            // Canonicalize for consistent comparison with existing docs
249                            let canonical = entry_path.canonicalize().map_or_else(
250                                |_| full_path.clone(),
251                                |p| p.to_string_lossy().to_string(),
252                            );
253
254                            // Skip if already exists (compare canonical paths)
255                            if existing.contains(&canonical) {
256                                continue;
257                            }
258
259                            directives.push(DirectiveWrapper {
260                                directive_type: "document".to_string(),
261                                date: date_str.to_string(),
262                                filename: None, // Plugin-generated
263                                lineno: None,
264                                data: DirectiveData::Document(DocumentData {
265                                    account,
266                                    path: full_path,
267                                    tags: vec![],
268                                    links: vec![],
269                                    metadata: vec![],
270                                }),
271                            });
272                        }
273                    }
274                }
275            }
276        }
277    }
278
279    Ok(())
280}