Skip to main content

rustledger_plugin/native/plugins/
document_discovery.rs

1//! Auto-discover documents from directories.
2
3use crate::types::{
4    DirectiveData, DirectiveWrapper, DocumentData, PluginError, PluginInput, PluginOp, PluginOutput,
5};
6
7use super::super::NativePlugin;
8
9/// Maximum recursion depth for directory scanning to prevent denial-of-service from deeply nested structures.
10const MAX_SCAN_DEPTH: usize = 32;
11
12/// Plugin that auto-discovers document files from configured directories.
13///
14/// Scans directories specified in `option "documents"` for files matching
15/// the pattern: `{Account}/YYYY-MM-DD.description.*`
16///
17/// For example: `documents/Assets/Bank/Checking/2024-01-15.statement.pdf`
18/// generates: `2024-01-15 document Assets:Bank:Checking "documents/Assets/Bank/Checking/2024-01-15.statement.pdf"`
19///
20/// # Security
21///
22/// - Symlinks are skipped to prevent infinite recursion from symlink cycles
23/// - Maximum recursion depth is enforced to prevent denial-of-service from deeply nested directories
24pub struct DocumentDiscoveryPlugin {
25    /// Directories to scan for documents (resolved to absolute paths).
26    pub directories: Vec<String>,
27    /// Base directory for resolving relative paths in existing document directives.
28    pub base_dir: std::path::PathBuf,
29}
30
31impl DocumentDiscoveryPlugin {
32    /// Create a new plugin with the given directories and base directory.
33    ///
34    /// The `base_dir` is used to resolve relative paths in existing document directives
35    /// for duplicate detection.
36    pub const fn new(directories: Vec<String>, base_dir: std::path::PathBuf) -> Self {
37        Self {
38            directories,
39            base_dir,
40        }
41    }
42}
43
44impl NativePlugin for DocumentDiscoveryPlugin {
45    fn name(&self) -> &'static str {
46        "document_discovery"
47    }
48
49    fn description(&self) -> &'static str {
50        "Auto-discover documents from directories"
51    }
52
53    fn process(&self, input: PluginInput) -> PluginOutput {
54        use std::path::Path;
55
56        let mut new_directives = Vec::new();
57        let mut errors = Vec::new();
58
59        // Collect existing document paths to avoid duplicates.
60        // Normalize paths by resolving relative paths against base_dir, then canonicalizing.
61        let mut existing_docs: std::collections::HashSet<String> = std::collections::HashSet::new();
62        for wrapper in &input.directives {
63            if let DirectiveData::Document(doc) = &wrapper.data {
64                let doc_path = Path::new(&doc.path);
65                // Resolve relative paths against base_dir
66                let resolved = if doc_path.is_absolute() {
67                    doc_path.to_path_buf()
68                } else {
69                    self.base_dir.join(doc_path)
70                };
71                // Canonicalize for consistent path comparison
72                let normalized = resolved
73                    .canonicalize()
74                    .map_or_else(|_| doc.path.clone(), |p| p.to_string_lossy().to_string());
75                existing_docs.insert(normalized);
76            }
77        }
78
79        // Scan each directory
80        for dir in &self.directories {
81            let dir_path = Path::new(dir);
82            if !dir_path.exists() {
83                continue;
84            }
85
86            if let Err(e) = scan_documents(
87                dir_path,
88                dir,
89                &existing_docs,
90                &mut new_directives,
91                &mut errors,
92                0, // Initial depth
93            ) {
94                errors.push(PluginError::error(format!(
95                    "Error scanning documents in {dir}: {e}"
96                )));
97            }
98        }
99
100        // Keep all input directives, then insert discovered documents.
101        let mut ops: Vec<PluginOp> = (0..input.directives.len()).map(PluginOp::Keep).collect();
102        for w in new_directives {
103            ops.push(PluginOp::Insert(w));
104        }
105
106        // Final ordering is the loader's responsibility — it re-sorts
107        // directives after the plugin pass.
108        PluginOutput { ops, errors }
109    }
110}
111
112/// Recursively scan a directory for document files.
113///
114/// # Security
115/// - Uses `symlink_metadata` to detect and skip symlinks, preventing infinite loops
116/// - Enforces maximum recursion depth to prevent denial-of-service from deeply nested directories
117#[allow(clippy::only_used_in_recursion)]
118fn scan_documents(
119    path: &std::path::Path,
120    base_dir: &str,
121    existing: &std::collections::HashSet<String>,
122    directives: &mut Vec<DirectiveWrapper>,
123    errors: &mut Vec<PluginError>,
124    depth: usize,
125) -> std::io::Result<()> {
126    use std::fs;
127
128    // Enforce maximum recursion depth
129    if depth > MAX_SCAN_DEPTH {
130        errors.push(PluginError::warning(format!(
131            "Maximum directory depth ({MAX_SCAN_DEPTH}) exceeded at {}",
132            path.display()
133        )));
134        return Ok(());
135    }
136
137    for entry in fs::read_dir(path)? {
138        let entry = entry?;
139        let entry_path = entry.path();
140
141        // Use symlink_metadata to check file type WITHOUT following symlinks.
142        // This prevents infinite recursion from symlink cycles.
143        let metadata = match fs::symlink_metadata(&entry_path) {
144            Ok(m) => m,
145            Err(_) => continue, // Skip entries we can't stat
146        };
147
148        // Skip symlinks entirely to prevent security issues
149        if metadata.file_type().is_symlink() {
150            continue;
151        }
152
153        if metadata.is_dir() {
154            scan_documents(
155                &entry_path,
156                base_dir,
157                existing,
158                directives,
159                errors,
160                depth + 1,
161            )?;
162        } else if metadata.is_file() {
163            // Try to parse filename as YYYY-MM-DD.description.ext
164            if let Some(file_name) = entry_path.file_name().and_then(|n| n.to_str())
165                && file_name.len() >= 10
166                && file_name.chars().nth(4) == Some('-')
167                && file_name.chars().nth(7) == Some('-')
168            {
169                let date_str = &file_name[0..10];
170                // Validate date format
171                if date_str.chars().take(4).all(|c| c.is_ascii_digit())
172                    && date_str.chars().skip(5).take(2).all(|c| c.is_ascii_digit())
173                    && date_str.chars().skip(8).take(2).all(|c| c.is_ascii_digit())
174                {
175                    // Extract account from path relative to base_dir
176                    if let Ok(rel_path) = entry_path.strip_prefix(base_dir)
177                        && let Some(parent) = rel_path.parent()
178                    {
179                        let account = parent
180                            .components()
181                            .map(|c| c.as_os_str().to_string_lossy().to_string())
182                            .collect::<Vec<_>>()
183                            .join(":");
184
185                        if !account.is_empty() {
186                            let full_path = entry_path.to_string_lossy().to_string();
187
188                            // Canonicalize for consistent comparison with existing docs
189                            let canonical = entry_path.canonicalize().map_or_else(
190                                |_| full_path.clone(),
191                                |p| p.to_string_lossy().to_string(),
192                            );
193
194                            // Skip if already exists (compare canonical paths)
195                            if existing.contains(&canonical) {
196                                continue;
197                            }
198
199                            directives.push(DirectiveWrapper {
200                                directive_type: "document".to_string(),
201                                date: date_str.to_string(),
202                                filename: None, // Plugin-generated
203                                lineno: None,
204                                data: DirectiveData::Document(DocumentData {
205                                    account,
206                                    path: full_path,
207                                    metadata: vec![],
208                                }),
209                            });
210                        }
211                    }
212                }
213            }
214        }
215    }
216
217    Ok(())
218}