Skip to main content

rustledger_plugin/native/plugins/
document_discovery.rs

1//! Auto-discover documents from directories.
2
3use crate::types::{
4    DirectiveData, DirectiveWrapper, DocumentData, PluginError, PluginInput, PluginOutput,
5    sort_directives,
6};
7
8use super::super::NativePlugin;
9
10/// Maximum recursion depth for directory scanning to prevent denial-of-service from deeply nested structures.
11const MAX_SCAN_DEPTH: usize = 32;
12
13/// Plugin that auto-discovers document files from configured directories.
14///
15/// Scans directories specified in `option "documents"` for files matching
16/// the pattern: `{Account}/YYYY-MM-DD.description.*`
17///
18/// For example: `documents/Assets/Bank/Checking/2024-01-15.statement.pdf`
19/// generates: `2024-01-15 document Assets:Bank:Checking "documents/Assets/Bank/Checking/2024-01-15.statement.pdf"`
20///
21/// # Security
22///
23/// - Symlinks are skipped to prevent infinite recursion from symlink cycles
24/// - Maximum recursion depth is enforced to prevent denial-of-service from deeply nested directories
25pub struct DocumentDiscoveryPlugin {
26    /// Directories to scan for documents (resolved to absolute paths).
27    pub directories: Vec<String>,
28    /// Base directory for resolving relative paths in existing document directives.
29    pub base_dir: std::path::PathBuf,
30}
31
32impl DocumentDiscoveryPlugin {
33    /// Create a new plugin with the given directories and base directory.
34    ///
35    /// The `base_dir` is used to resolve relative paths in existing document directives
36    /// for duplicate detection.
37    pub const fn new(directories: Vec<String>, base_dir: std::path::PathBuf) -> Self {
38        Self {
39            directories,
40            base_dir,
41        }
42    }
43}
44
45impl NativePlugin for DocumentDiscoveryPlugin {
46    fn name(&self) -> &'static str {
47        "document_discovery"
48    }
49
50    fn description(&self) -> &'static str {
51        "Auto-discover documents from directories"
52    }
53
54    fn process(&self, input: PluginInput) -> PluginOutput {
55        use std::path::Path;
56
57        let mut new_directives = Vec::new();
58        let mut errors = Vec::new();
59
60        // Collect existing document paths to avoid duplicates.
61        // Normalize paths by resolving relative paths against base_dir, then canonicalizing.
62        let mut existing_docs: std::collections::HashSet<String> = std::collections::HashSet::new();
63        for wrapper in &input.directives {
64            if let DirectiveData::Document(doc) = &wrapper.data {
65                let doc_path = Path::new(&doc.path);
66                // Resolve relative paths against base_dir
67                let resolved = if doc_path.is_absolute() {
68                    doc_path.to_path_buf()
69                } else {
70                    self.base_dir.join(doc_path)
71                };
72                // Canonicalize for consistent path comparison
73                let normalized = resolved
74                    .canonicalize()
75                    .map_or_else(|_| doc.path.clone(), |p| p.to_string_lossy().to_string());
76                existing_docs.insert(normalized);
77            }
78        }
79
80        // Scan each directory
81        for dir in &self.directories {
82            let dir_path = Path::new(dir);
83            if !dir_path.exists() {
84                continue;
85            }
86
87            if let Err(e) = scan_documents(
88                dir_path,
89                dir,
90                &existing_docs,
91                &mut new_directives,
92                &mut errors,
93                0, // Initial depth
94            ) {
95                errors.push(PluginError::error(format!(
96                    "Error scanning documents in {dir}: {e}"
97                )));
98            }
99        }
100
101        // Add discovered documents to directives
102        let mut all_directives = input.directives;
103        all_directives.extend(new_directives);
104
105        // Sort using beancount's standard ordering
106        sort_directives(&mut all_directives);
107
108        PluginOutput {
109            directives: all_directives,
110            errors,
111        }
112    }
113}
114
115/// Recursively scan a directory for document files.
116///
117/// # Security
118/// - Uses `symlink_metadata` to detect and skip symlinks, preventing infinite loops
119/// - Enforces maximum recursion depth to prevent denial-of-service from deeply nested directories
120#[allow(clippy::only_used_in_recursion)]
121fn scan_documents(
122    path: &std::path::Path,
123    base_dir: &str,
124    existing: &std::collections::HashSet<String>,
125    directives: &mut Vec<DirectiveWrapper>,
126    errors: &mut Vec<PluginError>,
127    depth: usize,
128) -> std::io::Result<()> {
129    use std::fs;
130
131    // Enforce maximum recursion depth
132    if depth > MAX_SCAN_DEPTH {
133        errors.push(PluginError::warning(format!(
134            "Maximum directory depth ({MAX_SCAN_DEPTH}) exceeded at {}",
135            path.display()
136        )));
137        return Ok(());
138    }
139
140    for entry in fs::read_dir(path)? {
141        let entry = entry?;
142        let entry_path = entry.path();
143
144        // Use symlink_metadata to check file type WITHOUT following symlinks.
145        // This prevents infinite recursion from symlink cycles.
146        let metadata = match fs::symlink_metadata(&entry_path) {
147            Ok(m) => m,
148            Err(_) => continue, // Skip entries we can't stat
149        };
150
151        // Skip symlinks entirely to prevent security issues
152        if metadata.file_type().is_symlink() {
153            continue;
154        }
155
156        if metadata.is_dir() {
157            scan_documents(
158                &entry_path,
159                base_dir,
160                existing,
161                directives,
162                errors,
163                depth + 1,
164            )?;
165        } else if metadata.is_file() {
166            // Try to parse filename as YYYY-MM-DD.description.ext
167            if let Some(file_name) = entry_path.file_name().and_then(|n| n.to_str())
168                && file_name.len() >= 10
169                && file_name.chars().nth(4) == Some('-')
170                && file_name.chars().nth(7) == Some('-')
171            {
172                let date_str = &file_name[0..10];
173                // Validate date format
174                if date_str.chars().take(4).all(|c| c.is_ascii_digit())
175                    && date_str.chars().skip(5).take(2).all(|c| c.is_ascii_digit())
176                    && date_str.chars().skip(8).take(2).all(|c| c.is_ascii_digit())
177                {
178                    // Extract account from path relative to base_dir
179                    if let Ok(rel_path) = entry_path.strip_prefix(base_dir)
180                        && let Some(parent) = rel_path.parent()
181                    {
182                        let account = parent
183                            .components()
184                            .map(|c| c.as_os_str().to_string_lossy().to_string())
185                            .collect::<Vec<_>>()
186                            .join(":");
187
188                        if !account.is_empty() {
189                            let full_path = entry_path.to_string_lossy().to_string();
190
191                            // Canonicalize for consistent comparison with existing docs
192                            let canonical = entry_path.canonicalize().map_or_else(
193                                |_| full_path.clone(),
194                                |p| p.to_string_lossy().to_string(),
195                            );
196
197                            // Skip if already exists (compare canonical paths)
198                            if existing.contains(&canonical) {
199                                continue;
200                            }
201
202                            directives.push(DirectiveWrapper {
203                                directive_type: "document".to_string(),
204                                date: date_str.to_string(),
205                                filename: None, // Plugin-generated
206                                lineno: None,
207                                data: DirectiveData::Document(DocumentData {
208                                    account,
209                                    path: full_path,
210                                    metadata: vec![],
211                                }),
212                            });
213                        }
214                    }
215                }
216            }
217        }
218    }
219
220    Ok(())
221}