Skip to main content

plissken_core/
discover.rs

1//! Python module auto-discovery
2//!
3//! Walks the filesystem to find Python modules, converting file paths
4//! to dotted module names.
5
6use std::collections::HashMap;
7use std::path::{Path, PathBuf};
8use walkdir::WalkDir;
9
10use crate::config::ModuleSourceType;
11
12/// A discovered Python module
13#[derive(Debug, Clone)]
14pub struct DiscoveredModule {
15    /// Dotted module name (e.g., "mypackage.utils.helpers")
16    pub name: String,
17    /// Path to the Python file
18    pub path: PathBuf,
19    /// Detected module type (Python or PyO3)
20    pub module_type: ModuleSourceType,
21}
22
23/// Directories to skip during discovery
24const SKIP_DIRS: &[&str] = &[
25    "__pycache__",
26    ".venv",
27    "venv",
28    ".env",
29    "env",
30    ".tox",
31    ".nox",
32    ".pytest_cache",
33    ".mypy_cache",
34    ".ruff_cache",
35    "node_modules",
36    ".git",
37    "build",
38    "dist",
39    "egg-info",
40];
41
42/// Discover Python modules by walking the filesystem.
43///
44/// # Arguments
45/// * `source_dir` - The directory to search for Python files
46/// * `package_name` - The root package name for module path generation
47///
48/// # Returns
49/// A vector of discovered modules with their dotted names and paths.
50pub fn discover_python_modules(
51    source_dir: &Path,
52    package_name: &str,
53) -> Result<Vec<DiscoveredModule>, std::io::Error> {
54    let mut modules = Vec::new();
55
56    if !source_dir.exists() {
57        return Ok(modules);
58    }
59
60    for entry in WalkDir::new(source_dir)
61        .follow_links(true)
62        .into_iter()
63        .filter_entry(|e| !should_skip_entry(e))
64    {
65        let entry = entry?;
66        let path = entry.path();
67
68        // Only process .py files
69        if path.extension().map(|e| e == "py").unwrap_or(false)
70            && let Some(module) = path_to_module(path, source_dir, package_name)
71        {
72            modules.push(module);
73        }
74    }
75
76    // Sort modules by name for consistent ordering
77    modules.sort_by(|a, b| a.name.cmp(&b.name));
78
79    Ok(modules)
80}
81
82/// Check if an entry should be skipped during directory traversal.
83fn should_skip_entry(entry: &walkdir::DirEntry) -> bool {
84    let file_name = entry.file_name().to_string_lossy();
85
86    // Skip hidden files/directories (except the source dir itself)
87    if file_name.starts_with('.') && entry.depth() > 0 {
88        return true;
89    }
90
91    // Skip known non-module directories
92    if entry.file_type().is_dir() {
93        if SKIP_DIRS.iter().any(|&skip| file_name == skip) {
94            return true;
95        }
96        // Skip directories ending in .egg-info
97        if file_name.ends_with(".egg-info") {
98            return true;
99        }
100    }
101
102    false
103}
104
105/// Convert a file path to a Python module.
106fn path_to_module(
107    file_path: &Path,
108    source_dir: &Path,
109    package_name: &str,
110) -> Option<DiscoveredModule> {
111    // Get relative path from source directory
112    let relative = file_path.strip_prefix(source_dir).ok()?;
113
114    // Convert path to module name
115    let module_name = path_to_module_name(relative, package_name)?;
116
117    // Detect module type by scanning file content
118    let module_type = detect_module_type(file_path);
119
120    Some(DiscoveredModule {
121        name: module_name,
122        path: file_path.to_owned(),
123        module_type,
124    })
125}
126
127/// Convert a relative file path to a dotted module name.
128///
129/// Examples:
130/// - `mypackage/__init__.py` → `mypackage`
131/// - `mypackage/utils.py` → `mypackage.utils`
132/// - `mypackage/sub/helpers.py` → `mypackage.sub.helpers`
133fn path_to_module_name(relative_path: &Path, package_name: &str) -> Option<String> {
134    let mut components: Vec<&str> = Vec::new();
135
136    for component in relative_path.components() {
137        if let std::path::Component::Normal(name) = component {
138            let name_str = name.to_str()?;
139            components.push(name_str);
140        }
141    }
142
143    if components.is_empty() {
144        return None;
145    }
146
147    // Remove .py extension from the last component
148    let last_idx = components.len() - 1;
149    let last = components[last_idx];
150    let last_without_ext = last.strip_suffix(".py")?;
151
152    // Handle __init__.py - represents the package itself
153    if last_without_ext == "__init__" {
154        if components.len() == 1 {
155            // Root __init__.py
156            return Some(package_name.to_string());
157        }
158        // Sub-package __init__.py - remove the __init__ part
159        components.pop();
160    } else {
161        components[last_idx] = last_without_ext;
162    }
163
164    if components.is_empty() {
165        return Some(package_name.to_string());
166    }
167
168    // Check if the first component matches the package name
169    // If source dir already contains the package, don't duplicate
170    if components[0] == package_name {
171        Some(components.join("."))
172    } else {
173        // Prepend package name
174        Some(format!("{}.{}", package_name, components.join(".")))
175    }
176}
177
178/// Detect if a Python file is a PyO3 stub module.
179///
180/// Looks for markers that indicate the module imports from a native extension:
181/// - Import from a module with underscore prefix (e.g., `from ._native import`)
182/// - Comment marker `# pyo3` or `# pyo3-stub`
183fn detect_module_type(file_path: &Path) -> ModuleSourceType {
184    // Read the first part of the file to check for markers
185    if let Ok(content) = std::fs::read_to_string(file_path) {
186        // Only check the first ~2KB for performance
187        let preview = if content.len() > 2048 {
188            &content[..2048]
189        } else {
190            &content
191        };
192
193        // Check for PyO3 markers
194        if preview.contains("# pyo3")
195            || preview.contains("#pyo3")
196            || preview.contains("# type: ignore[import]")
197        // Common in PyO3 stubs
198        {
199            return ModuleSourceType::Pyo3;
200        }
201
202        // Check for imports from native modules (underscore prefix convention)
203        for line in preview.lines() {
204            let line = line.trim();
205            if (line.starts_with("from ._") || line.starts_with("from _"))
206                && line.contains(" import ")
207            {
208                return ModuleSourceType::Pyo3;
209            }
210        }
211    }
212
213    ModuleSourceType::Python
214}
215
216/// Merge discovered modules with explicitly configured modules.
217///
218/// Explicit modules take precedence over discovered ones.
219pub fn merge_modules(
220    discovered: Vec<DiscoveredModule>,
221    explicit: &HashMap<String, ModuleSourceType>,
222) -> HashMap<String, ModuleSourceType> {
223    let mut result: HashMap<String, ModuleSourceType> = discovered
224        .into_iter()
225        .map(|m| (m.name, m.module_type))
226        .collect();
227
228    // Explicit modules override discovered ones
229    for (name, module_type) in explicit {
230        result.insert(name.clone(), module_type.clone());
231    }
232
233    result
234}
235
236#[cfg(test)]
237mod tests {
238    use super::*;
239    use tempfile::TempDir;
240
241    #[test]
242    fn test_path_to_module_name_simple() {
243        let path = Path::new("utils.py");
244        assert_eq!(
245            path_to_module_name(path, "mypackage"),
246            Some("mypackage.utils".to_string())
247        );
248    }
249
250    #[test]
251    fn test_path_to_module_name_nested() {
252        let path = Path::new("sub/helpers.py");
253        assert_eq!(
254            path_to_module_name(path, "mypackage"),
255            Some("mypackage.sub.helpers".to_string())
256        );
257    }
258
259    #[test]
260    fn test_path_to_module_name_init() {
261        let path = Path::new("__init__.py");
262        assert_eq!(
263            path_to_module_name(path, "mypackage"),
264            Some("mypackage".to_string())
265        );
266    }
267
268    #[test]
269    fn test_path_to_module_name_subpackage_init() {
270        let path = Path::new("sub/__init__.py");
271        assert_eq!(
272            path_to_module_name(path, "mypackage"),
273            Some("mypackage.sub".to_string())
274        );
275    }
276
277    #[test]
278    fn test_path_to_module_name_with_package_in_path() {
279        let path = Path::new("mypackage/utils.py");
280        assert_eq!(
281            path_to_module_name(path, "mypackage"),
282            Some("mypackage.utils".to_string())
283        );
284    }
285
286    #[test]
287    fn test_discover_python_modules() {
288        let temp_dir = TempDir::new().unwrap();
289        let pkg_dir = temp_dir.path().join("mypackage");
290        std::fs::create_dir(&pkg_dir).unwrap();
291
292        // Create some Python files
293        std::fs::write(pkg_dir.join("__init__.py"), "").unwrap();
294        std::fs::write(pkg_dir.join("utils.py"), "def helper(): pass").unwrap();
295        std::fs::write(pkg_dir.join("core.py"), "class Engine: pass").unwrap();
296
297        // Create a subpackage
298        let sub_dir = pkg_dir.join("sub");
299        std::fs::create_dir(&sub_dir).unwrap();
300        std::fs::write(sub_dir.join("__init__.py"), "").unwrap();
301        std::fs::write(sub_dir.join("helpers.py"), "").unwrap();
302
303        // Create a __pycache__ directory (should be skipped)
304        let pycache = pkg_dir.join("__pycache__");
305        std::fs::create_dir(&pycache).unwrap();
306        std::fs::write(pycache.join("utils.cpython-311.pyc"), "").unwrap();
307
308        let modules = discover_python_modules(&pkg_dir, "mypackage").unwrap();
309
310        let names: Vec<&str> = modules.iter().map(|m| m.name.as_str()).collect();
311        assert!(names.contains(&"mypackage"));
312        assert!(names.contains(&"mypackage.utils"));
313        assert!(names.contains(&"mypackage.core"));
314        assert!(names.contains(&"mypackage.sub"));
315        assert!(names.contains(&"mypackage.sub.helpers"));
316        // Should NOT contain pycache files
317        assert!(!names.iter().any(|n| n.contains("pycache")));
318    }
319
320    #[test]
321    fn test_detect_module_type_python() {
322        let temp_dir = TempDir::new().unwrap();
323        let file = temp_dir.path().join("module.py");
324        std::fs::write(&file, "def foo(): pass\n").unwrap();
325
326        assert!(matches!(
327            detect_module_type(&file),
328            ModuleSourceType::Python
329        ));
330    }
331
332    #[test]
333    fn test_detect_module_type_pyo3_marker() {
334        let temp_dir = TempDir::new().unwrap();
335        let file = temp_dir.path().join("module.py");
336        std::fs::write(&file, "# pyo3\nfrom ._native import Foo\n").unwrap();
337
338        assert!(matches!(detect_module_type(&file), ModuleSourceType::Pyo3));
339    }
340
341    #[test]
342    fn test_detect_module_type_native_import() {
343        let temp_dir = TempDir::new().unwrap();
344        let file = temp_dir.path().join("module.py");
345        std::fs::write(&file, "from ._impl import SomeClass\n").unwrap();
346
347        assert!(matches!(detect_module_type(&file), ModuleSourceType::Pyo3));
348    }
349
350    #[test]
351    fn test_merge_modules() {
352        let discovered = vec![
353            DiscoveredModule {
354                name: "pkg.a".to_string(),
355                path: PathBuf::from("a.py"),
356                module_type: ModuleSourceType::Python,
357            },
358            DiscoveredModule {
359                name: "pkg.b".to_string(),
360                path: PathBuf::from("b.py"),
361                module_type: ModuleSourceType::Python,
362            },
363        ];
364
365        let mut explicit = HashMap::new();
366        explicit.insert("pkg.b".to_string(), ModuleSourceType::Pyo3); // Override
367        explicit.insert("pkg.c".to_string(), ModuleSourceType::Python); // Add new
368
369        let merged = merge_modules(discovered, &explicit);
370
371        assert_eq!(merged.len(), 3);
372        assert!(matches!(
373            merged.get("pkg.a"),
374            Some(ModuleSourceType::Python)
375        ));
376        assert!(matches!(merged.get("pkg.b"), Some(ModuleSourceType::Pyo3))); // Overridden
377        assert!(matches!(
378            merged.get("pkg.c"),
379            Some(ModuleSourceType::Python)
380        )); // Added
381    }
382}