Skip to main content

socket_patch_core/crawlers/
python_crawler.rs

1use std::collections::{HashMap, HashSet};
2use std::path::{Path, PathBuf};
3use std::process::{Command, Stdio};
4
5use super::types::{CrawledPackage, CrawlerOptions};
6
7// ---------------------------------------------------------------------------
8// Python command discovery
9// ---------------------------------------------------------------------------
10
11/// Find a working Python command on the system.
12///
13/// Tries `python3`, `python`, and `py` (Windows launcher) in order,
14/// returning the first one that responds to `--version`.
15pub fn find_python_command() -> Option<&'static str> {
16    ["python3", "python", "py"].into_iter().find(|cmd| {
17        Command::new(cmd)
18            .args(["--version"])
19            .stdin(Stdio::null())
20            .stdout(Stdio::null())
21            .stderr(Stdio::null())
22            .status()
23            .is_ok()
24    })
25}
26
27/// Default batch size for crawling.
28const _DEFAULT_BATCH_SIZE: usize = 100;
29
30// ---------------------------------------------------------------------------
31// PEP 503 name canonicalization
32// ---------------------------------------------------------------------------
33
34/// Canonicalize a Python package name per PEP 503.
35///
36/// Lowercases, trims, and replaces runs of `[-_.]` with a single `-`.
37pub fn canonicalize_pypi_name(name: &str) -> String {
38    let trimmed = name.trim().to_lowercase();
39    let mut result = String::with_capacity(trimmed.len());
40    let mut in_separator_run = false;
41
42    for ch in trimmed.chars() {
43        if ch == '-' || ch == '_' || ch == '.' {
44            if !in_separator_run {
45                result.push('-');
46                in_separator_run = true;
47            }
48            // else: skip consecutive separators
49        } else {
50            in_separator_run = false;
51            result.push(ch);
52        }
53    }
54
55    result
56}
57
58// ---------------------------------------------------------------------------
59// Helpers: read Python metadata from dist-info
60// ---------------------------------------------------------------------------
61
62/// Read `Name` and `Version` from a `.dist-info/METADATA` file.
63pub async fn read_python_metadata(dist_info_path: &Path) -> Option<(String, String)> {
64    let metadata_path = dist_info_path.join("METADATA");
65    let content = tokio::fs::read_to_string(&metadata_path).await.ok()?;
66
67    let mut name: Option<String> = None;
68    let mut version: Option<String> = None;
69
70    for line in content.lines() {
71        if name.is_some() && version.is_some() {
72            break;
73        }
74        if let Some(rest) = line.strip_prefix("Name:") {
75            name = Some(rest.trim().to_string());
76        } else if let Some(rest) = line.strip_prefix("Version:") {
77            version = Some(rest.trim().to_string());
78        }
79        // Stop at first empty line (end of headers)
80        if line.trim().is_empty() && (name.is_some() || version.is_some()) {
81            break;
82        }
83    }
84
85    match (name, version) {
86        (Some(n), Some(v)) if !n.is_empty() && !v.is_empty() => Some((n, v)),
87        _ => None,
88    }
89}
90
91// ---------------------------------------------------------------------------
92// Helpers: find Python directories with wildcard matching
93// ---------------------------------------------------------------------------
94
95/// Find directories matching a path pattern with wildcard segments.
96///
97/// Supported wildcards:
98/// - `"python3.*"` — matches directory entries starting with `python3.`
99/// - `"*"` — matches any directory entry
100///
101/// All other segments are treated as literal path components.
102pub async fn find_python_dirs(base_path: &Path, segments: &[&str]) -> Vec<PathBuf> {
103    let mut results = Vec::new();
104
105    // Check that base_path is a directory
106    match tokio::fs::metadata(base_path).await {
107        Ok(m) if m.is_dir() => {}
108        _ => return results,
109    }
110
111    if segments.is_empty() {
112        results.push(base_path.to_path_buf());
113        return results;
114    }
115
116    let first = segments[0];
117    let rest = &segments[1..];
118
119    if first == "python3.*" {
120        // Wildcard: list directory and match python3.X entries
121        if let Ok(mut entries) = tokio::fs::read_dir(base_path).await {
122            while let Ok(Some(entry)) = entries.next_entry().await {
123                let ft = match entry.file_type().await {
124                    Ok(ft) => ft,
125                    Err(_) => continue,
126                };
127                if !ft.is_dir() {
128                    continue;
129                }
130                let name = entry.file_name();
131                let name_str = name.to_string_lossy();
132                if name_str.starts_with("python3.") {
133                    let sub = Box::pin(find_python_dirs(
134                        &base_path.join(entry.file_name()),
135                        rest,
136                    ))
137                    .await;
138                    results.extend(sub);
139                }
140            }
141        }
142    } else if first == "*" {
143        // Generic wildcard: match any directory entry
144        if let Ok(mut entries) = tokio::fs::read_dir(base_path).await {
145            while let Ok(Some(entry)) = entries.next_entry().await {
146                let ft = match entry.file_type().await {
147                    Ok(ft) => ft,
148                    Err(_) => continue,
149                };
150                if !ft.is_dir() {
151                    continue;
152                }
153                let sub = Box::pin(find_python_dirs(
154                    &base_path.join(entry.file_name()),
155                    rest,
156                ))
157                .await;
158                results.extend(sub);
159            }
160        }
161    } else {
162        // Literal segment: just check if it exists
163        let sub =
164            Box::pin(find_python_dirs(&base_path.join(first), rest)).await;
165        results.extend(sub);
166    }
167
168    results
169}
170
171// ---------------------------------------------------------------------------
172// Helpers: site-packages discovery
173// ---------------------------------------------------------------------------
174
175/// Find `site-packages` (or `dist-packages`) directories under a base dir.
176///
177/// Handles both Unix (`lib/python3.X/site-packages`) and macOS/Linux layouts.
178pub async fn find_site_packages_under(
179    base_dir: &Path,
180    sub_dir_type: &str, // "site-packages" or "dist-packages"
181) -> Vec<PathBuf> {
182    if cfg!(windows) {
183        find_python_dirs(base_dir, &["Lib", sub_dir_type]).await
184    } else {
185        find_python_dirs(base_dir, &["lib", "python3.*", sub_dir_type]).await
186    }
187}
188
189/// Find local virtual environment `site-packages` directories.
190///
191/// Checks (in order):
192/// 1. `VIRTUAL_ENV` environment variable
193/// 2. `.venv` directory in `cwd`
194/// 3. `venv` directory in `cwd`
195pub async fn find_local_venv_site_packages(cwd: &Path) -> Vec<PathBuf> {
196    let mut results = Vec::new();
197
198    // 1. Check VIRTUAL_ENV env var
199    if let Ok(virtual_env) = std::env::var("VIRTUAL_ENV") {
200        let venv_path = PathBuf::from(&virtual_env);
201        let matches = find_site_packages_under(&venv_path, "site-packages").await;
202        results.extend(matches);
203        if !results.is_empty() {
204            return results;
205        }
206    }
207
208    // 2. Check .venv and venv in cwd
209    for venv_dir in &[".venv", "venv"] {
210        let venv_path = cwd.join(venv_dir);
211        let matches = find_site_packages_under(&venv_path, "site-packages").await;
212        results.extend(matches);
213    }
214
215    results
216}
217
218/// Get global/system Python `site-packages` directories.
219///
220/// Queries `python3` for site-packages paths, then checks well-known system
221/// locations including Homebrew, conda, uv tools, pip --user, etc.
222pub async fn get_global_python_site_packages() -> Vec<PathBuf> {
223    let mut results = Vec::new();
224    let mut seen = HashSet::new();
225
226    let add_path = |p: PathBuf, seen: &mut HashSet<PathBuf>, results: &mut Vec<PathBuf>| {
227        let resolved = if p.is_absolute() {
228            p
229        } else {
230            std::path::absolute(&p).unwrap_or(p)
231        };
232        if seen.insert(resolved.clone()) {
233            results.push(resolved);
234        }
235    };
236
237    // 1. Ask Python for site-packages
238    if let Some(python_cmd) = find_python_command() {
239        if let Ok(output) = Command::new(python_cmd)
240            .args([
241                "-c",
242                "import site; print('\\n'.join(site.getsitepackages())); print(site.getusersitepackages())",
243            ])
244            .stdin(Stdio::null())
245            .stdout(Stdio::piped())
246            .stderr(Stdio::piped())
247            .output()
248        {
249            if output.status.success() {
250                let stdout = String::from_utf8_lossy(&output.stdout);
251                for line in stdout.lines() {
252                    let p = line.trim();
253                    if !p.is_empty() {
254                        add_path(PathBuf::from(p), &mut seen, &mut results);
255                    }
256                }
257            }
258        }
259    }
260
261    // 2. Well-known system paths
262    let home_dir = std::env::var("HOME")
263        .or_else(|_| std::env::var("USERPROFILE"))
264        .unwrap_or_else(|_| "~".to_string());
265
266    // Helper closure to scan base/lib/python3.*/[dist|site]-packages
267    async fn scan_well_known(
268        base: &Path,
269        pkg_type: &str,
270        seen: &mut HashSet<PathBuf>,
271        results: &mut Vec<PathBuf>,
272    ) {
273        let matches = find_python_dirs(base, &["lib", "python3.*", pkg_type]).await;
274        for m in matches {
275            let resolved = if m.is_absolute() {
276                m
277            } else {
278                std::path::absolute(&m).unwrap_or(m)
279            };
280            if seen.insert(resolved.clone()) {
281                results.push(resolved);
282            }
283        }
284    }
285
286    if !cfg!(windows) {
287        // Debian/Ubuntu
288        scan_well_known(Path::new("/usr"), "dist-packages", &mut seen, &mut results).await;
289        scan_well_known(Path::new("/usr"), "site-packages", &mut seen, &mut results).await;
290        // Debian pip / most distros / macOS
291        scan_well_known(
292            Path::new("/usr/local"),
293            "dist-packages",
294            &mut seen,
295            &mut results,
296        )
297        .await;
298        scan_well_known(
299            Path::new("/usr/local"),
300            "site-packages",
301            &mut seen,
302            &mut results,
303        )
304        .await;
305        // pip --user on Unix
306        let user_local = PathBuf::from(&home_dir).join(".local");
307        scan_well_known(&user_local, "site-packages", &mut seen, &mut results).await;
308    }
309
310    // macOS-specific
311    if cfg!(target_os = "macos") {
312        scan_well_known(
313            Path::new("/opt/homebrew"),
314            "site-packages",
315            &mut seen,
316            &mut results,
317        )
318        .await;
319
320        // Python.org framework
321        let fw_matches = find_python_dirs(
322            Path::new("/Library/Frameworks/Python.framework/Versions"),
323            &["python3.*", "lib", "python3.*", "site-packages"],
324        )
325        .await;
326        for m in fw_matches {
327            add_path(m, &mut seen, &mut results);
328        }
329
330        let fw_matches2 = find_python_dirs(
331            Path::new("/Library/Frameworks/Python.framework"),
332            &["Versions", "*", "lib", "python3.*", "site-packages"],
333        )
334        .await;
335        for m in fw_matches2 {
336            add_path(m, &mut seen, &mut results);
337        }
338    }
339
340    // Windows-specific
341    if cfg!(windows) {
342        // pip --user on Windows: %APPDATA%\Python\PythonXY\site-packages
343        if let Ok(appdata) = std::env::var("APPDATA") {
344            let appdata_python = PathBuf::from(&appdata).join("Python");
345            if let Ok(mut entries) = tokio::fs::read_dir(&appdata_python).await {
346                while let Ok(Some(entry)) = entries.next_entry().await {
347                    let p = appdata_python.join(entry.file_name()).join("site-packages");
348                    if tokio::fs::metadata(&p).await.is_ok() {
349                        add_path(p, &mut seen, &mut results);
350                    }
351                }
352            }
353        }
354        // Common Windows Python install locations
355        for base in &["C:\\Python", "C:\\Program Files\\Python"] {
356            if let Ok(mut entries) = tokio::fs::read_dir(base).await {
357                while let Ok(Some(entry)) = entries.next_entry().await {
358                    let sp = PathBuf::from(base)
359                        .join(entry.file_name())
360                        .join("Lib")
361                        .join("site-packages");
362                    if tokio::fs::metadata(&sp).await.is_ok() {
363                        add_path(sp, &mut seen, &mut results);
364                    }
365                }
366            }
367        }
368        // Microsoft Store / python.org via LocalAppData
369        if let Ok(local) = std::env::var("LOCALAPPDATA") {
370            let programs_python = PathBuf::from(&local).join("Programs").join("Python");
371            if let Ok(mut entries) = tokio::fs::read_dir(&programs_python).await {
372                while let Ok(Some(entry)) = entries.next_entry().await {
373                    let sp = programs_python
374                        .join(entry.file_name())
375                        .join("Lib")
376                        .join("site-packages");
377                    if tokio::fs::metadata(&sp).await.is_ok() {
378                        add_path(sp, &mut seen, &mut results);
379                    }
380                }
381            }
382        }
383    }
384
385    // pyenv (works on macOS and Linux)
386    if !cfg!(windows) {
387        let pyenv_root = std::env::var("PYENV_ROOT")
388            .map(PathBuf::from)
389            .unwrap_or_else(|_| PathBuf::from(&home_dir).join(".pyenv"));
390        let pyenv_versions = pyenv_root.join("versions");
391        let pyenv_matches = find_python_dirs(
392            &pyenv_versions,
393            &["*", "lib", "python3.*", "site-packages"],
394        )
395        .await;
396        for m in pyenv_matches {
397            add_path(m, &mut seen, &mut results);
398        }
399    }
400
401    // Conda
402    let anaconda = PathBuf::from(&home_dir).join("anaconda3");
403    scan_well_known(&anaconda, "site-packages", &mut seen, &mut results).await;
404    let miniconda = PathBuf::from(&home_dir).join("miniconda3");
405    scan_well_known(&miniconda, "site-packages", &mut seen, &mut results).await;
406
407    // uv tools
408    if cfg!(target_os = "macos") {
409        let uv_base = PathBuf::from(&home_dir)
410            .join("Library")
411            .join("Application Support")
412            .join("uv")
413            .join("tools");
414        let uv_matches =
415            find_python_dirs(&uv_base, &["*", "lib", "python3.*", "site-packages"]).await;
416        for m in uv_matches {
417            add_path(m, &mut seen, &mut results);
418        }
419    } else if cfg!(windows) {
420        // %LOCALAPPDATA%\uv\tools
421        if let Ok(local) = std::env::var("LOCALAPPDATA") {
422            let uv_base = PathBuf::from(local).join("uv").join("tools");
423            let uv_matches =
424                find_python_dirs(&uv_base, &["*", "Lib", "site-packages"]).await;
425            for m in uv_matches {
426                add_path(m, &mut seen, &mut results);
427            }
428        }
429    } else {
430        let uv_base = PathBuf::from(&home_dir)
431            .join(".local")
432            .join("share")
433            .join("uv")
434            .join("tools");
435        let uv_matches =
436            find_python_dirs(&uv_base, &["*", "lib", "python3.*", "site-packages"]).await;
437        for m in uv_matches {
438            add_path(m, &mut seen, &mut results);
439        }
440    }
441
442    results
443}
444
445// ---------------------------------------------------------------------------
446// PythonCrawler
447// ---------------------------------------------------------------------------
448
449/// Python ecosystem crawler for discovering packages in `site-packages`.
450pub struct PythonCrawler;
451
452impl PythonCrawler {
453    /// Create a new `PythonCrawler`.
454    pub fn new() -> Self {
455        Self
456    }
457
458    /// Get `site-packages` paths based on options.
459    pub async fn get_site_packages_paths(&self, options: &CrawlerOptions) -> Result<Vec<PathBuf>, std::io::Error> {
460        if options.global || options.global_prefix.is_some() {
461            if let Some(ref custom) = options.global_prefix {
462                return Ok(vec![custom.clone()]);
463            }
464            return Ok(get_global_python_site_packages().await);
465        }
466        Ok(find_local_venv_site_packages(&options.cwd).await)
467    }
468
469    /// Crawl all discovered `site-packages` and return every package found.
470    pub async fn crawl_all(&self, options: &CrawlerOptions) -> Vec<CrawledPackage> {
471        let mut packages = Vec::new();
472        let mut seen = HashSet::new();
473
474        let sp_paths = self.get_site_packages_paths(options).await.unwrap_or_default();
475
476        for sp_path in &sp_paths {
477            let found = self.scan_site_packages(sp_path, &mut seen).await;
478            packages.extend(found);
479        }
480
481        packages
482    }
483
484    /// Find specific packages by PURL.
485    ///
486    /// Accepts base PURLs (no qualifiers) — the caller should strip qualifiers
487    /// before calling.
488    pub async fn find_by_purls(
489        &self,
490        site_packages_path: &Path,
491        purls: &[String],
492    ) -> Result<HashMap<String, CrawledPackage>, std::io::Error> {
493        let mut result = HashMap::new();
494
495        // Build lookup: canonicalized-name@version -> purl
496        let mut purl_lookup: HashMap<String, &str> = HashMap::new();
497        for purl in purls {
498            if let Some((name, version)) = Self::parse_pypi_purl(purl) {
499                let key = format!("{}@{}", canonicalize_pypi_name(&name), version);
500                purl_lookup.insert(key, purl.as_str());
501            }
502        }
503
504        if purl_lookup.is_empty() {
505            return Ok(result);
506        }
507
508        // Scan all .dist-info dirs
509        let entries = match tokio::fs::read_dir(site_packages_path).await {
510            Ok(rd) => {
511                let mut entries = rd;
512                let mut v = Vec::new();
513                while let Ok(Some(entry)) = entries.next_entry().await {
514                    v.push(entry);
515                }
516                v
517            }
518            Err(_) => return Ok(result),
519        };
520
521        for entry in entries {
522            let name = entry.file_name();
523            let name_str = name.to_string_lossy();
524            if !name_str.ends_with(".dist-info") {
525                continue;
526            }
527
528            let dist_info_path = site_packages_path.join(&*name_str);
529            if let Some((raw_name, version)) = read_python_metadata(&dist_info_path).await {
530                let canon_name = canonicalize_pypi_name(&raw_name);
531                let key = format!("{canon_name}@{version}");
532
533                if let Some(&matched_purl) = purl_lookup.get(&key) {
534                    result.insert(
535                        matched_purl.to_string(),
536                        CrawledPackage {
537                            name: canon_name,
538                            version,
539                            namespace: None,
540                            purl: matched_purl.to_string(),
541                            path: site_packages_path.to_path_buf(),
542                        },
543                    );
544                }
545            }
546        }
547
548        Ok(result)
549    }
550
551    // ------------------------------------------------------------------
552    // Private helpers
553    // ------------------------------------------------------------------
554
555    /// Scan a `site-packages` directory for `.dist-info` directories.
556    async fn scan_site_packages(
557        &self,
558        site_packages_path: &Path,
559        seen: &mut HashSet<String>,
560    ) -> Vec<CrawledPackage> {
561        let mut results = Vec::new();
562
563        let entries = match tokio::fs::read_dir(site_packages_path).await {
564            Ok(rd) => {
565                let mut entries = rd;
566                let mut v = Vec::new();
567                while let Ok(Some(entry)) = entries.next_entry().await {
568                    v.push(entry);
569                }
570                v
571            }
572            Err(_) => return results,
573        };
574
575        for entry in entries {
576            let name = entry.file_name();
577            let name_str = name.to_string_lossy();
578            if !name_str.ends_with(".dist-info") {
579                continue;
580            }
581
582            let dist_info_path = site_packages_path.join(&*name_str);
583            if let Some((raw_name, version)) = read_python_metadata(&dist_info_path).await {
584                let canon_name = canonicalize_pypi_name(&raw_name);
585                let purl = format!("pkg:pypi/{canon_name}@{version}");
586
587                if seen.contains(&purl) {
588                    continue;
589                }
590                seen.insert(purl.clone());
591
592                results.push(CrawledPackage {
593                    name: canon_name,
594                    version,
595                    namespace: None,
596                    purl,
597                    path: site_packages_path.to_path_buf(),
598                });
599            }
600        }
601
602        results
603    }
604
605    /// Parse a PyPI PURL string to extract name and version.
606    /// Strips qualifiers before parsing.
607    fn parse_pypi_purl(purl: &str) -> Option<(String, String)> {
608        // Strip qualifiers
609        let base = match purl.find('?') {
610            Some(idx) => &purl[..idx],
611            None => purl,
612        };
613
614        let rest = base.strip_prefix("pkg:pypi/")?;
615        let at_idx = rest.rfind('@')?;
616        let name = &rest[..at_idx];
617        let version = &rest[at_idx + 1..];
618
619        if name.is_empty() || version.is_empty() {
620            return None;
621        }
622
623        Some((name.to_string(), version.to_string()))
624    }
625}
626
627impl Default for PythonCrawler {
628    fn default() -> Self {
629        Self::new()
630    }
631}
632
633#[cfg(test)]
634mod tests {
635    use super::*;
636
637    #[test]
638    fn test_canonicalize_pypi_name_basic() {
639        assert_eq!(canonicalize_pypi_name("Requests"), "requests");
640        assert_eq!(canonicalize_pypi_name("my_package"), "my-package");
641        assert_eq!(canonicalize_pypi_name("My.Package"), "my-package");
642        assert_eq!(canonicalize_pypi_name("My-._Package"), "my-package");
643    }
644
645    #[test]
646    fn test_canonicalize_pypi_name_runs() {
647        // Runs of separators collapse to single -
648        assert_eq!(canonicalize_pypi_name("a__b"), "a-b");
649        assert_eq!(canonicalize_pypi_name("a-.-b"), "a-b");
650        assert_eq!(canonicalize_pypi_name("a_._-b"), "a-b");
651    }
652
653    #[test]
654    fn test_canonicalize_pypi_name_trim() {
655        assert_eq!(canonicalize_pypi_name("  requests  "), "requests");
656    }
657
658    #[test]
659    fn test_parse_pypi_purl() {
660        let (name, ver) = PythonCrawler::parse_pypi_purl("pkg:pypi/requests@2.28.0").unwrap();
661        assert_eq!(name, "requests");
662        assert_eq!(ver, "2.28.0");
663    }
664
665    #[test]
666    fn test_parse_pypi_purl_with_qualifiers() {
667        let (name, ver) =
668            PythonCrawler::parse_pypi_purl("pkg:pypi/requests@2.28.0?artifact_id=abc").unwrap();
669        assert_eq!(name, "requests");
670        assert_eq!(ver, "2.28.0");
671    }
672
673    #[test]
674    fn test_parse_pypi_purl_invalid() {
675        assert!(PythonCrawler::parse_pypi_purl("pkg:npm/lodash@4.17.21").is_none());
676        assert!(PythonCrawler::parse_pypi_purl("not-a-purl").is_none());
677    }
678
679    #[tokio::test]
680    async fn test_read_python_metadata_valid() {
681        let dir = tempfile::tempdir().unwrap();
682        let dist_info = dir.path().join("requests-2.28.0.dist-info");
683        tokio::fs::create_dir_all(&dist_info).await.unwrap();
684        tokio::fs::write(
685            dist_info.join("METADATA"),
686            "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n\nSome description",
687        )
688        .await
689        .unwrap();
690
691        let result = read_python_metadata(&dist_info).await;
692        assert!(result.is_some());
693        let (name, version) = result.unwrap();
694        assert_eq!(name, "Requests");
695        assert_eq!(version, "2.28.0");
696    }
697
698    #[tokio::test]
699    async fn test_read_python_metadata_missing() {
700        let dir = tempfile::tempdir().unwrap();
701        let dist_info = dir.path().join("nonexistent.dist-info");
702        assert!(read_python_metadata(&dist_info).await.is_none());
703    }
704
705    #[tokio::test]
706    async fn test_find_python_dirs_literal() {
707        let dir = tempfile::tempdir().unwrap();
708        let target = dir.path().join("lib").join("python3.11").join("site-packages");
709        tokio::fs::create_dir_all(&target).await.unwrap();
710
711        let results =
712            find_python_dirs(dir.path(), &["lib", "python3.*", "site-packages"]).await;
713        assert_eq!(results.len(), 1);
714        assert_eq!(results[0], target);
715    }
716
717    #[tokio::test]
718    async fn test_find_python_dirs_wildcard() {
719        let dir = tempfile::tempdir().unwrap();
720        let sp1 = dir.path().join("lib").join("python3.10").join("site-packages");
721        let sp2 = dir.path().join("lib").join("python3.11").join("site-packages");
722        tokio::fs::create_dir_all(&sp1).await.unwrap();
723        tokio::fs::create_dir_all(&sp2).await.unwrap();
724
725        // Also create a non-matching dir
726        let non_match = dir.path().join("lib").join("ruby3.0").join("site-packages");
727        tokio::fs::create_dir_all(&non_match).await.unwrap();
728
729        let results =
730            find_python_dirs(dir.path(), &["lib", "python3.*", "site-packages"]).await;
731        assert_eq!(results.len(), 2);
732    }
733
734    #[tokio::test]
735    async fn test_find_python_dirs_star_wildcard() {
736        let dir = tempfile::tempdir().unwrap();
737        let sp1 = dir
738            .path()
739            .join("tools")
740            .join("mytool")
741            .join("lib")
742            .join("python3.11")
743            .join("site-packages");
744        tokio::fs::create_dir_all(&sp1).await.unwrap();
745
746        let results = find_python_dirs(
747            dir.path(),
748            &["tools", "*", "lib", "python3.*", "site-packages"],
749        )
750        .await;
751        assert_eq!(results.len(), 1);
752        assert_eq!(results[0], sp1);
753    }
754
755    #[tokio::test]
756    async fn test_find_python_dirs_pyenv_layout() {
757        // Create a pyenv-like layout: versions/3.11.5/lib/python3.11/site-packages
758        let dir = tempfile::tempdir().unwrap();
759        let sp1 = dir
760            .path()
761            .join("versions")
762            .join("3.11.5")
763            .join("lib")
764            .join("python3.11")
765            .join("site-packages");
766        let sp2 = dir
767            .path()
768            .join("versions")
769            .join("3.12.0")
770            .join("lib")
771            .join("python3.12")
772            .join("site-packages");
773        tokio::fs::create_dir_all(&sp1).await.unwrap();
774        tokio::fs::create_dir_all(&sp2).await.unwrap();
775
776        let results = find_python_dirs(
777            &dir.path().join("versions"),
778            &["*", "lib", "python3.*", "site-packages"],
779        )
780        .await;
781        assert_eq!(results.len(), 2);
782        assert!(results.contains(&sp1));
783        assert!(results.contains(&sp2));
784    }
785
786    #[tokio::test]
787    async fn test_crawl_all_python() {
788        let dir = tempfile::tempdir().unwrap();
789        let venv = dir.path().join(".venv");
790        let sp = if cfg!(windows) {
791            venv.join("Lib").join("site-packages")
792        } else {
793            venv.join("lib").join("python3.11").join("site-packages")
794        };
795        tokio::fs::create_dir_all(&sp).await.unwrap();
796
797        // Create a dist-info dir with METADATA
798        let dist_info = sp.join("requests-2.28.0.dist-info");
799        tokio::fs::create_dir_all(&dist_info).await.unwrap();
800        tokio::fs::write(
801            dist_info.join("METADATA"),
802            "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n",
803        )
804        .await
805        .unwrap();
806
807        let crawler = PythonCrawler::new();
808        let options = CrawlerOptions {
809            cwd: dir.path().to_path_buf(),
810            global: false,
811            global_prefix: None,
812            batch_size: 100,
813        };
814
815        let packages = crawler.crawl_all(&options).await;
816        assert_eq!(packages.len(), 1);
817        assert_eq!(packages[0].name, "requests");
818        assert_eq!(packages[0].version, "2.28.0");
819        assert_eq!(packages[0].purl, "pkg:pypi/requests@2.28.0");
820        assert!(packages[0].namespace.is_none());
821    }
822
823    #[test]
824    fn test_find_python_command() {
825        // On any platform with Python installed, this should return Some
826        // In CI environments, Python is typically available
827        let cmd = find_python_command();
828        // We don't assert Some because Python may not be installed,
829        // but if it is, the command should be valid
830        if let Some(c) = cmd {
831            assert!(
832                ["python3", "python", "py"].contains(&c),
833                "unexpected command: {c}"
834            );
835        }
836    }
837
838    #[test]
839    fn test_home_dir_detection() {
840        // Verify the fallback chain works: HOME -> USERPROFILE -> "~"
841        let home = std::env::var("HOME")
842            .or_else(|_| std::env::var("USERPROFILE"))
843            .unwrap_or_else(|_| "~".to_string());
844        // On any CI or dev machine, we should get a real path, not "~"
845        assert_ne!(home, "~", "expected a real home directory");
846        assert!(!home.is_empty());
847    }
848
849    #[tokio::test]
850    async fn test_find_by_purls_python() {
851        let dir = tempfile::tempdir().unwrap();
852        let sp = dir.path().to_path_buf();
853
854        // Create dist-info
855        let dist_info = sp.join("requests-2.28.0.dist-info");
856        tokio::fs::create_dir_all(&dist_info).await.unwrap();
857        tokio::fs::write(
858            dist_info.join("METADATA"),
859            "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n",
860        )
861        .await
862        .unwrap();
863
864        let crawler = PythonCrawler::new();
865        let purls = vec![
866            "pkg:pypi/requests@2.28.0".to_string(),
867            "pkg:pypi/flask@3.0.0".to_string(),
868        ];
869
870        let result = crawler.find_by_purls(&sp, &purls).await.unwrap();
871        assert_eq!(result.len(), 1);
872        assert!(result.contains_key("pkg:pypi/requests@2.28.0"));
873        assert!(!result.contains_key("pkg:pypi/flask@3.0.0"));
874    }
875}