Skip to main content

socket_patch_core/crawlers/
python_crawler.rs

1use std::collections::{HashMap, HashSet};
2use std::path::{Path, PathBuf};
3
4use super::types::{CrawledPackage, CrawlerOptions};
5use crate::utils::process::{CommandRunner, SystemCommandRunner};
6
7// ---------------------------------------------------------------------------
8// Python command discovery
9// ---------------------------------------------------------------------------
10
11/// Find a working Python command on the system.
12///
13/// Tries `python3`, `python`, and `py` (Windows launcher) in order,
14/// returning the first one that responds to `--version`.
15pub fn find_python_command() -> Option<&'static str> {
16    find_python_command_with(&SystemCommandRunner)
17}
18
19/// Version of `find_python_command` that accepts an injected
20/// `CommandRunner`. Tests inject a `MockCommandRunner` that returns
21/// `Some(...)` for `python3 --version` to exercise the success arm
22/// without a real Python on PATH.
23pub fn find_python_command_with(runner: &dyn CommandRunner) -> Option<&'static str> {
24    ["python3", "python", "py"]
25        .into_iter()
26        .find(|cmd| runner.run(cmd, &["--version"]).is_some())
27}
28
29/// Default batch size for crawling.
30const _DEFAULT_BATCH_SIZE: usize = 100;
31
32// ---------------------------------------------------------------------------
33// PEP 503 name canonicalization
34// ---------------------------------------------------------------------------
35
36/// Canonicalize a Python package name per PEP 503.
37///
38/// Lowercases, trims, and replaces runs of `[-_.]` with a single `-`.
39pub fn canonicalize_pypi_name(name: &str) -> String {
40    let trimmed = name.trim().to_lowercase();
41    let mut result = String::with_capacity(trimmed.len());
42    let mut in_separator_run = false;
43
44    for ch in trimmed.chars() {
45        if ch == '-' || ch == '_' || ch == '.' {
46            if !in_separator_run {
47                result.push('-');
48                in_separator_run = true;
49            }
50            // else: skip consecutive separators
51        } else {
52            in_separator_run = false;
53            result.push(ch);
54        }
55    }
56
57    result
58}
59
60// ---------------------------------------------------------------------------
61// Helpers: read Python metadata from dist-info
62// ---------------------------------------------------------------------------
63
64/// Read `Name` and `Version` from a `.dist-info/METADATA` file.
65pub async fn read_python_metadata(dist_info_path: &Path) -> Option<(String, String)> {
66    let metadata_path = dist_info_path.join("METADATA");
67    let content = tokio::fs::read_to_string(&metadata_path).await.ok()?;
68
69    let mut name: Option<String> = None;
70    let mut version: Option<String> = None;
71
72    for line in content.lines() {
73        if name.is_some() && version.is_some() {
74            break;
75        }
76        if let Some(rest) = line.strip_prefix("Name:") {
77            name = Some(rest.trim().to_string());
78        } else if let Some(rest) = line.strip_prefix("Version:") {
79            version = Some(rest.trim().to_string());
80        }
81        // Stop at first empty line (end of headers)
82        if line.trim().is_empty() && (name.is_some() || version.is_some()) {
83            break;
84        }
85    }
86
87    match (name, version) {
88        (Some(n), Some(v)) if !n.is_empty() && !v.is_empty() => Some((n, v)),
89        _ => None,
90    }
91}
92
93// ---------------------------------------------------------------------------
94// Helpers: find Python directories with wildcard matching
95// ---------------------------------------------------------------------------
96
97/// Find directories matching a path pattern with wildcard segments.
98///
99/// Supported wildcards:
100/// - `"python3.*"` — matches directory entries starting with `python3.`
101/// - `"*"` — matches any directory entry
102///
103/// All other segments are treated as literal path components.
104pub async fn find_python_dirs(base_path: &Path, segments: &[&str]) -> Vec<PathBuf> {
105    let mut results = Vec::new();
106
107    // Check that base_path is a directory
108    match tokio::fs::metadata(base_path).await {
109        Ok(m) if m.is_dir() => {}
110        _ => return results,
111    }
112
113    if segments.is_empty() {
114        results.push(base_path.to_path_buf());
115        return results;
116    }
117
118    let first = segments[0];
119    let rest = &segments[1..];
120
121    if first == "python3.*" {
122        // Wildcard: list directory and match python3.X entries
123        for entry in crate::utils::fs::list_dir_entries(base_path).await {
124            if !crate::utils::fs::entry_is_dir(&entry).await {
125                continue;
126            }
127            let name = entry.file_name();
128            let name_str = name.to_string_lossy();
129            if name_str.starts_with("python3.") {
130                let sub = Box::pin(find_python_dirs(
131                    &base_path.join(entry.file_name()),
132                    rest,
133                ))
134                .await;
135                results.extend(sub);
136            }
137        }
138    } else if first == "*" {
139        // Generic wildcard: match any directory entry
140        for entry in crate::utils::fs::list_dir_entries(base_path).await {
141            if !crate::utils::fs::entry_is_dir(&entry).await {
142                continue;
143            }
144            let sub = Box::pin(find_python_dirs(
145                &base_path.join(entry.file_name()),
146                rest,
147            ))
148            .await;
149            results.extend(sub);
150        }
151    } else {
152        // Literal segment: just check if it exists
153        let sub =
154            Box::pin(find_python_dirs(&base_path.join(first), rest)).await;
155        results.extend(sub);
156    }
157
158    results
159}
160
161// ---------------------------------------------------------------------------
162// Helpers: site-packages discovery
163// ---------------------------------------------------------------------------
164
165/// Find `site-packages` (or `dist-packages`) directories under a base dir.
166///
167/// Handles both Unix (`lib/python3.X/site-packages`) and macOS/Linux layouts.
168pub async fn find_site_packages_under(
169    base_dir: &Path,
170    sub_dir_type: &str, // "site-packages" or "dist-packages"
171) -> Vec<PathBuf> {
172    #[cfg(windows)]
173    {
174        find_python_dirs(base_dir, &["Lib", sub_dir_type]).await
175    }
176    #[cfg(not(windows))]
177    {
178        find_python_dirs(base_dir, &["lib", "python3.*", sub_dir_type]).await
179    }
180}
181
182/// Find local virtual environment `site-packages` directories.
183///
184/// Checks (in order):
185/// 1. `VIRTUAL_ENV` environment variable
186/// 2. `.venv` directory in `cwd`
187/// 3. `venv` directory in `cwd`
188pub async fn find_local_venv_site_packages(cwd: &Path) -> Vec<PathBuf> {
189    let mut results = Vec::new();
190
191    // 1. Check VIRTUAL_ENV env var
192    if let Ok(virtual_env) = std::env::var("VIRTUAL_ENV") {
193        let venv_path = PathBuf::from(&virtual_env);
194        let matches = find_site_packages_under(&venv_path, "site-packages").await;
195        results.extend(matches);
196        if !results.is_empty() {
197            return results;
198        }
199    }
200
201    // 2. Check .venv and venv in cwd
202    for venv_dir in &[".venv", "venv"] {
203        let venv_path = cwd.join(venv_dir);
204        let matches = find_site_packages_under(&venv_path, "site-packages").await;
205        results.extend(matches);
206    }
207
208    results
209}
210
211/// Get global/system Python `site-packages` directories.
212///
213/// Queries `python3` for site-packages paths, then checks well-known system
214/// locations including Homebrew, conda, uv tools, pip --user, etc.
215pub async fn get_global_python_site_packages() -> Vec<PathBuf> {
216    let mut results = Vec::new();
217    let mut seen = HashSet::new();
218
219    let add_path = |p: PathBuf, seen: &mut HashSet<PathBuf>, results: &mut Vec<PathBuf>| {
220        let resolved = if p.is_absolute() {
221            p
222        } else {
223            std::path::absolute(&p).unwrap_or(p)
224        };
225        if seen.insert(resolved.clone()) {
226            results.push(resolved);
227        }
228    };
229
230    // 1. Ask Python for site-packages
231    if let Some(python_cmd) = find_python_command() {
232        let runner = SystemCommandRunner;
233        if let Some(stdout) = runner.run(
234            python_cmd,
235            &[
236                "-c",
237                "import site; print('\\n'.join(site.getsitepackages())); print(site.getusersitepackages())",
238            ],
239        ) {
240            for p in parse_python_site_packages_output(&stdout) {
241                add_path(p, &mut seen, &mut results);
242            }
243        }
244    }
245
246    // 2. Well-known system paths
247    let home_dir = std::env::var("HOME")
248        .or_else(|_| std::env::var("USERPROFILE"))
249        .unwrap_or_else(|_| "~".to_string());
250
251    // Helper closure to scan base/lib/python3.*/[dist|site]-packages
252    async fn scan_well_known(
253        base: &Path,
254        pkg_type: &str,
255        seen: &mut HashSet<PathBuf>,
256        results: &mut Vec<PathBuf>,
257    ) {
258        let matches = find_python_dirs(base, &["lib", "python3.*", pkg_type]).await;
259        for m in matches {
260            let resolved = if m.is_absolute() {
261                m
262            } else {
263                std::path::absolute(&m).unwrap_or(m)
264            };
265            if seen.insert(resolved.clone()) {
266                results.push(resolved);
267            }
268        }
269    }
270
271    #[cfg(not(windows))]
272    {
273        // Debian/Ubuntu
274        scan_well_known(Path::new("/usr"), "dist-packages", &mut seen, &mut results).await;
275        scan_well_known(Path::new("/usr"), "site-packages", &mut seen, &mut results).await;
276        // Debian pip / most distros / macOS
277        scan_well_known(
278            Path::new("/usr/local"),
279            "dist-packages",
280            &mut seen,
281            &mut results,
282        )
283        .await;
284        scan_well_known(
285            Path::new("/usr/local"),
286            "site-packages",
287            &mut seen,
288            &mut results,
289        )
290        .await;
291        // pip --user on Unix
292        let user_local = PathBuf::from(&home_dir).join(".local");
293        scan_well_known(&user_local, "site-packages", &mut seen, &mut results).await;
294    }
295
296    // macOS-specific
297    #[cfg(target_os = "macos")]
298    {
299        scan_well_known(
300            Path::new("/opt/homebrew"),
301            "site-packages",
302            &mut seen,
303            &mut results,
304        )
305        .await;
306
307        // Python.org framework
308        let fw_matches = find_python_dirs(
309            Path::new("/Library/Frameworks/Python.framework/Versions"),
310            &["python3.*", "lib", "python3.*", "site-packages"],
311        )
312        .await;
313        for m in fw_matches {
314            add_path(m, &mut seen, &mut results);
315        }
316
317        let fw_matches2 = find_python_dirs(
318            Path::new("/Library/Frameworks/Python.framework"),
319            &["Versions", "*", "lib", "python3.*", "site-packages"],
320        )
321        .await;
322        for m in fw_matches2 {
323            add_path(m, &mut seen, &mut results);
324        }
325    }
326
327    // Windows-specific
328    #[cfg(windows)]
329    {
330        // pip --user on Windows: %APPDATA%\Python\PythonXY\site-packages
331        if let Ok(appdata) = std::env::var("APPDATA") {
332            let appdata_python = PathBuf::from(&appdata).join("Python");
333            for entry in crate::utils::fs::list_dir_entries(&appdata_python).await {
334                let p = appdata_python.join(entry.file_name()).join("site-packages");
335                if tokio::fs::metadata(&p).await.is_ok() {
336                    add_path(p, &mut seen, &mut results);
337                }
338            }
339        }
340        // Common Windows Python install locations
341        for base in &["C:\\Python", "C:\\Program Files\\Python"] {
342            for entry in crate::utils::fs::list_dir_entries(Path::new(base)).await {
343                let sp = PathBuf::from(base)
344                    .join(entry.file_name())
345                    .join("Lib")
346                    .join("site-packages");
347                if tokio::fs::metadata(&sp).await.is_ok() {
348                    add_path(sp, &mut seen, &mut results);
349                }
350            }
351        }
352        // Microsoft Store / python.org via LocalAppData
353        if let Ok(local) = std::env::var("LOCALAPPDATA") {
354            let programs_python = PathBuf::from(&local).join("Programs").join("Python");
355            for entry in crate::utils::fs::list_dir_entries(&programs_python).await {
356                let sp = programs_python
357                    .join(entry.file_name())
358                    .join("Lib")
359                    .join("site-packages");
360                if tokio::fs::metadata(&sp).await.is_ok() {
361                    add_path(sp, &mut seen, &mut results);
362                }
363            }
364        }
365    }
366
367    // pyenv (works on macOS and Linux)
368    #[cfg(not(windows))]
369    {
370        let pyenv_root = std::env::var("PYENV_ROOT")
371            .map(PathBuf::from)
372            .unwrap_or_else(|_| PathBuf::from(&home_dir).join(".pyenv"));
373        let pyenv_versions = pyenv_root.join("versions");
374        let pyenv_matches = find_python_dirs(
375            &pyenv_versions,
376            &["*", "lib", "python3.*", "site-packages"],
377        )
378        .await;
379        for m in pyenv_matches {
380            add_path(m, &mut seen, &mut results);
381        }
382    }
383
384    // Conda
385    let anaconda = PathBuf::from(&home_dir).join("anaconda3");
386    scan_well_known(&anaconda, "site-packages", &mut seen, &mut results).await;
387    let miniconda = PathBuf::from(&home_dir).join("miniconda3");
388    scan_well_known(&miniconda, "site-packages", &mut seen, &mut results).await;
389
390    // uv tools — platform-specific install root.
391    #[cfg(target_os = "macos")]
392    {
393        let uv_base = PathBuf::from(&home_dir)
394            .join("Library")
395            .join("Application Support")
396            .join("uv")
397            .join("tools");
398        let uv_matches =
399            find_python_dirs(&uv_base, &["*", "lib", "python3.*", "site-packages"]).await;
400        for m in uv_matches {
401            add_path(m, &mut seen, &mut results);
402        }
403    }
404    #[cfg(windows)]
405    {
406        // %LOCALAPPDATA%\uv\tools
407        if let Ok(local) = std::env::var("LOCALAPPDATA") {
408            let uv_base = PathBuf::from(local).join("uv").join("tools");
409            let uv_matches =
410                find_python_dirs(&uv_base, &["*", "Lib", "site-packages"]).await;
411            for m in uv_matches {
412                add_path(m, &mut seen, &mut results);
413            }
414        }
415    }
416    #[cfg(all(not(target_os = "macos"), not(windows)))]
417    {
418        let uv_base = PathBuf::from(&home_dir)
419            .join(".local")
420            .join("share")
421            .join("uv")
422            .join("tools");
423        let uv_matches =
424            find_python_dirs(&uv_base, &["*", "lib", "python3.*", "site-packages"]).await;
425        for m in uv_matches {
426            add_path(m, &mut seen, &mut results);
427        }
428    }
429
430    // uv-managed Python interpreters (`uv python install 3.X`) live at:
431    //   Linux/macOS: ~/.local/share/uv/python/cpython-3.X.*/lib/python3.X/site-packages/
432    //   Windows:     %LOCALAPPDATA%\uv\python\cpython-3.X.*\Lib\site-packages\
433    // The typical flow is `uv venv` + `uv pip install`, where the venv layout
434    // is already covered by `find_local_venv_site_packages`. But power users
435    // can install packages directly into the managed interpreter (e.g. via
436    // `<uv-python>/bin/pip install ...`), and globally-discovered crawls
437    // should surface those.
438    #[cfg(not(windows))]
439    {
440        let uv_python = PathBuf::from(&home_dir)
441            .join(".local")
442            .join("share")
443            .join("uv")
444            .join("python");
445        let uv_matches =
446            find_python_dirs(&uv_python, &["*", "lib", "python3.*", "site-packages"]).await;
447        for m in uv_matches {
448            add_path(m, &mut seen, &mut results);
449        }
450    }
451    #[cfg(windows)]
452    {
453        if let Ok(local) = std::env::var("LOCALAPPDATA") {
454            let uv_python = PathBuf::from(local).join("uv").join("python");
455            let uv_matches =
456                find_python_dirs(&uv_python, &["*", "Lib", "site-packages"]).await;
457            for m in uv_matches {
458                add_path(m, &mut seen, &mut results);
459            }
460        }
461    }
462
463    results
464}
465
466/// Returns true if `cwd` looks like a Python project root.
467///
468/// Used by `PythonCrawler::get_site_packages_paths` to decide
469/// whether to fall back to the global-discovery path when no venv
470/// was found. Mirrors `is_dotnet_project` in nuget_crawler and the
471/// `has_gemfile || has_gemfile_lock` check in ruby_crawler.
472///
473/// The list intentionally covers all major Python toolchains:
474///   * `pyproject.toml` — PEP 518 / 621 (poetry, hatch, uv, flit,
475///     setuptools-PEP-517, pdm, etc. — anything modern)
476///   * `setup.py` / `setup.cfg` — legacy setuptools
477///   * `requirements.txt` — pip-compile / bare requirements
478///   * `uv.lock` — uv-managed projects (PEP 751 export sibling is
479///     `pylock.toml` but in practice `uv.lock` is what ships)
480async fn is_python_project(cwd: &Path) -> bool {
481    let markers = [
482        "pyproject.toml",
483        "setup.py",
484        "setup.cfg",
485        "requirements.txt",
486        "uv.lock",
487    ];
488    for m in &markers {
489        if tokio::fs::metadata(cwd.join(m)).await.is_ok() {
490            return true;
491        }
492    }
493    false
494}
495
496// ---------------------------------------------------------------------------
497// PythonCrawler
498// ---------------------------------------------------------------------------
499
500/// Python ecosystem crawler for discovering packages in `site-packages`.
501pub struct PythonCrawler;
502
503impl PythonCrawler {
504    /// Create a new `PythonCrawler`.
505    pub fn new() -> Self {
506        Self
507    }
508
509    /// Get `site-packages` paths based on options.
510    ///
511    /// Local-mode discovery has two stages:
512    ///   1. `find_local_venv_site_packages` — handles `VIRTUAL_ENV`,
513    ///      `.venv`, and `venv` directories (covers the common case
514    ///      of an activated or project-local venv).
515    ///   2. If no venv was found AND the cwd looks like a Python
516    ///      project (`pyproject.toml`, `setup.py`, `setup.cfg`,
517    ///      `requirements.txt`, or `uv.lock` present), fall through
518    ///      to `get_global_python_site_packages`. This mirrors the
519    ///      cargo / ruby / go pattern where a project marker
520    ///      indicates "scan this ecosystem globally for this project".
521    ///
522    /// Without the marker fallback, a fresh clone with
523    /// `pyproject.toml` + `uv.lock` but no `.venv` would silently
524    /// return zero packages.
525    pub async fn get_site_packages_paths(&self, options: &CrawlerOptions) -> Result<Vec<PathBuf>, std::io::Error> {
526        if options.global || options.global_prefix.is_some() {
527            if let Some(ref custom) = options.global_prefix {
528                return Ok(vec![custom.clone()]);
529            }
530            return Ok(get_global_python_site_packages().await);
531        }
532        let venv_paths = find_local_venv_site_packages(&options.cwd).await;
533        if !venv_paths.is_empty() {
534            return Ok(venv_paths);
535        }
536        if is_python_project(&options.cwd).await {
537            return Ok(get_global_python_site_packages().await);
538        }
539        Ok(Vec::new())
540    }
541
542    /// Crawl all discovered `site-packages` and return every package found.
543    pub async fn crawl_all(&self, options: &CrawlerOptions) -> Vec<CrawledPackage> {
544        let mut packages = Vec::new();
545        let mut seen = HashSet::new();
546
547        let sp_paths = self.get_site_packages_paths(options).await.unwrap_or_default();
548
549        for sp_path in &sp_paths {
550            let found = self.scan_site_packages(sp_path, &mut seen).await;
551            packages.extend(found);
552        }
553
554        packages
555    }
556
557    /// Find specific packages by PURL.
558    ///
559    /// Accepts base PURLs (no qualifiers) — the caller should strip qualifiers
560    /// before calling.
561    pub async fn find_by_purls(
562        &self,
563        site_packages_path: &Path,
564        purls: &[String],
565    ) -> Result<HashMap<String, CrawledPackage>, std::io::Error> {
566        let mut result = HashMap::new();
567
568        // Build lookup: canonicalized-name@version -> purl
569        let mut purl_lookup: HashMap<String, &str> = HashMap::new();
570        for purl in purls {
571            if let Some((name, version)) = Self::parse_pypi_purl(purl) {
572                let key = format!("{}@{}", canonicalize_pypi_name(&name), version);
573                purl_lookup.insert(key, purl.as_str());
574            }
575        }
576
577        if purl_lookup.is_empty() {
578            return Ok(result);
579        }
580
581        // Scan all .dist-info dirs
582        for entry in crate::utils::fs::list_dir_entries(site_packages_path).await {
583            let name = entry.file_name();
584            let name_str = name.to_string_lossy();
585            if !name_str.ends_with(".dist-info") {
586                continue;
587            }
588
589            let dist_info_path = site_packages_path.join(&*name_str);
590            if let Some((raw_name, version)) = read_python_metadata(&dist_info_path).await {
591                let canon_name = canonicalize_pypi_name(&raw_name);
592                let key = format!("{canon_name}@{version}");
593
594                if let Some(&matched_purl) = purl_lookup.get(&key) {
595                    result.insert(
596                        matched_purl.to_string(),
597                        CrawledPackage {
598                            name: canon_name,
599                            version,
600                            namespace: None,
601                            purl: matched_purl.to_string(),
602                            path: site_packages_path.to_path_buf(),
603                        },
604                    );
605                }
606            }
607        }
608
609        Ok(result)
610    }
611
612    // ------------------------------------------------------------------
613    // Private helpers
614    // ------------------------------------------------------------------
615
616    /// Scan a `site-packages` directory for `.dist-info` directories.
617    async fn scan_site_packages(
618        &self,
619        site_packages_path: &Path,
620        seen: &mut HashSet<String>,
621    ) -> Vec<CrawledPackage> {
622        let mut results = Vec::new();
623
624        for entry in crate::utils::fs::list_dir_entries(site_packages_path).await {
625            let name = entry.file_name();
626            let name_str = name.to_string_lossy();
627            if !name_str.ends_with(".dist-info") {
628                continue;
629            }
630
631            let dist_info_path = site_packages_path.join(&*name_str);
632            if let Some((raw_name, version)) = read_python_metadata(&dist_info_path).await {
633                let canon_name = canonicalize_pypi_name(&raw_name);
634                let purl = format!("pkg:pypi/{canon_name}@{version}");
635
636                if seen.contains(&purl) {
637                    continue;
638                }
639                seen.insert(purl.clone());
640
641                results.push(CrawledPackage {
642                    name: canon_name,
643                    version,
644                    namespace: None,
645                    purl,
646                    path: site_packages_path.to_path_buf(),
647                });
648            }
649        }
650
651        results
652    }
653
654    /// Parse a PyPI PURL string to extract name and version.
655    /// Strips qualifiers before parsing.
656    fn parse_pypi_purl(purl: &str) -> Option<(String, String)> {
657        // Strip qualifiers
658        let base = match purl.find('?') {
659            Some(idx) => &purl[..idx],
660            None => purl,
661        };
662
663        let rest = base.strip_prefix("pkg:pypi/")?;
664        let at_idx = rest.rfind('@')?;
665        let name = &rest[..at_idx];
666        let version = &rest[at_idx + 1..];
667
668        if name.is_empty() || version.is_empty() {
669            return None;
670        }
671
672        Some((name.to_string(), version.to_string()))
673    }
674}
675
676impl Default for PythonCrawler {
677    fn default() -> Self {
678        Self::new()
679    }
680}
681
682/// Pure parser for `python -c "import site; print(...);
683/// print(site.getusersitepackages())"` stdout. Splits the output on
684/// newlines, trims each line, discards empty lines, and returns the
685/// remaining lines as `PathBuf`s. Extracted so the path-derivation
686/// logic is unit-testable without a real Python interpreter.
687pub fn parse_python_site_packages_output(stdout: &str) -> Vec<PathBuf> {
688    stdout
689        .lines()
690        .map(str::trim)
691        .filter(|line| !line.is_empty())
692        .map(PathBuf::from)
693        .collect()
694}
695
696#[cfg(test)]
697mod tests {
698    use super::*;
699
700    #[test]
701    fn test_canonicalize_pypi_name_basic() {
702        assert_eq!(canonicalize_pypi_name("Requests"), "requests");
703        assert_eq!(canonicalize_pypi_name("my_package"), "my-package");
704        assert_eq!(canonicalize_pypi_name("My.Package"), "my-package");
705        assert_eq!(canonicalize_pypi_name("My-._Package"), "my-package");
706    }
707
708    #[test]
709    fn test_canonicalize_pypi_name_runs() {
710        // Runs of separators collapse to single -
711        assert_eq!(canonicalize_pypi_name("a__b"), "a-b");
712        assert_eq!(canonicalize_pypi_name("a-.-b"), "a-b");
713        assert_eq!(canonicalize_pypi_name("a_._-b"), "a-b");
714    }
715
716    #[test]
717    fn test_canonicalize_pypi_name_trim() {
718        assert_eq!(canonicalize_pypi_name("  requests  "), "requests");
719    }
720
721    #[test]
722    fn test_parse_pypi_purl() {
723        let (name, ver) = PythonCrawler::parse_pypi_purl("pkg:pypi/requests@2.28.0").unwrap();
724        assert_eq!(name, "requests");
725        assert_eq!(ver, "2.28.0");
726    }
727
728    #[test]
729    fn test_parse_pypi_purl_with_qualifiers() {
730        let (name, ver) =
731            PythonCrawler::parse_pypi_purl("pkg:pypi/requests@2.28.0?artifact_id=abc").unwrap();
732        assert_eq!(name, "requests");
733        assert_eq!(ver, "2.28.0");
734    }
735
736    #[test]
737    fn test_parse_pypi_purl_invalid() {
738        assert!(PythonCrawler::parse_pypi_purl("pkg:npm/lodash@4.17.21").is_none());
739        assert!(PythonCrawler::parse_pypi_purl("not-a-purl").is_none());
740    }
741
742    #[tokio::test]
743    async fn test_read_python_metadata_valid() {
744        let dir = tempfile::tempdir().unwrap();
745        let dist_info = dir.path().join("requests-2.28.0.dist-info");
746        tokio::fs::create_dir_all(&dist_info).await.unwrap();
747        tokio::fs::write(
748            dist_info.join("METADATA"),
749            "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n\nSome description",
750        )
751        .await
752        .unwrap();
753
754        let result = read_python_metadata(&dist_info).await;
755        assert!(result.is_some());
756        let (name, version) = result.unwrap();
757        assert_eq!(name, "Requests");
758        assert_eq!(version, "2.28.0");
759    }
760
761    #[tokio::test]
762    async fn test_read_python_metadata_missing() {
763        let dir = tempfile::tempdir().unwrap();
764        let dist_info = dir.path().join("nonexistent.dist-info");
765        assert!(read_python_metadata(&dist_info).await.is_none());
766    }
767
768    #[tokio::test]
769    async fn test_find_python_dirs_literal() {
770        let dir = tempfile::tempdir().unwrap();
771        let target = dir.path().join("lib").join("python3.11").join("site-packages");
772        tokio::fs::create_dir_all(&target).await.unwrap();
773
774        let results =
775            find_python_dirs(dir.path(), &["lib", "python3.*", "site-packages"]).await;
776        assert_eq!(results.len(), 1);
777        assert_eq!(results[0], target);
778    }
779
780    #[tokio::test]
781    async fn test_find_python_dirs_wildcard() {
782        let dir = tempfile::tempdir().unwrap();
783        let sp1 = dir.path().join("lib").join("python3.10").join("site-packages");
784        let sp2 = dir.path().join("lib").join("python3.11").join("site-packages");
785        tokio::fs::create_dir_all(&sp1).await.unwrap();
786        tokio::fs::create_dir_all(&sp2).await.unwrap();
787
788        // Also create a non-matching dir
789        let non_match = dir.path().join("lib").join("ruby3.0").join("site-packages");
790        tokio::fs::create_dir_all(&non_match).await.unwrap();
791
792        let results =
793            find_python_dirs(dir.path(), &["lib", "python3.*", "site-packages"]).await;
794        assert_eq!(results.len(), 2);
795    }
796
797    #[tokio::test]
798    async fn test_find_python_dirs_star_wildcard() {
799        let dir = tempfile::tempdir().unwrap();
800        let sp1 = dir
801            .path()
802            .join("tools")
803            .join("mytool")
804            .join("lib")
805            .join("python3.11")
806            .join("site-packages");
807        tokio::fs::create_dir_all(&sp1).await.unwrap();
808
809        let results = find_python_dirs(
810            dir.path(),
811            &["tools", "*", "lib", "python3.*", "site-packages"],
812        )
813        .await;
814        assert_eq!(results.len(), 1);
815        assert_eq!(results[0], sp1);
816    }
817
818    #[tokio::test]
819    async fn test_find_python_dirs_pyenv_layout() {
820        // Create a pyenv-like layout: versions/3.11.5/lib/python3.11/site-packages
821        let dir = tempfile::tempdir().unwrap();
822        let sp1 = dir
823            .path()
824            .join("versions")
825            .join("3.11.5")
826            .join("lib")
827            .join("python3.11")
828            .join("site-packages");
829        let sp2 = dir
830            .path()
831            .join("versions")
832            .join("3.12.0")
833            .join("lib")
834            .join("python3.12")
835            .join("site-packages");
836        tokio::fs::create_dir_all(&sp1).await.unwrap();
837        tokio::fs::create_dir_all(&sp2).await.unwrap();
838
839        let results = find_python_dirs(
840            &dir.path().join("versions"),
841            &["*", "lib", "python3.*", "site-packages"],
842        )
843        .await;
844        assert_eq!(results.len(), 2);
845        assert!(results.contains(&sp1));
846        assert!(results.contains(&sp2));
847    }
848
849    #[tokio::test]
850    async fn test_crawl_all_python() {
851        let dir = tempfile::tempdir().unwrap();
852        let venv = dir.path().join(".venv");
853        #[cfg(windows)]
854        let sp = venv.join("Lib").join("site-packages");
855        #[cfg(not(windows))]
856        let sp = venv.join("lib").join("python3.11").join("site-packages");
857        tokio::fs::create_dir_all(&sp).await.unwrap();
858
859        // Create a dist-info dir with METADATA
860        let dist_info = sp.join("requests-2.28.0.dist-info");
861        tokio::fs::create_dir_all(&dist_info).await.unwrap();
862        tokio::fs::write(
863            dist_info.join("METADATA"),
864            "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n",
865        )
866        .await
867        .unwrap();
868
869        let crawler = PythonCrawler::new();
870        let options = CrawlerOptions {
871            cwd: dir.path().to_path_buf(),
872            global: false,
873            global_prefix: None,
874            batch_size: 100,
875        };
876
877        let packages = crawler.crawl_all(&options).await;
878        assert_eq!(packages.len(), 1);
879        assert_eq!(packages[0].name, "requests");
880        assert_eq!(packages[0].version, "2.28.0");
881        assert_eq!(packages[0].purl, "pkg:pypi/requests@2.28.0");
882        assert!(packages[0].namespace.is_none());
883    }
884
885    #[test]
886    fn test_find_python_command() {
887        // On any platform with Python installed, this should return Some
888        // In CI environments, Python is typically available
889        let cmd = find_python_command();
890        // We don't assert Some because Python may not be installed,
891        // but if it is, the command should be valid
892        if let Some(c) = cmd {
893            assert!(
894                ["python3", "python", "py"].contains(&c),
895                "unexpected command: {c}"
896            );
897        }
898    }
899
900    #[test]
901    fn test_home_dir_detection() {
902        // Verify the fallback chain works: HOME -> USERPROFILE -> "~"
903        let home = std::env::var("HOME")
904            .or_else(|_| std::env::var("USERPROFILE"))
905            .unwrap_or_else(|_| "~".to_string());
906        // On any CI or dev machine, we should get a real path, not "~"
907        assert_ne!(home, "~", "expected a real home directory");
908        assert!(!home.is_empty());
909    }
910
911    #[tokio::test]
912    async fn test_find_by_purls_python() {
913        let dir = tempfile::tempdir().unwrap();
914        let sp = dir.path().to_path_buf();
915
916        // Create dist-info
917        let dist_info = sp.join("requests-2.28.0.dist-info");
918        tokio::fs::create_dir_all(&dist_info).await.unwrap();
919        tokio::fs::write(
920            dist_info.join("METADATA"),
921            "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n",
922        )
923        .await
924        .unwrap();
925
926        let crawler = PythonCrawler::new();
927        let purls = vec![
928            "pkg:pypi/requests@2.28.0".to_string(),
929            "pkg:pypi/flask@3.0.0".to_string(),
930        ];
931
932        let result = crawler.find_by_purls(&sp, &purls).await.unwrap();
933        assert_eq!(result.len(), 1);
934        assert!(result.contains_key("pkg:pypi/requests@2.28.0"));
935        assert!(!result.contains_key("pkg:pypi/flask@3.0.0"));
936    }
937}