Skip to main content

socket_patch_core/crawlers/
python_crawler.rs

1use std::collections::{HashMap, HashSet};
2use std::path::{Path, PathBuf};
3
4use super::types::{CrawledPackage, CrawlerOptions};
5use crate::utils::process::{CommandRunner, SystemCommandRunner};
6
7// ---------------------------------------------------------------------------
8// Python command discovery
9// ---------------------------------------------------------------------------
10
11/// Find a working Python command on the system.
12///
13/// Tries `python3`, `python`, and `py` (Windows launcher) in order,
14/// returning the first one that responds to `--version`.
15pub fn find_python_command() -> Option<&'static str> {
16    find_python_command_with(&SystemCommandRunner)
17}
18
19/// Version of `find_python_command` that accepts an injected
20/// `CommandRunner`. Tests inject a `MockCommandRunner` that returns
21/// `Some(...)` for `python3 --version` to exercise the success arm
22/// without a real Python on PATH.
23pub fn find_python_command_with(runner: &dyn CommandRunner) -> Option<&'static str> {
24    ["python3", "python", "py"]
25        .into_iter()
26        .find(|cmd| runner.run(cmd, &["--version"]).is_some())
27}
28
29/// Default batch size for crawling.
30const _DEFAULT_BATCH_SIZE: usize = 100;
31
32// ---------------------------------------------------------------------------
33// PEP 503 name canonicalization
34// ---------------------------------------------------------------------------
35
36/// Canonicalize a Python package name per PEP 503.
37///
38/// Lowercases, trims, and replaces runs of `[-_.]` with a single `-`.
39pub fn canonicalize_pypi_name(name: &str) -> String {
40    let trimmed = name.trim().to_lowercase();
41    let mut result = String::with_capacity(trimmed.len());
42    let mut in_separator_run = false;
43
44    for ch in trimmed.chars() {
45        if ch == '-' || ch == '_' || ch == '.' {
46            if !in_separator_run {
47                result.push('-');
48                in_separator_run = true;
49            }
50            // else: skip consecutive separators
51        } else {
52            in_separator_run = false;
53            result.push(ch);
54        }
55    }
56
57    result
58}
59
60// ---------------------------------------------------------------------------
61// Helpers: read Python metadata from dist-info
62// ---------------------------------------------------------------------------
63
64/// Read `Name` and `Version` for a `.dist-info` directory.
65///
66/// Primary source is the `.dist-info/METADATA` header block. When that
67/// file is missing or malformed (no usable `Name`/`Version`), fall back
68/// to the `<name>-<version>.dist-info` directory name so a corrupt or
69/// partially-written install does not make the package invisible to the
70/// crawler — a real risk for a tool whose job is to find and patch
71/// packages. The fallback only fires for an actual directory, guarding
72/// against a stray `*.dist-info` file masquerading as an install.
73pub async fn read_python_metadata(dist_info_path: &Path) -> Option<(String, String)> {
74    if let Some(found) = parse_metadata_headers(dist_info_path).await {
75        return Some(found);
76    }
77
78    let is_dir = tokio::fs::metadata(dist_info_path)
79        .await
80        .map(|m| m.is_dir())
81        .unwrap_or(false);
82    if !is_dir {
83        return None;
84    }
85    let dir_name = dist_info_path.file_name()?.to_string_lossy();
86    parse_dist_info_dir_name(&dir_name)
87}
88
89/// Parse the `Name`/`Version` headers from `<dist-info>/METADATA`.
90///
91/// Returns `None` if the file is absent, unreadable, or does not yield a
92/// non-empty `Name` and `Version` before the header/body separator.
93async fn parse_metadata_headers(dist_info_path: &Path) -> Option<(String, String)> {
94    let metadata_path = dist_info_path.join("METADATA");
95    let content = tokio::fs::read_to_string(&metadata_path).await.ok()?;
96
97    let mut name: Option<String> = None;
98    let mut version: Option<String> = None;
99
100    for line in content.lines() {
101        if name.is_some() && version.is_some() {
102            break;
103        }
104        if let Some(rest) = line.strip_prefix("Name:") {
105            name = Some(rest.trim().to_string());
106        } else if let Some(rest) = line.strip_prefix("Version:") {
107            version = Some(rest.trim().to_string());
108        }
109        // Stop at first empty line (end of headers)
110        if line.trim().is_empty() && (name.is_some() || version.is_some()) {
111            break;
112        }
113    }
114
115    match (name, version) {
116        (Some(n), Some(v)) if !n.is_empty() && !v.is_empty() => Some((n, v)),
117        _ => None,
118    }
119}
120
121/// Derive `(name, version)` from a `<name>-<version>.dist-info` directory
122/// name. A PEP 440 version never contains `-` (pre-release and local
123/// segments normalize to `aN`/`+local`), so the final `-` is the
124/// name/version boundary even when the distribution name itself contains
125/// a `-` (older pip kept the raw name; newer pip escapes it to `_`).
126/// Either way the caller canonicalizes the name. Returns `None` when the
127/// directory name carries no version segment.
128fn parse_dist_info_dir_name(dir_name: &str) -> Option<(String, String)> {
129    let base = dir_name.strip_suffix(".dist-info")?;
130    let idx = base.rfind('-')?;
131    let name = &base[..idx];
132    let version = &base[idx + 1..];
133    if name.is_empty() || version.is_empty() {
134        return None;
135    }
136    Some((name.to_string(), version.to_string()))
137}
138
139// ---------------------------------------------------------------------------
140// Helpers: find Python directories with wildcard matching
141// ---------------------------------------------------------------------------
142
143/// Find directories matching a path pattern with wildcard segments.
144///
145/// Supported wildcards:
146/// - `"python3.*"` — matches directory entries starting with `python3.`
147/// - `"*"` — matches any directory entry
148///
149/// All other segments are treated as literal path components.
150pub async fn find_python_dirs(base_path: &Path, segments: &[&str]) -> Vec<PathBuf> {
151    let mut results = Vec::new();
152
153    // Check that base_path is a directory
154    match tokio::fs::metadata(base_path).await {
155        Ok(m) if m.is_dir() => {}
156        _ => return results,
157    }
158
159    if segments.is_empty() {
160        results.push(base_path.to_path_buf());
161        return results;
162    }
163
164    let first = segments[0];
165    let rest = &segments[1..];
166
167    if first == "python3.*" {
168        // Wildcard: list directory and match python3.X entries
169        for entry in crate::utils::fs::list_dir_entries(base_path).await {
170            if !crate::utils::fs::entry_is_dir(&entry).await {
171                continue;
172            }
173            let name = entry.file_name();
174            let name_str = name.to_string_lossy();
175            if name_str.starts_with("python3.") {
176                let sub =
177                    Box::pin(find_python_dirs(&base_path.join(entry.file_name()), rest)).await;
178                results.extend(sub);
179            }
180        }
181    } else if first == "*" {
182        // Generic wildcard: match any directory entry
183        for entry in crate::utils::fs::list_dir_entries(base_path).await {
184            if !crate::utils::fs::entry_is_dir(&entry).await {
185                continue;
186            }
187            let sub = Box::pin(find_python_dirs(&base_path.join(entry.file_name()), rest)).await;
188            results.extend(sub);
189        }
190    } else {
191        // Literal segment: just check if it exists
192        let sub = Box::pin(find_python_dirs(&base_path.join(first), rest)).await;
193        results.extend(sub);
194    }
195
196    results
197}
198
199// ---------------------------------------------------------------------------
200// Helpers: site-packages discovery
201// ---------------------------------------------------------------------------
202
203/// Find `site-packages` (or `dist-packages`) directories under a base dir.
204///
205/// Handles both Unix (`lib/python3.X/site-packages`) and macOS/Linux layouts.
206pub async fn find_site_packages_under(
207    base_dir: &Path,
208    sub_dir_type: &str, // "site-packages" or "dist-packages"
209) -> Vec<PathBuf> {
210    #[cfg(windows)]
211    {
212        find_python_dirs(base_dir, &["Lib", sub_dir_type]).await
213    }
214    #[cfg(not(windows))]
215    {
216        find_python_dirs(base_dir, &["lib", "python3.*", sub_dir_type]).await
217    }
218}
219
220/// Find local virtual environment `site-packages` directories.
221///
222/// Checks (in order):
223/// 1. `VIRTUAL_ENV` environment variable
224/// 2. `.venv` directory in `cwd`
225/// 3. `venv` directory in `cwd`
226pub async fn find_local_venv_site_packages(cwd: &Path) -> Vec<PathBuf> {
227    let mut results = Vec::new();
228
229    // 1. Check VIRTUAL_ENV env var
230    if let Ok(virtual_env) = std::env::var("VIRTUAL_ENV") {
231        let venv_path = PathBuf::from(&virtual_env);
232        let matches = find_site_packages_under(&venv_path, "site-packages").await;
233        results.extend(matches);
234        if !results.is_empty() {
235            return results;
236        }
237    }
238
239    // 2. Check .venv and venv in cwd
240    for venv_dir in &[".venv", "venv"] {
241        let venv_path = cwd.join(venv_dir);
242        let matches = find_site_packages_under(&venv_path, "site-packages").await;
243        results.extend(matches);
244    }
245
246    results
247}
248
249/// Get global/system Python `site-packages` directories.
250///
251/// Queries `python3` for site-packages paths, then checks well-known system
252/// locations including Homebrew, conda, uv tools, pip --user, etc.
253pub async fn get_global_python_site_packages() -> Vec<PathBuf> {
254    let mut results = Vec::new();
255    let mut seen = HashSet::new();
256
257    let add_path = |p: PathBuf, seen: &mut HashSet<PathBuf>, results: &mut Vec<PathBuf>| {
258        let resolved = if p.is_absolute() {
259            p
260        } else {
261            std::path::absolute(&p).unwrap_or(p)
262        };
263        if seen.insert(resolved.clone()) {
264            results.push(resolved);
265        }
266    };
267
268    // 1. Ask Python for site-packages
269    if let Some(python_cmd) = find_python_command() {
270        let runner = SystemCommandRunner;
271        if let Some(stdout) = runner.run(
272            python_cmd,
273            &[
274                "-c",
275                "import site; print('\\n'.join(site.getsitepackages())); print(site.getusersitepackages())",
276            ],
277        ) {
278            for p in parse_python_site_packages_output(&stdout) {
279                add_path(p, &mut seen, &mut results);
280            }
281        }
282    }
283
284    // 2. Well-known system paths
285    let home_dir = std::env::var("HOME")
286        .or_else(|_| std::env::var("USERPROFILE"))
287        .unwrap_or_else(|_| "~".to_string());
288
289    // Helper closure to scan base/lib/python3.*/[dist|site]-packages
290    async fn scan_well_known(
291        base: &Path,
292        pkg_type: &str,
293        seen: &mut HashSet<PathBuf>,
294        results: &mut Vec<PathBuf>,
295    ) {
296        let matches = find_python_dirs(base, &["lib", "python3.*", pkg_type]).await;
297        for m in matches {
298            let resolved = if m.is_absolute() {
299                m
300            } else {
301                std::path::absolute(&m).unwrap_or(m)
302            };
303            if seen.insert(resolved.clone()) {
304                results.push(resolved);
305            }
306        }
307    }
308
309    #[cfg(not(windows))]
310    {
311        // Debian/Ubuntu
312        scan_well_known(Path::new("/usr"), "dist-packages", &mut seen, &mut results).await;
313        scan_well_known(Path::new("/usr"), "site-packages", &mut seen, &mut results).await;
314        // Debian pip / most distros / macOS
315        scan_well_known(
316            Path::new("/usr/local"),
317            "dist-packages",
318            &mut seen,
319            &mut results,
320        )
321        .await;
322        scan_well_known(
323            Path::new("/usr/local"),
324            "site-packages",
325            &mut seen,
326            &mut results,
327        )
328        .await;
329        // pip --user on Unix
330        let user_local = PathBuf::from(&home_dir).join(".local");
331        scan_well_known(&user_local, "site-packages", &mut seen, &mut results).await;
332    }
333
334    // macOS-specific
335    #[cfg(target_os = "macos")]
336    {
337        scan_well_known(
338            Path::new("/opt/homebrew"),
339            "site-packages",
340            &mut seen,
341            &mut results,
342        )
343        .await;
344
345        // Python.org framework: /Library/Frameworks/Python.framework/Versions/
346        // holds bare version dirs (`3.11`, `3.12`, `Current`) — NOT `python3.X`
347        // — so the version segment must be matched with `*`, not `python3.*`.
348        let fw_matches = find_python_dirs(
349            Path::new("/Library/Frameworks/Python.framework"),
350            &["Versions", "*", "lib", "python3.*", "site-packages"],
351        )
352        .await;
353        for m in fw_matches {
354            add_path(m, &mut seen, &mut results);
355        }
356    }
357
358    // Windows-specific
359    #[cfg(windows)]
360    {
361        // pip --user on Windows: %APPDATA%\Python\PythonXY\site-packages
362        if let Ok(appdata) = std::env::var("APPDATA") {
363            let appdata_python = PathBuf::from(&appdata).join("Python");
364            for entry in crate::utils::fs::list_dir_entries(&appdata_python).await {
365                let p = appdata_python.join(entry.file_name()).join("site-packages");
366                if tokio::fs::metadata(&p).await.is_ok() {
367                    add_path(p, &mut seen, &mut results);
368                }
369            }
370        }
371        // Common Windows Python install locations
372        for base in &["C:\\Python", "C:\\Program Files\\Python"] {
373            for entry in crate::utils::fs::list_dir_entries(Path::new(base)).await {
374                let sp = PathBuf::from(base)
375                    .join(entry.file_name())
376                    .join("Lib")
377                    .join("site-packages");
378                if tokio::fs::metadata(&sp).await.is_ok() {
379                    add_path(sp, &mut seen, &mut results);
380                }
381            }
382        }
383        // Microsoft Store / python.org via LocalAppData
384        if let Ok(local) = std::env::var("LOCALAPPDATA") {
385            let programs_python = PathBuf::from(&local).join("Programs").join("Python");
386            for entry in crate::utils::fs::list_dir_entries(&programs_python).await {
387                let sp = programs_python
388                    .join(entry.file_name())
389                    .join("Lib")
390                    .join("site-packages");
391                if tokio::fs::metadata(&sp).await.is_ok() {
392                    add_path(sp, &mut seen, &mut results);
393                }
394            }
395        }
396    }
397
398    // pyenv (works on macOS and Linux)
399    #[cfg(not(windows))]
400    {
401        let pyenv_root = std::env::var("PYENV_ROOT")
402            .map(PathBuf::from)
403            .unwrap_or_else(|_| PathBuf::from(&home_dir).join(".pyenv"));
404        let pyenv_versions = pyenv_root.join("versions");
405        let pyenv_matches =
406            find_python_dirs(&pyenv_versions, &["*", "lib", "python3.*", "site-packages"]).await;
407        for m in pyenv_matches {
408            add_path(m, &mut seen, &mut results);
409        }
410    }
411
412    // Conda
413    let anaconda = PathBuf::from(&home_dir).join("anaconda3");
414    scan_well_known(&anaconda, "site-packages", &mut seen, &mut results).await;
415    let miniconda = PathBuf::from(&home_dir).join("miniconda3");
416    scan_well_known(&miniconda, "site-packages", &mut seen, &mut results).await;
417
418    // uv tools — platform-specific install root.
419    #[cfg(target_os = "macos")]
420    {
421        let uv_base = PathBuf::from(&home_dir)
422            .join("Library")
423            .join("Application Support")
424            .join("uv")
425            .join("tools");
426        let uv_matches =
427            find_python_dirs(&uv_base, &["*", "lib", "python3.*", "site-packages"]).await;
428        for m in uv_matches {
429            add_path(m, &mut seen, &mut results);
430        }
431    }
432    #[cfg(windows)]
433    {
434        // %LOCALAPPDATA%\uv\tools
435        if let Ok(local) = std::env::var("LOCALAPPDATA") {
436            let uv_base = PathBuf::from(local).join("uv").join("tools");
437            let uv_matches = find_python_dirs(&uv_base, &["*", "Lib", "site-packages"]).await;
438            for m in uv_matches {
439                add_path(m, &mut seen, &mut results);
440            }
441        }
442    }
443    #[cfg(all(not(target_os = "macos"), not(windows)))]
444    {
445        let uv_base = PathBuf::from(&home_dir)
446            .join(".local")
447            .join("share")
448            .join("uv")
449            .join("tools");
450        let uv_matches =
451            find_python_dirs(&uv_base, &["*", "lib", "python3.*", "site-packages"]).await;
452        for m in uv_matches {
453            add_path(m, &mut seen, &mut results);
454        }
455    }
456
457    // uv-managed Python interpreters (`uv python install 3.X`) live at:
458    //   Linux/macOS: ~/.local/share/uv/python/cpython-3.X.*/lib/python3.X/site-packages/
459    //   Windows:     %LOCALAPPDATA%\uv\python\cpython-3.X.*\Lib\site-packages\
460    // The typical flow is `uv venv` + `uv pip install`, where the venv layout
461    // is already covered by `find_local_venv_site_packages`. But power users
462    // can install packages directly into the managed interpreter (e.g. via
463    // `<uv-python>/bin/pip install ...`), and globally-discovered crawls
464    // should surface those.
465    #[cfg(not(windows))]
466    {
467        let uv_python = PathBuf::from(&home_dir)
468            .join(".local")
469            .join("share")
470            .join("uv")
471            .join("python");
472        let uv_matches =
473            find_python_dirs(&uv_python, &["*", "lib", "python3.*", "site-packages"]).await;
474        for m in uv_matches {
475            add_path(m, &mut seen, &mut results);
476        }
477    }
478    #[cfg(windows)]
479    {
480        if let Ok(local) = std::env::var("LOCALAPPDATA") {
481            let uv_python = PathBuf::from(local).join("uv").join("python");
482            let uv_matches = find_python_dirs(&uv_python, &["*", "Lib", "site-packages"]).await;
483            for m in uv_matches {
484                add_path(m, &mut seen, &mut results);
485            }
486        }
487    }
488
489    results
490}
491
492/// Returns true if `cwd` looks like a Python project root.
493///
494/// Used by `PythonCrawler::get_site_packages_paths` to decide
495/// whether to fall back to the global-discovery path when no venv
496/// was found. Mirrors `is_dotnet_project` in nuget_crawler and the
497/// `has_gemfile || has_gemfile_lock` check in ruby_crawler.
498///
499/// The list intentionally covers all major Python toolchains:
500///   * `pyproject.toml` — PEP 518 / 621 (poetry, hatch, uv, flit,
501///     setuptools-PEP-517, pdm, etc. — anything modern)
502///   * `setup.py` / `setup.cfg` — legacy setuptools
503///   * `requirements.txt` — pip-compile / bare requirements
504///   * `uv.lock` — uv-managed projects (PEP 751 export sibling is
505///     `pylock.toml` but in practice `uv.lock` is what ships)
506async fn is_python_project(cwd: &Path) -> bool {
507    let markers = [
508        "pyproject.toml",
509        "setup.py",
510        "setup.cfg",
511        "requirements.txt",
512        "uv.lock",
513    ];
514    for m in &markers {
515        if tokio::fs::metadata(cwd.join(m)).await.is_ok() {
516            return true;
517        }
518    }
519    false
520}
521
522// ---------------------------------------------------------------------------
523// PythonCrawler
524// ---------------------------------------------------------------------------
525
526/// Python ecosystem crawler for discovering packages in `site-packages`.
527pub struct PythonCrawler;
528
529impl PythonCrawler {
530    /// Create a new `PythonCrawler`.
531    pub fn new() -> Self {
532        Self
533    }
534
535    /// Get `site-packages` paths based on options.
536    ///
537    /// Local-mode discovery has two stages:
538    ///   1. `find_local_venv_site_packages` — handles `VIRTUAL_ENV`,
539    ///      `.venv`, and `venv` directories (covers the common case
540    ///      of an activated or project-local venv).
541    ///   2. If no venv was found AND the cwd looks like a Python
542    ///      project (`pyproject.toml`, `setup.py`, `setup.cfg`,
543    ///      `requirements.txt`, or `uv.lock` present), fall through
544    ///      to `get_global_python_site_packages`. This mirrors the
545    ///      cargo / ruby / go pattern where a project marker
546    ///      indicates "scan this ecosystem globally for this project".
547    ///
548    /// Without the marker fallback, a fresh clone with
549    /// `pyproject.toml` + `uv.lock` but no `.venv` would silently
550    /// return zero packages.
551    pub async fn get_site_packages_paths(
552        &self,
553        options: &CrawlerOptions,
554    ) -> Result<Vec<PathBuf>, std::io::Error> {
555        if options.global || options.global_prefix.is_some() {
556            if let Some(ref custom) = options.global_prefix {
557                return Ok(vec![custom.clone()]);
558            }
559            return Ok(get_global_python_site_packages().await);
560        }
561        let venv_paths = find_local_venv_site_packages(&options.cwd).await;
562        if !venv_paths.is_empty() {
563            return Ok(venv_paths);
564        }
565        if is_python_project(&options.cwd).await {
566            return Ok(get_global_python_site_packages().await);
567        }
568        Ok(Vec::new())
569    }
570
571    /// Crawl all discovered `site-packages` and return every package found.
572    pub async fn crawl_all(&self, options: &CrawlerOptions) -> Vec<CrawledPackage> {
573        let mut packages = Vec::new();
574        let mut seen = HashSet::new();
575
576        let sp_paths = self
577            .get_site_packages_paths(options)
578            .await
579            .unwrap_or_default();
580
581        for sp_path in &sp_paths {
582            let found = self.scan_site_packages(sp_path, &mut seen).await;
583            packages.extend(found);
584        }
585
586        packages
587    }
588
589    /// Find specific packages by PURL.
590    ///
591    /// Accepts base PURLs (no qualifiers) — the caller should strip qualifiers
592    /// before calling.
593    pub async fn find_by_purls(
594        &self,
595        site_packages_path: &Path,
596        purls: &[String],
597    ) -> Result<HashMap<String, CrawledPackage>, std::io::Error> {
598        let mut result = HashMap::new();
599
600        // Build lookup: canonicalized-name@version -> purl
601        let mut purl_lookup: HashMap<String, &str> = HashMap::new();
602        for purl in purls {
603            if let Some((name, version)) = Self::parse_pypi_purl(purl) {
604                let key = format!("{}@{}", canonicalize_pypi_name(&name), version);
605                purl_lookup.insert(key, purl.as_str());
606            }
607        }
608
609        if purl_lookup.is_empty() {
610            return Ok(result);
611        }
612
613        // Scan all .dist-info dirs
614        for entry in crate::utils::fs::list_dir_entries(site_packages_path).await {
615            let name = entry.file_name();
616            let name_str = name.to_string_lossy();
617            if !name_str.ends_with(".dist-info") {
618                continue;
619            }
620
621            let dist_info_path = site_packages_path.join(&*name_str);
622            if let Some((raw_name, version)) = read_python_metadata(&dist_info_path).await {
623                let canon_name = canonicalize_pypi_name(&raw_name);
624                let key = format!("{canon_name}@{version}");
625
626                if let Some(&matched_purl) = purl_lookup.get(&key) {
627                    result.insert(
628                        matched_purl.to_string(),
629                        CrawledPackage {
630                            name: canon_name,
631                            version,
632                            namespace: None,
633                            purl: matched_purl.to_string(),
634                            path: site_packages_path.to_path_buf(),
635                        },
636                    );
637                }
638            }
639        }
640
641        Ok(result)
642    }
643
644    // ------------------------------------------------------------------
645    // Private helpers
646    // ------------------------------------------------------------------
647
648    /// Scan a `site-packages` directory for `.dist-info` directories.
649    async fn scan_site_packages(
650        &self,
651        site_packages_path: &Path,
652        seen: &mut HashSet<String>,
653    ) -> Vec<CrawledPackage> {
654        let mut results = Vec::new();
655
656        for entry in crate::utils::fs::list_dir_entries(site_packages_path).await {
657            let name = entry.file_name();
658            let name_str = name.to_string_lossy();
659            if !name_str.ends_with(".dist-info") {
660                continue;
661            }
662
663            let dist_info_path = site_packages_path.join(&*name_str);
664            if let Some((raw_name, version)) = read_python_metadata(&dist_info_path).await {
665                let canon_name = canonicalize_pypi_name(&raw_name);
666                let purl = format!("pkg:pypi/{canon_name}@{version}");
667
668                if seen.contains(&purl) {
669                    continue;
670                }
671                seen.insert(purl.clone());
672
673                results.push(CrawledPackage {
674                    name: canon_name,
675                    version,
676                    namespace: None,
677                    purl,
678                    path: site_packages_path.to_path_buf(),
679                });
680            }
681        }
682
683        results
684    }
685
686    /// Parse a PyPI PURL string to extract name and version.
687    /// Strips qualifiers before parsing.
688    fn parse_pypi_purl(purl: &str) -> Option<(String, String)> {
689        // Strip qualifiers
690        let base = match purl.find('?') {
691            Some(idx) => &purl[..idx],
692            None => purl,
693        };
694
695        let rest = base.strip_prefix("pkg:pypi/")?;
696        let at_idx = rest.rfind('@')?;
697        let name = &rest[..at_idx];
698        let version = &rest[at_idx + 1..];
699
700        if name.is_empty() || version.is_empty() {
701            return None;
702        }
703
704        Some((name.to_string(), version.to_string()))
705    }
706}
707
708impl Default for PythonCrawler {
709    fn default() -> Self {
710        Self::new()
711    }
712}
713
714/// Pure parser for `python -c "import site; print(...);
715/// print(site.getusersitepackages())"` stdout. Splits the output on
716/// newlines, trims each line, discards empty lines, and returns the
717/// remaining lines as `PathBuf`s. Extracted so the path-derivation
718/// logic is unit-testable without a real Python interpreter.
719pub fn parse_python_site_packages_output(stdout: &str) -> Vec<PathBuf> {
720    stdout
721        .lines()
722        .map(str::trim)
723        .filter(|line| !line.is_empty())
724        .map(PathBuf::from)
725        .collect()
726}
727
728#[cfg(test)]
729mod tests {
730    use super::*;
731
732    #[test]
733    fn test_canonicalize_pypi_name_basic() {
734        assert_eq!(canonicalize_pypi_name("Requests"), "requests");
735        assert_eq!(canonicalize_pypi_name("my_package"), "my-package");
736        assert_eq!(canonicalize_pypi_name("My.Package"), "my-package");
737        assert_eq!(canonicalize_pypi_name("My-._Package"), "my-package");
738    }
739
740    #[test]
741    fn test_canonicalize_pypi_name_runs() {
742        // Runs of separators collapse to single -
743        assert_eq!(canonicalize_pypi_name("a__b"), "a-b");
744        assert_eq!(canonicalize_pypi_name("a-.-b"), "a-b");
745        assert_eq!(canonicalize_pypi_name("a_._-b"), "a-b");
746    }
747
748    #[test]
749    fn test_canonicalize_pypi_name_trim() {
750        assert_eq!(canonicalize_pypi_name("  requests  "), "requests");
751    }
752
753    #[test]
754    fn test_parse_pypi_purl() {
755        let (name, ver) = PythonCrawler::parse_pypi_purl("pkg:pypi/requests@2.28.0").unwrap();
756        assert_eq!(name, "requests");
757        assert_eq!(ver, "2.28.0");
758    }
759
760    #[test]
761    fn test_parse_pypi_purl_with_qualifiers() {
762        let (name, ver) =
763            PythonCrawler::parse_pypi_purl("pkg:pypi/requests@2.28.0?artifact_id=abc").unwrap();
764        assert_eq!(name, "requests");
765        assert_eq!(ver, "2.28.0");
766    }
767
768    #[test]
769    fn test_parse_pypi_purl_invalid() {
770        assert!(PythonCrawler::parse_pypi_purl("pkg:npm/lodash@4.17.21").is_none());
771        assert!(PythonCrawler::parse_pypi_purl("not-a-purl").is_none());
772    }
773
774    #[tokio::test]
775    async fn test_read_python_metadata_valid() {
776        let dir = tempfile::tempdir().unwrap();
777        let dist_info = dir.path().join("requests-2.28.0.dist-info");
778        tokio::fs::create_dir_all(&dist_info).await.unwrap();
779        tokio::fs::write(
780            dist_info.join("METADATA"),
781            "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n\nSome description",
782        )
783        .await
784        .unwrap();
785
786        let result = read_python_metadata(&dist_info).await;
787        assert!(result.is_some());
788        let (name, version) = result.unwrap();
789        assert_eq!(name, "Requests");
790        assert_eq!(version, "2.28.0");
791    }
792
793    #[tokio::test]
794    async fn test_read_python_metadata_missing() {
795        let dir = tempfile::tempdir().unwrap();
796        let dist_info = dir.path().join("nonexistent.dist-info");
797        assert!(read_python_metadata(&dist_info).await.is_none());
798    }
799
800    #[test]
801    fn test_parse_dist_info_dir_name() {
802        // Modern pip escapes `-` in the name to `_`.
803        assert_eq!(
804            parse_dist_info_dir_name("flask_sqlalchemy-3.0.5.dist-info"),
805            Some(("flask_sqlalchemy".to_string(), "3.0.5".to_string()))
806        );
807        // Older pip kept the raw name with `-`; the final `-` is still the
808        // version boundary because a normalized version never contains `-`.
809        assert_eq!(
810            parse_dist_info_dir_name("Flask-SQLAlchemy-3.0.5.dist-info"),
811            Some(("Flask-SQLAlchemy".to_string(), "3.0.5".to_string()))
812        );
813        assert_eq!(
814            parse_dist_info_dir_name("requests-2.28.0.dist-info"),
815            Some(("requests".to_string(), "2.28.0".to_string()))
816        );
817        // No version segment, wrong suffix, and empty-name guards.
818        assert!(parse_dist_info_dir_name("noversion.dist-info").is_none());
819        assert!(parse_dist_info_dir_name("requests-2.28.0.egg-info").is_none());
820        assert!(parse_dist_info_dir_name("-1.0.dist-info").is_none());
821    }
822
823    /// A `.dist-info` directory whose `METADATA` is missing must still be
824    /// discoverable via the directory name — otherwise a corrupt/partial
825    /// install silently hides a package the crawler is meant to patch.
826    #[tokio::test]
827    async fn test_read_python_metadata_falls_back_to_dir_name() {
828        let dir = tempfile::tempdir().unwrap();
829        let dist_info = dir.path().join("requests-2.28.0.dist-info");
830        tokio::fs::create_dir_all(&dist_info).await.unwrap();
831        // No METADATA file written at all.
832        let (name, version) = read_python_metadata(&dist_info).await.unwrap();
833        assert_eq!(name, "requests");
834        assert_eq!(version, "2.28.0");
835    }
836
837    /// Malformed METADATA (present but missing the `Version` header) also
838    /// falls back to the directory name rather than dropping the package.
839    #[tokio::test]
840    async fn test_read_python_metadata_falls_back_on_malformed() {
841        let dir = tempfile::tempdir().unwrap();
842        let dist_info = dir.path().join("urllib3-2.0.7.dist-info");
843        tokio::fs::create_dir_all(&dist_info).await.unwrap();
844        tokio::fs::write(
845            dist_info.join("METADATA"),
846            "Metadata-Version: 2.1\nName: urllib3\n\nDescription body, no Version header\n",
847        )
848        .await
849        .unwrap();
850        let (name, version) = read_python_metadata(&dist_info).await.unwrap();
851        assert_eq!(name, "urllib3");
852        assert_eq!(version, "2.0.7");
853    }
854
855    /// A stray *file* named `*.dist-info` must NOT be surfaced as a package
856    /// via the directory-name fallback.
857    #[tokio::test]
858    async fn test_read_python_metadata_ignores_stray_file() {
859        let dir = tempfile::tempdir().unwrap();
860        let stray = dir.path().join("ghost-1.0.dist-info");
861        tokio::fs::write(&stray, b"not a dir").await.unwrap();
862        assert!(read_python_metadata(&stray).await.is_none());
863    }
864
865    /// `crawl_all` recovers a package whose METADATA is missing by parsing
866    /// the `.dist-info` directory name.
867    #[tokio::test]
868    async fn test_crawl_all_recovers_metadata_less_package() {
869        let dir = tempfile::tempdir().unwrap();
870        let venv = dir.path().join(".venv");
871        #[cfg(windows)]
872        let sp = venv.join("Lib").join("site-packages");
873        #[cfg(not(windows))]
874        let sp = venv.join("lib").join("python3.11").join("site-packages");
875        tokio::fs::create_dir_all(&sp).await.unwrap();
876        // dist-info dir exists but has no METADATA (partial install).
877        tokio::fs::create_dir_all(sp.join("flask_sqlalchemy-3.0.5.dist-info"))
878            .await
879            .unwrap();
880
881        let crawler = PythonCrawler::new();
882        let options = CrawlerOptions {
883            cwd: dir.path().to_path_buf(),
884            global: false,
885            global_prefix: None,
886            batch_size: 100,
887        };
888        let packages = crawler.crawl_all(&options).await;
889        assert_eq!(packages.len(), 1);
890        assert_eq!(packages[0].name, "flask-sqlalchemy");
891        assert_eq!(packages[0].version, "3.0.5");
892        assert_eq!(packages[0].purl, "pkg:pypi/flask-sqlalchemy@3.0.5");
893    }
894
895    /// Regression for the macOS Python.framework layout: the `Versions/`
896    /// directory holds bare version dirs (`3.11`), so the version segment
897    /// must be matched with `*`. A `python3.*` pattern matches nothing —
898    /// which is exactly the bug that was fixed.
899    #[tokio::test]
900    async fn test_find_python_dirs_framework_versions_layout() {
901        let dir = tempfile::tempdir().unwrap();
902        let sp = dir
903            .path()
904            .join("Versions")
905            .join("3.11")
906            .join("lib")
907            .join("python3.11")
908            .join("site-packages");
909        tokio::fs::create_dir_all(&sp).await.unwrap();
910
911        // Correct pattern (`*` for the version dir) finds it.
912        let ok = find_python_dirs(
913            &dir.path().join("Versions"),
914            &["*", "lib", "python3.*", "site-packages"],
915        )
916        .await;
917        assert_eq!(ok.len(), 1);
918        assert_eq!(ok[0], sp);
919
920        // The buggy pattern (`python3.*` for the version dir) matches nothing.
921        let buggy = find_python_dirs(
922            &dir.path().join("Versions"),
923            &["python3.*", "lib", "python3.*", "site-packages"],
924        )
925        .await;
926        assert!(buggy.is_empty());
927    }
928
929    #[tokio::test]
930    async fn test_find_python_dirs_literal() {
931        let dir = tempfile::tempdir().unwrap();
932        let target = dir
933            .path()
934            .join("lib")
935            .join("python3.11")
936            .join("site-packages");
937        tokio::fs::create_dir_all(&target).await.unwrap();
938
939        let results = find_python_dirs(dir.path(), &["lib", "python3.*", "site-packages"]).await;
940        assert_eq!(results.len(), 1);
941        assert_eq!(results[0], target);
942    }
943
944    #[tokio::test]
945    async fn test_find_python_dirs_wildcard() {
946        let dir = tempfile::tempdir().unwrap();
947        let sp1 = dir
948            .path()
949            .join("lib")
950            .join("python3.10")
951            .join("site-packages");
952        let sp2 = dir
953            .path()
954            .join("lib")
955            .join("python3.11")
956            .join("site-packages");
957        tokio::fs::create_dir_all(&sp1).await.unwrap();
958        tokio::fs::create_dir_all(&sp2).await.unwrap();
959
960        // Also create a non-matching dir
961        let non_match = dir.path().join("lib").join("ruby3.0").join("site-packages");
962        tokio::fs::create_dir_all(&non_match).await.unwrap();
963
964        let results = find_python_dirs(dir.path(), &["lib", "python3.*", "site-packages"]).await;
965        assert_eq!(results.len(), 2);
966    }
967
968    #[tokio::test]
969    async fn test_find_python_dirs_star_wildcard() {
970        let dir = tempfile::tempdir().unwrap();
971        let sp1 = dir
972            .path()
973            .join("tools")
974            .join("mytool")
975            .join("lib")
976            .join("python3.11")
977            .join("site-packages");
978        tokio::fs::create_dir_all(&sp1).await.unwrap();
979
980        let results = find_python_dirs(
981            dir.path(),
982            &["tools", "*", "lib", "python3.*", "site-packages"],
983        )
984        .await;
985        assert_eq!(results.len(), 1);
986        assert_eq!(results[0], sp1);
987    }
988
989    #[tokio::test]
990    async fn test_find_python_dirs_pyenv_layout() {
991        // Create a pyenv-like layout: versions/3.11.5/lib/python3.11/site-packages
992        let dir = tempfile::tempdir().unwrap();
993        let sp1 = dir
994            .path()
995            .join("versions")
996            .join("3.11.5")
997            .join("lib")
998            .join("python3.11")
999            .join("site-packages");
1000        let sp2 = dir
1001            .path()
1002            .join("versions")
1003            .join("3.12.0")
1004            .join("lib")
1005            .join("python3.12")
1006            .join("site-packages");
1007        tokio::fs::create_dir_all(&sp1).await.unwrap();
1008        tokio::fs::create_dir_all(&sp2).await.unwrap();
1009
1010        let results = find_python_dirs(
1011            &dir.path().join("versions"),
1012            &["*", "lib", "python3.*", "site-packages"],
1013        )
1014        .await;
1015        assert_eq!(results.len(), 2);
1016        assert!(results.contains(&sp1));
1017        assert!(results.contains(&sp2));
1018    }
1019
1020    #[tokio::test]
1021    async fn test_crawl_all_python() {
1022        let dir = tempfile::tempdir().unwrap();
1023        let venv = dir.path().join(".venv");
1024        #[cfg(windows)]
1025        let sp = venv.join("Lib").join("site-packages");
1026        #[cfg(not(windows))]
1027        let sp = venv.join("lib").join("python3.11").join("site-packages");
1028        tokio::fs::create_dir_all(&sp).await.unwrap();
1029
1030        // Create a dist-info dir with METADATA
1031        let dist_info = sp.join("requests-2.28.0.dist-info");
1032        tokio::fs::create_dir_all(&dist_info).await.unwrap();
1033        tokio::fs::write(
1034            dist_info.join("METADATA"),
1035            "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n",
1036        )
1037        .await
1038        .unwrap();
1039
1040        let crawler = PythonCrawler::new();
1041        let options = CrawlerOptions {
1042            cwd: dir.path().to_path_buf(),
1043            global: false,
1044            global_prefix: None,
1045            batch_size: 100,
1046        };
1047
1048        let packages = crawler.crawl_all(&options).await;
1049        assert_eq!(packages.len(), 1);
1050        assert_eq!(packages[0].name, "requests");
1051        assert_eq!(packages[0].version, "2.28.0");
1052        assert_eq!(packages[0].purl, "pkg:pypi/requests@2.28.0");
1053        assert!(packages[0].namespace.is_none());
1054    }
1055
1056    #[test]
1057    fn test_find_python_command() {
1058        // On any platform with Python installed, this should return Some
1059        // In CI environments, Python is typically available
1060        let cmd = find_python_command();
1061        // We don't assert Some because Python may not be installed,
1062        // but if it is, the command should be valid
1063        if let Some(c) = cmd {
1064            assert!(
1065                ["python3", "python", "py"].contains(&c),
1066                "unexpected command: {c}"
1067            );
1068        }
1069    }
1070
1071    #[test]
1072    fn test_home_dir_detection() {
1073        // Verify the fallback chain works: HOME -> USERPROFILE -> "~"
1074        let home = std::env::var("HOME")
1075            .or_else(|_| std::env::var("USERPROFILE"))
1076            .unwrap_or_else(|_| "~".to_string());
1077        // On any CI or dev machine, we should get a real path, not "~"
1078        assert_ne!(home, "~", "expected a real home directory");
1079        assert!(!home.is_empty());
1080    }
1081
1082    #[tokio::test]
1083    async fn test_find_by_purls_python() {
1084        let dir = tempfile::tempdir().unwrap();
1085        let sp = dir.path().to_path_buf();
1086
1087        // Create dist-info
1088        let dist_info = sp.join("requests-2.28.0.dist-info");
1089        tokio::fs::create_dir_all(&dist_info).await.unwrap();
1090        tokio::fs::write(
1091            dist_info.join("METADATA"),
1092            "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n",
1093        )
1094        .await
1095        .unwrap();
1096
1097        let crawler = PythonCrawler::new();
1098        let purls = vec![
1099            "pkg:pypi/requests@2.28.0".to_string(),
1100            "pkg:pypi/flask@3.0.0".to_string(),
1101        ];
1102
1103        let result = crawler.find_by_purls(&sp, &purls).await.unwrap();
1104        assert_eq!(result.len(), 1);
1105        assert!(result.contains_key("pkg:pypi/requests@2.28.0"));
1106        assert!(!result.contains_key("pkg:pypi/flask@3.0.0"));
1107    }
1108}