1use std::collections::{HashMap, HashSet};
2use std::path::{Path, PathBuf};
3use std::process::{Command, Stdio};
4
5use super::types::{CrawledPackage, CrawlerOptions};
6
7pub fn find_python_command() -> Option<&'static str> {
16 ["python3", "python", "py"].into_iter().find(|cmd| {
17 Command::new(cmd)
18 .args(["--version"])
19 .stdin(Stdio::null())
20 .stdout(Stdio::null())
21 .stderr(Stdio::null())
22 .status()
23 .is_ok()
24 })
25}
26
27const _DEFAULT_BATCH_SIZE: usize = 100;
29
30pub fn canonicalize_pypi_name(name: &str) -> String {
38 let trimmed = name.trim().to_lowercase();
39 let mut result = String::with_capacity(trimmed.len());
40 let mut in_separator_run = false;
41
42 for ch in trimmed.chars() {
43 if ch == '-' || ch == '_' || ch == '.' {
44 if !in_separator_run {
45 result.push('-');
46 in_separator_run = true;
47 }
48 } else {
50 in_separator_run = false;
51 result.push(ch);
52 }
53 }
54
55 result
56}
57
58pub async fn read_python_metadata(dist_info_path: &Path) -> Option<(String, String)> {
64 let metadata_path = dist_info_path.join("METADATA");
65 let content = tokio::fs::read_to_string(&metadata_path).await.ok()?;
66
67 let mut name: Option<String> = None;
68 let mut version: Option<String> = None;
69
70 for line in content.lines() {
71 if name.is_some() && version.is_some() {
72 break;
73 }
74 if let Some(rest) = line.strip_prefix("Name:") {
75 name = Some(rest.trim().to_string());
76 } else if let Some(rest) = line.strip_prefix("Version:") {
77 version = Some(rest.trim().to_string());
78 }
79 if line.trim().is_empty() && (name.is_some() || version.is_some()) {
81 break;
82 }
83 }
84
85 match (name, version) {
86 (Some(n), Some(v)) if !n.is_empty() && !v.is_empty() => Some((n, v)),
87 _ => None,
88 }
89}
90
91pub async fn find_python_dirs(base_path: &Path, segments: &[&str]) -> Vec<PathBuf> {
103 let mut results = Vec::new();
104
105 match tokio::fs::metadata(base_path).await {
107 Ok(m) if m.is_dir() => {}
108 _ => return results,
109 }
110
111 if segments.is_empty() {
112 results.push(base_path.to_path_buf());
113 return results;
114 }
115
116 let first = segments[0];
117 let rest = &segments[1..];
118
119 if first == "python3.*" {
120 if let Ok(mut entries) = tokio::fs::read_dir(base_path).await {
122 while let Ok(Some(entry)) = entries.next_entry().await {
123 let ft = match entry.file_type().await {
124 Ok(ft) => ft,
125 Err(_) => continue,
126 };
127 if !ft.is_dir() {
128 continue;
129 }
130 let name = entry.file_name();
131 let name_str = name.to_string_lossy();
132 if name_str.starts_with("python3.") {
133 let sub = Box::pin(find_python_dirs(
134 &base_path.join(entry.file_name()),
135 rest,
136 ))
137 .await;
138 results.extend(sub);
139 }
140 }
141 }
142 } else if first == "*" {
143 if let Ok(mut entries) = tokio::fs::read_dir(base_path).await {
145 while let Ok(Some(entry)) = entries.next_entry().await {
146 let ft = match entry.file_type().await {
147 Ok(ft) => ft,
148 Err(_) => continue,
149 };
150 if !ft.is_dir() {
151 continue;
152 }
153 let sub = Box::pin(find_python_dirs(
154 &base_path.join(entry.file_name()),
155 rest,
156 ))
157 .await;
158 results.extend(sub);
159 }
160 }
161 } else {
162 let sub =
164 Box::pin(find_python_dirs(&base_path.join(first), rest)).await;
165 results.extend(sub);
166 }
167
168 results
169}
170
171pub async fn find_site_packages_under(
179 base_dir: &Path,
180 sub_dir_type: &str, ) -> Vec<PathBuf> {
182 if cfg!(windows) {
183 find_python_dirs(base_dir, &["Lib", sub_dir_type]).await
184 } else {
185 find_python_dirs(base_dir, &["lib", "python3.*", sub_dir_type]).await
186 }
187}
188
189pub async fn find_local_venv_site_packages(cwd: &Path) -> Vec<PathBuf> {
196 let mut results = Vec::new();
197
198 if let Ok(virtual_env) = std::env::var("VIRTUAL_ENV") {
200 let venv_path = PathBuf::from(&virtual_env);
201 let matches = find_site_packages_under(&venv_path, "site-packages").await;
202 results.extend(matches);
203 if !results.is_empty() {
204 return results;
205 }
206 }
207
208 for venv_dir in &[".venv", "venv"] {
210 let venv_path = cwd.join(venv_dir);
211 let matches = find_site_packages_under(&venv_path, "site-packages").await;
212 results.extend(matches);
213 }
214
215 results
216}
217
218pub async fn get_global_python_site_packages() -> Vec<PathBuf> {
223 let mut results = Vec::new();
224 let mut seen = HashSet::new();
225
226 let add_path = |p: PathBuf, seen: &mut HashSet<PathBuf>, results: &mut Vec<PathBuf>| {
227 let resolved = if p.is_absolute() {
228 p
229 } else {
230 std::path::absolute(&p).unwrap_or(p)
231 };
232 if seen.insert(resolved.clone()) {
233 results.push(resolved);
234 }
235 };
236
237 if let Some(python_cmd) = find_python_command() {
239 if let Ok(output) = Command::new(python_cmd)
240 .args([
241 "-c",
242 "import site; print('\\n'.join(site.getsitepackages())); print(site.getusersitepackages())",
243 ])
244 .stdin(Stdio::null())
245 .stdout(Stdio::piped())
246 .stderr(Stdio::piped())
247 .output()
248 {
249 if output.status.success() {
250 let stdout = String::from_utf8_lossy(&output.stdout);
251 for line in stdout.lines() {
252 let p = line.trim();
253 if !p.is_empty() {
254 add_path(PathBuf::from(p), &mut seen, &mut results);
255 }
256 }
257 }
258 }
259 }
260
261 let home_dir = std::env::var("HOME")
263 .or_else(|_| std::env::var("USERPROFILE"))
264 .unwrap_or_else(|_| "~".to_string());
265
266 async fn scan_well_known(
268 base: &Path,
269 pkg_type: &str,
270 seen: &mut HashSet<PathBuf>,
271 results: &mut Vec<PathBuf>,
272 ) {
273 let matches = find_python_dirs(base, &["lib", "python3.*", pkg_type]).await;
274 for m in matches {
275 let resolved = if m.is_absolute() {
276 m
277 } else {
278 std::path::absolute(&m).unwrap_or(m)
279 };
280 if seen.insert(resolved.clone()) {
281 results.push(resolved);
282 }
283 }
284 }
285
286 if !cfg!(windows) {
287 scan_well_known(Path::new("/usr"), "dist-packages", &mut seen, &mut results).await;
289 scan_well_known(Path::new("/usr"), "site-packages", &mut seen, &mut results).await;
290 scan_well_known(
292 Path::new("/usr/local"),
293 "dist-packages",
294 &mut seen,
295 &mut results,
296 )
297 .await;
298 scan_well_known(
299 Path::new("/usr/local"),
300 "site-packages",
301 &mut seen,
302 &mut results,
303 )
304 .await;
305 let user_local = PathBuf::from(&home_dir).join(".local");
307 scan_well_known(&user_local, "site-packages", &mut seen, &mut results).await;
308 }
309
310 if cfg!(target_os = "macos") {
312 scan_well_known(
313 Path::new("/opt/homebrew"),
314 "site-packages",
315 &mut seen,
316 &mut results,
317 )
318 .await;
319
320 let fw_matches = find_python_dirs(
322 Path::new("/Library/Frameworks/Python.framework/Versions"),
323 &["python3.*", "lib", "python3.*", "site-packages"],
324 )
325 .await;
326 for m in fw_matches {
327 add_path(m, &mut seen, &mut results);
328 }
329
330 let fw_matches2 = find_python_dirs(
331 Path::new("/Library/Frameworks/Python.framework"),
332 &["Versions", "*", "lib", "python3.*", "site-packages"],
333 )
334 .await;
335 for m in fw_matches2 {
336 add_path(m, &mut seen, &mut results);
337 }
338 }
339
340 if cfg!(windows) {
342 if let Ok(appdata) = std::env::var("APPDATA") {
344 let appdata_python = PathBuf::from(&appdata).join("Python");
345 if let Ok(mut entries) = tokio::fs::read_dir(&appdata_python).await {
346 while let Ok(Some(entry)) = entries.next_entry().await {
347 let p = appdata_python.join(entry.file_name()).join("site-packages");
348 if tokio::fs::metadata(&p).await.is_ok() {
349 add_path(p, &mut seen, &mut results);
350 }
351 }
352 }
353 }
354 for base in &["C:\\Python", "C:\\Program Files\\Python"] {
356 if let Ok(mut entries) = tokio::fs::read_dir(base).await {
357 while let Ok(Some(entry)) = entries.next_entry().await {
358 let sp = PathBuf::from(base)
359 .join(entry.file_name())
360 .join("Lib")
361 .join("site-packages");
362 if tokio::fs::metadata(&sp).await.is_ok() {
363 add_path(sp, &mut seen, &mut results);
364 }
365 }
366 }
367 }
368 if let Ok(local) = std::env::var("LOCALAPPDATA") {
370 let programs_python = PathBuf::from(&local).join("Programs").join("Python");
371 if let Ok(mut entries) = tokio::fs::read_dir(&programs_python).await {
372 while let Ok(Some(entry)) = entries.next_entry().await {
373 let sp = programs_python
374 .join(entry.file_name())
375 .join("Lib")
376 .join("site-packages");
377 if tokio::fs::metadata(&sp).await.is_ok() {
378 add_path(sp, &mut seen, &mut results);
379 }
380 }
381 }
382 }
383 }
384
385 let anaconda = PathBuf::from(&home_dir).join("anaconda3");
387 scan_well_known(&anaconda, "site-packages", &mut seen, &mut results).await;
388 let miniconda = PathBuf::from(&home_dir).join("miniconda3");
389 scan_well_known(&miniconda, "site-packages", &mut seen, &mut results).await;
390
391 if cfg!(target_os = "macos") {
393 let uv_base = PathBuf::from(&home_dir)
394 .join("Library")
395 .join("Application Support")
396 .join("uv")
397 .join("tools");
398 let uv_matches =
399 find_python_dirs(&uv_base, &["*", "lib", "python3.*", "site-packages"]).await;
400 for m in uv_matches {
401 add_path(m, &mut seen, &mut results);
402 }
403 } else if cfg!(windows) {
404 if let Ok(local) = std::env::var("LOCALAPPDATA") {
406 let uv_base = PathBuf::from(local).join("uv").join("tools");
407 let uv_matches =
408 find_python_dirs(&uv_base, &["*", "Lib", "site-packages"]).await;
409 for m in uv_matches {
410 add_path(m, &mut seen, &mut results);
411 }
412 }
413 } else {
414 let uv_base = PathBuf::from(&home_dir)
415 .join(".local")
416 .join("share")
417 .join("uv")
418 .join("tools");
419 let uv_matches =
420 find_python_dirs(&uv_base, &["*", "lib", "python3.*", "site-packages"]).await;
421 for m in uv_matches {
422 add_path(m, &mut seen, &mut results);
423 }
424 }
425
426 results
427}
428
429pub struct PythonCrawler;
435
436impl PythonCrawler {
437 pub fn new() -> Self {
439 Self
440 }
441
442 pub async fn get_site_packages_paths(&self, options: &CrawlerOptions) -> Result<Vec<PathBuf>, std::io::Error> {
444 if options.global || options.global_prefix.is_some() {
445 if let Some(ref custom) = options.global_prefix {
446 return Ok(vec![custom.clone()]);
447 }
448 return Ok(get_global_python_site_packages().await);
449 }
450 Ok(find_local_venv_site_packages(&options.cwd).await)
451 }
452
453 pub async fn crawl_all(&self, options: &CrawlerOptions) -> Vec<CrawledPackage> {
455 let mut packages = Vec::new();
456 let mut seen = HashSet::new();
457
458 let sp_paths = self.get_site_packages_paths(options).await.unwrap_or_default();
459
460 for sp_path in &sp_paths {
461 let found = self.scan_site_packages(sp_path, &mut seen).await;
462 packages.extend(found);
463 }
464
465 packages
466 }
467
468 pub async fn find_by_purls(
473 &self,
474 site_packages_path: &Path,
475 purls: &[String],
476 ) -> Result<HashMap<String, CrawledPackage>, std::io::Error> {
477 let mut result = HashMap::new();
478
479 let mut purl_lookup: HashMap<String, &str> = HashMap::new();
481 for purl in purls {
482 if let Some((name, version)) = Self::parse_pypi_purl(purl) {
483 let key = format!("{}@{}", canonicalize_pypi_name(&name), version);
484 purl_lookup.insert(key, purl.as_str());
485 }
486 }
487
488 if purl_lookup.is_empty() {
489 return Ok(result);
490 }
491
492 let entries = match tokio::fs::read_dir(site_packages_path).await {
494 Ok(rd) => {
495 let mut entries = rd;
496 let mut v = Vec::new();
497 while let Ok(Some(entry)) = entries.next_entry().await {
498 v.push(entry);
499 }
500 v
501 }
502 Err(_) => return Ok(result),
503 };
504
505 for entry in entries {
506 let name = entry.file_name();
507 let name_str = name.to_string_lossy();
508 if !name_str.ends_with(".dist-info") {
509 continue;
510 }
511
512 let dist_info_path = site_packages_path.join(&*name_str);
513 if let Some((raw_name, version)) = read_python_metadata(&dist_info_path).await {
514 let canon_name = canonicalize_pypi_name(&raw_name);
515 let key = format!("{canon_name}@{version}");
516
517 if let Some(&matched_purl) = purl_lookup.get(&key) {
518 result.insert(
519 matched_purl.to_string(),
520 CrawledPackage {
521 name: canon_name,
522 version,
523 namespace: None,
524 purl: matched_purl.to_string(),
525 path: site_packages_path.to_path_buf(),
526 },
527 );
528 }
529 }
530 }
531
532 Ok(result)
533 }
534
535 async fn scan_site_packages(
541 &self,
542 site_packages_path: &Path,
543 seen: &mut HashSet<String>,
544 ) -> Vec<CrawledPackage> {
545 let mut results = Vec::new();
546
547 let entries = match tokio::fs::read_dir(site_packages_path).await {
548 Ok(rd) => {
549 let mut entries = rd;
550 let mut v = Vec::new();
551 while let Ok(Some(entry)) = entries.next_entry().await {
552 v.push(entry);
553 }
554 v
555 }
556 Err(_) => return results,
557 };
558
559 for entry in entries {
560 let name = entry.file_name();
561 let name_str = name.to_string_lossy();
562 if !name_str.ends_with(".dist-info") {
563 continue;
564 }
565
566 let dist_info_path = site_packages_path.join(&*name_str);
567 if let Some((raw_name, version)) = read_python_metadata(&dist_info_path).await {
568 let canon_name = canonicalize_pypi_name(&raw_name);
569 let purl = format!("pkg:pypi/{canon_name}@{version}");
570
571 if seen.contains(&purl) {
572 continue;
573 }
574 seen.insert(purl.clone());
575
576 results.push(CrawledPackage {
577 name: canon_name,
578 version,
579 namespace: None,
580 purl,
581 path: site_packages_path.to_path_buf(),
582 });
583 }
584 }
585
586 results
587 }
588
589 fn parse_pypi_purl(purl: &str) -> Option<(String, String)> {
592 let base = match purl.find('?') {
594 Some(idx) => &purl[..idx],
595 None => purl,
596 };
597
598 let rest = base.strip_prefix("pkg:pypi/")?;
599 let at_idx = rest.rfind('@')?;
600 let name = &rest[..at_idx];
601 let version = &rest[at_idx + 1..];
602
603 if name.is_empty() || version.is_empty() {
604 return None;
605 }
606
607 Some((name.to_string(), version.to_string()))
608 }
609}
610
611impl Default for PythonCrawler {
612 fn default() -> Self {
613 Self::new()
614 }
615}
616
617#[cfg(test)]
618mod tests {
619 use super::*;
620
621 #[test]
622 fn test_canonicalize_pypi_name_basic() {
623 assert_eq!(canonicalize_pypi_name("Requests"), "requests");
624 assert_eq!(canonicalize_pypi_name("my_package"), "my-package");
625 assert_eq!(canonicalize_pypi_name("My.Package"), "my-package");
626 assert_eq!(canonicalize_pypi_name("My-._Package"), "my-package");
627 }
628
629 #[test]
630 fn test_canonicalize_pypi_name_runs() {
631 assert_eq!(canonicalize_pypi_name("a__b"), "a-b");
633 assert_eq!(canonicalize_pypi_name("a-.-b"), "a-b");
634 assert_eq!(canonicalize_pypi_name("a_._-b"), "a-b");
635 }
636
637 #[test]
638 fn test_canonicalize_pypi_name_trim() {
639 assert_eq!(canonicalize_pypi_name(" requests "), "requests");
640 }
641
642 #[test]
643 fn test_parse_pypi_purl() {
644 let (name, ver) = PythonCrawler::parse_pypi_purl("pkg:pypi/requests@2.28.0").unwrap();
645 assert_eq!(name, "requests");
646 assert_eq!(ver, "2.28.0");
647 }
648
649 #[test]
650 fn test_parse_pypi_purl_with_qualifiers() {
651 let (name, ver) =
652 PythonCrawler::parse_pypi_purl("pkg:pypi/requests@2.28.0?artifact_id=abc").unwrap();
653 assert_eq!(name, "requests");
654 assert_eq!(ver, "2.28.0");
655 }
656
657 #[test]
658 fn test_parse_pypi_purl_invalid() {
659 assert!(PythonCrawler::parse_pypi_purl("pkg:npm/lodash@4.17.21").is_none());
660 assert!(PythonCrawler::parse_pypi_purl("not-a-purl").is_none());
661 }
662
663 #[tokio::test]
664 async fn test_read_python_metadata_valid() {
665 let dir = tempfile::tempdir().unwrap();
666 let dist_info = dir.path().join("requests-2.28.0.dist-info");
667 tokio::fs::create_dir_all(&dist_info).await.unwrap();
668 tokio::fs::write(
669 dist_info.join("METADATA"),
670 "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n\nSome description",
671 )
672 .await
673 .unwrap();
674
675 let result = read_python_metadata(&dist_info).await;
676 assert!(result.is_some());
677 let (name, version) = result.unwrap();
678 assert_eq!(name, "Requests");
679 assert_eq!(version, "2.28.0");
680 }
681
682 #[tokio::test]
683 async fn test_read_python_metadata_missing() {
684 let dir = tempfile::tempdir().unwrap();
685 let dist_info = dir.path().join("nonexistent.dist-info");
686 assert!(read_python_metadata(&dist_info).await.is_none());
687 }
688
689 #[tokio::test]
690 async fn test_find_python_dirs_literal() {
691 let dir = tempfile::tempdir().unwrap();
692 let target = dir.path().join("lib").join("python3.11").join("site-packages");
693 tokio::fs::create_dir_all(&target).await.unwrap();
694
695 let results =
696 find_python_dirs(dir.path(), &["lib", "python3.*", "site-packages"]).await;
697 assert_eq!(results.len(), 1);
698 assert_eq!(results[0], target);
699 }
700
701 #[tokio::test]
702 async fn test_find_python_dirs_wildcard() {
703 let dir = tempfile::tempdir().unwrap();
704 let sp1 = dir.path().join("lib").join("python3.10").join("site-packages");
705 let sp2 = dir.path().join("lib").join("python3.11").join("site-packages");
706 tokio::fs::create_dir_all(&sp1).await.unwrap();
707 tokio::fs::create_dir_all(&sp2).await.unwrap();
708
709 let non_match = dir.path().join("lib").join("ruby3.0").join("site-packages");
711 tokio::fs::create_dir_all(&non_match).await.unwrap();
712
713 let results =
714 find_python_dirs(dir.path(), &["lib", "python3.*", "site-packages"]).await;
715 assert_eq!(results.len(), 2);
716 }
717
718 #[tokio::test]
719 async fn test_find_python_dirs_star_wildcard() {
720 let dir = tempfile::tempdir().unwrap();
721 let sp1 = dir
722 .path()
723 .join("tools")
724 .join("mytool")
725 .join("lib")
726 .join("python3.11")
727 .join("site-packages");
728 tokio::fs::create_dir_all(&sp1).await.unwrap();
729
730 let results = find_python_dirs(
731 dir.path(),
732 &["tools", "*", "lib", "python3.*", "site-packages"],
733 )
734 .await;
735 assert_eq!(results.len(), 1);
736 assert_eq!(results[0], sp1);
737 }
738
739 #[tokio::test]
740 async fn test_crawl_all_python() {
741 let dir = tempfile::tempdir().unwrap();
742 let venv = dir.path().join(".venv");
743 let sp = if cfg!(windows) {
744 venv.join("Lib").join("site-packages")
745 } else {
746 venv.join("lib").join("python3.11").join("site-packages")
747 };
748 tokio::fs::create_dir_all(&sp).await.unwrap();
749
750 let dist_info = sp.join("requests-2.28.0.dist-info");
752 tokio::fs::create_dir_all(&dist_info).await.unwrap();
753 tokio::fs::write(
754 dist_info.join("METADATA"),
755 "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n",
756 )
757 .await
758 .unwrap();
759
760 let crawler = PythonCrawler::new();
761 let options = CrawlerOptions {
762 cwd: dir.path().to_path_buf(),
763 global: false,
764 global_prefix: None,
765 batch_size: 100,
766 };
767
768 let packages = crawler.crawl_all(&options).await;
769 assert_eq!(packages.len(), 1);
770 assert_eq!(packages[0].name, "requests");
771 assert_eq!(packages[0].version, "2.28.0");
772 assert_eq!(packages[0].purl, "pkg:pypi/requests@2.28.0");
773 assert!(packages[0].namespace.is_none());
774 }
775
776 #[test]
777 fn test_find_python_command() {
778 let cmd = find_python_command();
781 if let Some(c) = cmd {
784 assert!(
785 ["python3", "python", "py"].contains(&c),
786 "unexpected command: {c}"
787 );
788 }
789 }
790
791 #[test]
792 fn test_home_dir_detection() {
793 let home = std::env::var("HOME")
795 .or_else(|_| std::env::var("USERPROFILE"))
796 .unwrap_or_else(|_| "~".to_string());
797 assert_ne!(home, "~", "expected a real home directory");
799 assert!(!home.is_empty());
800 }
801
802 #[tokio::test]
803 async fn test_find_by_purls_python() {
804 let dir = tempfile::tempdir().unwrap();
805 let sp = dir.path().to_path_buf();
806
807 let dist_info = sp.join("requests-2.28.0.dist-info");
809 tokio::fs::create_dir_all(&dist_info).await.unwrap();
810 tokio::fs::write(
811 dist_info.join("METADATA"),
812 "Metadata-Version: 2.1\nName: Requests\nVersion: 2.28.0\n",
813 )
814 .await
815 .unwrap();
816
817 let crawler = PythonCrawler::new();
818 let purls = vec![
819 "pkg:pypi/requests@2.28.0".to_string(),
820 "pkg:pypi/flask@3.0.0".to_string(),
821 ];
822
823 let result = crawler.find_by_purls(&sp, &purls).await.unwrap();
824 assert_eq!(result.len(), 1);
825 assert!(result.contains_key("pkg:pypi/requests@2.28.0"));
826 assert!(!result.contains_key("pkg:pypi/flask@3.0.0"));
827 }
828}