1use crate::models::{
35 DatasourceId, Dependency, FileReference, PackageData, PackageType, Party, Sha256Digest,
36};
37use crate::parser_warn as warn;
38use crate::parsers::utils::{read_file_to_string, split_name_email};
39use base64::Engine;
40use base64::engine::general_purpose::URL_SAFE_NO_PAD;
41use bzip2::read::BzDecoder;
42use csv::ReaderBuilder;
43use flate2::read::GzDecoder;
44use liblzma::read::XzDecoder;
45use packageurl::PackageUrl;
46use regex::Regex;
47use ruff_python_ast as ast;
48use ruff_python_parser::parse_module;
49use serde_json::{Map as JsonMap, Value as JsonValue};
50use sha2::{Digest, Sha256};
51use std::collections::{HashMap, HashSet};
52use std::fs::File;
53use std::io::Read;
54use std::path::{Component, Path, PathBuf};
55use tar::Archive;
56use toml::Value as TomlValue;
57use toml::map::Map as TomlMap;
58use zip::ZipArchive;
59
60use super::PackageParser;
61use super::license_normalization::{
62 DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
63 normalize_spdx_expression,
64};
65use super::pep508::parse_pep508_requirement;
66
67const FIELD_PROJECT: &str = "project";
69const FIELD_NAME: &str = "name";
70const FIELD_VERSION: &str = "version";
71const FIELD_DESCRIPTION: &str = "description";
72const FIELD_KEYWORDS: &str = "keywords";
73const FIELD_LICENSE: &str = "license";
74const FIELD_AUTHORS: &str = "authors";
75const FIELD_MAINTAINERS: &str = "maintainers";
76const FIELD_URLS: &str = "urls";
77const FIELD_HOMEPAGE: &str = "homepage";
78const FIELD_REPOSITORY: &str = "repository";
79const FIELD_DEPENDENCIES: &str = "dependencies";
80const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
81const FIELD_EXTRAS: &str = "extras";
82
83type ProjectUrls = (
84 Option<String>,
85 Option<String>,
86 Option<String>,
87 Option<String>,
88 Option<String>,
89);
90const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
91const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
92const MAX_SETUP_PY_BYTES: usize = 1_048_576;
93const MAX_SETUP_PY_AST_NODES: usize = 10_000;
94const MAX_SETUP_PY_AST_DEPTH: usize = 50;
95const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; const MAX_COMPRESSION_RATIO: f64 = 100.0; pub struct PythonParser;
109
110#[derive(Clone, Copy, Debug)]
111enum PythonSdistArchiveFormat {
112 TarGz,
113 Tgz,
114 TarBz2,
115 TarXz,
116 Zip,
117}
118
119#[derive(Clone, Debug)]
120struct ValidatedZipEntry {
121 index: usize,
122 name: String,
123}
124
125impl PackageParser for PythonParser {
126 const PACKAGE_TYPE: PackageType = PackageType::Pypi;
127
128 fn extract_packages(path: &Path) -> Vec<PackageData> {
129 vec![
130 if path.file_name().unwrap_or_default() == "pyproject.toml" {
131 extract_from_pyproject_toml(path)
132 } else if path.file_name().unwrap_or_default() == "setup.cfg" {
133 extract_from_setup_cfg(path)
134 } else if is_setup_py_like_path(path) {
135 return extract_setup_py_packages(path);
136 } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
137 extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
138 } else if is_installed_wheel_metadata_path(path) {
139 extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
140 } else if is_pip_cache_origin_json(path) {
141 extract_from_pip_origin_json(path)
142 } else if path.file_name().unwrap_or_default() == "pypi.json" {
143 extract_from_pypi_json(path)
144 } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
145 extract_from_pip_inspect(path)
146 } else if is_python_sdist_archive_path(path) {
147 extract_from_sdist_archive(path)
148 } else if path
149 .extension()
150 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
151 {
152 extract_from_wheel_archive(path)
153 } else if path
154 .extension()
155 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
156 {
157 extract_from_egg_archive(path)
158 } else {
159 default_package_data(path)
160 },
161 ]
162 }
163
164 fn is_match(path: &Path) -> bool {
165 if let Some(filename) = path.file_name()
166 && (filename == "pyproject.toml"
167 || filename == "setup.cfg"
168 || is_setup_py_like_path(path)
169 || filename == "PKG-INFO"
170 || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
171 || filename == "pypi.json"
172 || filename == "pip-inspect.deplock"
173 || is_pip_cache_origin_json(path))
174 {
175 return true;
176 }
177
178 if let Some(extension) = path.extension() {
179 let ext = extension.to_string_lossy().to_lowercase();
180 if (ext == "whl" && is_valid_wheel_archive_path(path))
181 || ext == "egg"
182 || is_python_sdist_archive_path(path)
183 {
184 return true;
185 }
186 }
187
188 false
189 }
190}
191
192fn is_setup_py_like_path(path: &Path) -> bool {
193 path.file_name()
194 .and_then(|name| name.to_str())
195 .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
196}
197
198fn is_installed_wheel_metadata_path(path: &Path) -> bool {
199 path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
200 && path
201 .parent()
202 .and_then(|parent| parent.file_name())
203 .and_then(|name| name.to_str())
204 .is_some_and(|name| name.ends_with(".dist-info"))
205}
206
207#[derive(Debug, Clone)]
208struct InstalledWheelMetadata {
209 wheel_tags: Vec<String>,
210 wheel_version: Option<String>,
211 wheel_generator: Option<String>,
212 root_is_purelib: Option<bool>,
213 compressed_tag: Option<String>,
214}
215
216fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
217 let Some(parent) = path.parent() else {
218 return;
219 };
220
221 if !parent
222 .file_name()
223 .and_then(|name| name.to_str())
224 .is_some_and(|name| name.ends_with(".dist-info"))
225 {
226 return;
227 }
228
229 let wheel_path = parent.join("WHEEL");
230 if !wheel_path.exists() {
231 return;
232 }
233
234 let Ok(content) = read_file_to_string(&wheel_path) else {
235 warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
236 return;
237 };
238
239 let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
240 return;
241 };
242
243 apply_installed_wheel_metadata(package_data, &wheel_metadata);
244}
245
246fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
247 use super::rfc822::{get_header_all, get_header_first};
248
249 let metadata = super::rfc822::parse_rfc822_content(content);
250 let wheel_tags = get_header_all(&metadata.headers, "tag");
251 if wheel_tags.is_empty() {
252 return None;
253 }
254
255 let wheel_version = get_header_first(&metadata.headers, "wheel-version");
256 let wheel_generator = get_header_first(&metadata.headers, "generator");
257 let root_is_purelib =
258 get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
259 match value.to_ascii_lowercase().as_str() {
260 "true" => Some(true),
261 "false" => Some(false),
262 _ => None,
263 }
264 });
265
266 let compressed_tag = compress_wheel_tags(&wheel_tags);
267
268 Some(InstalledWheelMetadata {
269 wheel_tags,
270 wheel_version,
271 wheel_generator,
272 root_is_purelib,
273 compressed_tag,
274 })
275}
276
277fn compress_wheel_tags(tags: &[String]) -> Option<String> {
278 if tags.is_empty() {
279 return None;
280 }
281
282 if tags.len() == 1 {
283 return Some(tags[0].clone());
284 }
285
286 let mut python_tags = Vec::new();
287 let mut abi_tag: Option<&str> = None;
288 let mut platform_tag: Option<&str> = None;
289
290 for tag in tags {
291 let mut parts = tag.splitn(3, '-');
292 let python = parts.next()?;
293 let abi = parts.next()?;
294 let platform = parts.next()?;
295
296 if abi_tag.is_some_and(|existing| existing != abi)
297 || platform_tag.is_some_and(|existing| existing != platform)
298 {
299 return None;
300 }
301
302 abi_tag = Some(abi);
303 platform_tag = Some(platform);
304 python_tags.push(python.to_string());
305 }
306
307 Some(format!(
308 "{}-{}-{}",
309 python_tags.join("."),
310 abi_tag?,
311 platform_tag?
312 ))
313}
314
315fn apply_installed_wheel_metadata(
316 package_data: &mut PackageData,
317 wheel_metadata: &InstalledWheelMetadata,
318) {
319 let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
320 extra_data.insert(
321 "wheel_tags".to_string(),
322 JsonValue::Array(
323 wheel_metadata
324 .wheel_tags
325 .iter()
326 .cloned()
327 .map(JsonValue::String)
328 .collect(),
329 ),
330 );
331
332 if let Some(wheel_version) = &wheel_metadata.wheel_version {
333 extra_data.insert(
334 "wheel_version".to_string(),
335 JsonValue::String(wheel_version.clone()),
336 );
337 }
338
339 if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
340 extra_data.insert(
341 "wheel_generator".to_string(),
342 JsonValue::String(wheel_generator.clone()),
343 );
344 }
345
346 if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
347 extra_data.insert(
348 "root_is_purelib".to_string(),
349 JsonValue::Bool(root_is_purelib),
350 );
351 }
352
353 if let (Some(name), Some(version), Some(extension)) = (
354 package_data.name.as_deref(),
355 package_data.version.as_deref(),
356 wheel_metadata.compressed_tag.as_deref(),
357 ) {
358 package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
359 }
360}
361
362fn is_pip_cache_origin_json(path: &Path) -> bool {
363 path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
364 && path.ancestors().skip(1).any(|ancestor| {
365 ancestor
366 .file_name()
367 .and_then(|name| name.to_str())
368 .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
369 })
370}
371
372fn extract_from_pip_origin_json(path: &Path) -> PackageData {
373 let content = match read_file_to_string(path) {
374 Ok(content) => content,
375 Err(e) => {
376 warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
377 return default_package_data(path);
378 }
379 };
380
381 let root: JsonValue = match serde_json::from_str(&content) {
382 Ok(root) => root,
383 Err(e) => {
384 warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
385 return default_package_data(path);
386 }
387 };
388
389 let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
390 warn!("No url found in pip cache origin.json at {:?}", path);
391 return default_package_data(path);
392 };
393
394 let sibling_wheel = find_sibling_cached_wheel(path);
395 let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
396 sibling_wheel
397 .as_ref()
398 .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
399 });
400
401 let Some((name, version)) = name_version else {
402 warn!(
403 "Failed to infer package name/version from pip cache origin.json at {:?}",
404 path
405 );
406 return default_package_data(path);
407 };
408
409 let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
410 build_pypi_urls(Some(&name), Some(&version));
411 let purl = sibling_wheel
412 .as_ref()
413 .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
414 .or(plain_purl);
415
416 PackageData {
417 package_type: Some(PythonParser::PACKAGE_TYPE),
418 primary_language: Some("Python".to_string()),
419 name: Some(name),
420 version: Some(version),
421 datasource_id: Some(DatasourceId::PypiPipOriginJson),
422 download_url: Some(download_url.to_string()),
423 sha256: extract_sha256_from_origin_json(&root)
424 .and_then(|h| Sha256Digest::from_hex(&h).ok()),
425 repository_homepage_url,
426 repository_download_url,
427 api_data_url,
428 purl,
429 ..Default::default()
430 }
431}
432
433fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
434 let parent = path.parent()?;
435 let entries = parent.read_dir().ok()?;
436
437 for entry in entries.flatten() {
438 let sibling_path = entry.path();
439 if sibling_path
440 .extension()
441 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
442 && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
443 {
444 return Some(wheel_info);
445 }
446 }
447
448 None
449}
450
451fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
452 let file_name = url.rsplit('/').next()?;
453
454 if file_name.ends_with(".whl") {
455 return parse_wheel_filename(Path::new(file_name))
456 .map(|wheel_info| (wheel_info.name, wheel_info.version));
457 }
458
459 let stem = strip_python_archive_extension(file_name)?;
460 let (name, version) = stem.rsplit_once('-')?;
461 if name.is_empty() || version.is_empty() {
462 return None;
463 }
464
465 Some((name.replace('_', "-"), version.to_string()))
466}
467
468fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
469 [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
470 .iter()
471 .find_map(|suffix| file_name.strip_suffix(suffix))
472}
473
474fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
475 root.pointer("/archive_info/hashes/sha256")
476 .and_then(|value| value.as_str())
477 .map(ToOwned::to_owned)
478 .or_else(|| {
479 root.pointer("/archive_info/hash")
480 .and_then(|value| value.as_str())
481 .and_then(normalize_origin_hash)
482 })
483}
484
485fn normalize_origin_hash(hash: &str) -> Option<String> {
486 if let Some(value) = hash.strip_prefix("sha256=") {
487 return Some(value.to_string());
488 }
489 if let Some(value) = hash.strip_prefix("sha256:") {
490 return Some(value.to_string());
491 }
492 if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
493 return Some(hash.to_string());
494 }
495 None
496}
497
498fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
499 let content = match read_file_to_string(path) {
500 Ok(content) => content,
501 Err(e) => {
502 warn!("Failed to read metadata at {:?}: {}", path, e);
503 return default_package_data(path);
504 }
505 };
506
507 let metadata = super::rfc822::parse_rfc822_content(&content);
508 let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
509 merge_sibling_metadata_dependencies(path, &mut package_data);
510 merge_sibling_metadata_file_references(path, &mut package_data);
511 if datasource_id == DatasourceId::PypiWheelMetadata {
512 merge_sibling_wheel_metadata(path, &mut package_data);
513 }
514 package_data
515}
516
517fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
518 let mut extra_dependencies = Vec::new();
519
520 if let Some(parent) = path.parent() {
521 let direct_requires = parent.join("requires.txt");
522 if direct_requires.exists()
523 && let Ok(content) = read_file_to_string(&direct_requires)
524 {
525 extra_dependencies.extend(parse_requires_txt(&content));
526 }
527
528 let sibling_egg_info_requires = parent
529 .read_dir()
530 .ok()
531 .into_iter()
532 .flatten()
533 .flatten()
534 .find_map(|entry| {
535 let child_path = entry.path();
536 if child_path.is_dir()
537 && child_path
538 .file_name()
539 .and_then(|name| name.to_str())
540 .is_some_and(|name| name.ends_with(".egg-info"))
541 {
542 let requires = child_path.join("requires.txt");
543 requires.exists().then_some(requires)
544 } else {
545 None
546 }
547 });
548
549 if let Some(requires_path) = sibling_egg_info_requires
550 && let Ok(content) = read_file_to_string(&requires_path)
551 {
552 extra_dependencies.extend(parse_requires_txt(&content));
553 }
554 }
555
556 for dependency in extra_dependencies {
557 if !package_data.dependencies.iter().any(|existing| {
558 existing.purl == dependency.purl
559 && existing.scope == dependency.scope
560 && existing.extracted_requirement == dependency.extracted_requirement
561 && existing.extra_data == dependency.extra_data
562 }) {
563 package_data.dependencies.push(dependency);
564 }
565 }
566}
567
568fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
569 let mut extra_refs = Vec::new();
570
571 if let Some(parent) = path.parent() {
572 let record_path = parent.join("RECORD");
573 if record_path.exists()
574 && let Ok(content) = read_file_to_string(&record_path)
575 {
576 extra_refs.extend(parse_record_csv(&content));
577 }
578
579 let installed_files_path = parent.join("installed-files.txt");
580 if installed_files_path.exists()
581 && let Ok(content) = read_file_to_string(&installed_files_path)
582 {
583 extra_refs.extend(parse_installed_files_txt(&content));
584 }
585
586 let sources_path = parent.join("SOURCES.txt");
587 if sources_path.exists()
588 && let Ok(content) = read_file_to_string(&sources_path)
589 {
590 extra_refs.extend(parse_sources_txt(&content));
591 }
592 }
593
594 for file_ref in extra_refs {
595 if !package_data
596 .file_references
597 .iter()
598 .any(|existing| existing.path == file_ref.path)
599 {
600 package_data.file_references.push(file_ref);
601 }
602 }
603}
604
605fn collect_validated_zip_entries<R: Read + std::io::Seek>(
606 archive: &mut ZipArchive<R>,
607 path: &Path,
608 archive_type: &str,
609) -> Result<Vec<ValidatedZipEntry>, String> {
610 let mut total_extracted = 0u64;
611 let mut entries = Vec::new();
612
613 for i in 0..archive.len() {
614 if let Ok(file) = archive.by_index_raw(i) {
615 let compressed_size = file.compressed_size();
616 let uncompressed_size = file.size();
617 let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
618 warn!(
619 "Skipping unsafe path in {} {:?}: {}",
620 archive_type,
621 path,
622 file.name()
623 );
624 continue;
625 };
626
627 if compressed_size > 0 {
628 let ratio = uncompressed_size as f64 / compressed_size as f64;
629 if ratio > MAX_COMPRESSION_RATIO {
630 warn!(
631 "Suspicious compression ratio in {} {:?}: {:.2}:1",
632 archive_type, path, ratio
633 );
634 continue;
635 }
636 }
637
638 if uncompressed_size > MAX_FILE_SIZE {
639 warn!(
640 "File too large in {} {:?}: {} bytes (limit: {} bytes)",
641 archive_type, path, uncompressed_size, MAX_FILE_SIZE
642 );
643 continue;
644 }
645
646 total_extracted += uncompressed_size;
647 if total_extracted > MAX_ARCHIVE_SIZE {
648 let msg = format!(
649 "Total extracted size exceeds limit for {} {:?}",
650 archive_type, path
651 );
652 warn!("{}", msg);
653 return Err(msg);
654 }
655
656 entries.push(ValidatedZipEntry {
657 index: i,
658 name: entry_name,
659 });
660 }
661 }
662
663 Ok(entries)
664}
665
666fn is_python_sdist_archive_path(path: &Path) -> bool {
667 detect_python_sdist_archive_format(path).is_some()
668}
669
670fn is_valid_wheel_archive_path(path: &Path) -> bool {
671 if !path.is_file() {
672 return true;
673 }
674
675 let file = match File::open(path) {
676 Ok(file) => file,
677 Err(_) => return false,
678 };
679 let mut archive = match ZipArchive::new(file) {
680 Ok(archive) => archive,
681 Err(_) => return false,
682 };
683
684 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
685 Ok(entries) => entries,
686 Err(_) => return false,
687 };
688
689 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA").is_some()
690}
691
692fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
693 let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
694
695 if !is_likely_python_sdist_filename(&file_name) {
696 return None;
697 }
698
699 if file_name.ends_with(".tar.gz") {
700 tar_gz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarGz)
701 } else if file_name.ends_with(".tgz") {
702 tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
703 } else if file_name.ends_with(".tar.bz2") {
704 tar_bz2_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarBz2)
705 } else if file_name.ends_with(".tar.xz") {
706 tar_xz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarXz)
707 } else if file_name.ends_with(".zip") {
708 zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
709 } else {
710 None
711 }
712}
713
714fn tar_gz_sdist_contains_pkg_info(path: &Path) -> bool {
715 let Some(compressed_size) = compressed_archive_size(path) else {
716 return false;
717 };
718 let file = match File::open(path) {
719 Ok(file) => file,
720 Err(_) => return false,
721 };
722 let decoder = GzDecoder::new(file);
723 tar_sdist_contains_pkg_info(path, decoder, "tar.gz", compressed_size)
724}
725
726fn tar_bz2_sdist_contains_pkg_info(path: &Path) -> bool {
727 let Some(compressed_size) = compressed_archive_size(path) else {
728 return false;
729 };
730 let file = match File::open(path) {
731 Ok(file) => file,
732 Err(_) => return false,
733 };
734 let decoder = BzDecoder::new(file);
735 tar_sdist_contains_pkg_info(path, decoder, "tar.bz2", compressed_size)
736}
737
738fn tar_xz_sdist_contains_pkg_info(path: &Path) -> bool {
739 let Some(compressed_size) = compressed_archive_size(path) else {
740 return false;
741 };
742 let file = match File::open(path) {
743 Ok(file) => file,
744 Err(_) => return false,
745 };
746 let decoder = XzDecoder::new(file);
747 tar_sdist_contains_pkg_info(path, decoder, "tar.xz", compressed_size)
748}
749
750fn compressed_archive_size(path: &Path) -> Option<u64> {
751 std::fs::metadata(path).ok().map(|metadata| metadata.len())
752}
753
754fn tar_sdist_contains_pkg_info<R: Read>(
755 path: &Path,
756 reader: R,
757 archive_type: &str,
758 compressed_size: u64,
759) -> bool {
760 let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
761 else {
762 return false;
763 };
764
765 select_sdist_pkginfo_entry(path, &entries).is_some()
766}
767
768fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
769 if !path.is_file() {
770 return true;
771 }
772
773 let Some(compressed_size) = compressed_archive_size(path) else {
774 return false;
775 };
776 let file = match File::open(path) {
777 Ok(file) => file,
778 Err(_) => return false,
779 };
780 let decoder = GzDecoder::new(file);
781 tar_sdist_contains_pkg_info(path, decoder, "tgz", compressed_size)
782}
783
784fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
785 if !path.is_file() {
786 return true;
787 }
788
789 let file = match File::open(path) {
790 Ok(file) => file,
791 Err(_) => return false,
792 };
793 let mut archive = match ZipArchive::new(file) {
794 Ok(archive) => archive,
795 Err(_) => return false,
796 };
797
798 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
799 Ok(entries) => entries,
800 Err(_) => return false,
801 };
802 let metadata_entries: Vec<_> = validated_entries
803 .iter()
804 .filter(|entry| entry.name.ends_with("/PKG-INFO"))
805 .filter_map(|entry| {
806 read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
807 .ok()
808 .map(|content| (entry.name.clone(), content))
809 })
810 .collect();
811
812 has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
813}
814
815fn is_likely_python_sdist_filename(file_name: &str) -> bool {
816 let Some(stem) = strip_python_archive_extension(file_name) else {
817 return false;
818 };
819
820 let Some((name, version)) = stem.rsplit_once('-') else {
821 return false;
822 };
823
824 !name.is_empty()
825 && !version.is_empty()
826 && version.chars().any(|ch| ch.is_ascii_digit())
827 && name
828 .chars()
829 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
830}
831
832fn extract_from_sdist_archive(path: &Path) -> PackageData {
833 let metadata = match std::fs::metadata(path) {
834 Ok(m) => m,
835 Err(e) => {
836 warn!(
837 "Failed to read metadata for sdist archive {:?}: {}",
838 path, e
839 );
840 return default_package_data(path);
841 }
842 };
843
844 if metadata.len() > MAX_ARCHIVE_SIZE {
845 warn!(
846 "sdist archive too large: {} bytes (limit: {} bytes)",
847 metadata.len(),
848 MAX_ARCHIVE_SIZE
849 );
850 return default_package_data(path);
851 }
852
853 let Some(format) = detect_python_sdist_archive_format(path) else {
854 return default_package_data(path);
855 };
856
857 let mut package_data = match format {
858 PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
859 let file = match File::open(path) {
860 Ok(file) => file,
861 Err(e) => {
862 warn!("Failed to open sdist archive {:?}: {}", path, e);
863 return default_package_data(path);
864 }
865 };
866 let decoder = GzDecoder::new(file);
867 extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
868 }
869 PythonSdistArchiveFormat::TarBz2 => {
870 let file = match File::open(path) {
871 Ok(file) => file,
872 Err(e) => {
873 warn!("Failed to open sdist archive {:?}: {}", path, e);
874 return default_package_data(path);
875 }
876 };
877 let decoder = BzDecoder::new(file);
878 extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
879 }
880 PythonSdistArchiveFormat::TarXz => {
881 let file = match File::open(path) {
882 Ok(file) => file,
883 Err(e) => {
884 warn!("Failed to open sdist archive {:?}: {}", path, e);
885 return default_package_data(path);
886 }
887 };
888 let decoder = XzDecoder::new(file);
889 extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
890 }
891 PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
892 };
893
894 if package_data.package_type.is_some() {
895 let (size, sha256) = calculate_file_checksums(path);
896 package_data.size = size;
897 package_data.sha256 = sha256;
898 }
899
900 package_data
901}
902
903fn extract_from_tar_sdist_archive<R: Read>(
904 path: &Path,
905 reader: R,
906 archive_type: &str,
907 compressed_size: u64,
908) -> PackageData {
909 let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
910 else {
911 return default_package_data(path);
912 };
913
914 build_sdist_package_data(path, entries)
915}
916
917fn collect_tar_sdist_entries<R: Read>(
918 path: &Path,
919 reader: R,
920 archive_type: &str,
921 compressed_size: u64,
922) -> Option<Vec<(String, String)>> {
923 let mut archive = Archive::new(reader);
924 let archive_entries = match archive.entries() {
925 Ok(entries) => entries,
926 Err(e) => {
927 warn!(
928 "Failed to read {} sdist archive {:?}: {}",
929 archive_type, path, e
930 );
931 return None;
932 }
933 };
934
935 let mut total_extracted = 0u64;
936 let mut entries = Vec::new();
937
938 for entry_result in archive_entries {
939 let mut entry = match entry_result {
940 Ok(entry) => entry,
941 Err(e) => {
942 warn!(
943 "Failed to read {} sdist entry from {:?}: {}",
944 archive_type, path, e
945 );
946 continue;
947 }
948 };
949
950 let entry_size = entry.size();
951 if entry_size > MAX_FILE_SIZE {
952 warn!(
953 "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
954 archive_type, path, entry_size, MAX_FILE_SIZE
955 );
956 continue;
957 }
958
959 total_extracted += entry_size;
960 if total_extracted > MAX_ARCHIVE_SIZE {
961 warn!(
962 "Total extracted size exceeds limit for {} sdist {:?}",
963 archive_type, path
964 );
965 return None;
966 }
967
968 if compressed_size > 0 {
969 let ratio = total_extracted as f64 / compressed_size as f64;
970 if ratio > MAX_COMPRESSION_RATIO {
971 warn!(
972 "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
973 archive_type, path, ratio
974 );
975 return None;
976 }
977 }
978
979 let entry_path = match entry.path() {
980 Ok(path) => path.to_string_lossy().replace('\\', "/"),
981 Err(e) => {
982 warn!(
983 "Failed to get {} sdist entry path from {:?}: {}",
984 archive_type, path, e
985 );
986 continue;
987 }
988 };
989
990 let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
991 warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
992 continue;
993 };
994
995 if !is_relevant_sdist_text_entry(&entry_path) {
996 continue;
997 }
998
999 if let Ok(content) = read_limited_utf8(
1000 &mut entry,
1001 MAX_FILE_SIZE,
1002 &format!("{} entry {}", archive_type, entry_path),
1003 ) {
1004 entries.push((entry_path, content));
1005 }
1006 }
1007
1008 Some(entries)
1009}
1010
1011fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
1012 let file = match File::open(path) {
1013 Ok(file) => file,
1014 Err(e) => {
1015 warn!("Failed to open zip sdist archive {:?}: {}", path, e);
1016 return default_package_data(path);
1017 }
1018 };
1019
1020 let mut archive = match ZipArchive::new(file) {
1021 Ok(archive) => archive,
1022 Err(e) => {
1023 warn!("Failed to read zip sdist archive {:?}: {}", path, e);
1024 return default_package_data(path);
1025 }
1026 };
1027
1028 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
1029 Ok(entries) => entries,
1030 Err(_) => return default_package_data(path),
1031 };
1032
1033 let mut entries = Vec::new();
1034 for entry in validated_entries.iter() {
1035 if !is_relevant_sdist_text_entry(&entry.name) {
1036 continue;
1037 }
1038
1039 if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
1040 entries.push((entry.name.clone(), content));
1041 }
1042 }
1043
1044 build_sdist_package_data(path, entries)
1045}
1046
1047fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
1048 entry_path.ends_with("/PKG-INFO")
1049 || entry_path.ends_with("/requires.txt")
1050 || entry_path.ends_with("/SOURCES.txt")
1051}
1052
1053fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
1054 let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
1055 warn!("No PKG-INFO file found in sdist archive {:?}", path);
1056 return default_package_data(path);
1057 };
1058
1059 let mut package_data =
1060 python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
1061 merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
1062 merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
1063 apply_sdist_name_version_fallback(path, &mut package_data);
1064 package_data.datasource_id = Some(DatasourceId::PypiSdist);
1065 package_data
1066}
1067
1068fn select_sdist_pkginfo_entry(
1069 archive_path: &Path,
1070 entries: &[(String, String)],
1071) -> Option<(String, String)> {
1072 let expected_name = sdist_archive_expected_name(archive_path);
1073
1074 entries
1075 .iter()
1076 .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
1077 .min_by_key(|(entry_path, content)| {
1078 let components: Vec<_> = entry_path
1079 .split('/')
1080 .filter(|part| !part.is_empty())
1081 .collect();
1082 let candidate_name = sdist_pkginfo_candidate_name(content);
1083 let name_rank = if candidate_name == expected_name {
1084 0
1085 } else {
1086 1
1087 };
1088 let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1089
1090 (name_rank, kind_rank, components.len(), entry_path.clone())
1091 })
1092 .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1093}
1094
1095fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1096 let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1097 return false;
1098 };
1099
1100 entries.iter().any(|(entry_path, content)| {
1101 sdist_pkginfo_kind_rank(entry_path) < 3
1102 && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1103 })
1104}
1105
1106fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1107 archive_path
1108 .file_name()
1109 .and_then(|name| name.to_str())
1110 .and_then(strip_python_archive_extension)
1111 .and_then(|stem| {
1112 stem.rsplit_once('-')
1113 .map(|(name, _)| normalize_python_package_name(name))
1114 })
1115}
1116
1117fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1118 let metadata = super::rfc822::parse_rfc822_content(content);
1119 super::rfc822::get_header_first(&metadata.headers, "name")
1120 .map(|name| normalize_python_package_name(&name))
1121}
1122
1123fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1124 let components: Vec<_> = entry_path
1125 .split('/')
1126 .filter(|part| !part.is_empty())
1127 .collect();
1128
1129 if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1130 {
1131 0
1132 } else if components.len() == 2 && components[1] == "PKG-INFO" {
1133 1
1134 } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1135 2
1136 } else {
1137 3
1138 }
1139}
1140
1141fn merge_sdist_archive_dependencies(
1142 entries: &[(String, String)],
1143 metadata_path: &str,
1144 package_data: &mut PackageData,
1145) {
1146 let metadata_dir = metadata_path
1147 .rsplit_once('/')
1148 .map(|(dir, _)| dir)
1149 .unwrap_or("");
1150 let archive_root = metadata_path.split('/').next().unwrap_or("");
1151 let matched_egg_info_dir =
1152 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1153 let mut extra_dependencies = Vec::new();
1154
1155 for (entry_path, content) in entries {
1156 let is_direct_requires =
1157 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1158 let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1159 entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1160 });
1161
1162 if is_direct_requires || is_egg_info_requires {
1163 extra_dependencies.extend(parse_requires_txt(content));
1164 }
1165 }
1166
1167 for dependency in extra_dependencies {
1168 if !package_data.dependencies.iter().any(|existing| {
1169 existing.purl == dependency.purl
1170 && existing.scope == dependency.scope
1171 && existing.extracted_requirement == dependency.extracted_requirement
1172 && existing.extra_data == dependency.extra_data
1173 }) {
1174 package_data.dependencies.push(dependency);
1175 }
1176 }
1177}
1178
1179fn merge_sdist_archive_file_references(
1180 entries: &[(String, String)],
1181 metadata_path: &str,
1182 package_data: &mut PackageData,
1183) {
1184 let metadata_dir = metadata_path
1185 .rsplit_once('/')
1186 .map(|(dir, _)| dir)
1187 .unwrap_or("");
1188 let archive_root = metadata_path.split('/').next().unwrap_or("");
1189 let matched_egg_info_dir =
1190 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1191 let mut extra_refs = Vec::new();
1192
1193 for (entry_path, content) in entries {
1194 let is_direct_sources =
1195 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1196 let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1197 entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1198 });
1199
1200 if is_direct_sources || is_egg_info_sources {
1201 extra_refs.extend(parse_sources_txt(content));
1202 }
1203 }
1204
1205 for file_ref in extra_refs {
1206 if !package_data
1207 .file_references
1208 .iter()
1209 .any(|existing| existing.path == file_ref.path)
1210 {
1211 package_data.file_references.push(file_ref);
1212 }
1213 }
1214}
1215
1216fn select_matching_sdist_egg_info_dir(
1217 entries: &[(String, String)],
1218 archive_root: &str,
1219 package_name: Option<&str>,
1220) -> Option<String> {
1221 let normalized_package_name = package_name.map(normalize_python_package_name);
1222
1223 entries
1224 .iter()
1225 .filter_map(|(entry_path, _)| {
1226 let components: Vec<_> = entry_path
1227 .split('/')
1228 .filter(|part| !part.is_empty())
1229 .collect();
1230 if components.len() == 3
1231 && components[0] == archive_root
1232 && components[1].ends_with(".egg-info")
1233 {
1234 Some(components[1].to_string())
1235 } else {
1236 None
1237 }
1238 })
1239 .min_by_key(|egg_info_dir| {
1240 let normalized_dir_name =
1241 normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1242 let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1243 0
1244 } else {
1245 1
1246 };
1247
1248 (name_rank, egg_info_dir.clone())
1249 })
1250}
1251
1252fn normalize_python_package_name(name: &str) -> String {
1253 name.to_ascii_lowercase().replace('_', "-")
1254}
1255
1256fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1257 let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1258 return;
1259 };
1260
1261 let Some(stem) = strip_python_archive_extension(file_name) else {
1262 return;
1263 };
1264
1265 let Some((name, version)) = stem.rsplit_once('-') else {
1266 return;
1267 };
1268
1269 if package_data.name.is_none() {
1270 package_data.name = Some(name.replace('_', "-"));
1271 }
1272 if package_data.version.is_none() {
1273 package_data.version = Some(version.to_string());
1274 }
1275
1276 if package_data.purl.is_none()
1277 || package_data.repository_homepage_url.is_none()
1278 || package_data.repository_download_url.is_none()
1279 || package_data.api_data_url.is_none()
1280 {
1281 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1282 build_pypi_urls(
1283 package_data.name.as_deref(),
1284 package_data.version.as_deref(),
1285 );
1286
1287 if package_data.repository_homepage_url.is_none() {
1288 package_data.repository_homepage_url = repository_homepage_url;
1289 }
1290 if package_data.repository_download_url.is_none() {
1291 package_data.repository_download_url = repository_download_url;
1292 }
1293 if package_data.api_data_url.is_none() {
1294 package_data.api_data_url = api_data_url;
1295 }
1296 if package_data.purl.is_none() {
1297 package_data.purl = purl;
1298 }
1299 }
1300}
1301
1302fn extract_from_wheel_archive(path: &Path) -> PackageData {
1303 let metadata = match std::fs::metadata(path) {
1304 Ok(m) => m,
1305 Err(e) => {
1306 warn!(
1307 "Failed to read metadata for wheel archive {:?}: {}",
1308 path, e
1309 );
1310 return default_package_data(path);
1311 }
1312 };
1313
1314 if metadata.len() > MAX_ARCHIVE_SIZE {
1315 warn!(
1316 "Wheel archive too large: {} bytes (limit: {} bytes)",
1317 metadata.len(),
1318 MAX_ARCHIVE_SIZE
1319 );
1320 return default_package_data(path);
1321 }
1322
1323 let file = match File::open(path) {
1324 Ok(f) => f,
1325 Err(e) => {
1326 warn!("Failed to open wheel archive {:?}: {}", path, e);
1327 return default_package_data(path);
1328 }
1329 };
1330
1331 let mut archive = match ZipArchive::new(file) {
1332 Ok(a) => a,
1333 Err(e) => {
1334 warn!("Failed to read wheel archive {:?}: {}", path, e);
1335 return default_package_data(path);
1336 }
1337 };
1338
1339 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1340 Ok(entries) => entries,
1341 Err(_) => return default_package_data(path),
1342 };
1343
1344 let metadata_entry =
1345 match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1346 Some(entry) => entry,
1347 None => {
1348 warn!("No METADATA file found in wheel archive {:?}", path);
1349 return default_package_data(path);
1350 }
1351 };
1352
1353 let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1354 Ok(c) => c,
1355 Err(e) => {
1356 warn!("Failed to read METADATA from {:?}: {}", path, e);
1357 return default_package_data(path);
1358 }
1359 };
1360
1361 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1362
1363 let (size, sha256) = calculate_file_checksums(path);
1364 package_data.size = size;
1365 package_data.sha256 = sha256;
1366
1367 if let Some(record_entry) =
1368 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1369 && let Ok(record_content) =
1370 read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1371 {
1372 package_data.file_references = parse_record_csv(&record_content);
1373 }
1374
1375 if let Some(wheel_info) = parse_wheel_filename(path) {
1376 if package_data.name.is_none() {
1377 package_data.name = Some(wheel_info.name.clone());
1378 }
1379 if package_data.version.is_none() {
1380 package_data.version = Some(wheel_info.version.clone());
1381 }
1382
1383 package_data.qualifiers = Some(std::collections::HashMap::from([(
1384 "extension".to_string(),
1385 format!(
1386 "{}-{}-{}",
1387 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1388 ),
1389 )]));
1390
1391 package_data.purl = build_wheel_purl(
1392 package_data.name.as_deref(),
1393 package_data.version.as_deref(),
1394 &wheel_info,
1395 );
1396
1397 let mut extra_data = package_data.extra_data.unwrap_or_default();
1398 extra_data.insert(
1399 "python_requires".to_string(),
1400 serde_json::Value::String(wheel_info.python_tag.clone()),
1401 );
1402 extra_data.insert(
1403 "abi_tag".to_string(),
1404 serde_json::Value::String(wheel_info.abi_tag.clone()),
1405 );
1406 extra_data.insert(
1407 "platform_tag".to_string(),
1408 serde_json::Value::String(wheel_info.platform_tag.clone()),
1409 );
1410 package_data.extra_data = Some(extra_data);
1411 }
1412
1413 package_data
1414}
1415
1416fn extract_from_egg_archive(path: &Path) -> PackageData {
1417 let metadata = match std::fs::metadata(path) {
1418 Ok(m) => m,
1419 Err(e) => {
1420 warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1421 return default_package_data(path);
1422 }
1423 };
1424
1425 if metadata.len() > MAX_ARCHIVE_SIZE {
1426 warn!(
1427 "Egg archive too large: {} bytes (limit: {} bytes)",
1428 metadata.len(),
1429 MAX_ARCHIVE_SIZE
1430 );
1431 return default_package_data(path);
1432 }
1433
1434 let file = match File::open(path) {
1435 Ok(f) => f,
1436 Err(e) => {
1437 warn!("Failed to open egg archive {:?}: {}", path, e);
1438 return default_package_data(path);
1439 }
1440 };
1441
1442 let mut archive = match ZipArchive::new(file) {
1443 Ok(a) => a,
1444 Err(e) => {
1445 warn!("Failed to read egg archive {:?}: {}", path, e);
1446 return default_package_data(path);
1447 }
1448 };
1449
1450 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1451 Ok(entries) => entries,
1452 Err(_) => return default_package_data(path),
1453 };
1454
1455 let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1456 &validated_entries,
1457 &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1458 ) {
1459 Some(entry) => entry,
1460 None => {
1461 warn!("No PKG-INFO file found in egg archive {:?}", path);
1462 return default_package_data(path);
1463 }
1464 };
1465
1466 let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1467 Ok(c) => c,
1468 Err(e) => {
1469 warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1470 return default_package_data(path);
1471 }
1472 };
1473
1474 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1475
1476 let (size, sha256) = calculate_file_checksums(path);
1477 package_data.size = size;
1478 package_data.sha256 = sha256;
1479
1480 if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1481 &validated_entries,
1482 &[
1483 "EGG-INFO/installed-files.txt",
1484 ".egg-info/installed-files.txt",
1485 ],
1486 ) && let Ok(installed_files_content) =
1487 read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1488 {
1489 package_data.file_references = parse_installed_files_txt(&installed_files_content);
1490 }
1491
1492 if let Some(egg_info) = parse_egg_filename(path) {
1493 if package_data.name.is_none() {
1494 package_data.name = Some(egg_info.name.clone());
1495 }
1496 if package_data.version.is_none() {
1497 package_data.version = Some(egg_info.version.clone());
1498 }
1499
1500 if let Some(python_version) = &egg_info.python_version {
1501 let mut extra_data = package_data.extra_data.unwrap_or_default();
1502 extra_data.insert(
1503 "python_version".to_string(),
1504 serde_json::Value::String(python_version.clone()),
1505 );
1506 package_data.extra_data = Some(extra_data);
1507 }
1508 }
1509
1510 package_data.purl = build_egg_purl(
1511 package_data.name.as_deref(),
1512 package_data.version.as_deref(),
1513 );
1514
1515 package_data
1516}
1517
1518fn find_validated_zip_entry_by_suffix<'a>(
1519 entries: &'a [ValidatedZipEntry],
1520 suffix: &str,
1521) -> Option<&'a ValidatedZipEntry> {
1522 entries.iter().find(|entry| entry.name.ends_with(suffix))
1523}
1524
1525fn find_validated_zip_entry_by_any_suffix<'a>(
1526 entries: &'a [ValidatedZipEntry],
1527 suffixes: &[&str],
1528) -> Option<&'a ValidatedZipEntry> {
1529 entries
1530 .iter()
1531 .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1532}
1533
1534fn read_validated_zip_entry<R: Read + std::io::Seek>(
1535 archive: &mut ZipArchive<R>,
1536 entry: &ValidatedZipEntry,
1537 path: &Path,
1538 archive_type: &str,
1539) -> Result<String, String> {
1540 let mut file = archive
1541 .by_index(entry.index)
1542 .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1543
1544 let compressed_size = file.compressed_size();
1545 let uncompressed_size = file.size();
1546
1547 if compressed_size > 0 {
1548 let ratio = uncompressed_size as f64 / compressed_size as f64;
1549 if ratio > MAX_COMPRESSION_RATIO {
1550 return Err(format!(
1551 "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1552 archive_type, path, ratio
1553 ));
1554 }
1555 }
1556
1557 if uncompressed_size > MAX_FILE_SIZE {
1558 return Err(format!(
1559 "Rejected oversized entry in {} {:?}: {} bytes",
1560 archive_type, path, uncompressed_size
1561 ));
1562 }
1563
1564 read_limited_utf8(
1565 &mut file,
1566 MAX_FILE_SIZE,
1567 &format!("{} entry {}", archive_type, entry.name),
1568 )
1569}
1570
1571fn read_limited_utf8<R: Read>(
1572 reader: &mut R,
1573 max_bytes: u64,
1574 context: &str,
1575) -> Result<String, String> {
1576 let mut limited = reader.take(max_bytes + 1);
1577 let mut bytes = Vec::new();
1578 limited
1579 .read_to_end(&mut bytes)
1580 .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1581
1582 if bytes.len() as u64 > max_bytes {
1583 return Err(format!(
1584 "{} exceeded {} byte limit while reading",
1585 context, max_bytes
1586 ));
1587 }
1588
1589 String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1590}
1591
1592fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1593 let normalized = entry_path.replace('\\', "/");
1594 if normalized.len() >= 3 {
1595 let bytes = normalized.as_bytes();
1596 if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1597 return None;
1598 }
1599 }
1600 let path = Path::new(&normalized);
1601 let mut components = Vec::new();
1602
1603 for component in path.components() {
1604 match component {
1605 Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1606 Component::CurDir => {}
1607 Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1608 }
1609 }
1610
1611 (!components.is_empty()).then_some(components.join("/"))
1612}
1613
1614pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1619 let mut reader = ReaderBuilder::new()
1620 .has_headers(false)
1621 .from_reader(content.as_bytes());
1622
1623 let mut file_references = Vec::new();
1624
1625 for result in reader.records() {
1626 match result {
1627 Ok(record) => {
1628 if record.len() < 3 {
1629 continue;
1630 }
1631
1632 let path = record.get(0).unwrap_or("").trim().to_string();
1633 if path.is_empty() {
1634 continue;
1635 }
1636
1637 let hash_field = record.get(1).unwrap_or("").trim();
1638 let size_field = record.get(2).unwrap_or("").trim();
1639
1640 let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1642 let parts: Vec<&str> = hash_field.split('=').collect();
1643 if parts.len() == 2 && parts[0] == "sha256" {
1644 match URL_SAFE_NO_PAD.decode(parts[1]) {
1645 Ok(decoded) => {
1646 let hex = decoded
1647 .iter()
1648 .map(|b| format!("{:02x}", b))
1649 .collect::<String>();
1650 Sha256Digest::from_hex(&hex).ok()
1651 }
1652 Err(_) => None,
1653 }
1654 } else {
1655 None
1656 }
1657 } else {
1658 None
1659 };
1660
1661 let size = if !size_field.is_empty() && size_field != "-" {
1663 size_field.parse::<u64>().ok()
1664 } else {
1665 None
1666 };
1667
1668 file_references.push(FileReference {
1669 path,
1670 size,
1671 sha1: None,
1672 md5: None,
1673 sha256,
1674 sha512: None,
1675 extra_data: None,
1676 });
1677 }
1678 Err(e) => {
1679 warn!("Failed to parse RECORD CSV row: {}", e);
1680 continue;
1681 }
1682 }
1683 }
1684
1685 file_references
1686}
1687
1688pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1691 content
1692 .lines()
1693 .map(|line| line.trim())
1694 .filter(|line| !line.is_empty())
1695 .map(|path| FileReference {
1696 path: path.to_string(),
1697 size: None,
1698 sha1: None,
1699 md5: None,
1700 sha256: None,
1701 sha512: None,
1702 extra_data: None,
1703 })
1704 .collect()
1705}
1706
1707pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1708 content
1709 .lines()
1710 .map(str::trim)
1711 .filter(|line| !line.is_empty())
1712 .map(|path| FileReference {
1713 path: path.to_string(),
1714 size: None,
1715 sha1: None,
1716 md5: None,
1717 sha256: None,
1718 sha512: None,
1719 extra_data: None,
1720 })
1721 .collect()
1722}
1723
1724struct WheelInfo {
1725 name: String,
1726 version: String,
1727 python_tag: String,
1728 abi_tag: String,
1729 platform_tag: String,
1730}
1731
1732fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1733 let stem = path.file_stem()?.to_string_lossy();
1734 let parts: Vec<&str> = stem.split('-').collect();
1735
1736 if parts.len() >= 5 {
1737 Some(WheelInfo {
1738 name: parts[0].replace('_', "-"),
1739 version: parts[1].to_string(),
1740 python_tag: parts[2].to_string(),
1741 abi_tag: parts[3].to_string(),
1742 platform_tag: parts[4..].join("-"),
1743 })
1744 } else {
1745 None
1746 }
1747}
1748
1749struct EggInfo {
1750 name: String,
1751 version: String,
1752 python_version: Option<String>,
1753}
1754
1755fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1756 let stem = path.file_stem()?.to_string_lossy();
1757 let parts: Vec<&str> = stem.split('-').collect();
1758
1759 if parts.len() >= 2 {
1760 Some(EggInfo {
1761 name: parts[0].replace('_', "-"),
1762 version: parts[1].to_string(),
1763 python_version: parts.get(2).map(|s| s.to_string()),
1764 })
1765 } else {
1766 None
1767 }
1768}
1769
1770fn build_wheel_purl(
1771 name: Option<&str>,
1772 version: Option<&str>,
1773 wheel_info: &WheelInfo,
1774) -> Option<String> {
1775 let name = name?;
1776 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1777
1778 if let Some(ver) = version {
1779 package_url.with_version(ver).ok()?;
1780 }
1781
1782 let extension = format!(
1783 "{}-{}-{}",
1784 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1785 );
1786 package_url.add_qualifier("extension", extension).ok()?;
1787
1788 Some(package_url.to_string())
1789}
1790
1791fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1792 let name = name?;
1793 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1794
1795 if let Some(ver) = version {
1796 package_url.with_version(ver).ok()?;
1797 }
1798
1799 package_url.add_qualifier("type", "egg").ok()?;
1800
1801 Some(package_url.to_string())
1802}
1803
1804fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1805 let metadata = super::rfc822::parse_rfc822_content(content);
1806 build_package_data_from_rfc822(&metadata, datasource_id)
1807}
1808
1809fn build_package_data_from_rfc822(
1814 metadata: &super::rfc822::Rfc822Metadata,
1815 datasource_id: DatasourceId,
1816) -> PackageData {
1817 use super::rfc822::{get_header_all, get_header_first};
1818
1819 let name = get_header_first(&metadata.headers, "name");
1820 let version = get_header_first(&metadata.headers, "version");
1821 let summary = get_header_first(&metadata.headers, "summary");
1822 let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1823 let author = get_header_first(&metadata.headers, "author");
1824 let author_email = get_header_first(&metadata.headers, "author-email");
1825 let license = get_header_first(&metadata.headers, "license");
1826 let license_expression = get_header_first(&metadata.headers, "license-expression");
1827 let download_url = get_header_first(&metadata.headers, "download-url");
1828 let platform = get_header_first(&metadata.headers, "platform");
1829 let requires_python = get_header_first(&metadata.headers, "requires-python");
1830 let classifiers = get_header_all(&metadata.headers, "classifier");
1831 let license_files = get_header_all(&metadata.headers, "license-file");
1832
1833 let description_body = if metadata.body.is_empty() {
1834 get_header_first(&metadata.headers, "description").unwrap_or_default()
1835 } else {
1836 metadata.body.clone()
1837 };
1838
1839 let description = build_description(summary.as_deref(), &description_body);
1840
1841 let mut parties = Vec::new();
1842 if author.is_some() || author_email.is_some() {
1843 parties.push(Party {
1844 r#type: Some("person".to_string()),
1845 role: Some("author".to_string()),
1846 name: author,
1847 email: author_email,
1848 url: None,
1849 organization: None,
1850 organization_url: None,
1851 timezone: None,
1852 });
1853 }
1854
1855 let (keywords, license_classifiers) = split_classifiers(&classifiers);
1856 let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1857 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1858 license_expression
1859 .as_deref()
1860 .and_then(normalize_spdx_expression)
1861 .map(|normalized| {
1862 build_declared_license_data(
1863 normalized,
1864 DeclaredLicenseMatchMetadata::single_line(
1865 license_expression.as_deref().unwrap_or_default(),
1866 )
1867 .with_referenced_filenames(&referenced_license_files),
1868 )
1869 })
1870 .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1871
1872 let extracted_license_statement = license_expression
1873 .clone()
1874 .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1875
1876 let mut extra_data = HashMap::new();
1877 if let Some(platform_value) = platform
1878 && !platform_value.eq_ignore_ascii_case("unknown")
1879 && !platform_value.is_empty()
1880 {
1881 extra_data.insert(
1882 "platform".to_string(),
1883 serde_json::Value::String(platform_value),
1884 );
1885 }
1886
1887 if let Some(requires_python_value) = requires_python
1888 && !requires_python_value.is_empty()
1889 {
1890 extra_data.insert(
1891 "requires_python".to_string(),
1892 serde_json::Value::String(requires_python_value),
1893 );
1894 }
1895
1896 if !license_files.is_empty() {
1897 extra_data.insert(
1898 "license_files".to_string(),
1899 serde_json::Value::Array(
1900 license_files
1901 .iter()
1902 .cloned()
1903 .map(serde_json::Value::String)
1904 .collect(),
1905 ),
1906 );
1907 }
1908
1909 let file_references = license_files
1910 .iter()
1911 .map(|path| FileReference {
1912 path: path.clone(),
1913 size: None,
1914 sha1: None,
1915 md5: None,
1916 sha256: None,
1917 sha512: None,
1918 extra_data: None,
1919 })
1920 .collect();
1921
1922 let project_urls = get_header_all(&metadata.headers, "project-url");
1923 let dependencies = extract_rfc822_dependencies(&metadata.headers);
1924 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1925
1926 if !project_urls.is_empty() {
1927 let parsed_urls = parse_project_urls(&project_urls);
1928
1929 for (label, url) in &parsed_urls {
1930 let label_lower = label.to_lowercase();
1931
1932 if bug_tracking_url.is_none()
1933 && matches!(
1934 label_lower.as_str(),
1935 "tracker"
1936 | "bug reports"
1937 | "bug tracker"
1938 | "issues"
1939 | "issue tracker"
1940 | "github: issues"
1941 )
1942 {
1943 bug_tracking_url = Some(url.clone());
1944 } else if code_view_url.is_none()
1945 && matches!(label_lower.as_str(), "source" | "source code" | "code")
1946 {
1947 code_view_url = Some(url.clone());
1948 } else if vcs_url.is_none()
1949 && matches!(
1950 label_lower.as_str(),
1951 "github" | "gitlab" | "github: repo" | "repository"
1952 )
1953 {
1954 vcs_url = Some(url.clone());
1955 } else if homepage_url.is_none()
1956 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1957 {
1958 homepage_url = Some(url.clone());
1959 } else if label_lower == "changelog" {
1960 extra_data.insert(
1961 "changelog_url".to_string(),
1962 serde_json::Value::String(url.clone()),
1963 );
1964 }
1965 }
1966
1967 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1968 .iter()
1969 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1970 .collect();
1971
1972 if !project_urls_json.is_empty() {
1973 extra_data.insert(
1974 "project_urls".to_string(),
1975 serde_json::Value::Object(project_urls_json),
1976 );
1977 }
1978 }
1979
1980 let extra_data = if extra_data.is_empty() {
1981 None
1982 } else {
1983 Some(extra_data)
1984 };
1985
1986 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1987 build_pypi_urls(name.as_deref(), version.as_deref());
1988
1989 PackageData {
1990 package_type: Some(PythonParser::PACKAGE_TYPE),
1991 namespace: None,
1992 name,
1993 version,
1994 qualifiers: None,
1995 subpath: None,
1996 primary_language: Some("Python".to_string()),
1997 description,
1998 release_date: None,
1999 parties,
2000 keywords,
2001 homepage_url,
2002 download_url,
2003 size: None,
2004 sha1: None,
2005 md5: None,
2006 sha256: None,
2007 sha512: None,
2008 bug_tracking_url,
2009 code_view_url,
2010 vcs_url,
2011 copyright: None,
2012 holder: None,
2013 declared_license_expression,
2014 declared_license_expression_spdx,
2015 license_detections,
2016 other_license_expression: None,
2017 other_license_expression_spdx: None,
2018 other_license_detections: Vec::new(),
2019 extracted_license_statement,
2020 notice_text: None,
2021 source_packages: Vec::new(),
2022 file_references,
2023 is_private: false,
2024 is_virtual: false,
2025 extra_data,
2026 dependencies,
2027 repository_homepage_url,
2028 repository_download_url,
2029 api_data_url,
2030 datasource_id: Some(datasource_id),
2031 purl,
2032 }
2033}
2034
2035fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
2036 project_urls
2037 .iter()
2038 .filter_map(|url_entry| {
2039 if let Some((label, url)) = url_entry.split_once(", ") {
2040 let label_trimmed = label.trim();
2041 let url_trimmed = url.trim();
2042 if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
2043 return Some((label_trimmed.to_string(), url_trimmed.to_string()));
2044 }
2045 }
2046 None
2047 })
2048 .collect()
2049}
2050
2051fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
2052 let mut parts = Vec::new();
2053 if let Some(summary_value) = summary
2054 && !summary_value.trim().is_empty()
2055 {
2056 parts.push(summary_value.trim().to_string());
2057 }
2058
2059 if !body.trim().is_empty() {
2060 parts.push(body.trim().to_string());
2061 }
2062
2063 if parts.is_empty() {
2064 None
2065 } else {
2066 Some(parts.join("\n"))
2067 }
2068}
2069
2070fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
2071 let mut keywords = Vec::new();
2072 let mut license_classifiers = Vec::new();
2073
2074 for classifier in classifiers {
2075 if classifier.starts_with("License ::") {
2076 license_classifiers.push(classifier.to_string());
2077 } else {
2078 keywords.push(classifier.to_string());
2079 }
2080 }
2081
2082 (keywords, license_classifiers)
2083}
2084
2085fn build_extracted_license_statement(
2086 license: Option<&str>,
2087 license_classifiers: &[String],
2088) -> Option<String> {
2089 let mut lines = Vec::new();
2090
2091 if let Some(value) = license
2092 && !value.trim().is_empty()
2093 {
2094 lines.push(format!("license: {}", value.trim()));
2095 }
2096
2097 if !license_classifiers.is_empty() {
2098 lines.push("classifiers:".to_string());
2099 for classifier in license_classifiers {
2100 lines.push(format!(" - '{}'", classifier));
2101 }
2102 }
2103
2104 if lines.is_empty() {
2105 None
2106 } else {
2107 Some(format!("{}\n", lines.join("\n")))
2108 }
2109}
2110
2111pub(crate) fn build_pypi_urls(
2112 name: Option<&str>,
2113 version: Option<&str>,
2114) -> (
2115 Option<String>,
2116 Option<String>,
2117 Option<String>,
2118 Option<String>,
2119) {
2120 let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2121
2122 let repository_download_url = name.and_then(|value| {
2123 version.map(|ver| {
2124 format!(
2125 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2126 &value[..1.min(value.len())],
2127 value,
2128 value,
2129 ver
2130 )
2131 })
2132 });
2133
2134 let api_data_url = name.map(|value| {
2135 if let Some(ver) = version {
2136 format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2137 } else {
2138 format!("https://pypi.org/pypi/{}/json", value)
2139 }
2140 });
2141
2142 let purl = name.and_then(|value| {
2143 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2144 if let Some(ver) = version {
2145 package_url.with_version(ver).ok()?;
2146 }
2147 Some(package_url.to_string())
2148 });
2149
2150 (
2151 repository_homepage_url,
2152 repository_download_url,
2153 api_data_url,
2154 purl,
2155 )
2156}
2157
2158fn build_pypi_purl_with_extension(
2159 name: &str,
2160 version: Option<&str>,
2161 extension: &str,
2162) -> Option<String> {
2163 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2164 if let Some(ver) = version {
2165 package_url.with_version(ver).ok()?;
2166 }
2167 package_url.add_qualifier("extension", extension).ok()?;
2168 Some(package_url.to_string())
2169}
2170
2171fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2172 let toml_content = match read_toml_file(path) {
2173 Ok(content) => content,
2174 Err(e) => {
2175 warn!(
2176 "Failed to read or parse pyproject.toml at {:?}: {}",
2177 path, e
2178 );
2179 return default_package_data(path);
2180 }
2181 };
2182
2183 let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2184 let is_poetry_pyproject = tool_table
2185 .and_then(|tool| tool.get("poetry"))
2186 .and_then(|value| value.as_table())
2187 .is_some();
2188
2189 let project_table =
2191 if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2192 project.clone()
2194 } else if let Some(tool) = tool_table {
2195 if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2196 poetry.clone()
2198 } else {
2199 return default_package_data(path);
2200 }
2201 } else if toml_content.get(FIELD_NAME).is_some() {
2202 match toml_content.as_table() {
2204 Some(table) => table.clone(),
2205 None => {
2206 warn!("Failed to convert TOML content to table in {:?}", path);
2207 return default_package_data(path);
2208 }
2209 }
2210 } else {
2211 return default_package_data(path);
2212 };
2213
2214 let name = project_table
2215 .get(FIELD_NAME)
2216 .and_then(|v| v.as_str())
2217 .map(String::from);
2218
2219 let version = project_table
2220 .get(FIELD_VERSION)
2221 .and_then(|v| v.as_str())
2222 .map(String::from);
2223 let classifiers = project_table
2224 .get("classifiers")
2225 .and_then(|value| value.as_array())
2226 .map(|values| {
2227 values
2228 .iter()
2229 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2230 .collect::<Vec<_>>()
2231 })
2232 .unwrap_or_default();
2233 let (classifier_keywords, license_classifiers) = split_classifiers(&classifiers);
2234
2235 let extracted_license_statement = extract_raw_license_string(&project_table);
2236 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2237 normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2238
2239 let description = project_table
2240 .get(FIELD_DESCRIPTION)
2241 .and_then(|value| value.as_str())
2242 .map(|value| value.to_string());
2243 let mut keywords = project_table
2244 .get(FIELD_KEYWORDS)
2245 .and_then(|value| value.as_array())
2246 .map(|values| {
2247 values
2248 .iter()
2249 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2250 .collect::<Vec<_>>()
2251 })
2252 .unwrap_or_default();
2253 for classifier in classifier_keywords {
2254 if !keywords.contains(&classifier) {
2255 keywords.push(classifier);
2256 }
2257 }
2258
2259 let mut extra_data = extract_pyproject_extra_data(&toml_content).unwrap_or_default();
2261 let (homepage_url, download_url, bug_tracking_url, code_view_url, repository_url) =
2262 extract_urls(&project_table, &mut extra_data);
2263
2264 let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2265
2266 let purl = name.as_ref().and_then(|n| {
2268 let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2269 Ok(p) => p,
2270 Err(e) => {
2271 warn!(
2272 "Failed to create PackageUrl for Python package '{}': {}",
2273 n, e
2274 );
2275 return None;
2276 }
2277 };
2278
2279 if let Some(v) = &version
2280 && let Err(e) = package_url.with_version(v)
2281 {
2282 warn!(
2283 "Failed to set version '{}' for Python package '{}': {}",
2284 v, n, e
2285 );
2286 return None;
2287 }
2288
2289 Some(package_url.to_string())
2290 });
2291
2292 let api_data_url = name.as_ref().map(|n| {
2293 if let Some(v) = &version {
2294 format!("https://pypi.org/pypi/{}/{}/json", n, v)
2295 } else {
2296 format!("https://pypi.org/pypi/{}/json", n)
2297 }
2298 });
2299
2300 let pypi_homepage_url = name
2301 .as_ref()
2302 .map(|n| format!("https://pypi.org/project/{}", n));
2303
2304 let pypi_download_url = name.as_ref().and_then(|n| {
2305 version.as_ref().map(|v| {
2306 format!(
2307 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2308 &n[..1.min(n.len())],
2309 n,
2310 n,
2311 v
2312 )
2313 })
2314 });
2315
2316 PackageData {
2317 package_type: Some(PythonParser::PACKAGE_TYPE),
2318 namespace: None,
2319 name,
2320 version,
2321 qualifiers: None,
2322 subpath: None,
2323 primary_language: None,
2324 description,
2325 release_date: None,
2326 parties: extract_parties(&project_table),
2327 keywords,
2328 homepage_url: homepage_url.or(pypi_homepage_url),
2329 download_url: download_url
2330 .or_else(|| repository_url.clone())
2331 .or(pypi_download_url),
2332 size: None,
2333 sha1: None,
2334 md5: None,
2335 sha256: None,
2336 sha512: None,
2337 bug_tracking_url,
2338 code_view_url,
2339 vcs_url: repository_url,
2340 copyright: None,
2341 holder: None,
2342 declared_license_expression,
2343 declared_license_expression_spdx,
2344 license_detections,
2345 other_license_expression: None,
2346 other_license_expression_spdx: None,
2347 other_license_detections: Vec::new(),
2348 extracted_license_statement: extracted_license_statement
2349 .or_else(|| build_extracted_license_statement(None, &license_classifiers)),
2350 notice_text: None,
2351 source_packages: Vec::new(),
2352 file_references: Vec::new(),
2353 is_private: has_private_classifier(&classifiers),
2354 is_virtual: false,
2355 extra_data: if extra_data.is_empty() {
2356 None
2357 } else {
2358 Some(extra_data)
2359 },
2360 dependencies: [dependencies, optional_dependencies].concat(),
2361 repository_homepage_url: None,
2362 repository_download_url: None,
2363 api_data_url,
2364 datasource_id: Some(if is_poetry_pyproject {
2365 DatasourceId::PypiPoetryPyprojectToml
2366 } else {
2367 DatasourceId::PypiPyprojectToml
2368 }),
2369 purl,
2370 }
2371}
2372
2373fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2374 let path_str = path.to_string_lossy().replace('\\', "/");
2375 if path_str.contains("/EGG-INFO/PKG-INFO") {
2376 DatasourceId::PypiEggPkginfo
2377 } else if path_str.ends_with(".egg-info/PKG-INFO") {
2378 DatasourceId::PypiEditableEggPkginfo
2379 } else {
2380 DatasourceId::PypiSdistPkginfo
2381 }
2382}
2383
2384fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2385 project
2386 .get(FIELD_LICENSE)
2387 .and_then(|license_value| match license_value {
2388 TomlValue::String(license_str) => Some(license_str.clone()),
2389 TomlValue::Table(license_table) => license_table
2390 .get("text")
2391 .and_then(|v| v.as_str())
2392 .map(|s| s.to_string())
2393 .or_else(|| {
2394 license_table
2395 .get("expression")
2396 .and_then(|v| v.as_str())
2397 .map(|expr| expr.to_string())
2398 }),
2399 _ => None,
2400 })
2401}
2402
2403fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2404 match project.get(FIELD_LICENSE) {
2405 Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2406 Some(TomlValue::Table(license_table)) => license_table
2407 .get("expression")
2408 .and_then(|value| value.as_str()),
2409 _ => None,
2410 }
2411}
2412
2413fn extract_urls(
2414 project: &TomlMap<String, TomlValue>,
2415 extra_data: &mut HashMap<String, serde_json::Value>,
2416) -> ProjectUrls {
2417 let mut homepage_url = None;
2418 let mut download_url = None;
2419 let mut bug_tracking_url = None;
2420 let mut code_view_url = None;
2421 let mut repository_url = None;
2422
2423 if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2425 let parsed_urls: Vec<(String, String)> = urls
2426 .iter()
2427 .filter_map(|(label, value)| {
2428 value
2429 .as_str()
2430 .map(|url| (label.to_string(), url.to_string()))
2431 })
2432 .collect();
2433 apply_project_url_mappings(
2434 &parsed_urls,
2435 &mut homepage_url,
2436 &mut bug_tracking_url,
2437 &mut code_view_url,
2438 &mut repository_url,
2439 extra_data,
2440 );
2441
2442 download_url = urls
2443 .get("Downloads")
2444 .or_else(|| urls.get("downloads"))
2445 .and_then(|v| v.as_str())
2446 .map(String::from);
2447
2448 if homepage_url.is_none() {
2449 homepage_url = urls
2450 .get(FIELD_HOMEPAGE)
2451 .and_then(|v| v.as_str())
2452 .map(String::from);
2453 }
2454 if repository_url.is_none() {
2455 repository_url = urls
2456 .get(FIELD_REPOSITORY)
2457 .and_then(|v| v.as_str())
2458 .map(String::from);
2459 }
2460 }
2461
2462 if homepage_url.is_none() {
2464 homepage_url = project
2465 .get(FIELD_HOMEPAGE)
2466 .and_then(|v| v.as_str())
2467 .map(String::from);
2468 }
2469
2470 if repository_url.is_none() {
2471 repository_url = project
2472 .get(FIELD_REPOSITORY)
2473 .and_then(|v| v.as_str())
2474 .map(String::from);
2475 }
2476
2477 (
2478 homepage_url,
2479 download_url,
2480 bug_tracking_url,
2481 code_view_url,
2482 repository_url,
2483 )
2484}
2485
2486fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2487 let mut parties = Vec::new();
2488
2489 if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2490 for author in authors {
2491 if let Some(author_str) = author.as_str() {
2492 let (name, email) = split_name_email(author_str);
2493 parties.push(Party {
2494 r#type: None,
2495 role: Some("author".to_string()),
2496 name,
2497 email,
2498 url: None,
2499 organization: None,
2500 organization_url: None,
2501 timezone: None,
2502 });
2503 } else if let Some(author_table) = author.as_table() {
2504 let name = author_table
2505 .get("name")
2506 .and_then(|value| value.as_str())
2507 .map(|value| value.to_string());
2508 let email = author_table
2509 .get("email")
2510 .and_then(|value| value.as_str())
2511 .map(|value| value.to_string());
2512 if name.is_some() || email.is_some() {
2513 parties.push(Party {
2514 r#type: None,
2515 role: Some("author".to_string()),
2516 name,
2517 email,
2518 url: None,
2519 organization: None,
2520 organization_url: None,
2521 timezone: None,
2522 });
2523 }
2524 }
2525 }
2526 }
2527
2528 if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2529 for maintainer in maintainers {
2530 if let Some(maintainer_str) = maintainer.as_str() {
2531 let (name, email) = split_name_email(maintainer_str);
2532 parties.push(Party {
2533 r#type: None,
2534 role: Some("maintainer".to_string()),
2535 name,
2536 email,
2537 url: None,
2538 organization: None,
2539 organization_url: None,
2540 timezone: None,
2541 });
2542 } else if let Some(maintainer_table) = maintainer.as_table() {
2543 let name = maintainer_table
2544 .get("name")
2545 .and_then(|value| value.as_str())
2546 .map(|value| value.to_string());
2547 let email = maintainer_table
2548 .get("email")
2549 .and_then(|value| value.as_str())
2550 .map(|value| value.to_string());
2551 if name.is_some() || email.is_some() {
2552 parties.push(Party {
2553 r#type: None,
2554 role: Some("maintainer".to_string()),
2555 name,
2556 email,
2557 url: None,
2558 organization: None,
2559 organization_url: None,
2560 timezone: None,
2561 });
2562 }
2563 }
2564 }
2565 }
2566
2567 parties
2568}
2569
2570fn extract_dependencies(
2571 project: &TomlMap<String, TomlValue>,
2572 toml_content: &TomlValue,
2573) -> (Vec<Dependency>, Vec<Dependency>) {
2574 let mut dependencies = Vec::new();
2575 let mut optional_dependencies = Vec::new();
2576
2577 if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2579 match deps_value {
2580 TomlValue::Array(arr) => {
2581 dependencies = parse_dependency_array(arr, false, None);
2582 }
2583 TomlValue::Table(table) => {
2584 dependencies = parse_dependency_table(table, false, None);
2585 }
2586 _ => {}
2587 }
2588 }
2589
2590 if let Some(opt_deps_table) = project
2592 .get(FIELD_OPTIONAL_DEPENDENCIES)
2593 .and_then(|v| v.as_table())
2594 {
2595 for (extra_name, deps) in opt_deps_table {
2596 match deps {
2597 TomlValue::Array(arr) => {
2598 optional_dependencies.extend(parse_dependency_array(
2599 arr,
2600 true,
2601 Some(extra_name),
2602 ));
2603 }
2604 TomlValue::Table(table) => {
2605 optional_dependencies.extend(parse_dependency_table(
2606 table,
2607 true,
2608 Some(extra_name),
2609 ));
2610 }
2611 _ => {}
2612 }
2613 }
2614 }
2615
2616 if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2618 match dev_deps_value {
2619 TomlValue::Array(arr) => {
2620 optional_dependencies.extend(parse_dependency_array(
2621 arr,
2622 true,
2623 Some(FIELD_DEV_DEPENDENCIES),
2624 ));
2625 }
2626 TomlValue::Table(table) => {
2627 optional_dependencies.extend(parse_dependency_table(
2628 table,
2629 true,
2630 Some(FIELD_DEV_DEPENDENCIES),
2631 ));
2632 }
2633 _ => {}
2634 }
2635 }
2636
2637 if let Some(groups_table) = toml_content
2639 .get("tool")
2640 .and_then(|value| value.as_table())
2641 .and_then(|tool| tool.get("poetry"))
2642 .and_then(|value| value.as_table())
2643 .and_then(|poetry| poetry.get("group"))
2644 .and_then(|value| value.as_table())
2645 {
2646 for (group_name, group_data) in groups_table {
2647 if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2648 match group_deps {
2649 TomlValue::Array(arr) => {
2650 optional_dependencies.extend(parse_dependency_array(
2651 arr,
2652 true,
2653 Some(group_name),
2654 ));
2655 }
2656 TomlValue::Table(table) => {
2657 optional_dependencies.extend(parse_poetry_group_dependency_table(
2658 table,
2659 true,
2660 Some(group_name),
2661 ));
2662 }
2663 _ => {}
2664 }
2665 }
2666 }
2667 }
2668
2669 if let Some(groups_table) = toml_content
2670 .get(FIELD_DEPENDENCY_GROUPS)
2671 .and_then(|value| value.as_table())
2672 {
2673 for (group_name, deps) in groups_table {
2674 match deps {
2675 TomlValue::Array(arr) => {
2676 optional_dependencies.extend(parse_dependency_array(
2677 arr,
2678 true,
2679 Some(group_name),
2680 ));
2681 }
2682 TomlValue::Table(table) => {
2683 optional_dependencies.extend(parse_dependency_table(
2684 table,
2685 true,
2686 Some(group_name),
2687 ));
2688 }
2689 _ => {}
2690 }
2691 }
2692 }
2693
2694 if let Some(dev_deps_value) = toml_content
2695 .get("tool")
2696 .and_then(|value| value.as_table())
2697 .and_then(|tool| tool.get("uv"))
2698 .and_then(|value| value.as_table())
2699 .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2700 {
2701 match dev_deps_value {
2702 TomlValue::Array(arr) => {
2703 optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2704 }
2705 TomlValue::Table(table) => {
2706 optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2707 }
2708 _ => {}
2709 }
2710 }
2711
2712 (dependencies, optional_dependencies)
2713}
2714
2715fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2716 let mut extra_data = HashMap::new();
2717
2718 if let Some(tool_uv) = toml_content
2719 .get("tool")
2720 .and_then(|value| value.as_table())
2721 .and_then(|tool| tool.get("uv"))
2722 {
2723 extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2724 }
2725
2726 if extra_data.is_empty() {
2727 None
2728 } else {
2729 Some(extra_data)
2730 }
2731}
2732
2733fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2734 match value {
2735 TomlValue::String(value) => JsonValue::String(value.clone()),
2736 TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2737 TomlValue::Float(value) => JsonValue::String(value.to_string()),
2738 TomlValue::Boolean(value) => JsonValue::Bool(*value),
2739 TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2740 TomlValue::Array(values) => {
2741 JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2742 }
2743 TomlValue::Table(values) => JsonValue::Object(
2744 values
2745 .iter()
2746 .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2747 .collect::<JsonMap<String, JsonValue>>(),
2748 ),
2749 }
2750}
2751
2752fn parse_dependency_table(
2753 table: &TomlMap<String, TomlValue>,
2754 is_optional: bool,
2755 scope: Option<&str>,
2756) -> Vec<Dependency> {
2757 table
2758 .iter()
2759 .filter_map(|(name, version)| {
2760 let version_str = version.as_str().map(|s| s.to_string());
2761 let mut package_url =
2762 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2763
2764 if let Some(v) = &version_str {
2765 package_url.with_version(v).ok()?;
2766 }
2767
2768 Some(Dependency {
2769 purl: Some(package_url.to_string()),
2770 extracted_requirement: None,
2771 scope: scope.map(|s| s.to_string()),
2772 is_runtime: Some(!is_optional),
2773 is_optional: Some(is_optional),
2774 is_pinned: None,
2775 is_direct: Some(true),
2776 resolved_package: None,
2777 extra_data: None,
2778 })
2779 })
2780 .collect()
2781}
2782
2783fn parse_poetry_group_dependency_table(
2784 table: &TomlMap<String, TomlValue>,
2785 is_optional: bool,
2786 scope: Option<&str>,
2787) -> Vec<Dependency> {
2788 table
2789 .iter()
2790 .filter_map(|(name, value)| build_poetry_group_dependency(name, value, is_optional, scope))
2791 .collect()
2792}
2793
2794fn build_poetry_group_dependency(
2795 name: &str,
2796 value: &TomlValue,
2797 is_optional: bool,
2798 scope: Option<&str>,
2799) -> Option<Dependency> {
2800 let normalized_name = normalize_python_dependency_name(name);
2801 let (version_spec, extras, marker) = match value {
2802 TomlValue::String(spec) => (Some(spec.trim().to_string()), Vec::new(), None),
2803 TomlValue::Table(table) => {
2804 let version_spec = table
2805 .get(FIELD_VERSION)
2806 .and_then(|value| value.as_str())
2807 .map(str::trim)
2808 .filter(|value| !value.is_empty())
2809 .map(ToOwned::to_owned);
2810 let extras = table
2811 .get(FIELD_EXTRAS)
2812 .and_then(|value| value.as_array())
2813 .map(|values| {
2814 values
2815 .iter()
2816 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2817 .collect::<Vec<_>>()
2818 })
2819 .unwrap_or_default();
2820 let marker = table
2821 .get("markers")
2822 .and_then(|value| value.as_str())
2823 .map(str::trim)
2824 .filter(|value| !value.is_empty())
2825 .map(ToOwned::to_owned);
2826
2827 (version_spec, extras, marker)
2828 }
2829 _ => return None,
2830 };
2831
2832 let pinned_version = version_spec
2833 .as_deref()
2834 .and_then(extract_exact_pinned_version);
2835 let purl = build_python_dependency_purl(&normalized_name, pinned_version.as_deref())?;
2836
2837 let mut extra_data = HashMap::new();
2838 if let Some(marker) = marker {
2839 extra_data.insert("marker".to_string(), JsonValue::String(marker));
2840 }
2841 if !extras.is_empty() {
2842 extra_data.insert(
2843 "extras".to_string(),
2844 JsonValue::Array(extras.into_iter().map(JsonValue::String).collect()),
2845 );
2846 }
2847
2848 Some(Dependency {
2849 purl: Some(purl),
2850 extracted_requirement: version_spec,
2851 scope: scope.map(|value| value.to_string()),
2852 is_runtime: Some(!is_optional),
2853 is_optional: Some(is_optional),
2854 is_pinned: Some(pinned_version.is_some()),
2855 is_direct: Some(true),
2856 resolved_package: None,
2857 extra_data: if extra_data.is_empty() {
2858 None
2859 } else {
2860 Some(extra_data)
2861 },
2862 })
2863}
2864
2865fn parse_dependency_array(
2866 array: &[TomlValue],
2867 is_optional: bool,
2868 scope: Option<&str>,
2869) -> Vec<Dependency> {
2870 array
2871 .iter()
2872 .filter_map(|dep| {
2873 let dep_str = dep.as_str()?;
2874 build_pyproject_array_dependency(dep_str, is_optional, scope)
2875 })
2876 .collect()
2877}
2878
2879fn build_pyproject_array_dependency(
2880 dep_str: &str,
2881 is_optional: bool,
2882 scope: Option<&str>,
2883) -> Option<Dependency> {
2884 let parsed = parse_pep508_requirement(dep_str)?;
2885 let name = normalize_python_package_name(&parsed.name);
2886 let pinned_version = parsed
2887 .specifiers
2888 .as_deref()
2889 .and_then(extract_exact_pinned_version);
2890
2891 let purl = build_python_dependency_purl(&name, pinned_version.as_deref())?;
2892
2893 let mut extra_data = HashMap::new();
2894 if let Some(marker) = parsed.marker {
2895 extra_data.insert("marker".to_string(), JsonValue::String(marker));
2896 }
2897 if !parsed.extras.is_empty() {
2898 extra_data.insert(
2899 "extras".to_string(),
2900 JsonValue::Array(parsed.extras.into_iter().map(JsonValue::String).collect()),
2901 );
2902 }
2903
2904 let extracted_requirement = parsed.specifiers.or(parsed.url);
2905
2906 Some(Dependency {
2907 purl: Some(purl),
2908 extracted_requirement: extracted_requirement.clone(),
2909 scope: scope.map(|s| s.to_string()),
2910 is_runtime: Some(!is_optional),
2911 is_optional: Some(is_optional),
2912 is_pinned: Some(pinned_version.is_some()),
2913 is_direct: Some(true),
2914 resolved_package: None,
2915 extra_data: if extra_data.is_empty() {
2916 None
2917 } else {
2918 Some(extra_data)
2919 },
2920 })
2921}
2922
2923fn extract_exact_pinned_version(specifiers: &str) -> Option<String> {
2924 let trimmed = specifiers.trim();
2925 if trimmed.contains(',') {
2926 return None;
2927 }
2928
2929 let stripped = if let Some(version) = trimmed.strip_prefix("===") {
2930 version
2931 } else if let Some(version) = trimmed.strip_prefix("==") {
2932 version
2933 } else {
2934 return None;
2935 };
2936
2937 let version = stripped.trim();
2938 if version.is_empty() {
2939 None
2940 } else {
2941 Some(version.to_string())
2942 }
2943}
2944
2945#[derive(Debug, Clone)]
2946enum Value {
2947 String(String),
2948 Number(f64),
2949 Bool(bool),
2950 None,
2951 List(Vec<Value>),
2952 Tuple(Vec<Value>),
2953 Dict(HashMap<String, Value>),
2954}
2955
2956struct LiteralEvaluator {
2957 constants: HashMap<String, Value>,
2958 max_depth: usize,
2959 max_nodes: usize,
2960 nodes_visited: usize,
2961}
2962
2963impl LiteralEvaluator {
2964 fn new(constants: HashMap<String, Value>) -> Self {
2965 Self {
2966 constants,
2967 max_depth: MAX_SETUP_PY_AST_DEPTH,
2968 max_nodes: MAX_SETUP_PY_AST_NODES,
2969 nodes_visited: 0,
2970 }
2971 }
2972
2973 fn insert_constant(&mut self, name: String, value: Value) {
2974 self.constants.insert(name, value);
2975 }
2976
2977 fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2978 if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2979 return None;
2980 }
2981 self.nodes_visited += 1;
2982
2983 match expr {
2984 ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
2985 Some(Value::String(value.to_str().to_string()))
2986 }
2987 ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
2988 Some(Value::Bool(*value))
2989 }
2990 ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
2991 self.evaluate_number(value)
2992 }
2993 ast::Expr::NoneLiteral(_) => Some(Value::None),
2994 ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2995 ast::Expr::List(ast::ExprList { elts, .. }) => {
2996 let mut values = Vec::new();
2997 for elt in elts {
2998 values.push(self.evaluate_expr(elt, depth + 1)?);
2999 }
3000 Some(Value::List(values))
3001 }
3002 ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
3003 let mut values = Vec::new();
3004 for elt in elts {
3005 values.push(self.evaluate_expr(elt, depth + 1)?);
3006 }
3007 Some(Value::Tuple(values))
3008 }
3009 ast::Expr::Dict(ast::ExprDict { items, .. }) => {
3010 let mut dict = HashMap::new();
3011 for item in items {
3012 let key_expr = item.key.as_ref()?;
3013 let key_value = self.evaluate_expr(key_expr, depth + 1)?;
3014 let key = value_to_string(&key_value)?;
3015 let value = self.evaluate_expr(&item.value, depth + 1)?;
3016 dict.insert(key, value);
3017 }
3018 Some(Value::Dict(dict))
3019 }
3020 ast::Expr::Call(ast::ExprCall {
3021 func, arguments, ..
3022 }) => {
3023 let args = arguments.args.as_ref();
3024 let keywords = arguments.keywords.as_ref();
3025 if keywords.is_empty()
3026 && let Some(name) = dotted_name(func.as_ref(), depth + 1)
3027 && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
3028 {
3029 return self.evaluate_ordered_dict(args, depth + 1);
3030 }
3031
3032 if !args.is_empty() {
3033 return None;
3034 }
3035
3036 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
3037 && id == "dict"
3038 {
3039 let mut dict = HashMap::new();
3040 for keyword in keywords {
3041 let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
3042 let value = self.evaluate_expr(&keyword.value, depth + 1)?;
3043 dict.insert(key.to_string(), value);
3044 }
3045 return Some(Value::Dict(dict));
3046 }
3047
3048 None
3049 }
3050 _ => None,
3051 }
3052 }
3053
3054 fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
3055 match number {
3056 ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
3057 ast::Number::Float(value) => Some(Value::Number(*value)),
3058 ast::Number::Complex { .. } => None,
3059 }
3060 }
3061
3062 fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
3063 if args.len() != 1 {
3064 return None;
3065 }
3066
3067 let items = match self.evaluate_expr(&args[0], depth)? {
3068 Value::List(items) | Value::Tuple(items) => items,
3069 _ => return None,
3070 };
3071
3072 let mut dict = HashMap::new();
3073 for item in items {
3074 let Value::Tuple(values) = item else {
3075 return None;
3076 };
3077 if values.len() != 2 {
3078 return None;
3079 }
3080 let key = value_to_string(&values[0])?;
3081 dict.insert(key, values[1].clone());
3082 }
3083
3084 Some(Value::Dict(dict))
3085 }
3086}
3087
3088#[derive(Default)]
3089struct SetupAliases {
3090 setup_names: HashSet<String>,
3091 module_aliases: HashMap<String, String>,
3092}
3093
3094fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
3095 extract_from_setup_py(path).into_iter().collect()
3096}
3097
3098fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
3099 let content = match read_file_to_string(path) {
3100 Ok(content) => content,
3101 Err(e) => {
3102 warn!("Failed to read setup.py at {:?}: {}", path, e);
3103 return Some(default_package_data(path));
3104 }
3105 };
3106
3107 if content.len() > MAX_SETUP_PY_BYTES {
3108 warn!("setup.py too large at {:?}: {} bytes", path, content.len());
3109 let package_data = extract_from_setup_py_regex(&content);
3110 return should_emit_setup_py_package(&package_data).then_some(package_data);
3111 }
3112
3113 let mut package_data = match extract_from_setup_py_ast(&content) {
3114 Ok(Some(data)) => data,
3115 Ok(None) => return Some(default_package_data(path)),
3116 Err(e) => {
3117 warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
3118 extract_from_setup_py_regex(&content)
3119 }
3120 };
3121
3122 if package_data.name.is_none() {
3123 package_data.name = extract_setup_value(&content, "name");
3124 }
3125
3126 if package_data.version.is_none() {
3127 package_data.version = extract_setup_value(&content, "version");
3128 }
3129
3130 if package_data
3131 .version
3132 .as_deref()
3133 .is_some_and(|version| version.trim().is_empty())
3134 {
3135 package_data.version = None;
3136 }
3137
3138 fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
3139 package_data.purl = build_setup_py_purl(
3140 package_data.name.as_deref(),
3141 package_data.version.as_deref(),
3142 );
3143
3144 if should_emit_setup_py_package(&package_data) {
3145 Some(package_data)
3146 } else {
3147 Some(default_package_data(path))
3148 }
3149}
3150
3151fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
3152 package_data.name.is_some()
3153 || package_data.version.is_some()
3154 || package_data.purl.is_some()
3155 || !package_data.dependencies.is_empty()
3156 || package_data.extracted_license_statement.is_some()
3157 || !package_data.license_detections.is_empty()
3158 || !package_data.parties.is_empty()
3159 || package_data.description.is_some()
3160 || package_data.homepage_url.is_some()
3161 || package_data.bug_tracking_url.is_some()
3162 || package_data.code_view_url.is_some()
3163 || package_data.vcs_url.is_some()
3164}
3165
3166fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
3167 if package_data.version.is_some()
3168 && package_data.extracted_license_statement.is_some()
3169 && package_data
3170 .parties
3171 .iter()
3172 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
3173 {
3174 return;
3175 }
3176
3177 let Some(root) = path.parent() else {
3178 return;
3179 };
3180
3181 let dunder_metadata = collect_sibling_dunder_metadata(root, content);
3182
3183 if package_data.version.is_none() {
3184 package_data.version = dunder_metadata.version;
3185 }
3186
3187 if package_data.extracted_license_statement.is_none() {
3188 package_data.extracted_license_statement = dunder_metadata.license;
3189 }
3190
3191 let has_author = package_data
3192 .parties
3193 .iter()
3194 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
3195
3196 if !has_author && let Some(author) = dunder_metadata.author {
3197 package_data.parties.push(Party {
3198 r#type: Some("person".to_string()),
3199 role: Some("author".to_string()),
3200 name: Some(author),
3201 email: None,
3202 url: None,
3203 organization: None,
3204 organization_url: None,
3205 timezone: None,
3206 });
3207 }
3208}
3209
3210#[derive(Default)]
3211struct DunderMetadata {
3212 version: Option<String>,
3213 author: Option<String>,
3214 license: Option<String>,
3215}
3216
3217fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
3218 let statements = match parse_module(content) {
3219 Ok(parsed) => parsed.into_suite(),
3220 Err(_) => return DunderMetadata::default(),
3221 };
3222
3223 let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3224 let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3225 let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3226 let mut metadata = DunderMetadata::default();
3227 let mut candidate_paths = Vec::new();
3228
3229 for module in imported_dunder_modules(&statements) {
3230 let Some(path) = resolve_imported_module_path(root, &module) else {
3231 continue;
3232 };
3233
3234 candidate_paths.push(path);
3235 }
3236
3237 candidate_paths.extend(referenced_dunder_init_paths(root, content));
3238
3239 let mut seen_paths = HashSet::new();
3240 for path in candidate_paths {
3241 if !seen_paths.insert(path.clone()) {
3242 continue;
3243 }
3244
3245 let Ok(module_content) = read_file_to_string(&path) else {
3246 continue;
3247 };
3248
3249 if metadata.version.is_none() {
3250 metadata.version = version_re
3251 .as_ref()
3252 .and_then(|regex| regex.captures(&module_content))
3253 .and_then(|captures| captures.get(1))
3254 .map(|match_| match_.as_str().to_string());
3255 }
3256
3257 if metadata.author.is_none() {
3258 metadata.author = author_re
3259 .as_ref()
3260 .and_then(|regex| regex.captures(&module_content))
3261 .and_then(|captures| captures.get(1))
3262 .map(|match_| match_.as_str().to_string());
3263 }
3264
3265 if metadata.license.is_none() {
3266 metadata.license = license_re
3267 .as_ref()
3268 .and_then(|regex| regex.captures(&module_content))
3269 .and_then(|captures| captures.get(1))
3270 .map(|match_| match_.as_str().to_string());
3271 }
3272
3273 if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
3274 return metadata;
3275 }
3276 }
3277
3278 metadata
3279}
3280
3281fn referenced_dunder_init_paths(root: &Path, content: &str) -> Vec<PathBuf> {
3282 let open_re = match Regex::new(r#"open\(\s*['\"]([^'\"]+__init__\.py)['\"]"#) {
3283 Ok(regex) => regex,
3284 Err(_) => return Vec::new(),
3285 };
3286
3287 open_re
3288 .captures_iter(content)
3289 .filter_map(|captures| captures.get(1).map(|m| m.as_str()))
3290 .filter_map(|relative| {
3291 let relative_path = PathBuf::from(relative);
3292 if relative_path.is_absolute()
3293 || relative_path.components().any(|component| {
3294 matches!(
3295 component,
3296 Component::ParentDir | Component::RootDir | Component::Prefix(_)
3297 )
3298 })
3299 {
3300 return None;
3301 }
3302
3303 let candidate = root.join(relative_path);
3304 candidate.exists().then_some(candidate)
3305 })
3306 .collect()
3307}
3308
3309fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
3310 let mut modules = Vec::new();
3311
3312 for statement in statements {
3313 let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
3314 continue;
3315 };
3316 let Some(module) = module.as_ref().map(|name| name.as_str()) else {
3317 continue;
3318 };
3319 let imports_dunder = names.iter().any(|alias| {
3320 matches!(
3321 alias.name.as_str(),
3322 "__version__" | "__author__" | "__license__"
3323 )
3324 });
3325 if imports_dunder {
3326 modules.push(module.to_string());
3327 }
3328 }
3329
3330 modules
3331}
3332
3333fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
3334 let relative = PathBuf::from_iter(module.split('.'));
3335 let candidates = [
3336 root.join(relative.with_extension("py")),
3337 root.join(&relative).join("__init__.py"),
3338 root.join("src").join(relative.with_extension("py")),
3339 root.join("src").join(relative).join("__init__.py"),
3340 ];
3341
3342 candidates.into_iter().find(|candidate| candidate.exists())
3343}
3344
3345fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
3361 let statements = parse_module(content)
3362 .map(|parsed| parsed.into_suite())
3363 .map_err(|e| e.to_string())?;
3364 let aliases = collect_setup_aliases(&statements);
3365 let mut evaluator = LiteralEvaluator::new(HashMap::new());
3366 build_setup_py_constants(&statements, &mut evaluator);
3367
3368 let setup_call = find_setup_call(&statements, &aliases);
3369 let Some(call_expr) = setup_call else {
3370 return Ok(None);
3371 };
3372
3373 let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3374 Ok(Some(build_setup_py_package_data(&setup_values)))
3375}
3376
3377fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3378 for stmt in statements {
3379 if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3380 if targets.len() != 1 {
3381 continue;
3382 }
3383
3384 let Some(name) = extract_assign_name(&targets[0]) else {
3385 continue;
3386 };
3387
3388 if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3389 evaluator.insert_constant(name, value);
3390 }
3391 }
3392 }
3393}
3394
3395fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3396 match target {
3397 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3398 _ => None,
3399 }
3400}
3401
3402fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3403 let mut aliases = SetupAliases::default();
3404 aliases.setup_names.insert("setup".to_string());
3405
3406 for stmt in statements {
3407 match stmt {
3408 ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3409 for alias in names {
3410 let module_name = alias.name.as_str();
3411 if !is_setup_module(module_name) {
3412 continue;
3413 }
3414 let alias_name = alias
3415 .asname
3416 .as_ref()
3417 .map(|name| name.as_str())
3418 .unwrap_or(module_name);
3419 aliases
3420 .module_aliases
3421 .insert(alias_name.to_string(), module_name.to_string());
3422 }
3423 }
3424 ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3425 let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3426 continue;
3427 };
3428 if !is_setup_module(module_name) {
3429 continue;
3430 }
3431 for alias in names {
3432 if alias.name.as_str() != "setup" {
3433 continue;
3434 }
3435 let alias_name = alias
3436 .asname
3437 .as_ref()
3438 .map(|name| name.as_str())
3439 .unwrap_or("setup");
3440 aliases.setup_names.insert(alias_name.to_string());
3441 }
3442 }
3443 _ => {}
3444 }
3445 }
3446
3447 aliases
3448}
3449
3450fn is_setup_module(module_name: &str) -> bool {
3451 matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3452}
3453
3454fn find_setup_call<'a>(
3455 statements: &'a [ast::Stmt],
3456 aliases: &'a SetupAliases,
3457) -> Option<&'a ast::Expr> {
3458 let mut finder = SetupCallFinder {
3459 aliases,
3460 called_function_names: collect_top_level_called_function_names(statements),
3461 nodes_visited: 0,
3462 };
3463 finder.find_in_statements(statements)
3464}
3465
3466fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3467 let mut called = HashSet::new();
3468 collect_called_function_names_in_statements(statements, &mut called);
3469 called
3470}
3471
3472fn collect_called_function_names_in_statements(
3473 statements: &[ast::Stmt],
3474 called: &mut HashSet<String>,
3475) {
3476 for stmt in statements {
3477 match stmt {
3478 ast::Stmt::Expr(ast::StmtExpr { value, .. })
3479 | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3480 collect_called_function_names_in_expr(value.as_ref(), called);
3481 }
3482 ast::Stmt::If(ast::StmtIf {
3483 body,
3484 elif_else_clauses,
3485 ..
3486 }) => {
3487 collect_called_function_names_in_statements(body, called);
3488 for clause in elif_else_clauses {
3489 collect_called_function_names_in_statements(&clause.body, called);
3490 }
3491 }
3492 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3493 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3494 collect_called_function_names_in_statements(body, called);
3495 collect_called_function_names_in_statements(orelse, called);
3496 }
3497 ast::Stmt::With(ast::StmtWith { body, .. }) => {
3498 collect_called_function_names_in_statements(body, called);
3499 }
3500 ast::Stmt::Try(ast::StmtTry {
3501 body,
3502 orelse,
3503 finalbody,
3504 handlers,
3505 ..
3506 }) => {
3507 collect_called_function_names_in_statements(body, called);
3508 collect_called_function_names_in_statements(orelse, called);
3509 collect_called_function_names_in_statements(finalbody, called);
3510 for handler in handlers {
3511 let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3512 body,
3513 ..
3514 }) = handler;
3515 collect_called_function_names_in_statements(body, called);
3516 }
3517 }
3518 _ => {}
3519 }
3520 }
3521}
3522
3523fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3524 if let ast::Expr::Call(ast::ExprCall {
3525 func, arguments, ..
3526 }) = expr
3527 {
3528 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3529 called.insert(id.as_str().to_string());
3530 }
3531
3532 for arg in arguments.args.iter() {
3533 collect_called_function_names_in_expr(arg, called);
3534 }
3535 for keyword in arguments.keywords.iter() {
3536 collect_called_function_names_in_expr(&keyword.value, called);
3537 }
3538 }
3539}
3540
3541struct SetupCallFinder<'a> {
3542 aliases: &'a SetupAliases,
3543 called_function_names: HashSet<String>,
3544 nodes_visited: usize,
3545}
3546
3547impl<'a> SetupCallFinder<'a> {
3548 fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3549 for stmt in statements {
3550 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3551 return None;
3552 }
3553 self.nodes_visited += 1;
3554
3555 let found = match stmt {
3556 ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3557 ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3558 ast::Stmt::If(ast::StmtIf {
3559 body,
3560 elif_else_clauses,
3561 ..
3562 }) => self.find_in_statements(body).or_else(|| {
3563 for clause in elif_else_clauses {
3564 if let Some(found) = self.find_in_statements(&clause.body) {
3565 return Some(found);
3566 }
3567 }
3568 None
3569 }),
3570 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3571 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3572 .find_in_statements(body)
3573 .or_else(|| self.find_in_statements(orelse)),
3574 ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3575 .called_function_names
3576 .contains(name.as_str())
3577 .then(|| self.find_in_statements(body))
3578 .flatten(),
3579 ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3580 ast::Stmt::Try(ast::StmtTry {
3581 body,
3582 orelse,
3583 finalbody,
3584 handlers,
3585 ..
3586 }) => self
3587 .find_in_statements(body)
3588 .or_else(|| self.find_in_statements(orelse))
3589 .or_else(|| self.find_in_statements(finalbody))
3590 .or_else(|| {
3591 for handler in handlers {
3592 let ast::ExceptHandler::ExceptHandler(
3593 ast::ExceptHandlerExceptHandler { body, .. },
3594 ) = handler;
3595 if let Some(found) = self.find_in_statements(body) {
3596 return Some(found);
3597 }
3598 }
3599 None
3600 }),
3601 _ => None,
3602 };
3603
3604 if found.is_some() {
3605 return found;
3606 }
3607 }
3608
3609 None
3610 }
3611
3612 fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3613 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3614 return None;
3615 }
3616 self.nodes_visited += 1;
3617
3618 match expr {
3619 ast::Expr::Call(ast::ExprCall { func, .. })
3620 if is_setup_call(func.as_ref(), self.aliases) =>
3621 {
3622 Some(expr)
3623 }
3624 _ => None,
3625 }
3626 }
3627}
3628
3629fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3630 let Some(dotted) = dotted_name(func, 0) else {
3631 return false;
3632 };
3633
3634 if aliases.setup_names.contains(&dotted) {
3635 return true;
3636 }
3637
3638 let Some(module) = dotted.strip_suffix(".setup") else {
3639 return false;
3640 };
3641
3642 let resolved = resolve_module_alias(module, aliases);
3643 is_setup_module(&resolved)
3644}
3645
3646fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3647 if depth >= MAX_SETUP_PY_AST_DEPTH {
3648 return None;
3649 }
3650
3651 match expr {
3652 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3653 ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3654 let base = dotted_name(value.as_ref(), depth + 1)?;
3655 Some(format!("{}.{}", base, attr.as_str()))
3656 }
3657 _ => None,
3658 }
3659}
3660
3661fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3662 if let Some(mapped) = aliases.module_aliases.get(module) {
3663 return mapped.clone();
3664 }
3665
3666 let Some((base, rest)) = module.split_once('.') else {
3667 return module.to_string();
3668 };
3669
3670 if let Some(mapped) = aliases.module_aliases.get(base) {
3671 return format!("{}.{}", mapped, rest);
3672 }
3673
3674 module.to_string()
3675}
3676
3677fn extract_setup_keywords(
3678 call_expr: &ast::Expr,
3679 evaluator: &mut LiteralEvaluator,
3680) -> HashMap<String, Value> {
3681 let mut values = HashMap::new();
3682 let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3683 return values;
3684 };
3685
3686 for keyword in arguments.keywords.iter() {
3687 if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3688 if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3689 values.insert(arg.to_string(), value);
3690 }
3691 } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3692 for (key, value) in dict {
3693 values.insert(key, value);
3694 }
3695 }
3696 }
3697
3698 values
3699}
3700
3701fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3702 let name = get_value_string(values, "name");
3703 let version = get_value_string(values, "version");
3704 let description =
3705 get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3706 let homepage_url =
3707 get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3708 let author = get_value_string(values, "author");
3709 let author_email = get_value_string(values, "author_email");
3710 let maintainer = get_value_string(values, "maintainer");
3711 let maintainer_email = get_value_string(values, "maintainer_email");
3712 let license = get_value_string(values, "license");
3713 let classifiers = values
3714 .get("classifiers")
3715 .and_then(value_to_string_list)
3716 .unwrap_or_default();
3717
3718 let mut parties = Vec::new();
3719 if author.is_some() || author_email.is_some() {
3720 parties.push(Party {
3721 r#type: Some("person".to_string()),
3722 role: Some("author".to_string()),
3723 name: author,
3724 email: author_email,
3725 url: None,
3726 organization: None,
3727 organization_url: None,
3728 timezone: None,
3729 });
3730 }
3731
3732 if maintainer.is_some() || maintainer_email.is_some() {
3733 parties.push(Party {
3734 r#type: Some("person".to_string()),
3735 role: Some("maintainer".to_string()),
3736 name: maintainer,
3737 email: maintainer_email,
3738 url: None,
3739 organization: None,
3740 organization_url: None,
3741 timezone: None,
3742 });
3743 }
3744
3745 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3746 normalize_spdx_declared_license(license.as_deref());
3747 let extracted_license_statement = license.clone();
3748
3749 let dependencies = build_setup_py_dependencies(values);
3750 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3751 let mut homepage_from_project_urls = None;
3752 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3753 let mut extra_data = HashMap::new();
3754
3755 if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3756 apply_project_url_mappings(
3757 &parsed_project_urls,
3758 &mut homepage_from_project_urls,
3759 &mut bug_tracking_url,
3760 &mut code_view_url,
3761 &mut vcs_url,
3762 &mut extra_data,
3763 );
3764 }
3765
3766 let extra_data = if extra_data.is_empty() {
3767 None
3768 } else {
3769 Some(extra_data)
3770 };
3771
3772 PackageData {
3773 package_type: Some(PythonParser::PACKAGE_TYPE),
3774 namespace: None,
3775 name,
3776 version,
3777 qualifiers: None,
3778 subpath: None,
3779 primary_language: Some("Python".to_string()),
3780 description,
3781 release_date: None,
3782 parties,
3783 keywords: Vec::new(),
3784 homepage_url: homepage_url.or(homepage_from_project_urls),
3785 download_url: None,
3786 size: None,
3787 sha1: None,
3788 md5: None,
3789 sha256: None,
3790 sha512: None,
3791 bug_tracking_url,
3792 code_view_url,
3793 vcs_url,
3794 copyright: None,
3795 holder: None,
3796 declared_license_expression,
3797 declared_license_expression_spdx,
3798 license_detections,
3799 other_license_expression: None,
3800 other_license_expression_spdx: None,
3801 other_license_detections: Vec::new(),
3802 extracted_license_statement,
3803 notice_text: None,
3804 source_packages: Vec::new(),
3805 file_references: Vec::new(),
3806 is_private: has_private_classifier(&classifiers),
3807 is_virtual: false,
3808 extra_data,
3809 dependencies,
3810 repository_homepage_url: None,
3811 repository_download_url: None,
3812 api_data_url: None,
3813 datasource_id: Some(DatasourceId::PypiSetupPy),
3814 purl,
3815 }
3816}
3817
3818fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3819 let mut dependencies = Vec::new();
3820
3821 if let Some(reqs) = values
3822 .get("install_requires")
3823 .and_then(value_to_string_list)
3824 {
3825 dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3826 }
3827
3828 if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3829 dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3830 }
3831
3832 if let Some(Value::Dict(extras)) = values.get("extras_require") {
3833 let mut extra_items: Vec<_> = extras.iter().collect();
3834 extra_items.sort_by_key(|(name, _)| *name);
3835 for (extra_name, extra_value) in extra_items {
3836 if let Some(reqs) = value_to_string_list(extra_value) {
3837 dependencies.extend(build_setup_py_dependency_list(
3838 reqs.as_slice(),
3839 extra_name,
3840 true,
3841 ));
3842 }
3843 }
3844 }
3845
3846 dependencies
3847}
3848
3849fn build_setup_py_dependency_list(
3850 reqs: &[String],
3851 scope: &str,
3852 is_optional: bool,
3853) -> Vec<Dependency> {
3854 reqs.iter()
3855 .filter_map(|req| build_python_dependency(req, scope, is_optional, None))
3856 .collect()
3857}
3858
3859fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3860 values.get(key).and_then(value_to_string)
3861}
3862
3863fn value_to_string(value: &Value) -> Option<String> {
3864 match value {
3865 Value::String(value) => Some(value.clone()),
3866 Value::Number(value) => Some(value.to_string()),
3867 Value::Bool(value) => Some(value.to_string()),
3868 _ => None,
3869 }
3870}
3871
3872fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3873 match value {
3874 Value::String(value) => Some(vec![value.clone()]),
3875 Value::List(values) | Value::Tuple(values) => {
3876 let mut items = Vec::new();
3877 for item in values {
3878 items.push(value_to_string(item)?);
3879 }
3880 Some(items)
3881 }
3882 _ => None,
3883 }
3884}
3885
3886fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3887 let Value::Dict(dict) = value else {
3888 return None;
3889 };
3890
3891 let mut pairs: Vec<(String, String)> = dict
3892 .iter()
3893 .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3894 .collect::<Option<Vec<_>>>()?;
3895 pairs.sort_by(|left, right| left.0.cmp(&right.0));
3896 Some(pairs)
3897}
3898
3899fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3900 let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3901 extract_requires_dist_dependencies(&requires_dist)
3902}
3903
3904pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3905 requires_dist
3906 .iter()
3907 .filter_map(|entry| build_rfc822_dependency(entry))
3908 .collect()
3909}
3910
3911fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3912 build_python_dependency(entry, "install", false, None)
3913}
3914
3915fn build_python_dependency(
3916 entry: &str,
3917 default_scope: &str,
3918 default_optional: bool,
3919 marker_override: Option<&str>,
3920) -> Option<Dependency> {
3921 let (requirement_part, marker_part) = entry
3922 .split_once(';')
3923 .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3924 .unwrap_or((entry.trim(), None));
3925
3926 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3927 let requirement = normalize_rfc822_requirement(requirement_part);
3928 let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3929 marker_part.or(marker_override),
3930 default_scope,
3931 default_optional,
3932 );
3933 let purl = build_python_dependency_purl(&name, None)?;
3934
3935 let is_pinned = requirement
3936 .as_deref()
3937 .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3938 let purl = if is_pinned {
3939 requirement
3940 .as_deref()
3941 .map(|req| req.trim_start_matches('='))
3942 .and_then(|version| build_python_dependency_purl(&name, Some(version)))
3943 .unwrap_or(purl)
3944 } else {
3945 purl
3946 };
3947
3948 let mut extra_data = HashMap::new();
3949 extra_data.extend(marker_data);
3950 if let Some(marker) = marker {
3951 extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3952 }
3953
3954 Some(Dependency {
3955 purl: Some(purl),
3956 extracted_requirement: requirement,
3957 scope: Some(scope),
3958 is_runtime: Some(true),
3959 is_optional: Some(is_optional),
3960 is_pinned: Some(is_pinned),
3961 is_direct: Some(true),
3962 resolved_package: None,
3963 extra_data: if extra_data.is_empty() {
3964 None
3965 } else {
3966 Some(extra_data)
3967 },
3968 })
3969}
3970
3971fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3972 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3973 let trimmed = requirement_part.trim();
3974 let mut remainder = trimmed[name.len()..].trim();
3975
3976 if let Some(stripped) = remainder.strip_prefix('[')
3977 && let Some(end_idx) = stripped.find(']')
3978 {
3979 remainder = stripped[end_idx + 1..].trim();
3980 }
3981
3982 let remainder = remainder
3983 .strip_prefix('(')
3984 .and_then(|value| value.strip_suffix(')'))
3985 .unwrap_or(remainder)
3986 .trim();
3987
3988 if remainder.is_empty() {
3989 return None;
3990 }
3991
3992 let mut specifiers: Vec<String> = remainder
3993 .split(',')
3994 .map(|specifier| specifier.trim().replace(' ', ""))
3995 .filter(|specifier| !specifier.is_empty())
3996 .collect();
3997 specifiers.sort();
3998 Some(specifiers.join(","))
3999}
4000
4001fn encode_python_dependency_purl_version(version: &str) -> String {
4002 version.replace('*', "%2A")
4003}
4004
4005fn build_python_dependency_purl(name: &str, version: Option<&str>) -> Option<String> {
4006 let normalized_name = normalize_python_dependency_name(name);
4007
4008 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &normalized_name)
4009 .ok()
4010 .map(|_| match version {
4011 Some(version) => {
4012 format!(
4013 "pkg:pypi/{normalized_name}@{}",
4014 encode_python_dependency_purl_version(version)
4015 )
4016 }
4017 None => format!("pkg:pypi/{normalized_name}"),
4018 })
4019}
4020
4021fn normalize_python_dependency_name(name: &str) -> String {
4022 name.trim().to_ascii_lowercase().replace('_', "-")
4023}
4024
4025fn parse_rfc822_marker(
4026 marker_part: Option<&str>,
4027 default_scope: &str,
4028 default_optional: bool,
4029) -> (
4030 String,
4031 bool,
4032 Option<String>,
4033 HashMap<String, serde_json::Value>,
4034) {
4035 let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
4036 return (
4037 default_scope.to_string(),
4038 default_optional,
4039 None,
4040 HashMap::new(),
4041 );
4042 };
4043
4044 let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
4045 .expect("extra marker regex should compile");
4046 let mut extra_data = HashMap::new();
4047
4048 if let Some(python_version) = extract_marker_field(marker, "python_version") {
4049 extra_data.insert(
4050 "python_version".to_string(),
4051 serde_json::Value::String(python_version),
4052 );
4053 }
4054 if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
4055 extra_data.insert(
4056 "sys_platform".to_string(),
4057 serde_json::Value::String(sys_platform),
4058 );
4059 }
4060
4061 if let Some(captures) = extra_re.captures(marker)
4062 && let Some(scope) = captures.get(1)
4063 {
4064 return (
4065 scope.as_str().to_string(),
4066 true,
4067 Some(marker.trim().to_string()),
4068 extra_data,
4069 );
4070 }
4071
4072 (
4073 default_scope.to_string(),
4074 default_optional,
4075 Some(marker.trim().to_string()),
4076 extra_data,
4077 )
4078}
4079
4080fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
4081 let re = Regex::new(&format!(
4082 r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
4083 field
4084 ))
4085 .ok()?;
4086 let captures = re.captures(marker)?;
4087 let operator = captures.get(1)?.as_str();
4088 let value = captures.get(2)?.as_str();
4089 Some(format!("{} {}", operator, value))
4090}
4091
4092fn parse_requires_txt(content: &str) -> Vec<Dependency> {
4093 let mut dependencies = Vec::new();
4094 let mut current_scope = "install".to_string();
4095 let mut current_optional = false;
4096 let mut current_marker: Option<String> = None;
4097
4098 for line in content.lines() {
4099 let trimmed = line.trim();
4100 if trimmed.is_empty() || trimmed.starts_with('#') {
4101 continue;
4102 }
4103
4104 if trimmed.starts_with('[') && trimmed.ends_with(']') {
4105 let inner = &trimmed[1..trimmed.len() - 1];
4106 if let Some(rest) = inner.strip_prefix(':') {
4107 current_scope = "install".to_string();
4108 current_optional = false;
4109 current_marker = Some(rest.trim().to_string());
4110 } else if let Some((scope, marker)) = inner.split_once(':') {
4111 current_scope = scope.trim().to_string();
4112 current_optional = true;
4113 current_marker = Some(marker.trim().to_string());
4114 } else {
4115 current_scope = inner.trim().to_string();
4116 current_optional = true;
4117 current_marker = None;
4118 }
4119 continue;
4120 }
4121
4122 if let Some(dependency) = build_python_dependency(
4123 trimmed,
4124 ¤t_scope,
4125 current_optional,
4126 current_marker.as_deref(),
4127 ) {
4128 dependencies.push(dependency);
4129 }
4130 }
4131
4132 dependencies
4133}
4134
4135fn has_private_classifier(classifiers: &[String]) -> bool {
4136 classifiers
4137 .iter()
4138 .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
4139}
4140
4141fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
4142 let name = name?;
4143 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
4144 if let Some(version) = version {
4145 package_url.with_version(version).ok()?;
4146 }
4147 Some(package_url.to_string())
4148}
4149
4150fn extract_from_setup_py_regex(content: &str) -> PackageData {
4151 let name = extract_setup_value(content, "name");
4152 let version = extract_setup_value(content, "version");
4153 let license_expression = extract_setup_value(content, "license");
4154
4155 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4156 normalize_spdx_declared_license(license_expression.as_deref());
4157 let extracted_license_statement = license_expression.clone();
4158
4159 let dependencies = extract_setup_py_dependencies(content);
4160 let homepage_url = extract_setup_value(content, "url");
4161 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
4162
4163 PackageData {
4164 package_type: Some(PythonParser::PACKAGE_TYPE),
4165 namespace: None,
4166 name,
4167 version,
4168 qualifiers: None,
4169 subpath: None,
4170 primary_language: Some("Python".to_string()),
4171 description: None,
4172 release_date: None,
4173 parties: Vec::new(),
4174 keywords: Vec::new(),
4175 homepage_url,
4176 download_url: None,
4177 size: None,
4178 sha1: None,
4179 md5: None,
4180 sha256: None,
4181 sha512: None,
4182 bug_tracking_url: None,
4183 code_view_url: None,
4184 vcs_url: None,
4185 copyright: None,
4186 holder: None,
4187 declared_license_expression,
4188 declared_license_expression_spdx,
4189 license_detections,
4190 other_license_expression: None,
4191 other_license_expression_spdx: None,
4192 other_license_detections: Vec::new(),
4193 extracted_license_statement,
4194 notice_text: None,
4195 source_packages: Vec::new(),
4196 file_references: Vec::new(),
4197 is_private: false,
4198 is_virtual: false,
4199 extra_data: None,
4200 dependencies,
4201 repository_homepage_url: None,
4202 repository_download_url: None,
4203 api_data_url: None,
4204 datasource_id: Some(DatasourceId::PypiSetupPy),
4205 purl,
4206 }
4207}
4208
4209fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
4210 crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
4211}
4212
4213fn extract_from_pypi_json(path: &Path) -> PackageData {
4214 let default = PackageData {
4215 package_type: Some(PythonParser::PACKAGE_TYPE),
4216 datasource_id: Some(DatasourceId::PypiJson),
4217 ..Default::default()
4218 };
4219
4220 let content = match read_file_to_string(path) {
4221 Ok(content) => content,
4222 Err(error) => {
4223 warn!("Failed to read pypi.json at {:?}: {}", path, error);
4224 return default;
4225 }
4226 };
4227
4228 let root: serde_json::Value = match serde_json::from_str(&content) {
4229 Ok(value) => value,
4230 Err(error) => {
4231 warn!("Failed to parse pypi.json at {:?}: {}", path, error);
4232 return default;
4233 }
4234 };
4235
4236 let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
4237 warn!("No info object found in pypi.json at {:?}", path);
4238 return default;
4239 };
4240
4241 let name = info
4242 .get("name")
4243 .and_then(|value| value.as_str())
4244 .map(ToOwned::to_owned);
4245 let version = info
4246 .get("version")
4247 .and_then(|value| value.as_str())
4248 .map(ToOwned::to_owned);
4249 let summary = info
4250 .get("summary")
4251 .and_then(|value| value.as_str())
4252 .map(ToOwned::to_owned);
4253 let description = info
4254 .get("description")
4255 .and_then(|value| value.as_str())
4256 .filter(|value| !value.trim().is_empty())
4257 .map(ToOwned::to_owned)
4258 .or(summary);
4259 let mut homepage_url = info
4260 .get("home_page")
4261 .and_then(|value| value.as_str())
4262 .map(ToOwned::to_owned);
4263 let author = info
4264 .get("author")
4265 .and_then(|value| value.as_str())
4266 .filter(|value| !value.trim().is_empty())
4267 .map(ToOwned::to_owned);
4268 let author_email = info
4269 .get("author_email")
4270 .and_then(|value| value.as_str())
4271 .filter(|value| !value.trim().is_empty())
4272 .map(ToOwned::to_owned);
4273 let license = info
4274 .get("license")
4275 .and_then(|value| value.as_str())
4276 .filter(|value| !value.trim().is_empty())
4277 .map(ToOwned::to_owned);
4278 let keywords = parse_setup_cfg_keywords(
4279 info.get("keywords")
4280 .and_then(|value| value.as_str())
4281 .map(ToOwned::to_owned),
4282 );
4283 let classifiers = info
4284 .get("classifiers")
4285 .and_then(|value| value.as_array())
4286 .map(|values| {
4287 values
4288 .iter()
4289 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
4290 .collect::<Vec<_>>()
4291 })
4292 .unwrap_or_default();
4293
4294 let mut parties = Vec::new();
4295 if author.is_some() || author_email.is_some() {
4296 parties.push(Party {
4297 r#type: Some("person".to_string()),
4298 role: Some("author".to_string()),
4299 name: author,
4300 email: author_email,
4301 url: None,
4302 organization: None,
4303 organization_url: None,
4304 timezone: None,
4305 });
4306 }
4307
4308 let mut bug_tracking_url = None;
4309 let mut code_view_url = None;
4310 let mut vcs_url = None;
4311 let mut extra_data = HashMap::new();
4312
4313 let parsed_project_urls = info
4314 .get("project_urls")
4315 .and_then(|value| value.as_object())
4316 .map(|map| {
4317 let mut pairs: Vec<(String, String)> = map
4318 .iter()
4319 .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
4320 .collect();
4321 pairs.sort_by(|left, right| left.0.cmp(&right.0));
4322 pairs
4323 })
4324 .unwrap_or_default();
4325
4326 apply_project_url_mappings(
4327 &parsed_project_urls,
4328 &mut homepage_url,
4329 &mut bug_tracking_url,
4330 &mut code_view_url,
4331 &mut vcs_url,
4332 &mut extra_data,
4333 );
4334
4335 let (download_url, size, sha256) = root
4336 .get("urls")
4337 .and_then(|value| value.as_array())
4338 .map(|urls| select_pypi_json_artifact(urls))
4339 .unwrap_or((None, None, None));
4340
4341 let sha256 = sha256.and_then(|h| Sha256Digest::from_hex(&h).ok());
4342
4343 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4344 normalize_spdx_declared_license(license.as_deref());
4345 let dependencies = info
4346 .get("requires_dist")
4347 .and_then(|value| value.as_array())
4348 .map(|entries| {
4349 entries
4350 .iter()
4351 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4352 .collect::<Vec<_>>()
4353 })
4354 .map(|entries| extract_requires_dist_dependencies(&entries))
4355 .unwrap_or_default();
4356
4357 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
4358 build_pypi_urls(name.as_deref(), version.as_deref());
4359
4360 PackageData {
4361 package_type: Some(PythonParser::PACKAGE_TYPE),
4362 namespace: None,
4363 name,
4364 version,
4365 qualifiers: None,
4366 subpath: None,
4367 primary_language: None,
4368 description,
4369 release_date: None,
4370 parties,
4371 keywords,
4372 homepage_url: homepage_url.or(repository_homepage_url.clone()),
4373 download_url,
4374 size,
4375 sha1: None,
4376 md5: None,
4377 sha256,
4378 sha512: None,
4379 bug_tracking_url,
4380 code_view_url,
4381 vcs_url,
4382 copyright: None,
4383 holder: None,
4384 declared_license_expression,
4385 declared_license_expression_spdx,
4386 license_detections,
4387 other_license_expression: None,
4388 other_license_expression_spdx: None,
4389 other_license_detections: Vec::new(),
4390 extracted_license_statement: license,
4391 notice_text: None,
4392 source_packages: Vec::new(),
4393 file_references: Vec::new(),
4394 is_private: has_private_classifier(&classifiers),
4395 is_virtual: false,
4396 extra_data: if extra_data.is_empty() {
4397 None
4398 } else {
4399 Some(extra_data)
4400 },
4401 dependencies,
4402 repository_homepage_url,
4403 repository_download_url,
4404 api_data_url,
4405 datasource_id: Some(DatasourceId::PypiJson),
4406 purl,
4407 }
4408}
4409
4410fn select_pypi_json_artifact(
4411 urls: &[serde_json::Value],
4412) -> (Option<String>, Option<u64>, Option<String>) {
4413 let selected = urls
4414 .iter()
4415 .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4416 .or_else(|| urls.first());
4417
4418 let Some(entry) = selected else {
4419 return (None, None, None);
4420 };
4421
4422 let download_url = entry
4423 .get("url")
4424 .and_then(|value| value.as_str())
4425 .map(ToOwned::to_owned);
4426 let size = entry.get("size").and_then(|value| value.as_u64());
4427 let sha256 = entry
4428 .get("digests")
4429 .and_then(|value| value.as_object())
4430 .and_then(|digests| digests.get("sha256"))
4431 .and_then(|value| value.as_str())
4432 .map(ToOwned::to_owned);
4433
4434 (download_url, size, sha256)
4435}
4436
4437fn extract_from_pip_inspect(path: &Path) -> PackageData {
4438 let content = match read_file_to_string(path) {
4439 Ok(content) => content,
4440 Err(e) => {
4441 warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4442 return default_package_data(path);
4443 }
4444 };
4445
4446 let root: serde_json::Value = match serde_json::from_str(&content) {
4447 Ok(value) => value,
4448 Err(e) => {
4449 warn!(
4450 "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4451 path, e
4452 );
4453 return default_package_data(path);
4454 }
4455 };
4456
4457 let installed = match root.get("installed").and_then(|v| v.as_array()) {
4458 Some(arr) => arr,
4459 None => {
4460 warn!(
4461 "No 'installed' array found in pip-inspect.deplock at {:?}",
4462 path
4463 );
4464 return default_package_data(path);
4465 }
4466 };
4467
4468 let pip_version = root
4469 .get("pip_version")
4470 .and_then(|v| v.as_str())
4471 .map(String::from);
4472 let inspect_version = root
4473 .get("version")
4474 .and_then(|v| v.as_str())
4475 .map(String::from);
4476
4477 let mut main_package: Option<PackageData> = None;
4478 let mut dependencies: Vec<Dependency> = Vec::new();
4479
4480 for package_entry in installed {
4481 let metadata = match package_entry.get("metadata") {
4482 Some(m) => m,
4483 None => continue,
4484 };
4485
4486 let is_requested = package_entry
4487 .get("requested")
4488 .and_then(|v| v.as_bool())
4489 .unwrap_or(false);
4490 let has_direct_url = package_entry.get("direct_url").is_some();
4491
4492 let name = metadata
4493 .get("name")
4494 .and_then(|v| v.as_str())
4495 .map(String::from);
4496 let version = metadata
4497 .get("version")
4498 .and_then(|v| v.as_str())
4499 .map(String::from);
4500 let summary = metadata
4501 .get("summary")
4502 .and_then(|v| v.as_str())
4503 .map(String::from);
4504 let home_page = metadata
4505 .get("home_page")
4506 .and_then(|v| v.as_str())
4507 .map(String::from);
4508 let author = metadata
4509 .get("author")
4510 .and_then(|v| v.as_str())
4511 .map(String::from);
4512 let author_email = metadata
4513 .get("author_email")
4514 .and_then(|v| v.as_str())
4515 .map(String::from);
4516 let license = metadata
4517 .get("license")
4518 .and_then(|v| v.as_str())
4519 .map(String::from);
4520 let description = metadata
4521 .get("description")
4522 .and_then(|v| v.as_str())
4523 .map(String::from);
4524 let keywords = metadata
4525 .get("keywords")
4526 .and_then(|v| v.as_array())
4527 .map(|arr| {
4528 arr.iter()
4529 .filter_map(|k| k.as_str().map(String::from))
4530 .collect::<Vec<_>>()
4531 })
4532 .unwrap_or_default();
4533
4534 let mut parties = Vec::new();
4535 if author.is_some() || author_email.is_some() {
4536 parties.push(Party {
4537 r#type: Some("person".to_string()),
4538 role: Some("author".to_string()),
4539 name: author,
4540 email: author_email,
4541 url: None,
4542 organization: None,
4543 organization_url: None,
4544 timezone: None,
4545 });
4546 }
4547
4548 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4549 normalize_spdx_declared_license(license.as_deref());
4550 let extracted_license_statement = license.clone();
4551 let requires_dist = metadata
4552 .get("requires_dist")
4553 .and_then(|v| v.as_array())
4554 .map(|entries| {
4555 entries
4556 .iter()
4557 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4558 .collect::<Vec<_>>()
4559 })
4560 .unwrap_or_default();
4561 let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4562
4563 let purl = name.as_ref().and_then(|n| {
4564 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4565 if let Some(v) = &version {
4566 package_url.with_version(v).ok()?;
4567 }
4568 Some(package_url.to_string())
4569 });
4570
4571 if is_requested && has_direct_url {
4572 let mut extra_data = HashMap::new();
4573 if let Some(pv) = &pip_version {
4574 extra_data.insert(
4575 "pip_version".to_string(),
4576 serde_json::Value::String(pv.clone()),
4577 );
4578 }
4579 if let Some(iv) = &inspect_version {
4580 extra_data.insert(
4581 "inspect_version".to_string(),
4582 serde_json::Value::String(iv.clone()),
4583 );
4584 }
4585
4586 main_package = Some(PackageData {
4587 package_type: Some(PythonParser::PACKAGE_TYPE),
4588 namespace: None,
4589 name,
4590 version,
4591 qualifiers: None,
4592 subpath: None,
4593 primary_language: Some("Python".to_string()),
4594 description: description.or(summary),
4595 release_date: None,
4596 parties,
4597 keywords,
4598 homepage_url: home_page,
4599 download_url: None,
4600 size: None,
4601 sha1: None,
4602 md5: None,
4603 sha256: None,
4604 sha512: None,
4605 bug_tracking_url: None,
4606 code_view_url: None,
4607 vcs_url: None,
4608 copyright: None,
4609 holder: None,
4610 declared_license_expression,
4611 declared_license_expression_spdx,
4612 license_detections,
4613 other_license_expression: None,
4614 other_license_expression_spdx: None,
4615 other_license_detections: Vec::new(),
4616 extracted_license_statement,
4617 notice_text: None,
4618 source_packages: Vec::new(),
4619 file_references: Vec::new(),
4620 is_private: false,
4621 is_virtual: true,
4622 extra_data: if extra_data.is_empty() {
4623 None
4624 } else {
4625 Some(extra_data)
4626 },
4627 dependencies: parsed_dependencies,
4628 repository_homepage_url: None,
4629 repository_download_url: None,
4630 api_data_url: None,
4631 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4632 purl,
4633 });
4634 } else {
4635 let resolved_package = PackageData {
4636 package_type: Some(PythonParser::PACKAGE_TYPE),
4637 namespace: None,
4638 name: name.clone(),
4639 version: version.clone(),
4640 qualifiers: None,
4641 subpath: None,
4642 primary_language: Some("Python".to_string()),
4643 description: description.or(summary),
4644 release_date: None,
4645 parties,
4646 keywords,
4647 homepage_url: home_page,
4648 download_url: None,
4649 size: None,
4650 sha1: None,
4651 md5: None,
4652 sha256: None,
4653 sha512: None,
4654 bug_tracking_url: None,
4655 code_view_url: None,
4656 vcs_url: None,
4657 copyright: None,
4658 holder: None,
4659 declared_license_expression,
4660 declared_license_expression_spdx,
4661 license_detections,
4662 other_license_expression: None,
4663 other_license_expression_spdx: None,
4664 other_license_detections: Vec::new(),
4665 extracted_license_statement,
4666 notice_text: None,
4667 source_packages: Vec::new(),
4668 file_references: Vec::new(),
4669 is_private: false,
4670 is_virtual: true,
4671 extra_data: None,
4672 dependencies: parsed_dependencies,
4673 repository_homepage_url: None,
4674 repository_download_url: None,
4675 api_data_url: None,
4676 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4677 purl: purl.clone(),
4678 };
4679
4680 let resolved = package_data_to_resolved(&resolved_package);
4681 dependencies.push(Dependency {
4682 purl,
4683 extracted_requirement: None,
4684 scope: None,
4685 is_runtime: Some(true),
4686 is_optional: Some(false),
4687 is_pinned: Some(true),
4688 is_direct: Some(is_requested),
4689 resolved_package: Some(Box::new(resolved)),
4690 extra_data: None,
4691 });
4692 }
4693 }
4694
4695 if let Some(mut main_pkg) = main_package {
4696 let direct_requirement_purls: HashSet<String> = main_pkg
4697 .dependencies
4698 .iter()
4699 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4700 .collect();
4701
4702 let resolved_requirement_purls: HashSet<String> = dependencies
4703 .iter()
4704 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4705 .collect();
4706
4707 let unresolved_dependencies = main_pkg
4708 .dependencies
4709 .iter()
4710 .filter(|dep| {
4711 dep.purl.as_ref().is_some_and(|purl| {
4712 !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4713 })
4714 })
4715 .cloned()
4716 .collect::<Vec<_>>();
4717
4718 for dependency in &mut dependencies {
4719 if dependency
4720 .purl
4721 .as_ref()
4722 .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4723 {
4724 dependency.is_direct = Some(true);
4725 }
4726 }
4727
4728 main_pkg.dependencies = dependencies;
4729 main_pkg.dependencies.extend(unresolved_dependencies);
4730 main_pkg
4731 } else {
4732 default_package_data(path)
4733 }
4734}
4735
4736fn base_dependency_purl(purl: &str) -> String {
4737 purl.split_once('@')
4738 .map(|(base, _)| base.to_string())
4739 .unwrap_or_else(|| purl.to_string())
4740}
4741
4742type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4743
4744fn extract_from_setup_cfg(path: &Path) -> PackageData {
4745 let content = match read_file_to_string(path) {
4746 Ok(content) => content,
4747 Err(e) => {
4748 warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4749 return default_package_data(path);
4750 }
4751 };
4752
4753 let sections = parse_setup_cfg(&content);
4754 let name = get_ini_value(§ions, "metadata", "name");
4755 let version = get_ini_value(§ions, "metadata", "version");
4756 let description = get_ini_value(§ions, "metadata", "description");
4757 let author = get_ini_value(§ions, "metadata", "author");
4758 let author_email = get_ini_value(§ions, "metadata", "author_email");
4759 let maintainer = get_ini_value(§ions, "metadata", "maintainer");
4760 let maintainer_email = get_ini_value(§ions, "metadata", "maintainer_email");
4761 let license = get_ini_value(§ions, "metadata", "license");
4762 let mut homepage_url = get_ini_value(§ions, "metadata", "url");
4763 let classifiers = get_ini_values(§ions, "metadata", "classifiers");
4764 let keywords = parse_setup_cfg_keywords(get_ini_value(§ions, "metadata", "keywords"));
4765 let python_requires = get_ini_value(§ions, "options", "python_requires");
4766 let parsed_project_urls =
4767 parse_setup_cfg_project_urls(&get_ini_values(§ions, "metadata", "project_urls"));
4768 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4769 let mut extra_data = HashMap::new();
4770
4771 let mut parties = Vec::new();
4772 if author.is_some() || author_email.is_some() {
4773 parties.push(Party {
4774 r#type: Some("person".to_string()),
4775 role: Some("author".to_string()),
4776 name: author,
4777 email: author_email,
4778 url: None,
4779 organization: None,
4780 organization_url: None,
4781 timezone: None,
4782 });
4783 }
4784
4785 if maintainer.is_some() || maintainer_email.is_some() {
4786 parties.push(Party {
4787 r#type: Some("person".to_string()),
4788 role: Some("maintainer".to_string()),
4789 name: maintainer,
4790 email: maintainer_email,
4791 url: None,
4792 organization: None,
4793 organization_url: None,
4794 timezone: None,
4795 });
4796 }
4797
4798 let declared_license_expression = None;
4799 let declared_license_expression_spdx = None;
4800 let license_detections = Vec::new();
4801 let extracted_license_statement = license.clone();
4802
4803 let dependencies = extract_setup_cfg_dependencies(§ions);
4804
4805 if let Some(value) = python_requires {
4806 extra_data.insert(
4807 "python_requires".to_string(),
4808 serde_json::Value::String(value),
4809 );
4810 }
4811
4812 apply_project_url_mappings(
4813 &parsed_project_urls,
4814 &mut homepage_url,
4815 &mut bug_tracking_url,
4816 &mut code_view_url,
4817 &mut vcs_url,
4818 &mut extra_data,
4819 );
4820
4821 let extra_data = if extra_data.is_empty() {
4822 None
4823 } else {
4824 Some(extra_data)
4825 };
4826
4827 let purl = name.as_ref().and_then(|n| {
4828 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4829 if let Some(v) = &version {
4830 package_url.with_version(v).ok()?;
4831 }
4832 Some(package_url.to_string())
4833 });
4834
4835 PackageData {
4836 package_type: Some(PythonParser::PACKAGE_TYPE),
4837 namespace: None,
4838 name,
4839 version,
4840 qualifiers: None,
4841 subpath: None,
4842 primary_language: Some("Python".to_string()),
4843 description,
4844 release_date: None,
4845 parties,
4846 keywords,
4847 homepage_url,
4848 download_url: None,
4849 size: None,
4850 sha1: None,
4851 md5: None,
4852 sha256: None,
4853 sha512: None,
4854 bug_tracking_url,
4855 code_view_url,
4856 vcs_url,
4857 copyright: None,
4858 holder: None,
4859 declared_license_expression,
4860 declared_license_expression_spdx,
4861 license_detections,
4862 other_license_expression: None,
4863 other_license_expression_spdx: None,
4864 other_license_detections: Vec::new(),
4865 extracted_license_statement,
4866 notice_text: None,
4867 source_packages: Vec::new(),
4868 file_references: Vec::new(),
4869 is_private: has_private_classifier(&classifiers),
4870 is_virtual: false,
4871 extra_data,
4872 dependencies,
4873 repository_homepage_url: None,
4874 repository_download_url: None,
4875 api_data_url: None,
4876 datasource_id: Some(DatasourceId::PypiSetupCfg),
4877 purl,
4878 }
4879}
4880
4881fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4882 let Some(keywords) = value else {
4883 return Vec::new();
4884 };
4885
4886 keywords
4887 .split(',')
4888 .map(str::trim)
4889 .filter(|keyword| !keyword.is_empty())
4890 .map(ToOwned::to_owned)
4891 .collect()
4892}
4893
4894fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4895 entries
4896 .iter()
4897 .filter_map(|entry| {
4898 let (label, url) = entry.split_once('=')?;
4899 let label = label.trim();
4900 let url = url.trim();
4901 if label.is_empty() || url.is_empty() {
4902 None
4903 } else {
4904 Some((label.to_string(), url.to_string()))
4905 }
4906 })
4907 .collect()
4908}
4909
4910fn apply_project_url_mappings(
4911 parsed_urls: &[(String, String)],
4912 homepage_url: &mut Option<String>,
4913 bug_tracking_url: &mut Option<String>,
4914 code_view_url: &mut Option<String>,
4915 vcs_url: &mut Option<String>,
4916 extra_data: &mut HashMap<String, serde_json::Value>,
4917) {
4918 for (label, url) in parsed_urls {
4919 let label_lower = label.to_lowercase();
4920
4921 if bug_tracking_url.is_none()
4922 && matches!(
4923 label_lower.as_str(),
4924 "tracker"
4925 | "bug reports"
4926 | "bug tracker"
4927 | "issues"
4928 | "issue tracker"
4929 | "github: issues"
4930 )
4931 {
4932 *bug_tracking_url = Some(url.clone());
4933 } else if code_view_url.is_none()
4934 && matches!(label_lower.as_str(), "source" | "source code" | "code")
4935 {
4936 *code_view_url = Some(url.clone());
4937 } else if vcs_url.is_none()
4938 && matches!(
4939 label_lower.as_str(),
4940 "github" | "gitlab" | "github: repo" | "repository"
4941 )
4942 {
4943 *vcs_url = Some(url.clone());
4944 } else if homepage_url.is_none()
4945 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4946 {
4947 *homepage_url = Some(url.clone());
4948 } else if label_lower == "changelog" {
4949 extra_data.insert(
4950 "changelog_url".to_string(),
4951 serde_json::Value::String(url.clone()),
4952 );
4953 }
4954 }
4955
4956 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4957 .iter()
4958 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4959 .collect();
4960
4961 if !project_urls_json.is_empty() {
4962 extra_data.insert(
4963 "project_urls".to_string(),
4964 serde_json::Value::Object(project_urls_json),
4965 );
4966 }
4967}
4968
4969fn parse_setup_cfg(content: &str) -> IniSections {
4970 let mut sections: IniSections = HashMap::new();
4971 let mut current_section: Option<String> = None;
4972 let mut current_key: Option<String> = None;
4973
4974 for raw_line in content.lines() {
4975 let line = raw_line.trim_end_matches('\r');
4976 let trimmed = line.trim();
4977 if trimmed.is_empty() {
4978 continue;
4979 }
4980
4981 let stripped = line.trim_start();
4982 if stripped.starts_with('#') || stripped.starts_with(';') {
4983 continue;
4984 }
4985
4986 if stripped.starts_with('[') && stripped.ends_with(']') {
4987 let section_name = stripped
4988 .trim_start_matches('[')
4989 .trim_end_matches(']')
4990 .trim()
4991 .to_ascii_lowercase();
4992 current_section = if section_name.is_empty() {
4993 None
4994 } else {
4995 Some(section_name)
4996 };
4997 current_key = None;
4998 continue;
4999 }
5000
5001 if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
5002 if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
5003 let value = stripped.trim();
5004 if !value.is_empty() {
5005 sections
5006 .entry(section.clone())
5007 .or_default()
5008 .entry(key.clone())
5009 .or_default()
5010 .push(value.to_string());
5011 }
5012 }
5013 continue;
5014 }
5015
5016 if let Some((key, value)) = stripped.split_once('=')
5017 && let Some(section) = current_section.as_ref()
5018 {
5019 let key_name = key.trim().to_ascii_lowercase();
5020 let value_trimmed = value.trim();
5021 let entry = sections
5022 .entry(section.clone())
5023 .or_default()
5024 .entry(key_name.clone())
5025 .or_default();
5026 if !value_trimmed.is_empty() {
5027 entry.push(value_trimmed.to_string());
5028 }
5029 current_key = Some(key_name);
5030 }
5031 }
5032
5033 sections
5034}
5035
5036fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
5037 sections
5038 .get(§ion.to_ascii_lowercase())
5039 .and_then(|values| values.get(&key.to_ascii_lowercase()))
5040 .and_then(|entries| entries.first())
5041 .map(|value| value.trim().to_string())
5042}
5043
5044fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
5045 sections
5046 .get(§ion.to_ascii_lowercase())
5047 .and_then(|values| values.get(&key.to_ascii_lowercase()))
5048 .cloned()
5049 .unwrap_or_default()
5050}
5051
5052fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
5053 let mut dependencies = Vec::new();
5054
5055 for (sub_section, scope) in [
5056 ("install_requires", "install"),
5057 ("tests_require", "test"),
5058 ("setup_requires", "setup"),
5059 ] {
5060 let reqs = get_ini_values(sections, "options", sub_section);
5061 dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
5062 }
5063
5064 if let Some(extras) = sections.get("options.extras_require") {
5065 let mut extra_items: Vec<_> = extras.iter().collect();
5066 extra_items.sort_by_key(|(name, _)| *name);
5067 for (extra_name, reqs) in extra_items {
5068 dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
5069 }
5070 }
5071
5072 dependencies
5073}
5074
5075fn parse_setup_cfg_requirements(
5076 reqs: &[String],
5077 scope: &str,
5078 is_optional: bool,
5079) -> Vec<Dependency> {
5080 reqs.iter()
5081 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
5082 .collect()
5083}
5084
5085fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
5086 let trimmed = req.trim();
5087 if trimmed.is_empty() || trimmed.starts_with('#') {
5088 return None;
5089 }
5090
5091 let name = extract_setup_cfg_dependency_name(trimmed)?;
5092 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5093
5094 Some(Dependency {
5095 purl: Some(purl.to_string()),
5096 extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
5097 scope: Some(scope.to_string()),
5098 is_runtime: Some(true),
5099 is_optional: Some(is_optional),
5100 is_pinned: Some(false),
5101 is_direct: Some(true),
5102 resolved_package: None,
5103 extra_data: None,
5104 })
5105}
5106
5107fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
5108 let trimmed = req.trim();
5109 if trimmed.is_empty() {
5110 return None;
5111 }
5112
5113 let end = trimmed
5114 .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
5115 .unwrap_or(trimmed.len());
5116 let name = trimmed[..end].trim();
5117 if name.is_empty() {
5118 None
5119 } else {
5120 Some(name.to_string())
5121 }
5122}
5123
5124fn normalize_setup_cfg_requirement(req: &str) -> String {
5125 req.chars().filter(|c| !c.is_whitespace()).collect()
5126}
5127
5128fn extract_setup_value(content: &str, key: &str) -> Option<String> {
5129 let patterns = vec![
5130 format!("{}=\"", key), format!("{} =\"", key), format!("{}= \"", key), format!("{} = \"", key), format!("{}='", key), format!("{} ='", key), format!("{}= '", key), format!("{} = '", key), ];
5139
5140 for pattern in patterns {
5141 if let Some(start_idx) = content.find(&pattern) {
5142 let value_start = start_idx + pattern.len();
5143 let remaining = &content[value_start..];
5144
5145 if let Some(end_idx) = remaining.find(['"', '\'']) {
5146 return Some(remaining[..end_idx].to_string());
5147 }
5148 }
5149 }
5150
5151 None
5152}
5153
5154fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
5155 let mut dependencies = Vec::new();
5156
5157 if let Some(tests_deps) = extract_tests_require(content) {
5158 dependencies.extend(tests_deps);
5159 }
5160
5161 if let Some(extras_deps) = extract_extras_require(content) {
5162 dependencies.extend(extras_deps);
5163 }
5164
5165 dependencies
5166}
5167
5168fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
5169 let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
5170 let re = Regex::new(pattern).ok()?;
5171 let captures = re.captures(content)?;
5172 let deps_str = captures.get(1)?.as_str();
5173
5174 let deps = parse_setup_py_dep_list(deps_str, "test", true);
5175 if deps.is_empty() { None } else { Some(deps) }
5176}
5177
5178fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
5179 let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
5180 let re = Regex::new(pattern).ok()?;
5181 let captures = re.captures(content)?;
5182 let dict_content = captures.get(1)?.as_str();
5183
5184 let mut all_deps = Vec::new();
5185
5186 let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
5187 let entry_re = Regex::new(entry_pattern).ok()?;
5188
5189 for entry_cap in entry_re.captures_iter(dict_content) {
5190 if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
5191 let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
5192 all_deps.extend(deps);
5193 }
5194 }
5195
5196 if all_deps.is_empty() {
5197 None
5198 } else {
5199 Some(all_deps)
5200 }
5201}
5202
5203fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
5204 let dep_pattern = r#"['"]([^'"]+)['"]"#;
5205 let re = match Regex::new(dep_pattern) {
5206 Ok(r) => r,
5207 Err(_) => return Vec::new(),
5208 };
5209
5210 re.captures_iter(deps_str)
5211 .filter_map(|cap| {
5212 let dep_str = cap.get(1)?.as_str().trim();
5213 if dep_str.is_empty() {
5214 return None;
5215 }
5216
5217 let name = extract_setup_cfg_dependency_name(dep_str)?;
5218 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5219
5220 Some(Dependency {
5221 purl: Some(purl.to_string()),
5222 extracted_requirement: Some(dep_str.to_string()),
5223 scope: Some(scope.to_string()),
5224 is_runtime: Some(true),
5225 is_optional: Some(is_optional),
5226 is_pinned: Some(false),
5227 is_direct: Some(true),
5228 resolved_package: None,
5229 extra_data: None,
5230 })
5231 })
5232 .collect()
5233}
5234
5235pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
5237 let content = read_file_to_string(path).map_err(|e| e.to_string())?;
5238 toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
5239}
5240
5241fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<Sha256Digest>) {
5252 let mut file = match File::open(path) {
5253 Ok(f) => f,
5254 Err(_) => return (None, None),
5255 };
5256
5257 let metadata = match file.metadata() {
5258 Ok(m) => m,
5259 Err(_) => return (None, None),
5260 };
5261 let size = metadata.len();
5262
5263 let mut hasher = Sha256::new();
5264 let mut buffer = vec![0; 8192];
5265
5266 loop {
5267 match file.read(&mut buffer) {
5268 Ok(0) => break,
5269 Ok(n) => hasher.update(&buffer[..n]),
5270 Err(_) => return (Some(size), None),
5271 }
5272 }
5273
5274 let hash = Sha256Digest::from_bytes(hasher.finalize().into());
5275 (Some(size), Some(hash))
5276}
5277
5278fn default_package_data(path: &Path) -> PackageData {
5279 PackageData {
5280 package_type: Some(PythonParser::PACKAGE_TYPE),
5281 primary_language: Some("Python".to_string()),
5282 datasource_id: infer_python_datasource_id(path),
5283 ..Default::default()
5284 }
5285}
5286
5287fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
5288 let file_name = path.file_name().and_then(|name| name.to_str());
5289
5290 match file_name {
5291 Some("pyproject.toml") => {
5292 if read_toml_file(path)
5293 .ok()
5294 .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
5295 .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
5296 .is_some()
5297 {
5298 Some(DatasourceId::PypiPoetryPyprojectToml)
5299 } else {
5300 Some(DatasourceId::PypiPyprojectToml)
5301 }
5302 }
5303 Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
5304 Some(DatasourceId::PypiSetupPy)
5305 }
5306 Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
5307 Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
5308 Some("METADATA") if is_installed_wheel_metadata_path(path) => {
5309 Some(DatasourceId::PypiWheelMetadata)
5310 }
5311 Some("pypi.json") => Some(DatasourceId::PypiJson),
5312 Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
5313 Some("origin.json") if is_pip_cache_origin_json(path) => {
5314 Some(DatasourceId::PypiPipOriginJson)
5315 }
5316 _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
5317 Some(DatasourceId::PypiSdist)
5318 }
5319 _ if path
5320 .extension()
5321 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
5322 {
5323 Some(DatasourceId::PypiWheel)
5324 }
5325 _ if path
5326 .extension()
5327 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
5328 {
5329 Some(DatasourceId::PypiEgg)
5330 }
5331 _ => None,
5332 }
5333}
5334
5335crate::register_parser!(
5336 "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
5337 &[
5338 "**/pyproject.toml",
5339 "**/setup.py",
5340 "**/*_setup.py",
5341 "**/setup.cfg",
5342 "**/pypi.json",
5343 "**/PKG-INFO",
5344 "**/*.dist-info/METADATA",
5345 "**/origin.json",
5346 "**/*.tar.gz",
5347 "**/*.tgz",
5348 "**/*.tar.bz2",
5349 "**/*.tar.xz",
5350 "**/*.zip",
5351 "**/*.whl",
5352 "**/*.egg"
5353 ],
5354 "pypi",
5355 "Python",
5356 Some("https://packaging.python.org/"),
5357);