1use crate::models::{
35 DatasourceId, Dependency, FileReference, PackageData, PackageType, Party, Sha256Digest,
36};
37use crate::parser_warn as warn;
38use crate::parsers::utils::{
39 MAX_ITERATION_COUNT, read_file_to_string, split_name_email, truncate_field,
40};
41use base64::Engine;
42use base64::engine::general_purpose::URL_SAFE_NO_PAD;
43use bzip2::read::BzDecoder;
44use csv::ReaderBuilder;
45use flate2::read::GzDecoder;
46use liblzma::read::XzDecoder;
47use packageurl::PackageUrl;
48use regex::Regex;
49use ruff_python_ast as ast;
50use ruff_python_parser::parse_module;
51use serde_json::{Map as JsonMap, Value as JsonValue};
52use sha2::{Digest, Sha256};
53use std::collections::{HashMap, HashSet};
54use std::fs::File;
55use std::io::Read;
56use std::path::{Component, Path, PathBuf};
57use tar::Archive;
58use toml::Value as TomlValue;
59use toml::map::Map as TomlMap;
60use zip::ZipArchive;
61
62use super::PackageParser;
63use super::license_normalization::{
64 DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
65 normalize_spdx_expression,
66};
67use super::pep508::parse_pep508_requirement;
68
69const FIELD_PROJECT: &str = "project";
71const FIELD_NAME: &str = "name";
72const FIELD_VERSION: &str = "version";
73const FIELD_DESCRIPTION: &str = "description";
74const FIELD_KEYWORDS: &str = "keywords";
75const FIELD_LICENSE: &str = "license";
76const FIELD_AUTHORS: &str = "authors";
77const FIELD_MAINTAINERS: &str = "maintainers";
78const FIELD_URLS: &str = "urls";
79const FIELD_HOMEPAGE: &str = "homepage";
80const FIELD_REPOSITORY: &str = "repository";
81const FIELD_DEPENDENCIES: &str = "dependencies";
82const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
83const FIELD_EXTRAS: &str = "extras";
84
85type ProjectUrls = (
86 Option<String>,
87 Option<String>,
88 Option<String>,
89 Option<String>,
90 Option<String>,
91);
92const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
93const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
94const MAX_SETUP_PY_BYTES: usize = 1_048_576;
95const MAX_SETUP_PY_AST_NODES: usize = 10_000;
96const MAX_SETUP_PY_AST_DEPTH: usize = 50;
97const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; const MAX_COMPRESSION_RATIO: f64 = 100.0; pub struct PythonParser;
111
112#[derive(Clone, Copy, Debug)]
113enum PythonSdistArchiveFormat {
114 TarGz,
115 Tgz,
116 TarBz2,
117 TarXz,
118 Zip,
119}
120
121#[derive(Clone, Debug)]
122struct ValidatedZipEntry {
123 index: usize,
124 name: String,
125}
126
127impl PackageParser for PythonParser {
128 const PACKAGE_TYPE: PackageType = PackageType::Pypi;
129
130 fn extract_packages(path: &Path) -> Vec<PackageData> {
131 vec![
132 if path.file_name().unwrap_or_default() == "pyproject.toml" {
133 extract_from_pyproject_toml(path)
134 } else if path.file_name().unwrap_or_default() == "setup.cfg" {
135 extract_from_setup_cfg(path)
136 } else if is_setup_py_like_path(path) {
137 return extract_setup_py_packages(path);
138 } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
139 extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
140 } else if is_installed_wheel_metadata_path(path) {
141 extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
142 } else if is_pip_cache_origin_json(path) {
143 extract_from_pip_origin_json(path)
144 } else if path.file_name().unwrap_or_default() == "pypi.json" {
145 extract_from_pypi_json(path)
146 } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
147 extract_from_pip_inspect(path)
148 } else if is_python_sdist_archive_path(path) {
149 extract_from_sdist_archive(path)
150 } else if path
151 .extension()
152 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
153 {
154 extract_from_wheel_archive(path)
155 } else if path
156 .extension()
157 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
158 {
159 extract_from_egg_archive(path)
160 } else {
161 default_package_data(path)
162 },
163 ]
164 }
165
166 fn is_match(path: &Path) -> bool {
167 if let Some(filename) = path.file_name()
168 && (filename == "pyproject.toml"
169 || filename == "setup.cfg"
170 || is_setup_py_like_path(path)
171 || filename == "PKG-INFO"
172 || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
173 || filename == "pypi.json"
174 || filename == "pip-inspect.deplock"
175 || is_pip_cache_origin_json(path))
176 {
177 return true;
178 }
179
180 if let Some(extension) = path.extension() {
181 let ext = extension.to_string_lossy().to_lowercase();
182 if (ext == "whl" && is_valid_wheel_archive_path(path))
183 || ext == "egg"
184 || is_python_sdist_archive_path(path)
185 {
186 return true;
187 }
188 }
189
190 false
191 }
192}
193
194fn is_setup_py_like_path(path: &Path) -> bool {
195 path.file_name()
196 .and_then(|name| name.to_str())
197 .is_some_and(|name| {
198 name == "setup.py" || name.ends_with("_setup.py") || name.ends_with("-setup.py")
199 })
200}
201
202fn is_installed_wheel_metadata_path(path: &Path) -> bool {
203 path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
204 && path
205 .parent()
206 .and_then(|parent| parent.file_name())
207 .and_then(|name| name.to_str())
208 .is_some_and(|name| name.ends_with(".dist-info"))
209}
210
211#[derive(Debug, Clone)]
212struct InstalledWheelMetadata {
213 wheel_tags: Vec<String>,
214 wheel_version: Option<String>,
215 wheel_generator: Option<String>,
216 root_is_purelib: Option<bool>,
217 compressed_tag: Option<String>,
218}
219
220fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
221 let Some(parent) = path.parent() else {
222 return;
223 };
224
225 if !parent
226 .file_name()
227 .and_then(|name| name.to_str())
228 .is_some_and(|name| name.ends_with(".dist-info"))
229 {
230 return;
231 }
232
233 let wheel_path = parent.join("WHEEL");
234 if !wheel_path.exists() {
235 return;
236 }
237
238 let Ok(content) = read_file_to_string(&wheel_path, None) else {
239 warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
240 return;
241 };
242
243 let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
244 return;
245 };
246
247 apply_installed_wheel_metadata(package_data, &wheel_metadata);
248}
249
250fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
251 use super::rfc822::{get_header_all, get_header_first};
252
253 let metadata = super::rfc822::parse_rfc822_content(content);
254 let wheel_tags = get_header_all(&metadata.headers, "tag");
255 if wheel_tags.is_empty() {
256 return None;
257 }
258
259 let wheel_version = get_header_first(&metadata.headers, "wheel-version");
260 let wheel_generator = get_header_first(&metadata.headers, "generator");
261 let root_is_purelib =
262 get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
263 match value.to_ascii_lowercase().as_str() {
264 "true" => Some(true),
265 "false" => Some(false),
266 _ => None,
267 }
268 });
269
270 let compressed_tag = compress_wheel_tags(&wheel_tags);
271
272 Some(InstalledWheelMetadata {
273 wheel_tags,
274 wheel_version,
275 wheel_generator,
276 root_is_purelib,
277 compressed_tag,
278 })
279}
280
281fn compress_wheel_tags(tags: &[String]) -> Option<String> {
282 if tags.is_empty() {
283 return None;
284 }
285
286 if tags.len() == 1 {
287 return Some(tags[0].clone());
288 }
289
290 let mut python_tags = Vec::new();
291 let mut abi_tag: Option<&str> = None;
292 let mut platform_tag: Option<&str> = None;
293
294 for tag in tags {
295 let mut parts = tag.splitn(3, '-');
296 let python = parts.next()?;
297 let abi = parts.next()?;
298 let platform = parts.next()?;
299
300 if abi_tag.is_some_and(|existing| existing != abi)
301 || platform_tag.is_some_and(|existing| existing != platform)
302 {
303 return None;
304 }
305
306 abi_tag = Some(abi);
307 platform_tag = Some(platform);
308 python_tags.push(python.to_string());
309 }
310
311 Some(format!(
312 "{}-{}-{}",
313 python_tags.join("."),
314 abi_tag?,
315 platform_tag?
316 ))
317}
318
319fn apply_installed_wheel_metadata(
320 package_data: &mut PackageData,
321 wheel_metadata: &InstalledWheelMetadata,
322) {
323 let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
324 extra_data.insert(
325 "wheel_tags".to_string(),
326 JsonValue::Array(
327 wheel_metadata
328 .wheel_tags
329 .iter()
330 .cloned()
331 .map(JsonValue::String)
332 .collect(),
333 ),
334 );
335
336 if let Some(wheel_version) = &wheel_metadata.wheel_version {
337 extra_data.insert(
338 "wheel_version".to_string(),
339 JsonValue::String(wheel_version.clone()),
340 );
341 }
342
343 if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
344 extra_data.insert(
345 "wheel_generator".to_string(),
346 JsonValue::String(wheel_generator.clone()),
347 );
348 }
349
350 if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
351 extra_data.insert(
352 "root_is_purelib".to_string(),
353 JsonValue::Bool(root_is_purelib),
354 );
355 }
356
357 if let (Some(name), Some(version), Some(extension)) = (
358 package_data.name.as_deref(),
359 package_data.version.as_deref(),
360 wheel_metadata.compressed_tag.as_deref(),
361 ) {
362 package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
363 }
364}
365
366fn is_pip_cache_origin_json(path: &Path) -> bool {
367 path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
368 && path.ancestors().skip(1).any(|ancestor| {
369 ancestor
370 .file_name()
371 .and_then(|name| name.to_str())
372 .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
373 })
374}
375
376fn extract_from_pip_origin_json(path: &Path) -> PackageData {
377 let content = match read_file_to_string(path, None) {
378 Ok(content) => content,
379 Err(e) => {
380 warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
381 return default_package_data(path);
382 }
383 };
384
385 let root: JsonValue = match serde_json::from_str(&content) {
386 Ok(root) => root,
387 Err(e) => {
388 warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
389 return default_package_data(path);
390 }
391 };
392
393 let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
394 warn!("No url found in pip cache origin.json at {:?}", path);
395 return default_package_data(path);
396 };
397
398 let sibling_wheel = find_sibling_cached_wheel(path);
399 let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
400 sibling_wheel
401 .as_ref()
402 .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
403 });
404
405 let Some((name, version)) = name_version else {
406 warn!(
407 "Failed to infer package name/version from pip cache origin.json at {:?}",
408 path
409 );
410 return default_package_data(path);
411 };
412
413 let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
414 build_pypi_urls(Some(&name), Some(&version));
415 let purl = sibling_wheel
416 .as_ref()
417 .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
418 .or(plain_purl);
419
420 PackageData {
421 package_type: Some(PythonParser::PACKAGE_TYPE),
422 primary_language: Some("Python".to_string()),
423 name: Some(truncate_field(name)),
424 version: Some(version),
425 datasource_id: Some(DatasourceId::PypiPipOriginJson),
426 download_url: Some(truncate_field(download_url.to_string())),
427 sha256: extract_sha256_from_origin_json(&root)
428 .and_then(|h| Sha256Digest::from_hex(&h).ok()),
429 repository_homepage_url,
430 repository_download_url,
431 api_data_url,
432 purl,
433 ..Default::default()
434 }
435}
436
437fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
438 let parent = path.parent()?;
439 let entries = parent.read_dir().ok()?;
440
441 for entry in entries.flatten() {
442 let sibling_path = entry.path();
443 if sibling_path
444 .extension()
445 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
446 && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
447 {
448 return Some(wheel_info);
449 }
450 }
451
452 None
453}
454
455fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
456 let file_name = url.rsplit('/').next()?;
457
458 if file_name.ends_with(".whl") {
459 return parse_wheel_filename(Path::new(file_name))
460 .map(|wheel_info| (wheel_info.name, wheel_info.version));
461 }
462
463 let stem = strip_python_archive_extension(file_name)?;
464 let (name, version) = stem.rsplit_once('-')?;
465 if name.is_empty() || version.is_empty() {
466 return None;
467 }
468
469 Some((name.replace('_', "-"), version.to_string()))
470}
471
472fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
473 [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
474 .iter()
475 .find_map(|suffix| file_name.strip_suffix(suffix))
476}
477
478fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
479 root.pointer("/archive_info/hashes/sha256")
480 .and_then(|value| value.as_str())
481 .map(ToOwned::to_owned)
482 .or_else(|| {
483 root.pointer("/archive_info/hash")
484 .and_then(|value| value.as_str())
485 .and_then(normalize_origin_hash)
486 })
487}
488
489fn normalize_origin_hash(hash: &str) -> Option<String> {
490 if let Some(value) = hash.strip_prefix("sha256=") {
491 return Some(value.to_string());
492 }
493 if let Some(value) = hash.strip_prefix("sha256:") {
494 return Some(value.to_string());
495 }
496 if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
497 return Some(hash.to_string());
498 }
499 None
500}
501
502fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
503 let content = match read_file_to_string(path, None) {
504 Ok(content) => content,
505 Err(e) => {
506 warn!("Failed to read metadata at {:?}: {}", path, e);
507 return default_package_data(path);
508 }
509 };
510
511 let metadata = super::rfc822::parse_rfc822_content(&content);
512 let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
513 merge_sibling_metadata_dependencies(path, &mut package_data);
514 merge_sibling_metadata_file_references(path, &mut package_data);
515 if datasource_id == DatasourceId::PypiWheelMetadata {
516 merge_sibling_wheel_metadata(path, &mut package_data);
517 }
518 package_data
519}
520
521fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
522 let mut extra_dependencies = Vec::new();
523
524 if let Some(parent) = path.parent() {
525 let direct_requires = parent.join("requires.txt");
526 if direct_requires.exists()
527 && let Ok(content) = read_file_to_string(&direct_requires, None)
528 {
529 extra_dependencies.extend(parse_requires_txt(&content));
530 }
531
532 let sibling_egg_info_requires = parent
533 .read_dir()
534 .ok()
535 .into_iter()
536 .flatten()
537 .flatten()
538 .find_map(|entry| {
539 let child_path = entry.path();
540 if child_path.is_dir()
541 && child_path
542 .file_name()
543 .and_then(|name| name.to_str())
544 .is_some_and(|name| name.ends_with(".egg-info"))
545 {
546 let requires = child_path.join("requires.txt");
547 requires.exists().then_some(requires)
548 } else {
549 None
550 }
551 });
552
553 if let Some(requires_path) = sibling_egg_info_requires
554 && let Ok(content) = read_file_to_string(&requires_path, None)
555 {
556 extra_dependencies.extend(parse_requires_txt(&content));
557 }
558 }
559
560 for dependency in extra_dependencies {
561 if !package_data.dependencies.iter().any(|existing| {
562 existing.purl == dependency.purl
563 && existing.scope == dependency.scope
564 && existing.extracted_requirement == dependency.extracted_requirement
565 && existing.extra_data == dependency.extra_data
566 }) {
567 package_data.dependencies.push(dependency);
568 }
569 }
570}
571
572fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
573 let mut extra_refs = Vec::new();
574
575 if let Some(parent) = path.parent() {
576 let record_path = parent.join("RECORD");
577 if record_path.exists()
578 && let Ok(content) = read_file_to_string(&record_path, None)
579 {
580 extra_refs.extend(parse_record_csv(&content));
581 }
582
583 let installed_files_path = parent.join("installed-files.txt");
584 if installed_files_path.exists()
585 && let Ok(content) = read_file_to_string(&installed_files_path, None)
586 {
587 extra_refs.extend(parse_installed_files_txt(&content));
588 }
589
590 let sources_path = parent.join("SOURCES.txt");
591 if sources_path.exists()
592 && let Ok(content) = read_file_to_string(&sources_path, None)
593 {
594 extra_refs.extend(parse_sources_txt(&content));
595 }
596 }
597
598 for file_ref in extra_refs {
599 if !package_data
600 .file_references
601 .iter()
602 .any(|existing| existing.path == file_ref.path)
603 {
604 package_data.file_references.push(file_ref);
605 }
606 }
607}
608
609fn collect_validated_zip_entries<R: Read + std::io::Seek>(
610 archive: &mut ZipArchive<R>,
611 path: &Path,
612 archive_type: &str,
613) -> Result<Vec<ValidatedZipEntry>, String> {
614 let mut total_extracted = 0u64;
615 let mut entries = Vec::new();
616 let mut entry_count = 0usize;
617
618 for i in 0..archive.len() {
619 entry_count += 1;
620 if entry_count > MAX_ITERATION_COUNT {
621 warn!(
622 "Exceeded max entry count in {} {:?}; stopping at {} entries",
623 archive_type, path, MAX_ITERATION_COUNT
624 );
625 break;
626 }
627 if let Ok(file) = archive.by_index_raw(i) {
628 let compressed_size = file.compressed_size();
629 let uncompressed_size = file.size();
630 let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
631 warn!(
632 "Skipping unsafe path in {} {:?}: {}",
633 archive_type,
634 path,
635 file.name()
636 );
637 continue;
638 };
639
640 if compressed_size > 0 {
641 let ratio = uncompressed_size as f64 / compressed_size as f64;
642 if ratio > MAX_COMPRESSION_RATIO {
643 warn!(
644 "Suspicious compression ratio in {} {:?}: {:.2}:1",
645 archive_type, path, ratio
646 );
647 continue;
648 }
649 }
650
651 if uncompressed_size > MAX_FILE_SIZE {
652 warn!(
653 "File too large in {} {:?}: {} bytes (limit: {} bytes)",
654 archive_type, path, uncompressed_size, MAX_FILE_SIZE
655 );
656 continue;
657 }
658
659 total_extracted += uncompressed_size;
660 if total_extracted > MAX_ARCHIVE_SIZE {
661 let msg = format!(
662 "Total extracted size exceeds limit for {} {:?}",
663 archive_type, path
664 );
665 warn!("{}", msg);
666 return Err(msg);
667 }
668
669 entries.push(ValidatedZipEntry {
670 index: i,
671 name: entry_name,
672 });
673 }
674 }
675
676 Ok(entries)
677}
678
679fn is_python_sdist_archive_path(path: &Path) -> bool {
680 detect_python_sdist_archive_format(path).is_some()
681}
682
683fn is_valid_wheel_archive_path(path: &Path) -> bool {
684 if !path.is_file() {
685 return true;
686 }
687
688 let file = match File::open(path) {
689 Ok(file) => file,
690 Err(_) => return false,
691 };
692 let mut archive = match ZipArchive::new(file) {
693 Ok(archive) => archive,
694 Err(_) => return false,
695 };
696
697 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
698 Ok(entries) => entries,
699 Err(_) => return false,
700 };
701
702 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA").is_some()
703}
704
705fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
706 let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
707
708 if !is_likely_python_sdist_filename(&file_name) {
709 return None;
710 }
711
712 if file_name.ends_with(".tar.gz") {
713 tar_gz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarGz)
714 } else if file_name.ends_with(".tgz") {
715 tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
716 } else if file_name.ends_with(".tar.bz2") {
717 tar_bz2_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarBz2)
718 } else if file_name.ends_with(".tar.xz") {
719 tar_xz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarXz)
720 } else if file_name.ends_with(".zip") {
721 zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
722 } else {
723 None
724 }
725}
726
727fn tar_gz_sdist_contains_pkg_info(path: &Path) -> bool {
728 let Some(compressed_size) = compressed_archive_size(path) else {
729 return false;
730 };
731 let file = match File::open(path) {
732 Ok(file) => file,
733 Err(_) => return false,
734 };
735 let decoder = GzDecoder::new(file);
736 tar_sdist_contains_pkg_info(path, decoder, "tar.gz", compressed_size)
737}
738
739fn tar_bz2_sdist_contains_pkg_info(path: &Path) -> bool {
740 let Some(compressed_size) = compressed_archive_size(path) else {
741 return false;
742 };
743 let file = match File::open(path) {
744 Ok(file) => file,
745 Err(_) => return false,
746 };
747 let decoder = BzDecoder::new(file);
748 tar_sdist_contains_pkg_info(path, decoder, "tar.bz2", compressed_size)
749}
750
751fn tar_xz_sdist_contains_pkg_info(path: &Path) -> bool {
752 let Some(compressed_size) = compressed_archive_size(path) else {
753 return false;
754 };
755 let file = match File::open(path) {
756 Ok(file) => file,
757 Err(_) => return false,
758 };
759 let decoder = XzDecoder::new(file);
760 tar_sdist_contains_pkg_info(path, decoder, "tar.xz", compressed_size)
761}
762
763fn compressed_archive_size(path: &Path) -> Option<u64> {
764 std::fs::metadata(path).ok().map(|metadata| metadata.len())
765}
766
767fn tar_sdist_contains_pkg_info<R: Read>(
768 path: &Path,
769 reader: R,
770 archive_type: &str,
771 compressed_size: u64,
772) -> bool {
773 let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
774 else {
775 return false;
776 };
777
778 select_sdist_pkginfo_entry(path, &entries).is_some()
779}
780
781fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
782 if !path.is_file() {
783 return true;
784 }
785
786 let Some(compressed_size) = compressed_archive_size(path) else {
787 return false;
788 };
789 let file = match File::open(path) {
790 Ok(file) => file,
791 Err(_) => return false,
792 };
793 let decoder = GzDecoder::new(file);
794 tar_sdist_contains_pkg_info(path, decoder, "tgz", compressed_size)
795}
796
797fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
798 if !path.is_file() {
799 return true;
800 }
801
802 let file = match File::open(path) {
803 Ok(file) => file,
804 Err(_) => return false,
805 };
806 let mut archive = match ZipArchive::new(file) {
807 Ok(archive) => archive,
808 Err(_) => return false,
809 };
810
811 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
812 Ok(entries) => entries,
813 Err(_) => return false,
814 };
815 let metadata_entries: Vec<_> = validated_entries
816 .iter()
817 .filter(|entry| entry.name.ends_with("/PKG-INFO"))
818 .filter_map(|entry| {
819 read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
820 .ok()
821 .map(|content| (entry.name.clone(), content))
822 })
823 .collect();
824
825 has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
826}
827
828fn is_likely_python_sdist_filename(file_name: &str) -> bool {
829 let Some(stem) = strip_python_archive_extension(file_name) else {
830 return false;
831 };
832
833 let Some((name, version)) = stem.rsplit_once('-') else {
834 return false;
835 };
836
837 !name.is_empty()
838 && !version.is_empty()
839 && version.chars().any(|ch| ch.is_ascii_digit())
840 && name
841 .chars()
842 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
843}
844
845fn extract_from_sdist_archive(path: &Path) -> PackageData {
846 let metadata = match std::fs::metadata(path) {
847 Ok(m) => m,
848 Err(e) => {
849 warn!(
850 "Failed to read metadata for sdist archive {:?}: {}",
851 path, e
852 );
853 return default_package_data(path);
854 }
855 };
856
857 if metadata.len() > MAX_ARCHIVE_SIZE {
858 warn!(
859 "sdist archive too large: {} bytes (limit: {} bytes)",
860 metadata.len(),
861 MAX_ARCHIVE_SIZE
862 );
863 return default_package_data(path);
864 }
865
866 let Some(format) = detect_python_sdist_archive_format(path) else {
867 return default_package_data(path);
868 };
869
870 let mut package_data = match format {
871 PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
872 let file = match File::open(path) {
873 Ok(file) => file,
874 Err(e) => {
875 warn!("Failed to open sdist archive {:?}: {}", path, e);
876 return default_package_data(path);
877 }
878 };
879 let decoder = GzDecoder::new(file);
880 extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
881 }
882 PythonSdistArchiveFormat::TarBz2 => {
883 let file = match File::open(path) {
884 Ok(file) => file,
885 Err(e) => {
886 warn!("Failed to open sdist archive {:?}: {}", path, e);
887 return default_package_data(path);
888 }
889 };
890 let decoder = BzDecoder::new(file);
891 extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
892 }
893 PythonSdistArchiveFormat::TarXz => {
894 let file = match File::open(path) {
895 Ok(file) => file,
896 Err(e) => {
897 warn!("Failed to open sdist archive {:?}: {}", path, e);
898 return default_package_data(path);
899 }
900 };
901 let decoder = XzDecoder::new(file);
902 extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
903 }
904 PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
905 };
906
907 if package_data.package_type.is_some() {
908 let (size, sha256) = calculate_file_checksums(path);
909 package_data.size = size;
910 package_data.sha256 = sha256;
911 }
912
913 package_data
914}
915
916fn extract_from_tar_sdist_archive<R: Read>(
917 path: &Path,
918 reader: R,
919 archive_type: &str,
920 compressed_size: u64,
921) -> PackageData {
922 let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
923 else {
924 return default_package_data(path);
925 };
926
927 build_sdist_package_data(path, entries)
928}
929
930fn collect_tar_sdist_entries<R: Read>(
931 path: &Path,
932 reader: R,
933 archive_type: &str,
934 compressed_size: u64,
935) -> Option<Vec<(String, String)>> {
936 let mut archive = Archive::new(reader);
937 let archive_entries = match archive.entries() {
938 Ok(entries) => entries,
939 Err(e) => {
940 warn!(
941 "Failed to read {} sdist archive {:?}: {}",
942 archive_type, path, e
943 );
944 return None;
945 }
946 };
947
948 let mut total_extracted = 0u64;
949 let mut entries = Vec::new();
950 let mut entry_count = 0usize;
951
952 for entry_result in archive_entries {
953 entry_count += 1;
954 if entry_count > MAX_ITERATION_COUNT {
955 warn!(
956 "Exceeded max entry count in {} sdist {:?}; stopping at {} entries",
957 archive_type, path, MAX_ITERATION_COUNT
958 );
959 break;
960 }
961
962 let mut entry = match entry_result {
963 Ok(entry) => entry,
964 Err(e) => {
965 warn!(
966 "Failed to read {} sdist entry from {:?}: {}",
967 archive_type, path, e
968 );
969 continue;
970 }
971 };
972
973 let entry_size = entry.size();
974 if entry_size > MAX_FILE_SIZE {
975 warn!(
976 "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
977 archive_type, path, entry_size, MAX_FILE_SIZE
978 );
979 continue;
980 }
981
982 total_extracted += entry_size;
983 if total_extracted > MAX_ARCHIVE_SIZE {
984 warn!(
985 "Total extracted size exceeds limit for {} sdist {:?}",
986 archive_type, path
987 );
988 return None;
989 }
990
991 if compressed_size > 0 {
992 let ratio = total_extracted as f64 / compressed_size as f64;
993 if ratio > MAX_COMPRESSION_RATIO {
994 warn!(
995 "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
996 archive_type, path, ratio
997 );
998 return None;
999 }
1000 }
1001
1002 let entry_path = match entry.path() {
1003 Ok(path) => path.to_string_lossy().replace('\\', "/"),
1004 Err(e) => {
1005 warn!(
1006 "Failed to get {} sdist entry path from {:?}: {}",
1007 archive_type, path, e
1008 );
1009 continue;
1010 }
1011 };
1012
1013 let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
1014 warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
1015 continue;
1016 };
1017
1018 if !is_relevant_sdist_text_entry(&entry_path) {
1019 continue;
1020 }
1021
1022 if let Ok(content) = read_limited_utf8(
1023 &mut entry,
1024 MAX_FILE_SIZE,
1025 &format!("{} entry {}", archive_type, entry_path),
1026 ) {
1027 entries.push((entry_path, content));
1028 }
1029 }
1030
1031 Some(entries)
1032}
1033
1034fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
1035 let file = match File::open(path) {
1036 Ok(file) => file,
1037 Err(e) => {
1038 warn!("Failed to open zip sdist archive {:?}: {}", path, e);
1039 return default_package_data(path);
1040 }
1041 };
1042
1043 let mut archive = match ZipArchive::new(file) {
1044 Ok(archive) => archive,
1045 Err(e) => {
1046 warn!("Failed to read zip sdist archive {:?}: {}", path, e);
1047 return default_package_data(path);
1048 }
1049 };
1050
1051 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
1052 Ok(entries) => entries,
1053 Err(_) => return default_package_data(path),
1054 };
1055
1056 let mut entries = Vec::new();
1057 for entry in validated_entries.iter() {
1058 if !is_relevant_sdist_text_entry(&entry.name) {
1059 continue;
1060 }
1061
1062 if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
1063 entries.push((entry.name.clone(), content));
1064 }
1065 }
1066
1067 build_sdist_package_data(path, entries)
1068}
1069
1070fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
1071 entry_path.ends_with("/PKG-INFO")
1072 || entry_path.ends_with("/requires.txt")
1073 || entry_path.ends_with("/SOURCES.txt")
1074}
1075
1076fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
1077 let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
1078 warn!("No PKG-INFO file found in sdist archive {:?}", path);
1079 return default_package_data(path);
1080 };
1081
1082 let mut package_data =
1083 python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
1084 merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
1085 merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
1086 apply_sdist_name_version_fallback(path, &mut package_data);
1087 package_data.datasource_id = Some(DatasourceId::PypiSdist);
1088 package_data
1089}
1090
1091fn select_sdist_pkginfo_entry(
1092 archive_path: &Path,
1093 entries: &[(String, String)],
1094) -> Option<(String, String)> {
1095 let expected_name = sdist_archive_expected_name(archive_path);
1096
1097 entries
1098 .iter()
1099 .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
1100 .min_by_key(|(entry_path, content)| {
1101 let components: Vec<_> = entry_path
1102 .split('/')
1103 .filter(|part| !part.is_empty())
1104 .collect();
1105 let candidate_name = sdist_pkginfo_candidate_name(content);
1106 let name_rank = if candidate_name == expected_name {
1107 0
1108 } else {
1109 1
1110 };
1111 let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1112
1113 (name_rank, kind_rank, components.len(), entry_path.clone())
1114 })
1115 .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1116}
1117
1118fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1119 let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1120 return false;
1121 };
1122
1123 entries.iter().any(|(entry_path, content)| {
1124 sdist_pkginfo_kind_rank(entry_path) < 3
1125 && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1126 })
1127}
1128
1129fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1130 archive_path
1131 .file_name()
1132 .and_then(|name| name.to_str())
1133 .and_then(strip_python_archive_extension)
1134 .and_then(|stem| {
1135 stem.rsplit_once('-')
1136 .map(|(name, _)| normalize_python_package_name(name))
1137 })
1138}
1139
1140fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1141 let metadata = super::rfc822::parse_rfc822_content(content);
1142 super::rfc822::get_header_first(&metadata.headers, "name")
1143 .map(|name| normalize_python_package_name(&name))
1144}
1145
1146fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1147 let components: Vec<_> = entry_path
1148 .split('/')
1149 .filter(|part| !part.is_empty())
1150 .collect();
1151
1152 if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1153 {
1154 0
1155 } else if components.len() == 2 && components[1] == "PKG-INFO" {
1156 1
1157 } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1158 2
1159 } else {
1160 3
1161 }
1162}
1163
1164fn merge_sdist_archive_dependencies(
1165 entries: &[(String, String)],
1166 metadata_path: &str,
1167 package_data: &mut PackageData,
1168) {
1169 let metadata_dir = metadata_path
1170 .rsplit_once('/')
1171 .map(|(dir, _)| dir)
1172 .unwrap_or("");
1173 let archive_root = metadata_path.split('/').next().unwrap_or("");
1174 let matched_egg_info_dir =
1175 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1176 let mut extra_dependencies = Vec::new();
1177
1178 for (entry_path, content) in entries {
1179 let is_direct_requires =
1180 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1181 let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1182 entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1183 });
1184
1185 if is_direct_requires || is_egg_info_requires {
1186 extra_dependencies.extend(parse_requires_txt(content));
1187 }
1188 }
1189
1190 for dependency in extra_dependencies {
1191 if !package_data.dependencies.iter().any(|existing| {
1192 existing.purl == dependency.purl
1193 && existing.scope == dependency.scope
1194 && existing.extracted_requirement == dependency.extracted_requirement
1195 && existing.extra_data == dependency.extra_data
1196 }) {
1197 package_data.dependencies.push(dependency);
1198 }
1199 }
1200}
1201
1202fn merge_sdist_archive_file_references(
1203 entries: &[(String, String)],
1204 metadata_path: &str,
1205 package_data: &mut PackageData,
1206) {
1207 let metadata_dir = metadata_path
1208 .rsplit_once('/')
1209 .map(|(dir, _)| dir)
1210 .unwrap_or("");
1211 let archive_root = metadata_path.split('/').next().unwrap_or("");
1212 let matched_egg_info_dir =
1213 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1214 let mut extra_refs = Vec::new();
1215
1216 for (entry_path, content) in entries {
1217 let is_direct_sources =
1218 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1219 let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1220 entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1221 });
1222
1223 if is_direct_sources || is_egg_info_sources {
1224 extra_refs.extend(parse_sources_txt(content));
1225 }
1226 }
1227
1228 for file_ref in extra_refs {
1229 if !package_data
1230 .file_references
1231 .iter()
1232 .any(|existing| existing.path == file_ref.path)
1233 {
1234 package_data.file_references.push(file_ref);
1235 }
1236 }
1237}
1238
1239fn select_matching_sdist_egg_info_dir(
1240 entries: &[(String, String)],
1241 archive_root: &str,
1242 package_name: Option<&str>,
1243) -> Option<String> {
1244 let normalized_package_name = package_name.map(normalize_python_package_name);
1245
1246 entries
1247 .iter()
1248 .filter_map(|(entry_path, _)| {
1249 let components: Vec<_> = entry_path
1250 .split('/')
1251 .filter(|part| !part.is_empty())
1252 .collect();
1253 if components.len() == 3
1254 && components[0] == archive_root
1255 && components[1].ends_with(".egg-info")
1256 {
1257 Some(components[1].to_string())
1258 } else {
1259 None
1260 }
1261 })
1262 .min_by_key(|egg_info_dir| {
1263 let normalized_dir_name =
1264 normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1265 let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1266 0
1267 } else {
1268 1
1269 };
1270
1271 (name_rank, egg_info_dir.clone())
1272 })
1273}
1274
1275fn normalize_python_package_name(name: &str) -> String {
1276 name.to_ascii_lowercase().replace('_', "-")
1277}
1278
1279fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1280 let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1281 return;
1282 };
1283
1284 let Some(stem) = strip_python_archive_extension(file_name) else {
1285 return;
1286 };
1287
1288 let Some((name, version)) = stem.rsplit_once('-') else {
1289 return;
1290 };
1291
1292 if package_data.name.is_none() {
1293 package_data.name = Some(name.replace('_', "-"));
1294 }
1295 if package_data.version.is_none() {
1296 package_data.version = Some(version.to_string());
1297 }
1298
1299 if package_data.purl.is_none()
1300 || package_data.repository_homepage_url.is_none()
1301 || package_data.repository_download_url.is_none()
1302 || package_data.api_data_url.is_none()
1303 {
1304 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1305 build_pypi_urls(
1306 package_data.name.as_deref(),
1307 package_data.version.as_deref(),
1308 );
1309
1310 if package_data.repository_homepage_url.is_none() {
1311 package_data.repository_homepage_url = repository_homepage_url;
1312 }
1313 if package_data.repository_download_url.is_none() {
1314 package_data.repository_download_url = repository_download_url;
1315 }
1316 if package_data.api_data_url.is_none() {
1317 package_data.api_data_url = api_data_url;
1318 }
1319 if package_data.purl.is_none() {
1320 package_data.purl = purl;
1321 }
1322 }
1323}
1324
1325fn extract_from_wheel_archive(path: &Path) -> PackageData {
1326 let metadata = match std::fs::metadata(path) {
1327 Ok(m) => m,
1328 Err(e) => {
1329 warn!(
1330 "Failed to read metadata for wheel archive {:?}: {}",
1331 path, e
1332 );
1333 return default_package_data(path);
1334 }
1335 };
1336
1337 if metadata.len() > MAX_ARCHIVE_SIZE {
1338 warn!(
1339 "Wheel archive too large: {} bytes (limit: {} bytes)",
1340 metadata.len(),
1341 MAX_ARCHIVE_SIZE
1342 );
1343 return default_package_data(path);
1344 }
1345
1346 let file = match File::open(path) {
1347 Ok(f) => f,
1348 Err(e) => {
1349 warn!("Failed to open wheel archive {:?}: {}", path, e);
1350 return default_package_data(path);
1351 }
1352 };
1353
1354 let mut archive = match ZipArchive::new(file) {
1355 Ok(a) => a,
1356 Err(e) => {
1357 warn!("Failed to read wheel archive {:?}: {}", path, e);
1358 return default_package_data(path);
1359 }
1360 };
1361
1362 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1363 Ok(entries) => entries,
1364 Err(_) => return default_package_data(path),
1365 };
1366
1367 let metadata_entry =
1368 match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1369 Some(entry) => entry,
1370 None => {
1371 warn!("No METADATA file found in wheel archive {:?}", path);
1372 return default_package_data(path);
1373 }
1374 };
1375
1376 let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1377 Ok(c) => c,
1378 Err(e) => {
1379 warn!("Failed to read METADATA from {:?}: {}", path, e);
1380 return default_package_data(path);
1381 }
1382 };
1383
1384 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1385
1386 let (size, sha256) = calculate_file_checksums(path);
1387 package_data.size = size;
1388 package_data.sha256 = sha256;
1389
1390 if let Some(record_entry) =
1391 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1392 && let Ok(record_content) =
1393 read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1394 {
1395 package_data.file_references = parse_record_csv(&record_content);
1396 }
1397
1398 if let Some(wheel_info) = parse_wheel_filename(path) {
1399 if package_data.name.is_none() {
1400 package_data.name = Some(wheel_info.name.clone());
1401 }
1402 if package_data.version.is_none() {
1403 package_data.version = Some(wheel_info.version.clone());
1404 }
1405
1406 package_data.qualifiers = Some(std::collections::HashMap::from([(
1407 "extension".to_string(),
1408 format!(
1409 "{}-{}-{}",
1410 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1411 ),
1412 )]));
1413
1414 package_data.purl = build_wheel_purl(
1415 package_data.name.as_deref(),
1416 package_data.version.as_deref(),
1417 &wheel_info,
1418 );
1419
1420 let mut extra_data = package_data.extra_data.unwrap_or_default();
1421 extra_data.insert(
1422 "python_requires".to_string(),
1423 serde_json::Value::String(wheel_info.python_tag.clone()),
1424 );
1425 extra_data.insert(
1426 "abi_tag".to_string(),
1427 serde_json::Value::String(wheel_info.abi_tag.clone()),
1428 );
1429 extra_data.insert(
1430 "platform_tag".to_string(),
1431 serde_json::Value::String(wheel_info.platform_tag.clone()),
1432 );
1433 package_data.extra_data = Some(extra_data);
1434 }
1435
1436 package_data
1437}
1438
1439fn extract_from_egg_archive(path: &Path) -> PackageData {
1440 let metadata = match std::fs::metadata(path) {
1441 Ok(m) => m,
1442 Err(e) => {
1443 warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1444 return default_package_data(path);
1445 }
1446 };
1447
1448 if metadata.len() > MAX_ARCHIVE_SIZE {
1449 warn!(
1450 "Egg archive too large: {} bytes (limit: {} bytes)",
1451 metadata.len(),
1452 MAX_ARCHIVE_SIZE
1453 );
1454 return default_package_data(path);
1455 }
1456
1457 let file = match File::open(path) {
1458 Ok(f) => f,
1459 Err(e) => {
1460 warn!("Failed to open egg archive {:?}: {}", path, e);
1461 return default_package_data(path);
1462 }
1463 };
1464
1465 let mut archive = match ZipArchive::new(file) {
1466 Ok(a) => a,
1467 Err(e) => {
1468 warn!("Failed to read egg archive {:?}: {}", path, e);
1469 return default_package_data(path);
1470 }
1471 };
1472
1473 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1474 Ok(entries) => entries,
1475 Err(_) => return default_package_data(path),
1476 };
1477
1478 let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1479 &validated_entries,
1480 &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1481 ) {
1482 Some(entry) => entry,
1483 None => {
1484 warn!("No PKG-INFO file found in egg archive {:?}", path);
1485 return default_package_data(path);
1486 }
1487 };
1488
1489 let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1490 Ok(c) => c,
1491 Err(e) => {
1492 warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1493 return default_package_data(path);
1494 }
1495 };
1496
1497 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1498
1499 let (size, sha256) = calculate_file_checksums(path);
1500 package_data.size = size;
1501 package_data.sha256 = sha256;
1502
1503 if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1504 &validated_entries,
1505 &[
1506 "EGG-INFO/installed-files.txt",
1507 ".egg-info/installed-files.txt",
1508 ],
1509 ) && let Ok(installed_files_content) =
1510 read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1511 {
1512 package_data.file_references = parse_installed_files_txt(&installed_files_content);
1513 }
1514
1515 if let Some(egg_info) = parse_egg_filename(path) {
1516 if package_data.name.is_none() {
1517 package_data.name = Some(egg_info.name.clone());
1518 }
1519 if package_data.version.is_none() {
1520 package_data.version = Some(egg_info.version.clone());
1521 }
1522
1523 if let Some(python_version) = &egg_info.python_version {
1524 let mut extra_data = package_data.extra_data.unwrap_or_default();
1525 extra_data.insert(
1526 "python_version".to_string(),
1527 serde_json::Value::String(python_version.clone()),
1528 );
1529 package_data.extra_data = Some(extra_data);
1530 }
1531 }
1532
1533 package_data.purl = build_egg_purl(
1534 package_data.name.as_deref(),
1535 package_data.version.as_deref(),
1536 );
1537
1538 package_data
1539}
1540
1541fn find_validated_zip_entry_by_suffix<'a>(
1542 entries: &'a [ValidatedZipEntry],
1543 suffix: &str,
1544) -> Option<&'a ValidatedZipEntry> {
1545 entries.iter().find(|entry| entry.name.ends_with(suffix))
1546}
1547
1548fn find_validated_zip_entry_by_any_suffix<'a>(
1549 entries: &'a [ValidatedZipEntry],
1550 suffixes: &[&str],
1551) -> Option<&'a ValidatedZipEntry> {
1552 entries
1553 .iter()
1554 .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1555}
1556
1557fn read_validated_zip_entry<R: Read + std::io::Seek>(
1558 archive: &mut ZipArchive<R>,
1559 entry: &ValidatedZipEntry,
1560 path: &Path,
1561 archive_type: &str,
1562) -> Result<String, String> {
1563 let mut file = archive
1564 .by_index(entry.index)
1565 .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1566
1567 let compressed_size = file.compressed_size();
1568 let uncompressed_size = file.size();
1569
1570 if compressed_size > 0 {
1571 let ratio = uncompressed_size as f64 / compressed_size as f64;
1572 if ratio > MAX_COMPRESSION_RATIO {
1573 return Err(format!(
1574 "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1575 archive_type, path, ratio
1576 ));
1577 }
1578 }
1579
1580 if uncompressed_size > MAX_FILE_SIZE {
1581 return Err(format!(
1582 "Rejected oversized entry in {} {:?}: {} bytes",
1583 archive_type, path, uncompressed_size
1584 ));
1585 }
1586
1587 read_limited_utf8(
1588 &mut file,
1589 MAX_FILE_SIZE,
1590 &format!("{} entry {}", archive_type, entry.name),
1591 )
1592}
1593
1594fn read_limited_utf8<R: Read>(
1595 reader: &mut R,
1596 max_bytes: u64,
1597 context: &str,
1598) -> Result<String, String> {
1599 let mut limited = reader.take(max_bytes + 1);
1600 let mut bytes = Vec::new();
1601 limited
1602 .read_to_end(&mut bytes)
1603 .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1604
1605 if bytes.len() as u64 > max_bytes {
1606 return Err(format!(
1607 "{} exceeded {} byte limit while reading",
1608 context, max_bytes
1609 ));
1610 }
1611
1612 match String::from_utf8(bytes) {
1613 Ok(s) => Ok(s),
1614 Err(err) => {
1615 let bytes = err.into_bytes();
1616 warn!("Invalid UTF-8 in archive entry; using lossy conversion");
1617 Ok(String::from_utf8_lossy(&bytes).into_owned())
1618 }
1619 }
1620}
1621
1622fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1623 let normalized = entry_path.replace('\\', "/");
1624 if normalized.len() >= 3 {
1625 let bytes = normalized.as_bytes();
1626 if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1627 return None;
1628 }
1629 }
1630 let path = Path::new(&normalized);
1631 let mut components = Vec::new();
1632
1633 for component in path.components() {
1634 match component {
1635 Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1636 Component::CurDir => {}
1637 Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1638 }
1639 }
1640
1641 (!components.is_empty()).then_some(components.join("/"))
1642}
1643
1644pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1649 let mut reader = ReaderBuilder::new()
1650 .has_headers(false)
1651 .from_reader(content.as_bytes());
1652
1653 let mut file_references = Vec::new();
1654 let mut record_count = 0usize;
1655
1656 for result in reader.records() {
1657 record_count += 1;
1658 if record_count > MAX_ITERATION_COUNT {
1659 warn!(
1660 "Exceeded max record count in RECORD CSV; stopping at {} records",
1661 MAX_ITERATION_COUNT
1662 );
1663 break;
1664 }
1665 match result {
1666 Ok(record) => {
1667 if record.len() < 3 {
1668 continue;
1669 }
1670
1671 let path = record.get(0).unwrap_or("").trim().to_string();
1672 if path.is_empty() {
1673 continue;
1674 }
1675
1676 let hash_field = record.get(1).unwrap_or("").trim();
1677 let size_field = record.get(2).unwrap_or("").trim();
1678
1679 let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1681 let parts: Vec<&str> = hash_field.split('=').collect();
1682 if parts.len() == 2 && parts[0] == "sha256" {
1683 match URL_SAFE_NO_PAD.decode(parts[1]) {
1684 Ok(decoded) => {
1685 let hex = decoded
1686 .iter()
1687 .map(|b| format!("{:02x}", b))
1688 .collect::<String>();
1689 Sha256Digest::from_hex(&hex).ok()
1690 }
1691 Err(_) => None,
1692 }
1693 } else {
1694 None
1695 }
1696 } else {
1697 None
1698 };
1699
1700 let size = if !size_field.is_empty() && size_field != "-" {
1702 size_field.parse::<u64>().ok()
1703 } else {
1704 None
1705 };
1706
1707 file_references.push(FileReference {
1708 path,
1709 size,
1710 sha1: None,
1711 md5: None,
1712 sha256,
1713 sha512: None,
1714 extra_data: None,
1715 });
1716 }
1717 Err(e) => {
1718 warn!("Failed to parse RECORD CSV row: {}", e);
1719 continue;
1720 }
1721 }
1722 }
1723
1724 file_references
1725}
1726
1727pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1730 content
1731 .lines()
1732 .take(MAX_ITERATION_COUNT)
1733 .map(|line| line.trim())
1734 .filter(|line| !line.is_empty())
1735 .map(|path| FileReference {
1736 path: path.to_string(),
1737 size: None,
1738 sha1: None,
1739 md5: None,
1740 sha256: None,
1741 sha512: None,
1742 extra_data: None,
1743 })
1744 .collect()
1745}
1746
1747pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1748 content
1749 .lines()
1750 .take(MAX_ITERATION_COUNT)
1751 .map(str::trim)
1752 .filter(|line| !line.is_empty())
1753 .map(|path| FileReference {
1754 path: path.to_string(),
1755 size: None,
1756 sha1: None,
1757 md5: None,
1758 sha256: None,
1759 sha512: None,
1760 extra_data: None,
1761 })
1762 .collect()
1763}
1764
1765struct WheelInfo {
1766 name: String,
1767 version: String,
1768 python_tag: String,
1769 abi_tag: String,
1770 platform_tag: String,
1771}
1772
1773fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1774 let stem = path.file_stem()?.to_string_lossy();
1775 let parts: Vec<&str> = stem.split('-').collect();
1776
1777 if parts.len() >= 5 {
1778 Some(WheelInfo {
1779 name: parts[0].replace('_', "-"),
1780 version: parts[1].to_string(),
1781 python_tag: parts[2].to_string(),
1782 abi_tag: parts[3].to_string(),
1783 platform_tag: parts[4..].join("-"),
1784 })
1785 } else {
1786 None
1787 }
1788}
1789
1790struct EggInfo {
1791 name: String,
1792 version: String,
1793 python_version: Option<String>,
1794}
1795
1796fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1797 let stem = path.file_stem()?.to_string_lossy();
1798 let parts: Vec<&str> = stem.split('-').collect();
1799
1800 if parts.len() >= 2 {
1801 Some(EggInfo {
1802 name: parts[0].replace('_', "-"),
1803 version: parts[1].to_string(),
1804 python_version: parts.get(2).map(|s| s.to_string()),
1805 })
1806 } else {
1807 None
1808 }
1809}
1810
1811fn build_wheel_purl(
1812 name: Option<&str>,
1813 version: Option<&str>,
1814 wheel_info: &WheelInfo,
1815) -> Option<String> {
1816 let name = name?;
1817 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1818
1819 if let Some(ver) = version {
1820 package_url.with_version(ver).ok()?;
1821 }
1822
1823 let extension = format!(
1824 "{}-{}-{}",
1825 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1826 );
1827 package_url.add_qualifier("extension", extension).ok()?;
1828
1829 Some(package_url.to_string())
1830}
1831
1832fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1833 let name = name?;
1834 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1835
1836 if let Some(ver) = version {
1837 package_url.with_version(ver).ok()?;
1838 }
1839
1840 package_url.add_qualifier("type", "egg").ok()?;
1841
1842 Some(package_url.to_string())
1843}
1844
1845fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1846 let metadata = super::rfc822::parse_rfc822_content(content);
1847 build_package_data_from_rfc822(&metadata, datasource_id)
1848}
1849
1850fn build_package_data_from_rfc822(
1855 metadata: &super::rfc822::Rfc822Metadata,
1856 datasource_id: DatasourceId,
1857) -> PackageData {
1858 use super::rfc822::{get_header_all, get_header_first};
1859
1860 let name = get_header_first(&metadata.headers, "name").map(truncate_field);
1861 let version = get_header_first(&metadata.headers, "version").map(truncate_field);
1862 let summary = get_header_first(&metadata.headers, "summary").map(truncate_field);
1863 let mut homepage_url = get_header_first(&metadata.headers, "home-page").map(truncate_field);
1864 let author = get_header_first(&metadata.headers, "author").map(truncate_field);
1865 let author_email = get_header_first(&metadata.headers, "author-email").map(truncate_field);
1866 let license = get_header_first(&metadata.headers, "license").map(truncate_field);
1867 let license_expression = get_header_first(&metadata.headers, "license-expression");
1868 let download_url = get_header_first(&metadata.headers, "download-url");
1869 let platform = get_header_first(&metadata.headers, "platform");
1870 let requires_python = get_header_first(&metadata.headers, "requires-python");
1871 let classifiers = get_header_all(&metadata.headers, "classifier");
1872 let license_files = get_header_all(&metadata.headers, "license-file");
1873
1874 let description_body = if metadata.body.is_empty() {
1875 get_header_first(&metadata.headers, "description").unwrap_or_default()
1876 } else {
1877 metadata.body.clone()
1878 };
1879
1880 let description = build_description(summary.as_deref(), &description_body).map(truncate_field);
1881
1882 let mut parties = Vec::new();
1883 if author.is_some() || author_email.is_some() {
1884 parties.push(Party {
1885 r#type: Some("person".to_string()),
1886 role: Some("author".to_string()),
1887 name: author,
1888 email: author_email,
1889 url: None,
1890 organization: None,
1891 organization_url: None,
1892 timezone: None,
1893 });
1894 }
1895
1896 let (keywords, license_classifiers) = split_classifiers(&classifiers);
1897 let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1898 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1899 license_expression
1900 .as_deref()
1901 .and_then(normalize_spdx_expression)
1902 .map(|normalized| {
1903 build_declared_license_data(
1904 normalized,
1905 DeclaredLicenseMatchMetadata::single_line(
1906 license_expression.as_deref().unwrap_or_default(),
1907 )
1908 .with_referenced_filenames(&referenced_license_files),
1909 )
1910 })
1911 .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1912
1913 let extracted_license_statement = license_expression
1914 .clone()
1915 .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1916
1917 let mut extra_data = HashMap::new();
1918 if let Some(platform_value) = platform
1919 && !platform_value.eq_ignore_ascii_case("unknown")
1920 && !platform_value.is_empty()
1921 {
1922 extra_data.insert(
1923 "platform".to_string(),
1924 serde_json::Value::String(platform_value),
1925 );
1926 }
1927
1928 if let Some(requires_python_value) = requires_python
1929 && !requires_python_value.is_empty()
1930 {
1931 extra_data.insert(
1932 "requires_python".to_string(),
1933 serde_json::Value::String(requires_python_value),
1934 );
1935 }
1936
1937 if !license_files.is_empty() {
1938 extra_data.insert(
1939 "license_files".to_string(),
1940 serde_json::Value::Array(
1941 license_files
1942 .iter()
1943 .cloned()
1944 .map(serde_json::Value::String)
1945 .collect(),
1946 ),
1947 );
1948 }
1949
1950 let file_references = license_files
1951 .iter()
1952 .map(|path| FileReference {
1953 path: path.clone(),
1954 size: None,
1955 sha1: None,
1956 md5: None,
1957 sha256: None,
1958 sha512: None,
1959 extra_data: None,
1960 })
1961 .collect();
1962
1963 let project_urls = get_header_all(&metadata.headers, "project-url");
1964 let dependencies = extract_rfc822_dependencies(&metadata.headers);
1965 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1966
1967 if !project_urls.is_empty() {
1968 let parsed_urls = parse_project_urls(&project_urls);
1969
1970 for (label, url) in &parsed_urls {
1971 let label_lower = label.to_lowercase();
1972
1973 if bug_tracking_url.is_none()
1974 && matches!(
1975 label_lower.as_str(),
1976 "tracker"
1977 | "bug reports"
1978 | "bug tracker"
1979 | "issues"
1980 | "issue tracker"
1981 | "github: issues"
1982 )
1983 {
1984 bug_tracking_url = Some(url.clone());
1985 } else if code_view_url.is_none()
1986 && matches!(label_lower.as_str(), "source" | "source code" | "code")
1987 {
1988 code_view_url = Some(url.clone());
1989 } else if vcs_url.is_none()
1990 && matches!(
1991 label_lower.as_str(),
1992 "github" | "gitlab" | "github: repo" | "repository"
1993 )
1994 {
1995 vcs_url = Some(url.clone());
1996 } else if homepage_url.is_none()
1997 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1998 {
1999 homepage_url = Some(url.clone());
2000 } else if label_lower == "changelog" {
2001 extra_data.insert(
2002 "changelog_url".to_string(),
2003 serde_json::Value::String(url.clone()),
2004 );
2005 }
2006 }
2007
2008 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
2009 .iter()
2010 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
2011 .collect();
2012
2013 if !project_urls_json.is_empty() {
2014 extra_data.insert(
2015 "project_urls".to_string(),
2016 serde_json::Value::Object(project_urls_json),
2017 );
2018 }
2019 }
2020
2021 let extra_data = if extra_data.is_empty() {
2022 None
2023 } else {
2024 Some(extra_data)
2025 };
2026
2027 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
2028 build_pypi_urls(name.as_deref(), version.as_deref());
2029
2030 PackageData {
2031 package_type: Some(PythonParser::PACKAGE_TYPE),
2032 namespace: None,
2033 name,
2034 version,
2035 qualifiers: None,
2036 subpath: None,
2037 primary_language: Some("Python".to_string()),
2038 description,
2039 release_date: None,
2040 parties,
2041 keywords,
2042 homepage_url,
2043 download_url,
2044 size: None,
2045 sha1: None,
2046 md5: None,
2047 sha256: None,
2048 sha512: None,
2049 bug_tracking_url,
2050 code_view_url,
2051 vcs_url,
2052 copyright: None,
2053 holder: None,
2054 declared_license_expression,
2055 declared_license_expression_spdx,
2056 license_detections,
2057 other_license_expression: None,
2058 other_license_expression_spdx: None,
2059 other_license_detections: Vec::new(),
2060 extracted_license_statement,
2061 notice_text: None,
2062 source_packages: Vec::new(),
2063 file_references,
2064 is_private: false,
2065 is_virtual: false,
2066 extra_data,
2067 dependencies,
2068 repository_homepage_url,
2069 repository_download_url,
2070 api_data_url,
2071 datasource_id: Some(datasource_id),
2072 purl,
2073 }
2074}
2075
2076fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
2077 project_urls
2078 .iter()
2079 .filter_map(|url_entry| {
2080 if let Some((label, url)) = url_entry.split_once(", ") {
2081 let label_trimmed = label.trim();
2082 let url_trimmed = url.trim();
2083 if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
2084 return Some((label_trimmed.to_string(), url_trimmed.to_string()));
2085 }
2086 }
2087 None
2088 })
2089 .collect()
2090}
2091
2092fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
2093 let mut parts = Vec::new();
2094 if let Some(summary_value) = summary
2095 && !summary_value.trim().is_empty()
2096 {
2097 parts.push(summary_value.trim().to_string());
2098 }
2099
2100 if !body.trim().is_empty() {
2101 parts.push(body.trim().to_string());
2102 }
2103
2104 if parts.is_empty() {
2105 None
2106 } else {
2107 Some(parts.join("\n"))
2108 }
2109}
2110
2111fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
2112 let mut keywords = Vec::new();
2113 let mut license_classifiers = Vec::new();
2114
2115 for classifier in classifiers {
2116 if classifier.starts_with("License ::") {
2117 license_classifiers.push(classifier.to_string());
2118 } else {
2119 keywords.push(classifier.to_string());
2120 }
2121 }
2122
2123 (keywords, license_classifiers)
2124}
2125
2126fn build_extracted_license_statement(
2127 license: Option<&str>,
2128 license_classifiers: &[String],
2129) -> Option<String> {
2130 let mut lines = Vec::new();
2131
2132 if let Some(value) = license
2133 && !value.trim().is_empty()
2134 {
2135 lines.push(format!("license: {}", value.trim()));
2136 }
2137
2138 if !license_classifiers.is_empty() {
2139 lines.push("classifiers:".to_string());
2140 for classifier in license_classifiers {
2141 lines.push(format!(" - '{}'", classifier));
2142 }
2143 }
2144
2145 if lines.is_empty() {
2146 None
2147 } else {
2148 Some(format!("{}\n", lines.join("\n")))
2149 }
2150}
2151
2152pub(crate) fn build_pypi_urls(
2153 name: Option<&str>,
2154 version: Option<&str>,
2155) -> (
2156 Option<String>,
2157 Option<String>,
2158 Option<String>,
2159 Option<String>,
2160) {
2161 let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2162
2163 let repository_download_url = name.and_then(|value| {
2164 version.map(|ver| {
2165 format!(
2166 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2167 &value[..1.min(value.len())],
2168 value,
2169 value,
2170 ver
2171 )
2172 })
2173 });
2174
2175 let api_data_url = name.map(|value| {
2176 if let Some(ver) = version {
2177 format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2178 } else {
2179 format!("https://pypi.org/pypi/{}/json", value)
2180 }
2181 });
2182
2183 let purl = name.and_then(|value| {
2184 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2185 if let Some(ver) = version {
2186 package_url.with_version(ver).ok()?;
2187 }
2188 Some(package_url.to_string())
2189 });
2190
2191 (
2192 repository_homepage_url,
2193 repository_download_url,
2194 api_data_url,
2195 purl,
2196 )
2197}
2198
2199fn build_pypi_purl_with_extension(
2200 name: &str,
2201 version: Option<&str>,
2202 extension: &str,
2203) -> Option<String> {
2204 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2205 if let Some(ver) = version {
2206 package_url.with_version(ver).ok()?;
2207 }
2208 package_url.add_qualifier("extension", extension).ok()?;
2209 Some(package_url.to_string())
2210}
2211
2212fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2213 let toml_content = match read_toml_file(path) {
2214 Ok(content) => content,
2215 Err(e) => {
2216 warn!(
2217 "Failed to read or parse pyproject.toml at {:?}: {}",
2218 path, e
2219 );
2220 return default_package_data(path);
2221 }
2222 };
2223
2224 let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2225 let is_poetry_pyproject = tool_table
2226 .and_then(|tool| tool.get("poetry"))
2227 .and_then(|value| value.as_table())
2228 .is_some();
2229
2230 let project_table =
2232 if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2233 project.clone()
2235 } else if let Some(tool) = tool_table {
2236 if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2237 poetry.clone()
2239 } else {
2240 return default_package_data(path);
2241 }
2242 } else if toml_content.get(FIELD_NAME).is_some() {
2243 match toml_content.as_table() {
2245 Some(table) => table.clone(),
2246 None => {
2247 warn!("Failed to convert TOML content to table in {:?}", path);
2248 return default_package_data(path);
2249 }
2250 }
2251 } else {
2252 return default_package_data(path);
2253 };
2254
2255 let name = project_table
2256 .get(FIELD_NAME)
2257 .and_then(|v| v.as_str())
2258 .map(|v| truncate_field(v.to_string()));
2259
2260 let version = project_table
2261 .get(FIELD_VERSION)
2262 .and_then(|v| v.as_str())
2263 .map(String::from);
2264 let classifiers = project_table
2265 .get("classifiers")
2266 .and_then(|value| value.as_array())
2267 .map(|values| {
2268 values
2269 .iter()
2270 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2271 .collect::<Vec<_>>()
2272 })
2273 .unwrap_or_default();
2274 let (classifier_keywords, license_classifiers) = split_classifiers(&classifiers);
2275
2276 let extracted_license_statement = extract_raw_license_string(&project_table);
2277 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2278 normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2279
2280 let description = project_table
2281 .get(FIELD_DESCRIPTION)
2282 .and_then(|value| value.as_str())
2283 .map(|value| truncate_field(value.to_string()));
2284 let mut keywords = project_table
2285 .get(FIELD_KEYWORDS)
2286 .and_then(|value| value.as_array())
2287 .map(|values| {
2288 values
2289 .iter()
2290 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2291 .collect::<Vec<_>>()
2292 })
2293 .unwrap_or_default();
2294 for classifier in classifier_keywords {
2295 if !keywords.contains(&classifier) {
2296 keywords.push(classifier);
2297 }
2298 }
2299
2300 let mut extra_data = extract_pyproject_extra_data(&toml_content).unwrap_or_default();
2302 let (homepage_url, download_url, bug_tracking_url, code_view_url, repository_url) =
2303 extract_urls(&project_table, &mut extra_data);
2304
2305 let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2306
2307 let purl = name.as_ref().and_then(|n| {
2309 let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2310 Ok(p) => p,
2311 Err(e) => {
2312 warn!(
2313 "Failed to create PackageUrl for Python package '{}': {}",
2314 n, e
2315 );
2316 return None;
2317 }
2318 };
2319
2320 if let Some(v) = &version
2321 && let Err(e) = package_url.with_version(v)
2322 {
2323 warn!(
2324 "Failed to set version '{}' for Python package '{}': {}",
2325 v, n, e
2326 );
2327 return None;
2328 }
2329
2330 Some(package_url.to_string())
2331 });
2332
2333 let api_data_url = name.as_ref().map(|n| {
2334 if let Some(v) = &version {
2335 format!("https://pypi.org/pypi/{}/{}/json", n, v)
2336 } else {
2337 format!("https://pypi.org/pypi/{}/json", n)
2338 }
2339 });
2340
2341 let pypi_homepage_url = name
2342 .as_ref()
2343 .map(|n| format!("https://pypi.org/project/{}", n));
2344
2345 let pypi_download_url = name.as_ref().and_then(|n| {
2346 version.as_ref().map(|v| {
2347 format!(
2348 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2349 &n[..1.min(n.len())],
2350 n,
2351 n,
2352 v
2353 )
2354 })
2355 });
2356
2357 PackageData {
2358 package_type: Some(PythonParser::PACKAGE_TYPE),
2359 namespace: None,
2360 name,
2361 version,
2362 qualifiers: None,
2363 subpath: None,
2364 primary_language: None,
2365 description,
2366 release_date: None,
2367 parties: extract_parties(&project_table),
2368 keywords,
2369 homepage_url: homepage_url.or(pypi_homepage_url),
2370 download_url: download_url
2371 .or_else(|| repository_url.clone())
2372 .or(pypi_download_url),
2373 size: None,
2374 sha1: None,
2375 md5: None,
2376 sha256: None,
2377 sha512: None,
2378 bug_tracking_url,
2379 code_view_url,
2380 vcs_url: repository_url,
2381 copyright: None,
2382 holder: None,
2383 declared_license_expression,
2384 declared_license_expression_spdx,
2385 license_detections,
2386 other_license_expression: None,
2387 other_license_expression_spdx: None,
2388 other_license_detections: Vec::new(),
2389 extracted_license_statement: extracted_license_statement
2390 .or_else(|| build_extracted_license_statement(None, &license_classifiers)),
2391 notice_text: None,
2392 source_packages: Vec::new(),
2393 file_references: Vec::new(),
2394 is_private: has_private_classifier(&classifiers),
2395 is_virtual: false,
2396 extra_data: if extra_data.is_empty() {
2397 None
2398 } else {
2399 Some(extra_data)
2400 },
2401 dependencies: [dependencies, optional_dependencies].concat(),
2402 repository_homepage_url: None,
2403 repository_download_url: None,
2404 api_data_url,
2405 datasource_id: Some(if is_poetry_pyproject {
2406 DatasourceId::PypiPoetryPyprojectToml
2407 } else {
2408 DatasourceId::PypiPyprojectToml
2409 }),
2410 purl,
2411 }
2412}
2413
2414fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2415 let path_str = path.to_string_lossy().replace('\\', "/");
2416 if path_str.contains("/EGG-INFO/PKG-INFO") {
2417 DatasourceId::PypiEggPkginfo
2418 } else if path_str.ends_with(".egg-info/PKG-INFO") {
2419 DatasourceId::PypiEditableEggPkginfo
2420 } else {
2421 DatasourceId::PypiSdistPkginfo
2422 }
2423}
2424
2425fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2426 project
2427 .get(FIELD_LICENSE)
2428 .and_then(|license_value| match license_value {
2429 TomlValue::String(license_str) => Some(license_str.clone()),
2430 TomlValue::Table(license_table) => license_table
2431 .get("text")
2432 .and_then(|v| v.as_str())
2433 .map(|s| s.to_string())
2434 .or_else(|| {
2435 license_table
2436 .get("expression")
2437 .and_then(|v| v.as_str())
2438 .map(|expr| expr.to_string())
2439 }),
2440 _ => None,
2441 })
2442}
2443
2444fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2445 match project.get(FIELD_LICENSE) {
2446 Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2447 Some(TomlValue::Table(license_table)) => license_table
2448 .get("expression")
2449 .and_then(|value| value.as_str()),
2450 _ => None,
2451 }
2452}
2453
2454fn extract_urls(
2455 project: &TomlMap<String, TomlValue>,
2456 extra_data: &mut HashMap<String, serde_json::Value>,
2457) -> ProjectUrls {
2458 let mut homepage_url = None;
2459 let mut download_url = None;
2460 let mut bug_tracking_url = None;
2461 let mut code_view_url = None;
2462 let mut repository_url = None;
2463
2464 if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2466 let parsed_urls: Vec<(String, String)> = urls
2467 .iter()
2468 .filter_map(|(label, value)| {
2469 value
2470 .as_str()
2471 .map(|url| (label.to_string(), url.to_string()))
2472 })
2473 .collect();
2474 apply_project_url_mappings(
2475 &parsed_urls,
2476 &mut homepage_url,
2477 &mut bug_tracking_url,
2478 &mut code_view_url,
2479 &mut repository_url,
2480 extra_data,
2481 );
2482
2483 download_url = urls
2484 .get("Downloads")
2485 .or_else(|| urls.get("downloads"))
2486 .and_then(|v| v.as_str())
2487 .map(String::from);
2488
2489 if homepage_url.is_none() {
2490 homepage_url = urls
2491 .get(FIELD_HOMEPAGE)
2492 .and_then(|v| v.as_str())
2493 .map(String::from);
2494 }
2495 if repository_url.is_none() {
2496 repository_url = urls
2497 .get(FIELD_REPOSITORY)
2498 .and_then(|v| v.as_str())
2499 .map(String::from);
2500 }
2501 }
2502
2503 if homepage_url.is_none() {
2505 homepage_url = project
2506 .get(FIELD_HOMEPAGE)
2507 .and_then(|v| v.as_str())
2508 .map(String::from);
2509 }
2510
2511 if repository_url.is_none() {
2512 repository_url = project
2513 .get(FIELD_REPOSITORY)
2514 .and_then(|v| v.as_str())
2515 .map(String::from);
2516 }
2517
2518 (
2519 homepage_url,
2520 download_url,
2521 bug_tracking_url,
2522 code_view_url,
2523 repository_url,
2524 )
2525}
2526
2527fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2528 let mut parties = Vec::new();
2529
2530 if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2531 for author in authors {
2532 if let Some(author_str) = author.as_str() {
2533 let (name, email) = split_name_email(author_str);
2534 parties.push(Party {
2535 r#type: None,
2536 role: Some("author".to_string()),
2537 name,
2538 email,
2539 url: None,
2540 organization: None,
2541 organization_url: None,
2542 timezone: None,
2543 });
2544 } else if let Some(author_table) = author.as_table() {
2545 let name = author_table
2546 .get("name")
2547 .and_then(|value| value.as_str())
2548 .map(|value| value.to_string());
2549 let email = author_table
2550 .get("email")
2551 .and_then(|value| value.as_str())
2552 .map(|value| value.to_string());
2553 if name.is_some() || email.is_some() {
2554 parties.push(Party {
2555 r#type: None,
2556 role: Some("author".to_string()),
2557 name,
2558 email,
2559 url: None,
2560 organization: None,
2561 organization_url: None,
2562 timezone: None,
2563 });
2564 }
2565 }
2566 }
2567 }
2568
2569 if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2570 for maintainer in maintainers {
2571 if let Some(maintainer_str) = maintainer.as_str() {
2572 let (name, email) = split_name_email(maintainer_str);
2573 parties.push(Party {
2574 r#type: None,
2575 role: Some("maintainer".to_string()),
2576 name,
2577 email,
2578 url: None,
2579 organization: None,
2580 organization_url: None,
2581 timezone: None,
2582 });
2583 } else if let Some(maintainer_table) = maintainer.as_table() {
2584 let name = maintainer_table
2585 .get("name")
2586 .and_then(|value| value.as_str())
2587 .map(|value| value.to_string());
2588 let email = maintainer_table
2589 .get("email")
2590 .and_then(|value| value.as_str())
2591 .map(|value| value.to_string());
2592 if name.is_some() || email.is_some() {
2593 parties.push(Party {
2594 r#type: None,
2595 role: Some("maintainer".to_string()),
2596 name,
2597 email,
2598 url: None,
2599 organization: None,
2600 organization_url: None,
2601 timezone: None,
2602 });
2603 }
2604 }
2605 }
2606 }
2607
2608 parties
2609}
2610
2611fn extract_dependencies(
2612 project: &TomlMap<String, TomlValue>,
2613 toml_content: &TomlValue,
2614) -> (Vec<Dependency>, Vec<Dependency>) {
2615 let mut dependencies = Vec::new();
2616 let mut optional_dependencies = Vec::new();
2617
2618 if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2620 match deps_value {
2621 TomlValue::Array(arr) => {
2622 dependencies = parse_dependency_array(arr, false, None);
2623 }
2624 TomlValue::Table(table) => {
2625 dependencies = parse_dependency_table(table, false, None);
2626 }
2627 _ => {}
2628 }
2629 }
2630
2631 if let Some(opt_deps_table) = project
2633 .get(FIELD_OPTIONAL_DEPENDENCIES)
2634 .and_then(|v| v.as_table())
2635 {
2636 for (extra_name, deps) in opt_deps_table {
2637 match deps {
2638 TomlValue::Array(arr) => {
2639 optional_dependencies.extend(parse_dependency_array(
2640 arr,
2641 true,
2642 Some(extra_name),
2643 ));
2644 }
2645 TomlValue::Table(table) => {
2646 optional_dependencies.extend(parse_dependency_table(
2647 table,
2648 true,
2649 Some(extra_name),
2650 ));
2651 }
2652 _ => {}
2653 }
2654 }
2655 }
2656
2657 if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2659 match dev_deps_value {
2660 TomlValue::Array(arr) => {
2661 optional_dependencies.extend(parse_dependency_array(
2662 arr,
2663 true,
2664 Some(FIELD_DEV_DEPENDENCIES),
2665 ));
2666 }
2667 TomlValue::Table(table) => {
2668 optional_dependencies.extend(parse_dependency_table(
2669 table,
2670 true,
2671 Some(FIELD_DEV_DEPENDENCIES),
2672 ));
2673 }
2674 _ => {}
2675 }
2676 }
2677
2678 if let Some(groups_table) = toml_content
2680 .get("tool")
2681 .and_then(|value| value.as_table())
2682 .and_then(|tool| tool.get("poetry"))
2683 .and_then(|value| value.as_table())
2684 .and_then(|poetry| poetry.get("group"))
2685 .and_then(|value| value.as_table())
2686 {
2687 for (group_name, group_data) in groups_table {
2688 if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2689 match group_deps {
2690 TomlValue::Array(arr) => {
2691 optional_dependencies.extend(parse_dependency_array(
2692 arr,
2693 true,
2694 Some(group_name),
2695 ));
2696 }
2697 TomlValue::Table(table) => {
2698 optional_dependencies.extend(parse_poetry_group_dependency_table(
2699 table,
2700 true,
2701 Some(group_name),
2702 ));
2703 }
2704 _ => {}
2705 }
2706 }
2707 }
2708 }
2709
2710 if let Some(groups_table) = toml_content
2711 .get(FIELD_DEPENDENCY_GROUPS)
2712 .and_then(|value| value.as_table())
2713 {
2714 for (group_name, deps) in groups_table {
2715 match deps {
2716 TomlValue::Array(arr) => {
2717 optional_dependencies.extend(parse_dependency_array(
2718 arr,
2719 true,
2720 Some(group_name),
2721 ));
2722 }
2723 TomlValue::Table(table) => {
2724 optional_dependencies.extend(parse_dependency_table(
2725 table,
2726 true,
2727 Some(group_name),
2728 ));
2729 }
2730 _ => {}
2731 }
2732 }
2733 }
2734
2735 if let Some(dev_deps_value) = toml_content
2736 .get("tool")
2737 .and_then(|value| value.as_table())
2738 .and_then(|tool| tool.get("uv"))
2739 .and_then(|value| value.as_table())
2740 .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2741 {
2742 match dev_deps_value {
2743 TomlValue::Array(arr) => {
2744 optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2745 }
2746 TomlValue::Table(table) => {
2747 optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2748 }
2749 _ => {}
2750 }
2751 }
2752
2753 (dependencies, optional_dependencies)
2754}
2755
2756fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2757 let mut extra_data = HashMap::new();
2758
2759 if let Some(tool_uv) = toml_content
2760 .get("tool")
2761 .and_then(|value| value.as_table())
2762 .and_then(|tool| tool.get("uv"))
2763 {
2764 extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2765 }
2766
2767 if extra_data.is_empty() {
2768 None
2769 } else {
2770 Some(extra_data)
2771 }
2772}
2773
2774fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2775 match value {
2776 TomlValue::String(value) => JsonValue::String(value.clone()),
2777 TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2778 TomlValue::Float(value) => JsonValue::String(value.to_string()),
2779 TomlValue::Boolean(value) => JsonValue::Bool(*value),
2780 TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2781 TomlValue::Array(values) => {
2782 JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2783 }
2784 TomlValue::Table(values) => JsonValue::Object(
2785 values
2786 .iter()
2787 .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2788 .collect::<JsonMap<String, JsonValue>>(),
2789 ),
2790 }
2791}
2792
2793fn parse_dependency_table(
2794 table: &TomlMap<String, TomlValue>,
2795 is_optional: bool,
2796 scope: Option<&str>,
2797) -> Vec<Dependency> {
2798 table
2799 .iter()
2800 .filter_map(|(name, version)| {
2801 let version_str = version.as_str().map(|s| s.to_string());
2802 let mut package_url =
2803 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2804
2805 if let Some(v) = &version_str {
2806 package_url.with_version(v).ok()?;
2807 }
2808
2809 Some(Dependency {
2810 purl: Some(package_url.to_string()),
2811 extracted_requirement: None,
2812 scope: scope.map(|s| s.to_string()),
2813 is_runtime: Some(!is_optional),
2814 is_optional: Some(is_optional),
2815 is_pinned: None,
2816 is_direct: Some(true),
2817 resolved_package: None,
2818 extra_data: None,
2819 })
2820 })
2821 .collect()
2822}
2823
2824fn parse_poetry_group_dependency_table(
2825 table: &TomlMap<String, TomlValue>,
2826 is_optional: bool,
2827 scope: Option<&str>,
2828) -> Vec<Dependency> {
2829 table
2830 .iter()
2831 .filter_map(|(name, value)| build_poetry_group_dependency(name, value, is_optional, scope))
2832 .collect()
2833}
2834
2835fn build_poetry_group_dependency(
2836 name: &str,
2837 value: &TomlValue,
2838 is_optional: bool,
2839 scope: Option<&str>,
2840) -> Option<Dependency> {
2841 let normalized_name = normalize_python_dependency_name(name);
2842 let (version_spec, extras, marker) = match value {
2843 TomlValue::String(spec) => (Some(spec.trim().to_string()), Vec::new(), None),
2844 TomlValue::Table(table) => {
2845 let version_spec = table
2846 .get(FIELD_VERSION)
2847 .and_then(|value| value.as_str())
2848 .map(str::trim)
2849 .filter(|value| !value.is_empty())
2850 .map(ToOwned::to_owned);
2851 let extras = table
2852 .get(FIELD_EXTRAS)
2853 .and_then(|value| value.as_array())
2854 .map(|values| {
2855 values
2856 .iter()
2857 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2858 .collect::<Vec<_>>()
2859 })
2860 .unwrap_or_default();
2861 let marker = table
2862 .get("markers")
2863 .and_then(|value| value.as_str())
2864 .map(str::trim)
2865 .filter(|value| !value.is_empty())
2866 .map(ToOwned::to_owned);
2867
2868 (version_spec, extras, marker)
2869 }
2870 _ => return None,
2871 };
2872
2873 let pinned_version = version_spec
2874 .as_deref()
2875 .and_then(extract_exact_pinned_version);
2876 let purl = build_python_dependency_purl(&normalized_name, pinned_version.as_deref())?;
2877
2878 let mut extra_data = HashMap::new();
2879 if let Some(marker) = marker {
2880 extra_data.insert("marker".to_string(), JsonValue::String(marker));
2881 }
2882 if !extras.is_empty() {
2883 extra_data.insert(
2884 "extras".to_string(),
2885 JsonValue::Array(extras.into_iter().map(JsonValue::String).collect()),
2886 );
2887 }
2888
2889 Some(Dependency {
2890 purl: Some(purl),
2891 extracted_requirement: version_spec,
2892 scope: scope.map(|value| value.to_string()),
2893 is_runtime: Some(!is_optional),
2894 is_optional: Some(is_optional),
2895 is_pinned: Some(pinned_version.is_some()),
2896 is_direct: Some(true),
2897 resolved_package: None,
2898 extra_data: if extra_data.is_empty() {
2899 None
2900 } else {
2901 Some(extra_data)
2902 },
2903 })
2904}
2905
2906fn parse_dependency_array(
2907 array: &[TomlValue],
2908 is_optional: bool,
2909 scope: Option<&str>,
2910) -> Vec<Dependency> {
2911 array
2912 .iter()
2913 .filter_map(|dep| {
2914 let dep_str = dep.as_str()?;
2915 build_pyproject_array_dependency(dep_str, is_optional, scope)
2916 })
2917 .collect()
2918}
2919
2920fn build_pyproject_array_dependency(
2921 dep_str: &str,
2922 is_optional: bool,
2923 scope: Option<&str>,
2924) -> Option<Dependency> {
2925 let parsed = parse_pep508_requirement(dep_str)?;
2926 let name = normalize_python_package_name(&parsed.name);
2927 let pinned_version = parsed
2928 .specifiers
2929 .as_deref()
2930 .and_then(extract_exact_pinned_version);
2931
2932 let purl = build_python_dependency_purl(&name, pinned_version.as_deref())?;
2933
2934 let mut extra_data = HashMap::new();
2935 if let Some(marker) = parsed.marker {
2936 extra_data.insert("marker".to_string(), JsonValue::String(marker));
2937 }
2938 if !parsed.extras.is_empty() {
2939 extra_data.insert(
2940 "extras".to_string(),
2941 JsonValue::Array(parsed.extras.into_iter().map(JsonValue::String).collect()),
2942 );
2943 }
2944
2945 let extracted_requirement = parsed.specifiers.or(parsed.url);
2946
2947 Some(Dependency {
2948 purl: Some(purl),
2949 extracted_requirement: extracted_requirement.clone(),
2950 scope: scope.map(|s| s.to_string()),
2951 is_runtime: Some(!is_optional),
2952 is_optional: Some(is_optional),
2953 is_pinned: Some(pinned_version.is_some()),
2954 is_direct: Some(true),
2955 resolved_package: None,
2956 extra_data: if extra_data.is_empty() {
2957 None
2958 } else {
2959 Some(extra_data)
2960 },
2961 })
2962}
2963
2964fn extract_exact_pinned_version(specifiers: &str) -> Option<String> {
2965 let trimmed = specifiers.trim();
2966 if trimmed.contains(',') {
2967 return None;
2968 }
2969
2970 let stripped = if let Some(version) = trimmed.strip_prefix("===") {
2971 version
2972 } else if let Some(version) = trimmed.strip_prefix("==") {
2973 version
2974 } else {
2975 return None;
2976 };
2977
2978 let version = stripped.trim();
2979 if version.is_empty() {
2980 None
2981 } else {
2982 Some(version.to_string())
2983 }
2984}
2985
2986#[derive(Debug, Clone)]
2987enum Value {
2988 String(String),
2989 Number(f64),
2990 Bool(bool),
2991 None,
2992 List(Vec<Value>),
2993 Tuple(Vec<Value>),
2994 Dict(HashMap<String, Value>),
2995}
2996
2997struct LiteralEvaluator {
2998 constants: HashMap<String, Value>,
2999 max_depth: usize,
3000 max_nodes: usize,
3001 nodes_visited: usize,
3002}
3003
3004impl LiteralEvaluator {
3005 fn new(constants: HashMap<String, Value>) -> Self {
3006 Self {
3007 constants,
3008 max_depth: MAX_SETUP_PY_AST_DEPTH,
3009 max_nodes: MAX_SETUP_PY_AST_NODES,
3010 nodes_visited: 0,
3011 }
3012 }
3013
3014 fn insert_constant(&mut self, name: String, value: Value) {
3015 self.constants.insert(name, value);
3016 }
3017
3018 fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
3019 if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
3020 return None;
3021 }
3022 self.nodes_visited += 1;
3023
3024 match expr {
3025 ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
3026 Some(Value::String(value.to_str().to_string()))
3027 }
3028 ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
3029 Some(Value::Bool(*value))
3030 }
3031 ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
3032 self.evaluate_number(value)
3033 }
3034 ast::Expr::NoneLiteral(_) => Some(Value::None),
3035 ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
3036 ast::Expr::List(ast::ExprList { elts, .. }) => {
3037 let mut values = Vec::new();
3038 for elt in elts {
3039 values.push(self.evaluate_expr(elt, depth + 1)?);
3040 }
3041 Some(Value::List(values))
3042 }
3043 ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
3044 let mut values = Vec::new();
3045 for elt in elts {
3046 values.push(self.evaluate_expr(elt, depth + 1)?);
3047 }
3048 Some(Value::Tuple(values))
3049 }
3050 ast::Expr::Dict(ast::ExprDict { items, .. }) => {
3051 let mut dict = HashMap::new();
3052 for item in items {
3053 let key_expr = item.key.as_ref()?;
3054 let key_value = self.evaluate_expr(key_expr, depth + 1)?;
3055 let key = value_to_string(&key_value)?;
3056 let value = self.evaluate_expr(&item.value, depth + 1)?;
3057 dict.insert(key, value);
3058 }
3059 Some(Value::Dict(dict))
3060 }
3061 ast::Expr::Call(ast::ExprCall {
3062 func, arguments, ..
3063 }) => {
3064 let args = arguments.args.as_ref();
3065 let keywords = arguments.keywords.as_ref();
3066 if keywords.is_empty()
3067 && let Some(name) = dotted_name(func.as_ref(), depth + 1)
3068 && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
3069 {
3070 return self.evaluate_ordered_dict(args, depth + 1);
3071 }
3072
3073 if !args.is_empty() {
3074 return None;
3075 }
3076
3077 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
3078 && id == "dict"
3079 {
3080 let mut dict = HashMap::new();
3081 for keyword in keywords {
3082 let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
3083 let value = self.evaluate_expr(&keyword.value, depth + 1)?;
3084 dict.insert(key.to_string(), value);
3085 }
3086 return Some(Value::Dict(dict));
3087 }
3088
3089 None
3090 }
3091 _ => None,
3092 }
3093 }
3094
3095 fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
3096 match number {
3097 ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
3098 ast::Number::Float(value) => Some(Value::Number(*value)),
3099 ast::Number::Complex { .. } => None,
3100 }
3101 }
3102
3103 fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
3104 if args.len() != 1 {
3105 return None;
3106 }
3107
3108 let items = match self.evaluate_expr(&args[0], depth)? {
3109 Value::List(items) | Value::Tuple(items) => items,
3110 _ => return None,
3111 };
3112
3113 let mut dict = HashMap::new();
3114 for item in items {
3115 let Value::Tuple(values) = item else {
3116 return None;
3117 };
3118 if values.len() != 2 {
3119 return None;
3120 }
3121 let key = value_to_string(&values[0])?;
3122 dict.insert(key, values[1].clone());
3123 }
3124
3125 Some(Value::Dict(dict))
3126 }
3127}
3128
3129#[derive(Default)]
3130struct SetupAliases {
3131 setup_names: HashSet<String>,
3132 module_aliases: HashMap<String, String>,
3133}
3134
3135fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
3136 extract_from_setup_py(path).into_iter().collect()
3137}
3138
3139fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
3140 let content = match read_file_to_string(path, None) {
3141 Ok(content) => content,
3142 Err(e) => {
3143 warn!("Failed to read setup.py at {:?}: {}", path, e);
3144 return Some(default_package_data(path));
3145 }
3146 };
3147
3148 if content.len() > MAX_SETUP_PY_BYTES {
3149 warn!("setup.py too large at {:?}: {} bytes", path, content.len());
3150 let package_data = extract_from_setup_py_regex(&content);
3151 return should_emit_setup_py_package(&package_data).then_some(package_data);
3152 }
3153
3154 let mut package_data = match extract_from_setup_py_ast(&content) {
3155 Ok(Some(data)) => data,
3156 Ok(None) => return Some(default_package_data(path)),
3157 Err(e) => {
3158 warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
3159 extract_from_setup_py_regex(&content)
3160 }
3161 };
3162
3163 if package_data.name.is_none() {
3164 package_data.name = extract_setup_value(&content, "name");
3165 }
3166
3167 if package_data.version.is_none() {
3168 package_data.version = extract_setup_value(&content, "version");
3169 }
3170
3171 if package_data
3172 .version
3173 .as_deref()
3174 .is_some_and(|version| version.trim().is_empty())
3175 {
3176 package_data.version = None;
3177 }
3178
3179 fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
3180 package_data.purl = build_setup_py_purl(
3181 package_data.name.as_deref(),
3182 package_data.version.as_deref(),
3183 );
3184
3185 if should_emit_setup_py_package(&package_data) {
3186 Some(package_data)
3187 } else {
3188 Some(default_package_data(path))
3189 }
3190}
3191
3192fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
3193 package_data.name.is_some()
3194 || package_data.version.is_some()
3195 || package_data.purl.is_some()
3196 || !package_data.dependencies.is_empty()
3197 || package_data.extracted_license_statement.is_some()
3198 || !package_data.license_detections.is_empty()
3199 || !package_data.parties.is_empty()
3200 || package_data.description.is_some()
3201 || package_data.homepage_url.is_some()
3202 || package_data.bug_tracking_url.is_some()
3203 || package_data.code_view_url.is_some()
3204 || package_data.vcs_url.is_some()
3205}
3206
3207fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
3208 if package_data.version.is_some()
3209 && package_data.extracted_license_statement.is_some()
3210 && package_data
3211 .parties
3212 .iter()
3213 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
3214 {
3215 return;
3216 }
3217
3218 let Some(root) = path.parent() else {
3219 return;
3220 };
3221
3222 let dunder_metadata = collect_sibling_dunder_metadata(root, content);
3223
3224 if package_data.version.is_none() {
3225 package_data.version = dunder_metadata.version;
3226 }
3227
3228 if package_data.extracted_license_statement.is_none() {
3229 package_data.extracted_license_statement = dunder_metadata.license;
3230 }
3231
3232 let has_author = package_data
3233 .parties
3234 .iter()
3235 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
3236
3237 if !has_author && let Some(author) = dunder_metadata.author {
3238 package_data.parties.push(Party {
3239 r#type: Some("person".to_string()),
3240 role: Some("author".to_string()),
3241 name: Some(author),
3242 email: None,
3243 url: None,
3244 organization: None,
3245 organization_url: None,
3246 timezone: None,
3247 });
3248 }
3249}
3250
3251#[derive(Default)]
3252struct DunderMetadata {
3253 version: Option<String>,
3254 author: Option<String>,
3255 license: Option<String>,
3256}
3257
3258fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
3259 let statements = match parse_module(content) {
3260 Ok(parsed) => parsed.into_suite(),
3261 Err(_) => return DunderMetadata::default(),
3262 };
3263
3264 let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3265 let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3266 let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3267 let mut metadata = DunderMetadata::default();
3268 let mut candidate_paths = Vec::new();
3269
3270 for module in imported_dunder_modules(&statements) {
3271 let Some(path) = resolve_imported_module_path(root, &module) else {
3272 continue;
3273 };
3274
3275 candidate_paths.push(path);
3276 }
3277
3278 candidate_paths.extend(referenced_dunder_attribute_paths(root, content));
3279 candidate_paths.extend(referenced_dunder_init_paths(root, content));
3280
3281 let mut seen_paths = HashSet::new();
3282 for path in candidate_paths {
3283 if !seen_paths.insert(path.clone()) {
3284 continue;
3285 }
3286
3287 let Ok(module_content) = read_file_to_string(&path, None) else {
3288 continue;
3289 };
3290
3291 if metadata.version.is_none() {
3292 metadata.version = version_re
3293 .as_ref()
3294 .and_then(|regex| regex.captures(&module_content))
3295 .and_then(|captures| captures.get(1))
3296 .map(|match_| match_.as_str().to_string());
3297 }
3298
3299 if metadata.author.is_none() {
3300 metadata.author = author_re
3301 .as_ref()
3302 .and_then(|regex| regex.captures(&module_content))
3303 .and_then(|captures| captures.get(1))
3304 .map(|match_| match_.as_str().to_string());
3305 }
3306
3307 if metadata.license.is_none() {
3308 metadata.license = license_re
3309 .as_ref()
3310 .and_then(|regex| regex.captures(&module_content))
3311 .and_then(|captures| captures.get(1))
3312 .map(|match_| match_.as_str().to_string());
3313 }
3314
3315 if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
3316 return metadata;
3317 }
3318 }
3319
3320 metadata
3321}
3322
3323fn referenced_dunder_init_paths(root: &Path, content: &str) -> Vec<PathBuf> {
3324 let open_re = match Regex::new(r#"open\(\s*['\"]([^'\"]+__init__\.py)['\"]"#) {
3325 Ok(regex) => regex,
3326 Err(_) => return Vec::new(),
3327 };
3328
3329 open_re
3330 .captures_iter(content)
3331 .filter_map(|captures| captures.get(1).map(|m| m.as_str()))
3332 .filter_map(|relative| {
3333 let relative_path = PathBuf::from(relative);
3334 if relative_path.is_absolute()
3335 || relative_path.components().any(|component| {
3336 matches!(
3337 component,
3338 Component::ParentDir | Component::RootDir | Component::Prefix(_)
3339 )
3340 })
3341 {
3342 return None;
3343 }
3344
3345 let candidate = root.join(relative_path);
3346 candidate.exists().then_some(candidate)
3347 })
3348 .collect()
3349}
3350
3351fn referenced_dunder_attribute_paths(root: &Path, content: &str) -> Vec<PathBuf> {
3352 let attr_re =
3353 match Regex::new(r#"\b([A-Za-z_][A-Za-z0-9_]*)\s*\.\s*__(?:version|author|license)__\b"#) {
3354 Ok(regex) => regex,
3355 Err(_) => return Vec::new(),
3356 };
3357
3358 let mut seen_modules = HashSet::new();
3359 attr_re
3360 .captures_iter(content)
3361 .filter_map(|captures| captures.get(1).map(|m| m.as_str().to_string()))
3362 .filter(|module| seen_modules.insert(module.clone()))
3363 .filter_map(|module| resolve_imported_module_path(root, &module))
3364 .collect()
3365}
3366
3367fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
3368 let mut modules = Vec::new();
3369
3370 for statement in statements {
3371 let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
3372 continue;
3373 };
3374 let Some(module) = module.as_ref().map(|name| name.as_str()) else {
3375 continue;
3376 };
3377 let imports_dunder = names.iter().any(|alias| {
3378 matches!(
3379 alias.name.as_str(),
3380 "__version__" | "__author__" | "__license__"
3381 )
3382 });
3383 if imports_dunder {
3384 modules.push(module.to_string());
3385 }
3386 }
3387
3388 modules
3389}
3390
3391fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
3392 let relative = PathBuf::from_iter(module.split('.'));
3393 let candidates = [
3394 root.join(relative.with_extension("py")),
3395 root.join(&relative).join("__init__.py"),
3396 root.join("src").join(relative.with_extension("py")),
3397 root.join("src").join(relative).join("__init__.py"),
3398 ];
3399
3400 candidates.into_iter().find(|candidate| candidate.exists())
3401}
3402
3403fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
3419 let statements = parse_module(content)
3420 .map(|parsed| parsed.into_suite())
3421 .map_err(|e| e.to_string())?;
3422 let aliases = collect_setup_aliases(&statements);
3423 let mut evaluator = LiteralEvaluator::new(HashMap::new());
3424 build_setup_py_constants(&statements, &mut evaluator);
3425
3426 let setup_call = find_setup_call(&statements, &aliases);
3427 let Some(call_expr) = setup_call else {
3428 return Ok(None);
3429 };
3430
3431 let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3432 Ok(Some(build_setup_py_package_data(&setup_values)))
3433}
3434
3435fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3436 for stmt in statements {
3437 if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3438 if targets.len() != 1 {
3439 continue;
3440 }
3441
3442 let Some(name) = extract_assign_name(&targets[0]) else {
3443 continue;
3444 };
3445
3446 if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3447 evaluator.insert_constant(name, value);
3448 }
3449 }
3450 }
3451}
3452
3453fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3454 match target {
3455 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3456 _ => None,
3457 }
3458}
3459
3460fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3461 let mut aliases = SetupAliases::default();
3462 aliases.setup_names.insert("setup".to_string());
3463
3464 for stmt in statements {
3465 match stmt {
3466 ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3467 for alias in names {
3468 let module_name = alias.name.as_str();
3469 if !is_setup_module(module_name) {
3470 continue;
3471 }
3472 let alias_name = alias
3473 .asname
3474 .as_ref()
3475 .map(|name| name.as_str())
3476 .unwrap_or(module_name);
3477 aliases
3478 .module_aliases
3479 .insert(alias_name.to_string(), module_name.to_string());
3480 }
3481 }
3482 ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3483 let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3484 continue;
3485 };
3486 if !is_setup_module(module_name) {
3487 continue;
3488 }
3489 for alias in names {
3490 if alias.name.as_str() != "setup" {
3491 continue;
3492 }
3493 let alias_name = alias
3494 .asname
3495 .as_ref()
3496 .map(|name| name.as_str())
3497 .unwrap_or("setup");
3498 aliases.setup_names.insert(alias_name.to_string());
3499 }
3500 }
3501 _ => {}
3502 }
3503 }
3504
3505 aliases
3506}
3507
3508fn is_setup_module(module_name: &str) -> bool {
3509 matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3510}
3511
3512fn find_setup_call<'a>(
3513 statements: &'a [ast::Stmt],
3514 aliases: &'a SetupAliases,
3515) -> Option<&'a ast::Expr> {
3516 let mut finder = SetupCallFinder {
3517 aliases,
3518 called_function_names: collect_top_level_called_function_names(statements),
3519 nodes_visited: 0,
3520 };
3521 finder.find_in_statements(statements)
3522}
3523
3524fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3525 let mut called = HashSet::new();
3526 collect_called_function_names_in_statements(statements, &mut called);
3527 called
3528}
3529
3530fn collect_called_function_names_in_statements(
3531 statements: &[ast::Stmt],
3532 called: &mut HashSet<String>,
3533) {
3534 for stmt in statements {
3535 match stmt {
3536 ast::Stmt::Expr(ast::StmtExpr { value, .. })
3537 | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3538 collect_called_function_names_in_expr(value.as_ref(), called);
3539 }
3540 ast::Stmt::If(ast::StmtIf {
3541 body,
3542 elif_else_clauses,
3543 ..
3544 }) => {
3545 collect_called_function_names_in_statements(body, called);
3546 for clause in elif_else_clauses {
3547 collect_called_function_names_in_statements(&clause.body, called);
3548 }
3549 }
3550 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3551 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3552 collect_called_function_names_in_statements(body, called);
3553 collect_called_function_names_in_statements(orelse, called);
3554 }
3555 ast::Stmt::With(ast::StmtWith { body, .. }) => {
3556 collect_called_function_names_in_statements(body, called);
3557 }
3558 ast::Stmt::Try(ast::StmtTry {
3559 body,
3560 orelse,
3561 finalbody,
3562 handlers,
3563 ..
3564 }) => {
3565 collect_called_function_names_in_statements(body, called);
3566 collect_called_function_names_in_statements(orelse, called);
3567 collect_called_function_names_in_statements(finalbody, called);
3568 for handler in handlers {
3569 let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3570 body,
3571 ..
3572 }) = handler;
3573 collect_called_function_names_in_statements(body, called);
3574 }
3575 }
3576 _ => {}
3577 }
3578 }
3579}
3580
3581fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3582 if let ast::Expr::Call(ast::ExprCall {
3583 func, arguments, ..
3584 }) = expr
3585 {
3586 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3587 called.insert(id.as_str().to_string());
3588 }
3589
3590 for arg in arguments.args.iter() {
3591 collect_called_function_names_in_expr(arg, called);
3592 }
3593 for keyword in arguments.keywords.iter() {
3594 collect_called_function_names_in_expr(&keyword.value, called);
3595 }
3596 }
3597}
3598
3599struct SetupCallFinder<'a> {
3600 aliases: &'a SetupAliases,
3601 called_function_names: HashSet<String>,
3602 nodes_visited: usize,
3603}
3604
3605impl<'a> SetupCallFinder<'a> {
3606 fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3607 for stmt in statements {
3608 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3609 return None;
3610 }
3611 self.nodes_visited += 1;
3612
3613 let found = match stmt {
3614 ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3615 ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3616 ast::Stmt::If(ast::StmtIf {
3617 body,
3618 elif_else_clauses,
3619 ..
3620 }) => self.find_in_statements(body).or_else(|| {
3621 for clause in elif_else_clauses {
3622 if let Some(found) = self.find_in_statements(&clause.body) {
3623 return Some(found);
3624 }
3625 }
3626 None
3627 }),
3628 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3629 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3630 .find_in_statements(body)
3631 .or_else(|| self.find_in_statements(orelse)),
3632 ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3633 .called_function_names
3634 .contains(name.as_str())
3635 .then(|| self.find_in_statements(body))
3636 .flatten(),
3637 ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3638 ast::Stmt::Try(ast::StmtTry {
3639 body,
3640 orelse,
3641 finalbody,
3642 handlers,
3643 ..
3644 }) => self
3645 .find_in_statements(body)
3646 .or_else(|| self.find_in_statements(orelse))
3647 .or_else(|| self.find_in_statements(finalbody))
3648 .or_else(|| {
3649 for handler in handlers {
3650 let ast::ExceptHandler::ExceptHandler(
3651 ast::ExceptHandlerExceptHandler { body, .. },
3652 ) = handler;
3653 if let Some(found) = self.find_in_statements(body) {
3654 return Some(found);
3655 }
3656 }
3657 None
3658 }),
3659 _ => None,
3660 };
3661
3662 if found.is_some() {
3663 return found;
3664 }
3665 }
3666
3667 None
3668 }
3669
3670 fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3671 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3672 return None;
3673 }
3674 self.nodes_visited += 1;
3675
3676 match expr {
3677 ast::Expr::Call(ast::ExprCall { func, .. })
3678 if is_setup_call(func.as_ref(), self.aliases) =>
3679 {
3680 Some(expr)
3681 }
3682 _ => None,
3683 }
3684 }
3685}
3686
3687fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3688 let Some(dotted) = dotted_name(func, 0) else {
3689 return false;
3690 };
3691
3692 if aliases.setup_names.contains(&dotted) {
3693 return true;
3694 }
3695
3696 let Some(module) = dotted.strip_suffix(".setup") else {
3697 return false;
3698 };
3699
3700 let resolved = resolve_module_alias(module, aliases);
3701 is_setup_module(&resolved)
3702}
3703
3704fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3705 if depth >= MAX_SETUP_PY_AST_DEPTH {
3706 return None;
3707 }
3708
3709 match expr {
3710 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3711 ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3712 let base = dotted_name(value.as_ref(), depth + 1)?;
3713 Some(format!("{}.{}", base, attr.as_str()))
3714 }
3715 _ => None,
3716 }
3717}
3718
3719fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3720 if let Some(mapped) = aliases.module_aliases.get(module) {
3721 return mapped.clone();
3722 }
3723
3724 let Some((base, rest)) = module.split_once('.') else {
3725 return module.to_string();
3726 };
3727
3728 if let Some(mapped) = aliases.module_aliases.get(base) {
3729 return format!("{}.{}", mapped, rest);
3730 }
3731
3732 module.to_string()
3733}
3734
3735fn extract_setup_keywords(
3736 call_expr: &ast::Expr,
3737 evaluator: &mut LiteralEvaluator,
3738) -> HashMap<String, Value> {
3739 let mut values = HashMap::new();
3740 let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3741 return values;
3742 };
3743
3744 for keyword in arguments.keywords.iter() {
3745 if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3746 if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3747 values.insert(arg.to_string(), value);
3748 }
3749 } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3750 for (key, value) in dict {
3751 values.insert(key, value);
3752 }
3753 }
3754 }
3755
3756 values
3757}
3758
3759fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3760 let name = get_value_string(values, "name").map(truncate_field);
3761 let version = get_value_string(values, "version").map(truncate_field);
3762 let description = get_value_string(values, "description")
3763 .or_else(|| get_value_string(values, "summary"))
3764 .map(truncate_field);
3765 let homepage_url = get_value_string(values, "url")
3766 .or_else(|| get_value_string(values, "home_page"))
3767 .map(truncate_field);
3768 let author = get_value_string(values, "author").map(truncate_field);
3769 let author_email = get_value_string(values, "author_email");
3770 let maintainer = get_value_string(values, "maintainer").map(truncate_field);
3771 let maintainer_email = get_value_string(values, "maintainer_email");
3772 let license = get_value_string(values, "license").map(truncate_field);
3773 let classifiers = values
3774 .get("classifiers")
3775 .and_then(value_to_string_list)
3776 .unwrap_or_default();
3777
3778 let mut parties = Vec::new();
3779 if author.is_some() || author_email.is_some() {
3780 parties.push(Party {
3781 r#type: Some("person".to_string()),
3782 role: Some("author".to_string()),
3783 name: author,
3784 email: author_email,
3785 url: None,
3786 organization: None,
3787 organization_url: None,
3788 timezone: None,
3789 });
3790 }
3791
3792 if maintainer.is_some() || maintainer_email.is_some() {
3793 parties.push(Party {
3794 r#type: Some("person".to_string()),
3795 role: Some("maintainer".to_string()),
3796 name: maintainer,
3797 email: maintainer_email,
3798 url: None,
3799 organization: None,
3800 organization_url: None,
3801 timezone: None,
3802 });
3803 }
3804
3805 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3806 normalize_spdx_declared_license(license.as_deref());
3807 let extracted_license_statement = license.clone();
3808
3809 let dependencies = build_setup_py_dependencies(values);
3810 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3811 let mut homepage_from_project_urls = None;
3812 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3813 let mut extra_data = HashMap::new();
3814
3815 if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3816 apply_project_url_mappings(
3817 &parsed_project_urls,
3818 &mut homepage_from_project_urls,
3819 &mut bug_tracking_url,
3820 &mut code_view_url,
3821 &mut vcs_url,
3822 &mut extra_data,
3823 );
3824 }
3825
3826 let extra_data = if extra_data.is_empty() {
3827 None
3828 } else {
3829 Some(extra_data)
3830 };
3831
3832 PackageData {
3833 package_type: Some(PythonParser::PACKAGE_TYPE),
3834 namespace: None,
3835 name,
3836 version,
3837 qualifiers: None,
3838 subpath: None,
3839 primary_language: Some("Python".to_string()),
3840 description,
3841 release_date: None,
3842 parties,
3843 keywords: Vec::new(),
3844 homepage_url: homepage_url.or(homepage_from_project_urls),
3845 download_url: None,
3846 size: None,
3847 sha1: None,
3848 md5: None,
3849 sha256: None,
3850 sha512: None,
3851 bug_tracking_url,
3852 code_view_url,
3853 vcs_url,
3854 copyright: None,
3855 holder: None,
3856 declared_license_expression,
3857 declared_license_expression_spdx,
3858 license_detections,
3859 other_license_expression: None,
3860 other_license_expression_spdx: None,
3861 other_license_detections: Vec::new(),
3862 extracted_license_statement,
3863 notice_text: None,
3864 source_packages: Vec::new(),
3865 file_references: Vec::new(),
3866 is_private: has_private_classifier(&classifiers),
3867 is_virtual: false,
3868 extra_data,
3869 dependencies,
3870 repository_homepage_url: None,
3871 repository_download_url: None,
3872 api_data_url: None,
3873 datasource_id: Some(DatasourceId::PypiSetupPy),
3874 purl,
3875 }
3876}
3877
3878fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3879 let mut dependencies = Vec::new();
3880
3881 if let Some(reqs) = values
3882 .get("install_requires")
3883 .and_then(value_to_string_list)
3884 {
3885 dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3886 }
3887
3888 if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3889 dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3890 }
3891
3892 if let Some(Value::Dict(extras)) = values.get("extras_require") {
3893 let mut extra_items: Vec<_> = extras.iter().collect();
3894 extra_items.sort_by_key(|(name, _)| *name);
3895 for (extra_name, extra_value) in extra_items {
3896 if let Some(reqs) = value_to_string_list(extra_value) {
3897 dependencies.extend(build_setup_py_dependency_list(
3898 reqs.as_slice(),
3899 extra_name,
3900 true,
3901 ));
3902 }
3903 }
3904 }
3905
3906 dependencies
3907}
3908
3909fn build_setup_py_dependency_list(
3910 reqs: &[String],
3911 scope: &str,
3912 is_optional: bool,
3913) -> Vec<Dependency> {
3914 reqs.iter()
3915 .filter_map(|req| build_python_dependency(req, scope, is_optional, None))
3916 .collect()
3917}
3918
3919fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3920 values.get(key).and_then(value_to_string)
3921}
3922
3923fn value_to_string(value: &Value) -> Option<String> {
3924 match value {
3925 Value::String(value) => Some(value.clone()),
3926 Value::Number(value) => Some(value.to_string()),
3927 Value::Bool(value) => Some(value.to_string()),
3928 _ => None,
3929 }
3930}
3931
3932fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3933 match value {
3934 Value::String(value) => Some(vec![value.clone()]),
3935 Value::List(values) | Value::Tuple(values) => {
3936 let mut items = Vec::new();
3937 for item in values {
3938 items.push(value_to_string(item)?);
3939 }
3940 Some(items)
3941 }
3942 _ => None,
3943 }
3944}
3945
3946fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3947 let Value::Dict(dict) = value else {
3948 return None;
3949 };
3950
3951 let mut pairs: Vec<(String, String)> = dict
3952 .iter()
3953 .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3954 .collect::<Option<Vec<_>>>()?;
3955 pairs.sort_by(|left, right| left.0.cmp(&right.0));
3956 Some(pairs)
3957}
3958
3959fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3960 let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3961 extract_requires_dist_dependencies(&requires_dist)
3962}
3963
3964pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3965 requires_dist
3966 .iter()
3967 .filter_map(|entry| build_rfc822_dependency(entry))
3968 .collect()
3969}
3970
3971fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3972 build_python_dependency(entry, "install", false, None)
3973}
3974
3975fn build_python_dependency(
3976 entry: &str,
3977 default_scope: &str,
3978 default_optional: bool,
3979 marker_override: Option<&str>,
3980) -> Option<Dependency> {
3981 let (requirement_part, marker_part) = entry
3982 .split_once(';')
3983 .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3984 .unwrap_or((entry.trim(), None));
3985
3986 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3987 let requirement = normalize_rfc822_requirement(requirement_part);
3988 let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3989 marker_part.or(marker_override),
3990 default_scope,
3991 default_optional,
3992 );
3993 let purl = build_python_dependency_purl(&name, None)?;
3994
3995 let is_pinned = requirement
3996 .as_deref()
3997 .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3998 let purl = if is_pinned {
3999 requirement
4000 .as_deref()
4001 .map(|req| req.trim_start_matches('='))
4002 .and_then(|version| build_python_dependency_purl(&name, Some(version)))
4003 .unwrap_or(purl)
4004 } else {
4005 purl
4006 };
4007
4008 let mut extra_data = HashMap::new();
4009 extra_data.extend(marker_data);
4010 if let Some(marker) = marker {
4011 extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
4012 }
4013
4014 Some(Dependency {
4015 purl: Some(purl),
4016 extracted_requirement: requirement,
4017 scope: Some(scope),
4018 is_runtime: Some(true),
4019 is_optional: Some(is_optional),
4020 is_pinned: Some(is_pinned),
4021 is_direct: Some(true),
4022 resolved_package: None,
4023 extra_data: if extra_data.is_empty() {
4024 None
4025 } else {
4026 Some(extra_data)
4027 },
4028 })
4029}
4030
4031fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
4032 let name = extract_setup_cfg_dependency_name(requirement_part)?;
4033 let trimmed = requirement_part.trim();
4034 let mut remainder = trimmed[name.len()..].trim();
4035
4036 if let Some(stripped) = remainder.strip_prefix('[')
4037 && let Some(end_idx) = stripped.find(']')
4038 {
4039 remainder = stripped[end_idx + 1..].trim();
4040 }
4041
4042 let remainder = remainder
4043 .strip_prefix('(')
4044 .and_then(|value| value.strip_suffix(')'))
4045 .unwrap_or(remainder)
4046 .trim();
4047
4048 if remainder.is_empty() {
4049 return None;
4050 }
4051
4052 let mut specifiers: Vec<String> = remainder
4053 .split(',')
4054 .map(|specifier| specifier.trim().replace(' ', ""))
4055 .filter(|specifier| !specifier.is_empty())
4056 .collect();
4057 specifiers.sort();
4058 Some(specifiers.join(","))
4059}
4060
4061fn encode_python_dependency_purl_version(version: &str) -> String {
4062 version.replace('*', "%2A")
4063}
4064
4065fn build_python_dependency_purl(name: &str, version: Option<&str>) -> Option<String> {
4066 let normalized_name = normalize_python_dependency_name(name);
4067
4068 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &normalized_name)
4069 .ok()
4070 .map(|_| match version {
4071 Some(version) => {
4072 format!(
4073 "pkg:pypi/{normalized_name}@{}",
4074 encode_python_dependency_purl_version(version)
4075 )
4076 }
4077 None => format!("pkg:pypi/{normalized_name}"),
4078 })
4079}
4080
4081fn normalize_python_dependency_name(name: &str) -> String {
4082 name.trim().to_ascii_lowercase().replace('_', "-")
4083}
4084
4085fn parse_rfc822_marker(
4086 marker_part: Option<&str>,
4087 default_scope: &str,
4088 default_optional: bool,
4089) -> (
4090 String,
4091 bool,
4092 Option<String>,
4093 HashMap<String, serde_json::Value>,
4094) {
4095 let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
4096 return (
4097 default_scope.to_string(),
4098 default_optional,
4099 None,
4100 HashMap::new(),
4101 );
4102 };
4103
4104 let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
4105 .expect("extra marker regex should compile");
4106 let mut extra_data = HashMap::new();
4107
4108 if let Some(python_version) = extract_marker_field(marker, "python_version") {
4109 extra_data.insert(
4110 "python_version".to_string(),
4111 serde_json::Value::String(python_version),
4112 );
4113 }
4114 if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
4115 extra_data.insert(
4116 "sys_platform".to_string(),
4117 serde_json::Value::String(sys_platform),
4118 );
4119 }
4120
4121 if let Some(captures) = extra_re.captures(marker)
4122 && let Some(scope) = captures.get(1)
4123 {
4124 return (
4125 scope.as_str().to_string(),
4126 true,
4127 Some(marker.trim().to_string()),
4128 extra_data,
4129 );
4130 }
4131
4132 (
4133 default_scope.to_string(),
4134 default_optional,
4135 Some(marker.trim().to_string()),
4136 extra_data,
4137 )
4138}
4139
4140fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
4141 let re = Regex::new(&format!(
4142 r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
4143 field
4144 ))
4145 .ok()?;
4146 let captures = re.captures(marker)?;
4147 let operator = captures.get(1)?.as_str();
4148 let value = captures.get(2)?.as_str();
4149 Some(format!("{} {}", operator, value))
4150}
4151
4152fn parse_requires_txt(content: &str) -> Vec<Dependency> {
4153 let mut dependencies = Vec::new();
4154 let mut current_scope = "install".to_string();
4155 let mut current_optional = false;
4156 let mut current_marker: Option<String> = None;
4157 let mut line_count = 0usize;
4158
4159 for line in content.lines() {
4160 line_count += 1;
4161 if line_count > MAX_ITERATION_COUNT {
4162 warn!(
4163 "Exceeded max line count in requires.txt; stopping at {} lines",
4164 MAX_ITERATION_COUNT
4165 );
4166 break;
4167 }
4168 let trimmed = line.trim();
4169 if trimmed.is_empty() || trimmed.starts_with('#') {
4170 continue;
4171 }
4172
4173 if trimmed.starts_with('[') && trimmed.ends_with(']') {
4174 let inner = &trimmed[1..trimmed.len() - 1];
4175 if let Some(rest) = inner.strip_prefix(':') {
4176 current_scope = "install".to_string();
4177 current_optional = false;
4178 current_marker = Some(rest.trim().to_string());
4179 } else if let Some((scope, marker)) = inner.split_once(':') {
4180 current_scope = scope.trim().to_string();
4181 current_optional = true;
4182 current_marker = Some(marker.trim().to_string());
4183 } else {
4184 current_scope = inner.trim().to_string();
4185 current_optional = true;
4186 current_marker = None;
4187 }
4188 continue;
4189 }
4190
4191 if let Some(dependency) = build_python_dependency(
4192 trimmed,
4193 ¤t_scope,
4194 current_optional,
4195 current_marker.as_deref(),
4196 ) {
4197 dependencies.push(dependency);
4198 }
4199 }
4200
4201 dependencies
4202}
4203
4204fn has_private_classifier(classifiers: &[String]) -> bool {
4205 classifiers
4206 .iter()
4207 .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
4208}
4209
4210fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
4211 let name = name?;
4212 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
4213 if let Some(version) = version {
4214 package_url.with_version(version).ok()?;
4215 }
4216 Some(package_url.to_string())
4217}
4218
4219fn extract_from_setup_py_regex(content: &str) -> PackageData {
4220 let name = extract_setup_value(content, "name").map(truncate_field);
4221 let version = extract_setup_value(content, "version").map(truncate_field);
4222 let license_expression = extract_setup_value(content, "license").map(truncate_field);
4223
4224 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4225 normalize_spdx_declared_license(license_expression.as_deref());
4226 let extracted_license_statement = license_expression.clone();
4227
4228 let dependencies = extract_setup_py_dependencies(content);
4229 let homepage_url = extract_setup_value(content, "url").map(truncate_field);
4230 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
4231
4232 PackageData {
4233 package_type: Some(PythonParser::PACKAGE_TYPE),
4234 namespace: None,
4235 name,
4236 version,
4237 qualifiers: None,
4238 subpath: None,
4239 primary_language: Some("Python".to_string()),
4240 description: None,
4241 release_date: None,
4242 parties: Vec::new(),
4243 keywords: Vec::new(),
4244 homepage_url,
4245 download_url: None,
4246 size: None,
4247 sha1: None,
4248 md5: None,
4249 sha256: None,
4250 sha512: None,
4251 bug_tracking_url: None,
4252 code_view_url: None,
4253 vcs_url: None,
4254 copyright: None,
4255 holder: None,
4256 declared_license_expression,
4257 declared_license_expression_spdx,
4258 license_detections,
4259 other_license_expression: None,
4260 other_license_expression_spdx: None,
4261 other_license_detections: Vec::new(),
4262 extracted_license_statement,
4263 notice_text: None,
4264 source_packages: Vec::new(),
4265 file_references: Vec::new(),
4266 is_private: false,
4267 is_virtual: false,
4268 extra_data: None,
4269 dependencies,
4270 repository_homepage_url: None,
4271 repository_download_url: None,
4272 api_data_url: None,
4273 datasource_id: Some(DatasourceId::PypiSetupPy),
4274 purl,
4275 }
4276}
4277
4278fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
4279 crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
4280}
4281
4282fn extract_from_pypi_json(path: &Path) -> PackageData {
4283 let default = PackageData {
4284 package_type: Some(PythonParser::PACKAGE_TYPE),
4285 datasource_id: Some(DatasourceId::PypiJson),
4286 ..Default::default()
4287 };
4288
4289 let content = match read_file_to_string(path, None) {
4290 Ok(content) => content,
4291 Err(error) => {
4292 warn!("Failed to read pypi.json at {:?}: {}", path, error);
4293 return default;
4294 }
4295 };
4296
4297 let root: serde_json::Value = match serde_json::from_str(&content) {
4298 Ok(value) => value,
4299 Err(error) => {
4300 warn!("Failed to parse pypi.json at {:?}: {}", path, error);
4301 return default;
4302 }
4303 };
4304
4305 let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
4306 warn!("No info object found in pypi.json at {:?}", path);
4307 return default;
4308 };
4309
4310 let name = info
4311 .get("name")
4312 .and_then(|value| value.as_str())
4313 .map(|v| truncate_field(v.to_owned()));
4314 let version = info
4315 .get("version")
4316 .and_then(|value| value.as_str())
4317 .map(ToOwned::to_owned);
4318 let summary = info
4319 .get("summary")
4320 .and_then(|value| value.as_str())
4321 .map(|v| truncate_field(v.to_owned()));
4322 let description = info
4323 .get("description")
4324 .and_then(|value| value.as_str())
4325 .filter(|value| !value.trim().is_empty())
4326 .map(|v| truncate_field(v.to_owned()))
4327 .or(summary);
4328 let mut homepage_url = info
4329 .get("home_page")
4330 .and_then(|value| value.as_str())
4331 .map(|v| truncate_field(v.to_owned()));
4332 let author = info
4333 .get("author")
4334 .and_then(|value| value.as_str())
4335 .filter(|value| !value.trim().is_empty())
4336 .map(|v| truncate_field(v.to_owned()));
4337 let author_email = info
4338 .get("author_email")
4339 .and_then(|value| value.as_str())
4340 .filter(|value| !value.trim().is_empty())
4341 .map(ToOwned::to_owned);
4342 let license = info
4343 .get("license")
4344 .and_then(|value| value.as_str())
4345 .filter(|value| !value.trim().is_empty())
4346 .map(ToOwned::to_owned);
4347 let keywords = parse_setup_cfg_keywords(
4348 info.get("keywords")
4349 .and_then(|value| value.as_str())
4350 .map(ToOwned::to_owned),
4351 );
4352 let classifiers = info
4353 .get("classifiers")
4354 .and_then(|value| value.as_array())
4355 .map(|values| {
4356 values
4357 .iter()
4358 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
4359 .collect::<Vec<_>>()
4360 })
4361 .unwrap_or_default();
4362
4363 let mut parties = Vec::new();
4364 if author.is_some() || author_email.is_some() {
4365 parties.push(Party {
4366 r#type: Some("person".to_string()),
4367 role: Some("author".to_string()),
4368 name: author,
4369 email: author_email,
4370 url: None,
4371 organization: None,
4372 organization_url: None,
4373 timezone: None,
4374 });
4375 }
4376
4377 let mut bug_tracking_url = None;
4378 let mut code_view_url = None;
4379 let mut vcs_url = None;
4380 let mut extra_data = HashMap::new();
4381
4382 let parsed_project_urls = info
4383 .get("project_urls")
4384 .and_then(|value| value.as_object())
4385 .map(|map| {
4386 let mut pairs: Vec<(String, String)> = map
4387 .iter()
4388 .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
4389 .collect();
4390 pairs.sort_by(|left, right| left.0.cmp(&right.0));
4391 pairs
4392 })
4393 .unwrap_or_default();
4394
4395 apply_project_url_mappings(
4396 &parsed_project_urls,
4397 &mut homepage_url,
4398 &mut bug_tracking_url,
4399 &mut code_view_url,
4400 &mut vcs_url,
4401 &mut extra_data,
4402 );
4403
4404 let (download_url, size, sha256) = root
4405 .get("urls")
4406 .and_then(|value| value.as_array())
4407 .map(|urls| select_pypi_json_artifact(urls))
4408 .unwrap_or((None, None, None));
4409
4410 let sha256 = sha256.and_then(|h| Sha256Digest::from_hex(&h).ok());
4411
4412 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4413 normalize_spdx_declared_license(license.as_deref());
4414 let dependencies = info
4415 .get("requires_dist")
4416 .and_then(|value| value.as_array())
4417 .map(|entries| {
4418 entries
4419 .iter()
4420 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4421 .collect::<Vec<_>>()
4422 })
4423 .map(|entries| extract_requires_dist_dependencies(&entries))
4424 .unwrap_or_default();
4425
4426 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
4427 build_pypi_urls(name.as_deref(), version.as_deref());
4428
4429 PackageData {
4430 package_type: Some(PythonParser::PACKAGE_TYPE),
4431 namespace: None,
4432 name,
4433 version,
4434 qualifiers: None,
4435 subpath: None,
4436 primary_language: None,
4437 description,
4438 release_date: None,
4439 parties,
4440 keywords,
4441 homepage_url: homepage_url.or(repository_homepage_url.clone()),
4442 download_url,
4443 size,
4444 sha1: None,
4445 md5: None,
4446 sha256,
4447 sha512: None,
4448 bug_tracking_url,
4449 code_view_url,
4450 vcs_url,
4451 copyright: None,
4452 holder: None,
4453 declared_license_expression,
4454 declared_license_expression_spdx,
4455 license_detections,
4456 other_license_expression: None,
4457 other_license_expression_spdx: None,
4458 other_license_detections: Vec::new(),
4459 extracted_license_statement: license,
4460 notice_text: None,
4461 source_packages: Vec::new(),
4462 file_references: Vec::new(),
4463 is_private: has_private_classifier(&classifiers),
4464 is_virtual: false,
4465 extra_data: if extra_data.is_empty() {
4466 None
4467 } else {
4468 Some(extra_data)
4469 },
4470 dependencies,
4471 repository_homepage_url,
4472 repository_download_url,
4473 api_data_url,
4474 datasource_id: Some(DatasourceId::PypiJson),
4475 purl,
4476 }
4477}
4478
4479fn select_pypi_json_artifact(
4480 urls: &[serde_json::Value],
4481) -> (Option<String>, Option<u64>, Option<String>) {
4482 let selected = urls
4483 .iter()
4484 .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4485 .or_else(|| urls.first());
4486
4487 let Some(entry) = selected else {
4488 return (None, None, None);
4489 };
4490
4491 let download_url = entry
4492 .get("url")
4493 .and_then(|value| value.as_str())
4494 .map(ToOwned::to_owned);
4495 let size = entry.get("size").and_then(|value| value.as_u64());
4496 let sha256 = entry
4497 .get("digests")
4498 .and_then(|value| value.as_object())
4499 .and_then(|digests| digests.get("sha256"))
4500 .and_then(|value| value.as_str())
4501 .map(ToOwned::to_owned);
4502
4503 (download_url, size, sha256)
4504}
4505
4506fn extract_from_pip_inspect(path: &Path) -> PackageData {
4507 let content = match read_file_to_string(path, None) {
4508 Ok(content) => content,
4509 Err(e) => {
4510 warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4511 return default_package_data(path);
4512 }
4513 };
4514
4515 let root: serde_json::Value = match serde_json::from_str(&content) {
4516 Ok(value) => value,
4517 Err(e) => {
4518 warn!(
4519 "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4520 path, e
4521 );
4522 return default_package_data(path);
4523 }
4524 };
4525
4526 let installed = match root.get("installed").and_then(|v| v.as_array()) {
4527 Some(arr) => arr,
4528 None => {
4529 warn!(
4530 "No 'installed' array found in pip-inspect.deplock at {:?}",
4531 path
4532 );
4533 return default_package_data(path);
4534 }
4535 };
4536
4537 let pip_version = root
4538 .get("pip_version")
4539 .and_then(|v| v.as_str())
4540 .map(String::from);
4541 let inspect_version = root
4542 .get("version")
4543 .and_then(|v| v.as_str())
4544 .map(String::from);
4545
4546 let mut main_package: Option<PackageData> = None;
4547 let mut dependencies: Vec<Dependency> = Vec::new();
4548
4549 for package_entry in installed {
4550 let metadata = match package_entry.get("metadata") {
4551 Some(m) => m,
4552 None => continue,
4553 };
4554
4555 let is_requested = package_entry
4556 .get("requested")
4557 .and_then(|v| v.as_bool())
4558 .unwrap_or(false);
4559 let has_direct_url = package_entry.get("direct_url").is_some();
4560
4561 let name = metadata
4562 .get("name")
4563 .and_then(|v| v.as_str())
4564 .map(|v| truncate_field(v.to_string()));
4565 let version = metadata
4566 .get("version")
4567 .and_then(|v| v.as_str())
4568 .map(String::from);
4569 let summary = metadata
4570 .get("summary")
4571 .and_then(|v| v.as_str())
4572 .map(|v| truncate_field(v.to_string()));
4573 let home_page = metadata
4574 .get("home_page")
4575 .and_then(|v| v.as_str())
4576 .map(|v| truncate_field(v.to_string()));
4577 let author = metadata
4578 .get("author")
4579 .and_then(|v| v.as_str())
4580 .map(|v| truncate_field(v.to_string()));
4581 let author_email = metadata
4582 .get("author_email")
4583 .and_then(|v| v.as_str())
4584 .map(String::from);
4585 let license = metadata
4586 .get("license")
4587 .and_then(|v| v.as_str())
4588 .map(|v| truncate_field(v.to_string()));
4589 let description = metadata
4590 .get("description")
4591 .and_then(|v| v.as_str())
4592 .map(|v| truncate_field(v.to_string()));
4593 let keywords = metadata
4594 .get("keywords")
4595 .and_then(|v| v.as_array())
4596 .map(|arr| {
4597 arr.iter()
4598 .filter_map(|k| k.as_str().map(String::from))
4599 .collect::<Vec<_>>()
4600 })
4601 .unwrap_or_default();
4602
4603 let mut parties = Vec::new();
4604 if author.is_some() || author_email.is_some() {
4605 parties.push(Party {
4606 r#type: Some("person".to_string()),
4607 role: Some("author".to_string()),
4608 name: author,
4609 email: author_email,
4610 url: None,
4611 organization: None,
4612 organization_url: None,
4613 timezone: None,
4614 });
4615 }
4616
4617 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4618 normalize_spdx_declared_license(license.as_deref());
4619 let extracted_license_statement = license.clone();
4620 let requires_dist = metadata
4621 .get("requires_dist")
4622 .and_then(|v| v.as_array())
4623 .map(|entries| {
4624 entries
4625 .iter()
4626 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4627 .collect::<Vec<_>>()
4628 })
4629 .unwrap_or_default();
4630 let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4631
4632 let purl = name.as_ref().and_then(|n| {
4633 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4634 if let Some(v) = &version {
4635 package_url.with_version(v).ok()?;
4636 }
4637 Some(package_url.to_string())
4638 });
4639
4640 if is_requested && has_direct_url {
4641 let mut extra_data = HashMap::new();
4642 if let Some(pv) = &pip_version {
4643 extra_data.insert(
4644 "pip_version".to_string(),
4645 serde_json::Value::String(pv.clone()),
4646 );
4647 }
4648 if let Some(iv) = &inspect_version {
4649 extra_data.insert(
4650 "inspect_version".to_string(),
4651 serde_json::Value::String(iv.clone()),
4652 );
4653 }
4654
4655 main_package = Some(PackageData {
4656 package_type: Some(PythonParser::PACKAGE_TYPE),
4657 namespace: None,
4658 name,
4659 version,
4660 qualifiers: None,
4661 subpath: None,
4662 primary_language: Some("Python".to_string()),
4663 description: description.or(summary),
4664 release_date: None,
4665 parties,
4666 keywords,
4667 homepage_url: home_page,
4668 download_url: None,
4669 size: None,
4670 sha1: None,
4671 md5: None,
4672 sha256: None,
4673 sha512: None,
4674 bug_tracking_url: None,
4675 code_view_url: None,
4676 vcs_url: None,
4677 copyright: None,
4678 holder: None,
4679 declared_license_expression,
4680 declared_license_expression_spdx,
4681 license_detections,
4682 other_license_expression: None,
4683 other_license_expression_spdx: None,
4684 other_license_detections: Vec::new(),
4685 extracted_license_statement,
4686 notice_text: None,
4687 source_packages: Vec::new(),
4688 file_references: Vec::new(),
4689 is_private: false,
4690 is_virtual: true,
4691 extra_data: if extra_data.is_empty() {
4692 None
4693 } else {
4694 Some(extra_data)
4695 },
4696 dependencies: parsed_dependencies,
4697 repository_homepage_url: None,
4698 repository_download_url: None,
4699 api_data_url: None,
4700 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4701 purl,
4702 });
4703 } else {
4704 let resolved_package = PackageData {
4705 package_type: Some(PythonParser::PACKAGE_TYPE),
4706 namespace: None,
4707 name: name.clone(),
4708 version: version.clone(),
4709 qualifiers: None,
4710 subpath: None,
4711 primary_language: Some("Python".to_string()),
4712 description: description.or(summary),
4713 release_date: None,
4714 parties,
4715 keywords,
4716 homepage_url: home_page,
4717 download_url: None,
4718 size: None,
4719 sha1: None,
4720 md5: None,
4721 sha256: None,
4722 sha512: None,
4723 bug_tracking_url: None,
4724 code_view_url: None,
4725 vcs_url: None,
4726 copyright: None,
4727 holder: None,
4728 declared_license_expression,
4729 declared_license_expression_spdx,
4730 license_detections,
4731 other_license_expression: None,
4732 other_license_expression_spdx: None,
4733 other_license_detections: Vec::new(),
4734 extracted_license_statement,
4735 notice_text: None,
4736 source_packages: Vec::new(),
4737 file_references: Vec::new(),
4738 is_private: false,
4739 is_virtual: true,
4740 extra_data: None,
4741 dependencies: parsed_dependencies,
4742 repository_homepage_url: None,
4743 repository_download_url: None,
4744 api_data_url: None,
4745 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4746 purl: purl.clone(),
4747 };
4748
4749 let resolved = package_data_to_resolved(&resolved_package);
4750 dependencies.push(Dependency {
4751 purl,
4752 extracted_requirement: None,
4753 scope: None,
4754 is_runtime: Some(true),
4755 is_optional: Some(false),
4756 is_pinned: Some(true),
4757 is_direct: Some(is_requested),
4758 resolved_package: Some(Box::new(resolved)),
4759 extra_data: None,
4760 });
4761 }
4762 }
4763
4764 if let Some(mut main_pkg) = main_package {
4765 let direct_requirement_purls: HashSet<String> = main_pkg
4766 .dependencies
4767 .iter()
4768 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4769 .collect();
4770
4771 let resolved_requirement_purls: HashSet<String> = dependencies
4772 .iter()
4773 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4774 .collect();
4775
4776 let unresolved_dependencies = main_pkg
4777 .dependencies
4778 .iter()
4779 .filter(|dep| {
4780 dep.purl.as_ref().is_some_and(|purl| {
4781 !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4782 })
4783 })
4784 .cloned()
4785 .collect::<Vec<_>>();
4786
4787 for dependency in &mut dependencies {
4788 if dependency
4789 .purl
4790 .as_ref()
4791 .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4792 {
4793 dependency.is_direct = Some(true);
4794 }
4795 }
4796
4797 main_pkg.dependencies = dependencies;
4798 main_pkg.dependencies.extend(unresolved_dependencies);
4799 main_pkg
4800 } else {
4801 default_package_data(path)
4802 }
4803}
4804
4805fn base_dependency_purl(purl: &str) -> String {
4806 purl.split_once('@')
4807 .map(|(base, _)| base.to_string())
4808 .unwrap_or_else(|| purl.to_string())
4809}
4810
4811type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4812
4813fn extract_from_setup_cfg(path: &Path) -> PackageData {
4814 let content = match read_file_to_string(path, None) {
4815 Ok(content) => content,
4816 Err(e) => {
4817 warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4818 return default_package_data(path);
4819 }
4820 };
4821
4822 let sections = parse_setup_cfg(&content);
4823 let name = get_ini_value(§ions, "metadata", "name").map(truncate_field);
4824 let version = get_ini_value(§ions, "metadata", "version").map(truncate_field);
4825 let description = get_ini_value(§ions, "metadata", "description").map(truncate_field);
4826 let author = get_ini_value(§ions, "metadata", "author").map(truncate_field);
4827 let author_email = get_ini_value(§ions, "metadata", "author_email");
4828 let maintainer = get_ini_value(§ions, "metadata", "maintainer").map(truncate_field);
4829 let maintainer_email = get_ini_value(§ions, "metadata", "maintainer_email");
4830 let license = get_ini_value(§ions, "metadata", "license").map(truncate_field);
4831 let mut homepage_url = get_ini_value(§ions, "metadata", "url").map(truncate_field);
4832 let classifiers = get_ini_values(§ions, "metadata", "classifiers");
4833 let keywords = parse_setup_cfg_keywords(get_ini_value(§ions, "metadata", "keywords"));
4834 let python_requires = get_ini_value(§ions, "options", "python_requires");
4835 let parsed_project_urls =
4836 parse_setup_cfg_project_urls(&get_ini_values(§ions, "metadata", "project_urls"));
4837 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4838 let mut extra_data = HashMap::new();
4839
4840 let mut parties = Vec::new();
4841 if author.is_some() || author_email.is_some() {
4842 parties.push(Party {
4843 r#type: Some("person".to_string()),
4844 role: Some("author".to_string()),
4845 name: author,
4846 email: author_email,
4847 url: None,
4848 organization: None,
4849 organization_url: None,
4850 timezone: None,
4851 });
4852 }
4853
4854 if maintainer.is_some() || maintainer_email.is_some() {
4855 parties.push(Party {
4856 r#type: Some("person".to_string()),
4857 role: Some("maintainer".to_string()),
4858 name: maintainer,
4859 email: maintainer_email,
4860 url: None,
4861 organization: None,
4862 organization_url: None,
4863 timezone: None,
4864 });
4865 }
4866
4867 let declared_license_expression = None;
4868 let declared_license_expression_spdx = None;
4869 let license_detections = Vec::new();
4870 let extracted_license_statement = license.clone();
4871
4872 let dependencies = extract_setup_cfg_dependencies(§ions);
4873
4874 if let Some(value) = python_requires {
4875 extra_data.insert(
4876 "python_requires".to_string(),
4877 serde_json::Value::String(value),
4878 );
4879 }
4880
4881 apply_project_url_mappings(
4882 &parsed_project_urls,
4883 &mut homepage_url,
4884 &mut bug_tracking_url,
4885 &mut code_view_url,
4886 &mut vcs_url,
4887 &mut extra_data,
4888 );
4889
4890 let extra_data = if extra_data.is_empty() {
4891 None
4892 } else {
4893 Some(extra_data)
4894 };
4895
4896 let purl = name.as_ref().and_then(|n| {
4897 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4898 if let Some(v) = &version {
4899 package_url.with_version(v).ok()?;
4900 }
4901 Some(package_url.to_string())
4902 });
4903
4904 PackageData {
4905 package_type: Some(PythonParser::PACKAGE_TYPE),
4906 namespace: None,
4907 name,
4908 version,
4909 qualifiers: None,
4910 subpath: None,
4911 primary_language: Some("Python".to_string()),
4912 description,
4913 release_date: None,
4914 parties,
4915 keywords,
4916 homepage_url,
4917 download_url: None,
4918 size: None,
4919 sha1: None,
4920 md5: None,
4921 sha256: None,
4922 sha512: None,
4923 bug_tracking_url,
4924 code_view_url,
4925 vcs_url,
4926 copyright: None,
4927 holder: None,
4928 declared_license_expression,
4929 declared_license_expression_spdx,
4930 license_detections,
4931 other_license_expression: None,
4932 other_license_expression_spdx: None,
4933 other_license_detections: Vec::new(),
4934 extracted_license_statement,
4935 notice_text: None,
4936 source_packages: Vec::new(),
4937 file_references: Vec::new(),
4938 is_private: has_private_classifier(&classifiers),
4939 is_virtual: false,
4940 extra_data,
4941 dependencies,
4942 repository_homepage_url: None,
4943 repository_download_url: None,
4944 api_data_url: None,
4945 datasource_id: Some(DatasourceId::PypiSetupCfg),
4946 purl,
4947 }
4948}
4949
4950fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4951 let Some(keywords) = value else {
4952 return Vec::new();
4953 };
4954
4955 keywords
4956 .split(',')
4957 .map(str::trim)
4958 .filter(|keyword| !keyword.is_empty())
4959 .map(ToOwned::to_owned)
4960 .collect()
4961}
4962
4963fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4964 entries
4965 .iter()
4966 .filter_map(|entry| {
4967 let (label, url) = entry.split_once('=')?;
4968 let label = label.trim();
4969 let url = url.trim();
4970 if label.is_empty() || url.is_empty() {
4971 None
4972 } else {
4973 Some((label.to_string(), url.to_string()))
4974 }
4975 })
4976 .collect()
4977}
4978
4979fn apply_project_url_mappings(
4980 parsed_urls: &[(String, String)],
4981 homepage_url: &mut Option<String>,
4982 bug_tracking_url: &mut Option<String>,
4983 code_view_url: &mut Option<String>,
4984 vcs_url: &mut Option<String>,
4985 extra_data: &mut HashMap<String, serde_json::Value>,
4986) {
4987 for (label, url) in parsed_urls {
4988 let label_lower = label.to_lowercase();
4989
4990 if bug_tracking_url.is_none()
4991 && matches!(
4992 label_lower.as_str(),
4993 "tracker"
4994 | "bug reports"
4995 | "bug tracker"
4996 | "issues"
4997 | "issue tracker"
4998 | "github: issues"
4999 )
5000 {
5001 *bug_tracking_url = Some(url.clone());
5002 } else if code_view_url.is_none()
5003 && matches!(label_lower.as_str(), "source" | "source code" | "code")
5004 {
5005 *code_view_url = Some(url.clone());
5006 } else if vcs_url.is_none()
5007 && matches!(
5008 label_lower.as_str(),
5009 "github" | "gitlab" | "github: repo" | "repository"
5010 )
5011 {
5012 *vcs_url = Some(url.clone());
5013 } else if homepage_url.is_none()
5014 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
5015 {
5016 *homepage_url = Some(url.clone());
5017 } else if label_lower == "changelog" {
5018 extra_data.insert(
5019 "changelog_url".to_string(),
5020 serde_json::Value::String(url.clone()),
5021 );
5022 }
5023 }
5024
5025 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
5026 .iter()
5027 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
5028 .collect();
5029
5030 if !project_urls_json.is_empty() {
5031 extra_data.insert(
5032 "project_urls".to_string(),
5033 serde_json::Value::Object(project_urls_json),
5034 );
5035 }
5036}
5037
5038fn parse_setup_cfg(content: &str) -> IniSections {
5039 let mut sections: IniSections = HashMap::new();
5040 let mut current_section: Option<String> = None;
5041 let mut current_key: Option<String> = None;
5042
5043 for raw_line in content.lines() {
5044 let line = raw_line.trim_end_matches('\r');
5045 let trimmed = line.trim();
5046 if trimmed.is_empty() {
5047 continue;
5048 }
5049
5050 let stripped = line.trim_start();
5051 if stripped.starts_with('#') || stripped.starts_with(';') {
5052 continue;
5053 }
5054
5055 if stripped.starts_with('[') && stripped.ends_with(']') {
5056 let section_name = stripped
5057 .trim_start_matches('[')
5058 .trim_end_matches(']')
5059 .trim()
5060 .to_ascii_lowercase();
5061 current_section = if section_name.is_empty() {
5062 None
5063 } else {
5064 Some(section_name)
5065 };
5066 current_key = None;
5067 continue;
5068 }
5069
5070 if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
5071 if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
5072 let value = stripped.trim();
5073 if !value.is_empty() {
5074 sections
5075 .entry(section.clone())
5076 .or_default()
5077 .entry(key.clone())
5078 .or_default()
5079 .push(value.to_string());
5080 }
5081 }
5082 continue;
5083 }
5084
5085 if let Some((key, value)) = stripped.split_once('=')
5086 && let Some(section) = current_section.as_ref()
5087 {
5088 let key_name = key.trim().to_ascii_lowercase();
5089 let value_trimmed = value.trim();
5090 let entry = sections
5091 .entry(section.clone())
5092 .or_default()
5093 .entry(key_name.clone())
5094 .or_default();
5095 if !value_trimmed.is_empty() {
5096 entry.push(value_trimmed.to_string());
5097 }
5098 current_key = Some(key_name);
5099 }
5100 }
5101
5102 sections
5103}
5104
5105fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
5106 sections
5107 .get(§ion.to_ascii_lowercase())
5108 .and_then(|values| values.get(&key.to_ascii_lowercase()))
5109 .and_then(|entries| entries.first())
5110 .map(|value| value.trim().to_string())
5111}
5112
5113fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
5114 sections
5115 .get(§ion.to_ascii_lowercase())
5116 .and_then(|values| values.get(&key.to_ascii_lowercase()))
5117 .cloned()
5118 .unwrap_or_default()
5119}
5120
5121fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
5122 let mut dependencies = Vec::new();
5123
5124 for (sub_section, scope) in [
5125 ("install_requires", "install"),
5126 ("tests_require", "test"),
5127 ("setup_requires", "setup"),
5128 ] {
5129 let reqs = get_ini_values(sections, "options", sub_section);
5130 dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
5131 }
5132
5133 if let Some(extras) = sections.get("options.extras_require") {
5134 let mut extra_items: Vec<_> = extras.iter().collect();
5135 extra_items.sort_by_key(|(name, _)| *name);
5136 for (extra_name, reqs) in extra_items {
5137 dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
5138 }
5139 }
5140
5141 dependencies
5142}
5143
5144fn parse_setup_cfg_requirements(
5145 reqs: &[String],
5146 scope: &str,
5147 is_optional: bool,
5148) -> Vec<Dependency> {
5149 reqs.iter()
5150 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
5151 .collect()
5152}
5153
5154fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
5155 let trimmed = req.trim();
5156 if trimmed.is_empty() || trimmed.starts_with('#') {
5157 return None;
5158 }
5159
5160 let name = extract_setup_cfg_dependency_name(trimmed)?;
5161 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5162
5163 Some(Dependency {
5164 purl: Some(purl.to_string()),
5165 extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
5166 scope: Some(scope.to_string()),
5167 is_runtime: Some(true),
5168 is_optional: Some(is_optional),
5169 is_pinned: Some(false),
5170 is_direct: Some(true),
5171 resolved_package: None,
5172 extra_data: None,
5173 })
5174}
5175
5176fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
5177 let trimmed = req.trim();
5178 if trimmed.is_empty() {
5179 return None;
5180 }
5181
5182 let end = trimmed
5183 .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
5184 .unwrap_or(trimmed.len());
5185 let name = trimmed[..end].trim();
5186 if name.is_empty() {
5187 None
5188 } else {
5189 Some(name.to_string())
5190 }
5191}
5192
5193fn normalize_setup_cfg_requirement(req: &str) -> String {
5194 req.chars().filter(|c| !c.is_whitespace()).collect()
5195}
5196
5197fn extract_setup_value(content: &str, key: &str) -> Option<String> {
5198 let patterns = vec![
5199 format!("{}=\"", key), format!("{} =\"", key), format!("{}= \"", key), format!("{} = \"", key), format!("{}='", key), format!("{} ='", key), format!("{}= '", key), format!("{} = '", key), ];
5208
5209 for pattern in patterns {
5210 if let Some(start_idx) = content.find(&pattern) {
5211 let value_start = start_idx + pattern.len();
5212 let remaining = &content[value_start..];
5213
5214 if let Some(end_idx) = remaining.find(['"', '\'']) {
5215 return Some(remaining[..end_idx].to_string());
5216 }
5217 }
5218 }
5219
5220 None
5221}
5222
5223fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
5224 let mut dependencies = Vec::new();
5225
5226 if let Some(tests_deps) = extract_tests_require(content) {
5227 dependencies.extend(tests_deps);
5228 }
5229
5230 if let Some(extras_deps) = extract_extras_require(content) {
5231 dependencies.extend(extras_deps);
5232 }
5233
5234 dependencies
5235}
5236
5237fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
5238 let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
5239 let re = Regex::new(pattern).ok()?;
5240 let captures = re.captures(content)?;
5241 let deps_str = captures.get(1)?.as_str();
5242
5243 let deps = parse_setup_py_dep_list(deps_str, "test", true);
5244 if deps.is_empty() { None } else { Some(deps) }
5245}
5246
5247fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
5248 let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
5249 let re = Regex::new(pattern).ok()?;
5250 let captures = re.captures(content)?;
5251 let dict_content = captures.get(1)?.as_str();
5252
5253 let mut all_deps = Vec::new();
5254
5255 let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
5256 let entry_re = Regex::new(entry_pattern).ok()?;
5257
5258 for entry_cap in entry_re.captures_iter(dict_content) {
5259 if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
5260 let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
5261 all_deps.extend(deps);
5262 }
5263 }
5264
5265 if all_deps.is_empty() {
5266 None
5267 } else {
5268 Some(all_deps)
5269 }
5270}
5271
5272fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
5273 let dep_pattern = r#"['"]([^'"]+)['"]"#;
5274 let re = match Regex::new(dep_pattern) {
5275 Ok(r) => r,
5276 Err(_) => return Vec::new(),
5277 };
5278
5279 re.captures_iter(deps_str)
5280 .filter_map(|cap| {
5281 let dep_str = cap.get(1)?.as_str().trim();
5282 if dep_str.is_empty() {
5283 return None;
5284 }
5285
5286 let name = extract_setup_cfg_dependency_name(dep_str)?;
5287 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5288
5289 Some(Dependency {
5290 purl: Some(purl.to_string()),
5291 extracted_requirement: Some(dep_str.to_string()),
5292 scope: Some(scope.to_string()),
5293 is_runtime: Some(true),
5294 is_optional: Some(is_optional),
5295 is_pinned: Some(false),
5296 is_direct: Some(true),
5297 resolved_package: None,
5298 extra_data: None,
5299 })
5300 })
5301 .collect()
5302}
5303
5304pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
5306 let content = read_file_to_string(path, None).map_err(|e| e.to_string())?;
5307 toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
5308}
5309
5310fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<Sha256Digest>) {
5321 let mut file = match File::open(path) {
5322 Ok(f) => f,
5323 Err(_) => return (None, None),
5324 };
5325
5326 let metadata = match file.metadata() {
5327 Ok(m) => m,
5328 Err(_) => return (None, None),
5329 };
5330 let size = metadata.len();
5331
5332 let mut hasher = Sha256::new();
5333 let mut buffer = vec![0; 8192];
5334
5335 loop {
5336 match file.read(&mut buffer) {
5337 Ok(0) => break,
5338 Ok(n) => hasher.update(&buffer[..n]),
5339 Err(_) => return (Some(size), None),
5340 }
5341 }
5342
5343 let hash = Sha256Digest::from_bytes(hasher.finalize().into());
5344 (Some(size), Some(hash))
5345}
5346
5347fn default_package_data(path: &Path) -> PackageData {
5348 PackageData {
5349 package_type: Some(PythonParser::PACKAGE_TYPE),
5350 primary_language: Some("Python".to_string()),
5351 datasource_id: infer_python_datasource_id(path),
5352 ..Default::default()
5353 }
5354}
5355
5356fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
5357 let file_name = path.file_name().and_then(|name| name.to_str());
5358
5359 match file_name {
5360 Some("pyproject.toml") => {
5361 if read_toml_file(path)
5362 .ok()
5363 .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
5364 .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
5365 .is_some()
5366 {
5367 Some(DatasourceId::PypiPoetryPyprojectToml)
5368 } else {
5369 Some(DatasourceId::PypiPyprojectToml)
5370 }
5371 }
5372 Some(name)
5373 if name == "setup.py" || name.ends_with("_setup.py") || name.ends_with("-setup.py") =>
5374 {
5375 Some(DatasourceId::PypiSetupPy)
5376 }
5377 Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
5378 Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
5379 Some("METADATA") if is_installed_wheel_metadata_path(path) => {
5380 Some(DatasourceId::PypiWheelMetadata)
5381 }
5382 Some("pypi.json") => Some(DatasourceId::PypiJson),
5383 Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
5384 Some("origin.json") if is_pip_cache_origin_json(path) => {
5385 Some(DatasourceId::PypiPipOriginJson)
5386 }
5387 _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
5388 Some(DatasourceId::PypiSdist)
5389 }
5390 _ if path
5391 .extension()
5392 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
5393 {
5394 Some(DatasourceId::PypiWheel)
5395 }
5396 _ if path
5397 .extension()
5398 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
5399 {
5400 Some(DatasourceId::PypiEgg)
5401 }
5402 _ => None,
5403 }
5404}
5405
5406crate::register_parser!(
5407 "Python package manifests (pyproject.toml, setup.py, suffixed setup.py variants, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
5408 &[
5409 "**/pyproject.toml",
5410 "**/setup.py",
5411 "**/*_setup.py",
5412 "**/*-setup.py",
5413 "**/setup.cfg",
5414 "**/pypi.json",
5415 "**/PKG-INFO",
5416 "**/*.dist-info/METADATA",
5417 "**/origin.json",
5418 "**/*.tar.gz",
5419 "**/*.tgz",
5420 "**/*.tar.bz2",
5421 "**/*.tar.xz",
5422 "**/*.zip",
5423 "**/*.whl",
5424 "**/*.egg"
5425 ],
5426 "pypi",
5427 "Python",
5428 Some("https://packaging.python.org/"),
5429);