1use crate::models::{
35 DatasourceId, Dependency, FileReference, PackageData, PackageType, Party, Sha256Digest,
36};
37use crate::parser_warn as warn;
38use crate::parsers::utils::{
39 MAX_ITERATION_COUNT, read_file_to_string, split_name_email, truncate_field,
40};
41use base64::Engine;
42use base64::engine::general_purpose::URL_SAFE_NO_PAD;
43use bzip2::read::BzDecoder;
44use csv::ReaderBuilder;
45use flate2::read::GzDecoder;
46use liblzma::read::XzDecoder;
47use packageurl::PackageUrl;
48use regex::Regex;
49use ruff_python_ast as ast;
50use ruff_python_parser::parse_module;
51use serde_json::{Map as JsonMap, Value as JsonValue};
52use sha2::{Digest, Sha256};
53use std::collections::{HashMap, HashSet};
54use std::fs::File;
55use std::io::Read;
56use std::path::{Component, Path, PathBuf};
57use tar::Archive;
58use toml::Value as TomlValue;
59use toml::map::Map as TomlMap;
60use zip::ZipArchive;
61
62use super::PackageParser;
63use super::license_normalization::{
64 DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
65 normalize_spdx_expression,
66};
67use super::pep508::parse_pep508_requirement;
68
69const FIELD_PROJECT: &str = "project";
71const FIELD_NAME: &str = "name";
72const FIELD_VERSION: &str = "version";
73const FIELD_DESCRIPTION: &str = "description";
74const FIELD_KEYWORDS: &str = "keywords";
75const FIELD_LICENSE: &str = "license";
76const FIELD_AUTHORS: &str = "authors";
77const FIELD_MAINTAINERS: &str = "maintainers";
78const FIELD_URLS: &str = "urls";
79const FIELD_HOMEPAGE: &str = "homepage";
80const FIELD_REPOSITORY: &str = "repository";
81const FIELD_DEPENDENCIES: &str = "dependencies";
82const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
83const FIELD_EXTRAS: &str = "extras";
84
85type ProjectUrls = (
86 Option<String>,
87 Option<String>,
88 Option<String>,
89 Option<String>,
90 Option<String>,
91);
92const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
93const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
94const MAX_SETUP_PY_BYTES: usize = 1_048_576;
95const MAX_SETUP_PY_AST_NODES: usize = 10_000;
96const MAX_SETUP_PY_AST_DEPTH: usize = 50;
97const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; const MAX_COMPRESSION_RATIO: f64 = 100.0; pub struct PythonParser;
111
112#[derive(Clone, Copy, Debug)]
113enum PythonSdistArchiveFormat {
114 TarGz,
115 Tgz,
116 TarBz2,
117 TarXz,
118 Zip,
119}
120
121#[derive(Clone, Debug)]
122struct ValidatedZipEntry {
123 index: usize,
124 name: String,
125}
126
127impl PackageParser for PythonParser {
128 const PACKAGE_TYPE: PackageType = PackageType::Pypi;
129
130 fn extract_packages(path: &Path) -> Vec<PackageData> {
131 vec![
132 if path.file_name().unwrap_or_default() == "pyproject.toml" {
133 extract_from_pyproject_toml(path)
134 } else if path.file_name().unwrap_or_default() == "setup.cfg" {
135 extract_from_setup_cfg(path)
136 } else if is_setup_py_like_path(path) {
137 return extract_setup_py_packages(path);
138 } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
139 extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
140 } else if is_installed_wheel_metadata_path(path) {
141 extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
142 } else if is_pip_cache_origin_json(path) {
143 extract_from_pip_origin_json(path)
144 } else if path.file_name().unwrap_or_default() == "pypi.json" {
145 extract_from_pypi_json(path)
146 } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
147 extract_from_pip_inspect(path)
148 } else if is_python_sdist_archive_path(path) {
149 extract_from_sdist_archive(path)
150 } else if path
151 .extension()
152 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
153 {
154 extract_from_wheel_archive(path)
155 } else if path
156 .extension()
157 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
158 {
159 extract_from_egg_archive(path)
160 } else {
161 default_package_data(path)
162 },
163 ]
164 }
165
166 fn is_match(path: &Path) -> bool {
167 if let Some(filename) = path.file_name()
168 && (filename == "pyproject.toml"
169 || filename == "setup.cfg"
170 || is_setup_py_like_path(path)
171 || filename == "PKG-INFO"
172 || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
173 || filename == "pypi.json"
174 || filename == "pip-inspect.deplock"
175 || is_pip_cache_origin_json(path))
176 {
177 return true;
178 }
179
180 if let Some(extension) = path.extension() {
181 let ext = extension.to_string_lossy().to_lowercase();
182 if (ext == "whl" && is_valid_wheel_archive_path(path))
183 || ext == "egg"
184 || is_python_sdist_archive_path(path)
185 {
186 return true;
187 }
188 }
189
190 false
191 }
192}
193
194fn is_setup_py_like_path(path: &Path) -> bool {
195 path.file_name()
196 .and_then(|name| name.to_str())
197 .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
198}
199
200fn is_installed_wheel_metadata_path(path: &Path) -> bool {
201 path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
202 && path
203 .parent()
204 .and_then(|parent| parent.file_name())
205 .and_then(|name| name.to_str())
206 .is_some_and(|name| name.ends_with(".dist-info"))
207}
208
209#[derive(Debug, Clone)]
210struct InstalledWheelMetadata {
211 wheel_tags: Vec<String>,
212 wheel_version: Option<String>,
213 wheel_generator: Option<String>,
214 root_is_purelib: Option<bool>,
215 compressed_tag: Option<String>,
216}
217
218fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
219 let Some(parent) = path.parent() else {
220 return;
221 };
222
223 if !parent
224 .file_name()
225 .and_then(|name| name.to_str())
226 .is_some_and(|name| name.ends_with(".dist-info"))
227 {
228 return;
229 }
230
231 let wheel_path = parent.join("WHEEL");
232 if !wheel_path.exists() {
233 return;
234 }
235
236 let Ok(content) = read_file_to_string(&wheel_path, None) else {
237 warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
238 return;
239 };
240
241 let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
242 return;
243 };
244
245 apply_installed_wheel_metadata(package_data, &wheel_metadata);
246}
247
248fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
249 use super::rfc822::{get_header_all, get_header_first};
250
251 let metadata = super::rfc822::parse_rfc822_content(content);
252 let wheel_tags = get_header_all(&metadata.headers, "tag");
253 if wheel_tags.is_empty() {
254 return None;
255 }
256
257 let wheel_version = get_header_first(&metadata.headers, "wheel-version");
258 let wheel_generator = get_header_first(&metadata.headers, "generator");
259 let root_is_purelib =
260 get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
261 match value.to_ascii_lowercase().as_str() {
262 "true" => Some(true),
263 "false" => Some(false),
264 _ => None,
265 }
266 });
267
268 let compressed_tag = compress_wheel_tags(&wheel_tags);
269
270 Some(InstalledWheelMetadata {
271 wheel_tags,
272 wheel_version,
273 wheel_generator,
274 root_is_purelib,
275 compressed_tag,
276 })
277}
278
279fn compress_wheel_tags(tags: &[String]) -> Option<String> {
280 if tags.is_empty() {
281 return None;
282 }
283
284 if tags.len() == 1 {
285 return Some(tags[0].clone());
286 }
287
288 let mut python_tags = Vec::new();
289 let mut abi_tag: Option<&str> = None;
290 let mut platform_tag: Option<&str> = None;
291
292 for tag in tags {
293 let mut parts = tag.splitn(3, '-');
294 let python = parts.next()?;
295 let abi = parts.next()?;
296 let platform = parts.next()?;
297
298 if abi_tag.is_some_and(|existing| existing != abi)
299 || platform_tag.is_some_and(|existing| existing != platform)
300 {
301 return None;
302 }
303
304 abi_tag = Some(abi);
305 platform_tag = Some(platform);
306 python_tags.push(python.to_string());
307 }
308
309 Some(format!(
310 "{}-{}-{}",
311 python_tags.join("."),
312 abi_tag?,
313 platform_tag?
314 ))
315}
316
317fn apply_installed_wheel_metadata(
318 package_data: &mut PackageData,
319 wheel_metadata: &InstalledWheelMetadata,
320) {
321 let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
322 extra_data.insert(
323 "wheel_tags".to_string(),
324 JsonValue::Array(
325 wheel_metadata
326 .wheel_tags
327 .iter()
328 .cloned()
329 .map(JsonValue::String)
330 .collect(),
331 ),
332 );
333
334 if let Some(wheel_version) = &wheel_metadata.wheel_version {
335 extra_data.insert(
336 "wheel_version".to_string(),
337 JsonValue::String(wheel_version.clone()),
338 );
339 }
340
341 if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
342 extra_data.insert(
343 "wheel_generator".to_string(),
344 JsonValue::String(wheel_generator.clone()),
345 );
346 }
347
348 if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
349 extra_data.insert(
350 "root_is_purelib".to_string(),
351 JsonValue::Bool(root_is_purelib),
352 );
353 }
354
355 if let (Some(name), Some(version), Some(extension)) = (
356 package_data.name.as_deref(),
357 package_data.version.as_deref(),
358 wheel_metadata.compressed_tag.as_deref(),
359 ) {
360 package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
361 }
362}
363
364fn is_pip_cache_origin_json(path: &Path) -> bool {
365 path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
366 && path.ancestors().skip(1).any(|ancestor| {
367 ancestor
368 .file_name()
369 .and_then(|name| name.to_str())
370 .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
371 })
372}
373
374fn extract_from_pip_origin_json(path: &Path) -> PackageData {
375 let content = match read_file_to_string(path, None) {
376 Ok(content) => content,
377 Err(e) => {
378 warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
379 return default_package_data(path);
380 }
381 };
382
383 let root: JsonValue = match serde_json::from_str(&content) {
384 Ok(root) => root,
385 Err(e) => {
386 warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
387 return default_package_data(path);
388 }
389 };
390
391 let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
392 warn!("No url found in pip cache origin.json at {:?}", path);
393 return default_package_data(path);
394 };
395
396 let sibling_wheel = find_sibling_cached_wheel(path);
397 let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
398 sibling_wheel
399 .as_ref()
400 .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
401 });
402
403 let Some((name, version)) = name_version else {
404 warn!(
405 "Failed to infer package name/version from pip cache origin.json at {:?}",
406 path
407 );
408 return default_package_data(path);
409 };
410
411 let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
412 build_pypi_urls(Some(&name), Some(&version));
413 let purl = sibling_wheel
414 .as_ref()
415 .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
416 .or(plain_purl);
417
418 PackageData {
419 package_type: Some(PythonParser::PACKAGE_TYPE),
420 primary_language: Some("Python".to_string()),
421 name: Some(truncate_field(name)),
422 version: Some(version),
423 datasource_id: Some(DatasourceId::PypiPipOriginJson),
424 download_url: Some(truncate_field(download_url.to_string())),
425 sha256: extract_sha256_from_origin_json(&root)
426 .and_then(|h| Sha256Digest::from_hex(&h).ok()),
427 repository_homepage_url,
428 repository_download_url,
429 api_data_url,
430 purl,
431 ..Default::default()
432 }
433}
434
435fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
436 let parent = path.parent()?;
437 let entries = parent.read_dir().ok()?;
438
439 for entry in entries.flatten() {
440 let sibling_path = entry.path();
441 if sibling_path
442 .extension()
443 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
444 && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
445 {
446 return Some(wheel_info);
447 }
448 }
449
450 None
451}
452
453fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
454 let file_name = url.rsplit('/').next()?;
455
456 if file_name.ends_with(".whl") {
457 return parse_wheel_filename(Path::new(file_name))
458 .map(|wheel_info| (wheel_info.name, wheel_info.version));
459 }
460
461 let stem = strip_python_archive_extension(file_name)?;
462 let (name, version) = stem.rsplit_once('-')?;
463 if name.is_empty() || version.is_empty() {
464 return None;
465 }
466
467 Some((name.replace('_', "-"), version.to_string()))
468}
469
470fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
471 [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
472 .iter()
473 .find_map(|suffix| file_name.strip_suffix(suffix))
474}
475
476fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
477 root.pointer("/archive_info/hashes/sha256")
478 .and_then(|value| value.as_str())
479 .map(ToOwned::to_owned)
480 .or_else(|| {
481 root.pointer("/archive_info/hash")
482 .and_then(|value| value.as_str())
483 .and_then(normalize_origin_hash)
484 })
485}
486
487fn normalize_origin_hash(hash: &str) -> Option<String> {
488 if let Some(value) = hash.strip_prefix("sha256=") {
489 return Some(value.to_string());
490 }
491 if let Some(value) = hash.strip_prefix("sha256:") {
492 return Some(value.to_string());
493 }
494 if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
495 return Some(hash.to_string());
496 }
497 None
498}
499
500fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
501 let content = match read_file_to_string(path, None) {
502 Ok(content) => content,
503 Err(e) => {
504 warn!("Failed to read metadata at {:?}: {}", path, e);
505 return default_package_data(path);
506 }
507 };
508
509 let metadata = super::rfc822::parse_rfc822_content(&content);
510 let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
511 merge_sibling_metadata_dependencies(path, &mut package_data);
512 merge_sibling_metadata_file_references(path, &mut package_data);
513 if datasource_id == DatasourceId::PypiWheelMetadata {
514 merge_sibling_wheel_metadata(path, &mut package_data);
515 }
516 package_data
517}
518
519fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
520 let mut extra_dependencies = Vec::new();
521
522 if let Some(parent) = path.parent() {
523 let direct_requires = parent.join("requires.txt");
524 if direct_requires.exists()
525 && let Ok(content) = read_file_to_string(&direct_requires, None)
526 {
527 extra_dependencies.extend(parse_requires_txt(&content));
528 }
529
530 let sibling_egg_info_requires = parent
531 .read_dir()
532 .ok()
533 .into_iter()
534 .flatten()
535 .flatten()
536 .find_map(|entry| {
537 let child_path = entry.path();
538 if child_path.is_dir()
539 && child_path
540 .file_name()
541 .and_then(|name| name.to_str())
542 .is_some_and(|name| name.ends_with(".egg-info"))
543 {
544 let requires = child_path.join("requires.txt");
545 requires.exists().then_some(requires)
546 } else {
547 None
548 }
549 });
550
551 if let Some(requires_path) = sibling_egg_info_requires
552 && let Ok(content) = read_file_to_string(&requires_path, None)
553 {
554 extra_dependencies.extend(parse_requires_txt(&content));
555 }
556 }
557
558 for dependency in extra_dependencies {
559 if !package_data.dependencies.iter().any(|existing| {
560 existing.purl == dependency.purl
561 && existing.scope == dependency.scope
562 && existing.extracted_requirement == dependency.extracted_requirement
563 && existing.extra_data == dependency.extra_data
564 }) {
565 package_data.dependencies.push(dependency);
566 }
567 }
568}
569
570fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
571 let mut extra_refs = Vec::new();
572
573 if let Some(parent) = path.parent() {
574 let record_path = parent.join("RECORD");
575 if record_path.exists()
576 && let Ok(content) = read_file_to_string(&record_path, None)
577 {
578 extra_refs.extend(parse_record_csv(&content));
579 }
580
581 let installed_files_path = parent.join("installed-files.txt");
582 if installed_files_path.exists()
583 && let Ok(content) = read_file_to_string(&installed_files_path, None)
584 {
585 extra_refs.extend(parse_installed_files_txt(&content));
586 }
587
588 let sources_path = parent.join("SOURCES.txt");
589 if sources_path.exists()
590 && let Ok(content) = read_file_to_string(&sources_path, None)
591 {
592 extra_refs.extend(parse_sources_txt(&content));
593 }
594 }
595
596 for file_ref in extra_refs {
597 if !package_data
598 .file_references
599 .iter()
600 .any(|existing| existing.path == file_ref.path)
601 {
602 package_data.file_references.push(file_ref);
603 }
604 }
605}
606
607fn collect_validated_zip_entries<R: Read + std::io::Seek>(
608 archive: &mut ZipArchive<R>,
609 path: &Path,
610 archive_type: &str,
611) -> Result<Vec<ValidatedZipEntry>, String> {
612 let mut total_extracted = 0u64;
613 let mut entries = Vec::new();
614 let mut entry_count = 0usize;
615
616 for i in 0..archive.len() {
617 entry_count += 1;
618 if entry_count > MAX_ITERATION_COUNT {
619 warn!(
620 "Exceeded max entry count in {} {:?}; stopping at {} entries",
621 archive_type, path, MAX_ITERATION_COUNT
622 );
623 break;
624 }
625 if let Ok(file) = archive.by_index_raw(i) {
626 let compressed_size = file.compressed_size();
627 let uncompressed_size = file.size();
628 let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
629 warn!(
630 "Skipping unsafe path in {} {:?}: {}",
631 archive_type,
632 path,
633 file.name()
634 );
635 continue;
636 };
637
638 if compressed_size > 0 {
639 let ratio = uncompressed_size as f64 / compressed_size as f64;
640 if ratio > MAX_COMPRESSION_RATIO {
641 warn!(
642 "Suspicious compression ratio in {} {:?}: {:.2}:1",
643 archive_type, path, ratio
644 );
645 continue;
646 }
647 }
648
649 if uncompressed_size > MAX_FILE_SIZE {
650 warn!(
651 "File too large in {} {:?}: {} bytes (limit: {} bytes)",
652 archive_type, path, uncompressed_size, MAX_FILE_SIZE
653 );
654 continue;
655 }
656
657 total_extracted += uncompressed_size;
658 if total_extracted > MAX_ARCHIVE_SIZE {
659 let msg = format!(
660 "Total extracted size exceeds limit for {} {:?}",
661 archive_type, path
662 );
663 warn!("{}", msg);
664 return Err(msg);
665 }
666
667 entries.push(ValidatedZipEntry {
668 index: i,
669 name: entry_name,
670 });
671 }
672 }
673
674 Ok(entries)
675}
676
677fn is_python_sdist_archive_path(path: &Path) -> bool {
678 detect_python_sdist_archive_format(path).is_some()
679}
680
681fn is_valid_wheel_archive_path(path: &Path) -> bool {
682 if !path.is_file() {
683 return true;
684 }
685
686 let file = match File::open(path) {
687 Ok(file) => file,
688 Err(_) => return false,
689 };
690 let mut archive = match ZipArchive::new(file) {
691 Ok(archive) => archive,
692 Err(_) => return false,
693 };
694
695 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
696 Ok(entries) => entries,
697 Err(_) => return false,
698 };
699
700 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA").is_some()
701}
702
703fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
704 let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
705
706 if !is_likely_python_sdist_filename(&file_name) {
707 return None;
708 }
709
710 if file_name.ends_with(".tar.gz") {
711 tar_gz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarGz)
712 } else if file_name.ends_with(".tgz") {
713 tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
714 } else if file_name.ends_with(".tar.bz2") {
715 tar_bz2_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarBz2)
716 } else if file_name.ends_with(".tar.xz") {
717 tar_xz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarXz)
718 } else if file_name.ends_with(".zip") {
719 zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
720 } else {
721 None
722 }
723}
724
725fn tar_gz_sdist_contains_pkg_info(path: &Path) -> bool {
726 let Some(compressed_size) = compressed_archive_size(path) else {
727 return false;
728 };
729 let file = match File::open(path) {
730 Ok(file) => file,
731 Err(_) => return false,
732 };
733 let decoder = GzDecoder::new(file);
734 tar_sdist_contains_pkg_info(path, decoder, "tar.gz", compressed_size)
735}
736
737fn tar_bz2_sdist_contains_pkg_info(path: &Path) -> bool {
738 let Some(compressed_size) = compressed_archive_size(path) else {
739 return false;
740 };
741 let file = match File::open(path) {
742 Ok(file) => file,
743 Err(_) => return false,
744 };
745 let decoder = BzDecoder::new(file);
746 tar_sdist_contains_pkg_info(path, decoder, "tar.bz2", compressed_size)
747}
748
749fn tar_xz_sdist_contains_pkg_info(path: &Path) -> bool {
750 let Some(compressed_size) = compressed_archive_size(path) else {
751 return false;
752 };
753 let file = match File::open(path) {
754 Ok(file) => file,
755 Err(_) => return false,
756 };
757 let decoder = XzDecoder::new(file);
758 tar_sdist_contains_pkg_info(path, decoder, "tar.xz", compressed_size)
759}
760
761fn compressed_archive_size(path: &Path) -> Option<u64> {
762 std::fs::metadata(path).ok().map(|metadata| metadata.len())
763}
764
765fn tar_sdist_contains_pkg_info<R: Read>(
766 path: &Path,
767 reader: R,
768 archive_type: &str,
769 compressed_size: u64,
770) -> bool {
771 let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
772 else {
773 return false;
774 };
775
776 select_sdist_pkginfo_entry(path, &entries).is_some()
777}
778
779fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
780 if !path.is_file() {
781 return true;
782 }
783
784 let Some(compressed_size) = compressed_archive_size(path) else {
785 return false;
786 };
787 let file = match File::open(path) {
788 Ok(file) => file,
789 Err(_) => return false,
790 };
791 let decoder = GzDecoder::new(file);
792 tar_sdist_contains_pkg_info(path, decoder, "tgz", compressed_size)
793}
794
795fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
796 if !path.is_file() {
797 return true;
798 }
799
800 let file = match File::open(path) {
801 Ok(file) => file,
802 Err(_) => return false,
803 };
804 let mut archive = match ZipArchive::new(file) {
805 Ok(archive) => archive,
806 Err(_) => return false,
807 };
808
809 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
810 Ok(entries) => entries,
811 Err(_) => return false,
812 };
813 let metadata_entries: Vec<_> = validated_entries
814 .iter()
815 .filter(|entry| entry.name.ends_with("/PKG-INFO"))
816 .filter_map(|entry| {
817 read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
818 .ok()
819 .map(|content| (entry.name.clone(), content))
820 })
821 .collect();
822
823 has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
824}
825
826fn is_likely_python_sdist_filename(file_name: &str) -> bool {
827 let Some(stem) = strip_python_archive_extension(file_name) else {
828 return false;
829 };
830
831 let Some((name, version)) = stem.rsplit_once('-') else {
832 return false;
833 };
834
835 !name.is_empty()
836 && !version.is_empty()
837 && version.chars().any(|ch| ch.is_ascii_digit())
838 && name
839 .chars()
840 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
841}
842
843fn extract_from_sdist_archive(path: &Path) -> PackageData {
844 let metadata = match std::fs::metadata(path) {
845 Ok(m) => m,
846 Err(e) => {
847 warn!(
848 "Failed to read metadata for sdist archive {:?}: {}",
849 path, e
850 );
851 return default_package_data(path);
852 }
853 };
854
855 if metadata.len() > MAX_ARCHIVE_SIZE {
856 warn!(
857 "sdist archive too large: {} bytes (limit: {} bytes)",
858 metadata.len(),
859 MAX_ARCHIVE_SIZE
860 );
861 return default_package_data(path);
862 }
863
864 let Some(format) = detect_python_sdist_archive_format(path) else {
865 return default_package_data(path);
866 };
867
868 let mut package_data = match format {
869 PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
870 let file = match File::open(path) {
871 Ok(file) => file,
872 Err(e) => {
873 warn!("Failed to open sdist archive {:?}: {}", path, e);
874 return default_package_data(path);
875 }
876 };
877 let decoder = GzDecoder::new(file);
878 extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
879 }
880 PythonSdistArchiveFormat::TarBz2 => {
881 let file = match File::open(path) {
882 Ok(file) => file,
883 Err(e) => {
884 warn!("Failed to open sdist archive {:?}: {}", path, e);
885 return default_package_data(path);
886 }
887 };
888 let decoder = BzDecoder::new(file);
889 extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
890 }
891 PythonSdistArchiveFormat::TarXz => {
892 let file = match File::open(path) {
893 Ok(file) => file,
894 Err(e) => {
895 warn!("Failed to open sdist archive {:?}: {}", path, e);
896 return default_package_data(path);
897 }
898 };
899 let decoder = XzDecoder::new(file);
900 extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
901 }
902 PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
903 };
904
905 if package_data.package_type.is_some() {
906 let (size, sha256) = calculate_file_checksums(path);
907 package_data.size = size;
908 package_data.sha256 = sha256;
909 }
910
911 package_data
912}
913
914fn extract_from_tar_sdist_archive<R: Read>(
915 path: &Path,
916 reader: R,
917 archive_type: &str,
918 compressed_size: u64,
919) -> PackageData {
920 let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
921 else {
922 return default_package_data(path);
923 };
924
925 build_sdist_package_data(path, entries)
926}
927
928fn collect_tar_sdist_entries<R: Read>(
929 path: &Path,
930 reader: R,
931 archive_type: &str,
932 compressed_size: u64,
933) -> Option<Vec<(String, String)>> {
934 let mut archive = Archive::new(reader);
935 let archive_entries = match archive.entries() {
936 Ok(entries) => entries,
937 Err(e) => {
938 warn!(
939 "Failed to read {} sdist archive {:?}: {}",
940 archive_type, path, e
941 );
942 return None;
943 }
944 };
945
946 let mut total_extracted = 0u64;
947 let mut entries = Vec::new();
948 let mut entry_count = 0usize;
949
950 for entry_result in archive_entries {
951 entry_count += 1;
952 if entry_count > MAX_ITERATION_COUNT {
953 warn!(
954 "Exceeded max entry count in {} sdist {:?}; stopping at {} entries",
955 archive_type, path, MAX_ITERATION_COUNT
956 );
957 break;
958 }
959
960 let mut entry = match entry_result {
961 Ok(entry) => entry,
962 Err(e) => {
963 warn!(
964 "Failed to read {} sdist entry from {:?}: {}",
965 archive_type, path, e
966 );
967 continue;
968 }
969 };
970
971 let entry_size = entry.size();
972 if entry_size > MAX_FILE_SIZE {
973 warn!(
974 "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
975 archive_type, path, entry_size, MAX_FILE_SIZE
976 );
977 continue;
978 }
979
980 total_extracted += entry_size;
981 if total_extracted > MAX_ARCHIVE_SIZE {
982 warn!(
983 "Total extracted size exceeds limit for {} sdist {:?}",
984 archive_type, path
985 );
986 return None;
987 }
988
989 if compressed_size > 0 {
990 let ratio = total_extracted as f64 / compressed_size as f64;
991 if ratio > MAX_COMPRESSION_RATIO {
992 warn!(
993 "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
994 archive_type, path, ratio
995 );
996 return None;
997 }
998 }
999
1000 let entry_path = match entry.path() {
1001 Ok(path) => path.to_string_lossy().replace('\\', "/"),
1002 Err(e) => {
1003 warn!(
1004 "Failed to get {} sdist entry path from {:?}: {}",
1005 archive_type, path, e
1006 );
1007 continue;
1008 }
1009 };
1010
1011 let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
1012 warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
1013 continue;
1014 };
1015
1016 if !is_relevant_sdist_text_entry(&entry_path) {
1017 continue;
1018 }
1019
1020 if let Ok(content) = read_limited_utf8(
1021 &mut entry,
1022 MAX_FILE_SIZE,
1023 &format!("{} entry {}", archive_type, entry_path),
1024 ) {
1025 entries.push((entry_path, content));
1026 }
1027 }
1028
1029 Some(entries)
1030}
1031
1032fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
1033 let file = match File::open(path) {
1034 Ok(file) => file,
1035 Err(e) => {
1036 warn!("Failed to open zip sdist archive {:?}: {}", path, e);
1037 return default_package_data(path);
1038 }
1039 };
1040
1041 let mut archive = match ZipArchive::new(file) {
1042 Ok(archive) => archive,
1043 Err(e) => {
1044 warn!("Failed to read zip sdist archive {:?}: {}", path, e);
1045 return default_package_data(path);
1046 }
1047 };
1048
1049 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
1050 Ok(entries) => entries,
1051 Err(_) => return default_package_data(path),
1052 };
1053
1054 let mut entries = Vec::new();
1055 for entry in validated_entries.iter() {
1056 if !is_relevant_sdist_text_entry(&entry.name) {
1057 continue;
1058 }
1059
1060 if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
1061 entries.push((entry.name.clone(), content));
1062 }
1063 }
1064
1065 build_sdist_package_data(path, entries)
1066}
1067
1068fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
1069 entry_path.ends_with("/PKG-INFO")
1070 || entry_path.ends_with("/requires.txt")
1071 || entry_path.ends_with("/SOURCES.txt")
1072}
1073
1074fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
1075 let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
1076 warn!("No PKG-INFO file found in sdist archive {:?}", path);
1077 return default_package_data(path);
1078 };
1079
1080 let mut package_data =
1081 python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
1082 merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
1083 merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
1084 apply_sdist_name_version_fallback(path, &mut package_data);
1085 package_data.datasource_id = Some(DatasourceId::PypiSdist);
1086 package_data
1087}
1088
1089fn select_sdist_pkginfo_entry(
1090 archive_path: &Path,
1091 entries: &[(String, String)],
1092) -> Option<(String, String)> {
1093 let expected_name = sdist_archive_expected_name(archive_path);
1094
1095 entries
1096 .iter()
1097 .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
1098 .min_by_key(|(entry_path, content)| {
1099 let components: Vec<_> = entry_path
1100 .split('/')
1101 .filter(|part| !part.is_empty())
1102 .collect();
1103 let candidate_name = sdist_pkginfo_candidate_name(content);
1104 let name_rank = if candidate_name == expected_name {
1105 0
1106 } else {
1107 1
1108 };
1109 let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1110
1111 (name_rank, kind_rank, components.len(), entry_path.clone())
1112 })
1113 .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1114}
1115
1116fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1117 let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1118 return false;
1119 };
1120
1121 entries.iter().any(|(entry_path, content)| {
1122 sdist_pkginfo_kind_rank(entry_path) < 3
1123 && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1124 })
1125}
1126
1127fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1128 archive_path
1129 .file_name()
1130 .and_then(|name| name.to_str())
1131 .and_then(strip_python_archive_extension)
1132 .and_then(|stem| {
1133 stem.rsplit_once('-')
1134 .map(|(name, _)| normalize_python_package_name(name))
1135 })
1136}
1137
1138fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1139 let metadata = super::rfc822::parse_rfc822_content(content);
1140 super::rfc822::get_header_first(&metadata.headers, "name")
1141 .map(|name| normalize_python_package_name(&name))
1142}
1143
1144fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1145 let components: Vec<_> = entry_path
1146 .split('/')
1147 .filter(|part| !part.is_empty())
1148 .collect();
1149
1150 if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1151 {
1152 0
1153 } else if components.len() == 2 && components[1] == "PKG-INFO" {
1154 1
1155 } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1156 2
1157 } else {
1158 3
1159 }
1160}
1161
1162fn merge_sdist_archive_dependencies(
1163 entries: &[(String, String)],
1164 metadata_path: &str,
1165 package_data: &mut PackageData,
1166) {
1167 let metadata_dir = metadata_path
1168 .rsplit_once('/')
1169 .map(|(dir, _)| dir)
1170 .unwrap_or("");
1171 let archive_root = metadata_path.split('/').next().unwrap_or("");
1172 let matched_egg_info_dir =
1173 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1174 let mut extra_dependencies = Vec::new();
1175
1176 for (entry_path, content) in entries {
1177 let is_direct_requires =
1178 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1179 let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1180 entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1181 });
1182
1183 if is_direct_requires || is_egg_info_requires {
1184 extra_dependencies.extend(parse_requires_txt(content));
1185 }
1186 }
1187
1188 for dependency in extra_dependencies {
1189 if !package_data.dependencies.iter().any(|existing| {
1190 existing.purl == dependency.purl
1191 && existing.scope == dependency.scope
1192 && existing.extracted_requirement == dependency.extracted_requirement
1193 && existing.extra_data == dependency.extra_data
1194 }) {
1195 package_data.dependencies.push(dependency);
1196 }
1197 }
1198}
1199
1200fn merge_sdist_archive_file_references(
1201 entries: &[(String, String)],
1202 metadata_path: &str,
1203 package_data: &mut PackageData,
1204) {
1205 let metadata_dir = metadata_path
1206 .rsplit_once('/')
1207 .map(|(dir, _)| dir)
1208 .unwrap_or("");
1209 let archive_root = metadata_path.split('/').next().unwrap_or("");
1210 let matched_egg_info_dir =
1211 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1212 let mut extra_refs = Vec::new();
1213
1214 for (entry_path, content) in entries {
1215 let is_direct_sources =
1216 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1217 let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1218 entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1219 });
1220
1221 if is_direct_sources || is_egg_info_sources {
1222 extra_refs.extend(parse_sources_txt(content));
1223 }
1224 }
1225
1226 for file_ref in extra_refs {
1227 if !package_data
1228 .file_references
1229 .iter()
1230 .any(|existing| existing.path == file_ref.path)
1231 {
1232 package_data.file_references.push(file_ref);
1233 }
1234 }
1235}
1236
1237fn select_matching_sdist_egg_info_dir(
1238 entries: &[(String, String)],
1239 archive_root: &str,
1240 package_name: Option<&str>,
1241) -> Option<String> {
1242 let normalized_package_name = package_name.map(normalize_python_package_name);
1243
1244 entries
1245 .iter()
1246 .filter_map(|(entry_path, _)| {
1247 let components: Vec<_> = entry_path
1248 .split('/')
1249 .filter(|part| !part.is_empty())
1250 .collect();
1251 if components.len() == 3
1252 && components[0] == archive_root
1253 && components[1].ends_with(".egg-info")
1254 {
1255 Some(components[1].to_string())
1256 } else {
1257 None
1258 }
1259 })
1260 .min_by_key(|egg_info_dir| {
1261 let normalized_dir_name =
1262 normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1263 let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1264 0
1265 } else {
1266 1
1267 };
1268
1269 (name_rank, egg_info_dir.clone())
1270 })
1271}
1272
1273fn normalize_python_package_name(name: &str) -> String {
1274 name.to_ascii_lowercase().replace('_', "-")
1275}
1276
1277fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1278 let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1279 return;
1280 };
1281
1282 let Some(stem) = strip_python_archive_extension(file_name) else {
1283 return;
1284 };
1285
1286 let Some((name, version)) = stem.rsplit_once('-') else {
1287 return;
1288 };
1289
1290 if package_data.name.is_none() {
1291 package_data.name = Some(name.replace('_', "-"));
1292 }
1293 if package_data.version.is_none() {
1294 package_data.version = Some(version.to_string());
1295 }
1296
1297 if package_data.purl.is_none()
1298 || package_data.repository_homepage_url.is_none()
1299 || package_data.repository_download_url.is_none()
1300 || package_data.api_data_url.is_none()
1301 {
1302 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1303 build_pypi_urls(
1304 package_data.name.as_deref(),
1305 package_data.version.as_deref(),
1306 );
1307
1308 if package_data.repository_homepage_url.is_none() {
1309 package_data.repository_homepage_url = repository_homepage_url;
1310 }
1311 if package_data.repository_download_url.is_none() {
1312 package_data.repository_download_url = repository_download_url;
1313 }
1314 if package_data.api_data_url.is_none() {
1315 package_data.api_data_url = api_data_url;
1316 }
1317 if package_data.purl.is_none() {
1318 package_data.purl = purl;
1319 }
1320 }
1321}
1322
1323fn extract_from_wheel_archive(path: &Path) -> PackageData {
1324 let metadata = match std::fs::metadata(path) {
1325 Ok(m) => m,
1326 Err(e) => {
1327 warn!(
1328 "Failed to read metadata for wheel archive {:?}: {}",
1329 path, e
1330 );
1331 return default_package_data(path);
1332 }
1333 };
1334
1335 if metadata.len() > MAX_ARCHIVE_SIZE {
1336 warn!(
1337 "Wheel archive too large: {} bytes (limit: {} bytes)",
1338 metadata.len(),
1339 MAX_ARCHIVE_SIZE
1340 );
1341 return default_package_data(path);
1342 }
1343
1344 let file = match File::open(path) {
1345 Ok(f) => f,
1346 Err(e) => {
1347 warn!("Failed to open wheel archive {:?}: {}", path, e);
1348 return default_package_data(path);
1349 }
1350 };
1351
1352 let mut archive = match ZipArchive::new(file) {
1353 Ok(a) => a,
1354 Err(e) => {
1355 warn!("Failed to read wheel archive {:?}: {}", path, e);
1356 return default_package_data(path);
1357 }
1358 };
1359
1360 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1361 Ok(entries) => entries,
1362 Err(_) => return default_package_data(path),
1363 };
1364
1365 let metadata_entry =
1366 match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1367 Some(entry) => entry,
1368 None => {
1369 warn!("No METADATA file found in wheel archive {:?}", path);
1370 return default_package_data(path);
1371 }
1372 };
1373
1374 let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1375 Ok(c) => c,
1376 Err(e) => {
1377 warn!("Failed to read METADATA from {:?}: {}", path, e);
1378 return default_package_data(path);
1379 }
1380 };
1381
1382 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1383
1384 let (size, sha256) = calculate_file_checksums(path);
1385 package_data.size = size;
1386 package_data.sha256 = sha256;
1387
1388 if let Some(record_entry) =
1389 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1390 && let Ok(record_content) =
1391 read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1392 {
1393 package_data.file_references = parse_record_csv(&record_content);
1394 }
1395
1396 if let Some(wheel_info) = parse_wheel_filename(path) {
1397 if package_data.name.is_none() {
1398 package_data.name = Some(wheel_info.name.clone());
1399 }
1400 if package_data.version.is_none() {
1401 package_data.version = Some(wheel_info.version.clone());
1402 }
1403
1404 package_data.qualifiers = Some(std::collections::HashMap::from([(
1405 "extension".to_string(),
1406 format!(
1407 "{}-{}-{}",
1408 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1409 ),
1410 )]));
1411
1412 package_data.purl = build_wheel_purl(
1413 package_data.name.as_deref(),
1414 package_data.version.as_deref(),
1415 &wheel_info,
1416 );
1417
1418 let mut extra_data = package_data.extra_data.unwrap_or_default();
1419 extra_data.insert(
1420 "python_requires".to_string(),
1421 serde_json::Value::String(wheel_info.python_tag.clone()),
1422 );
1423 extra_data.insert(
1424 "abi_tag".to_string(),
1425 serde_json::Value::String(wheel_info.abi_tag.clone()),
1426 );
1427 extra_data.insert(
1428 "platform_tag".to_string(),
1429 serde_json::Value::String(wheel_info.platform_tag.clone()),
1430 );
1431 package_data.extra_data = Some(extra_data);
1432 }
1433
1434 package_data
1435}
1436
1437fn extract_from_egg_archive(path: &Path) -> PackageData {
1438 let metadata = match std::fs::metadata(path) {
1439 Ok(m) => m,
1440 Err(e) => {
1441 warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1442 return default_package_data(path);
1443 }
1444 };
1445
1446 if metadata.len() > MAX_ARCHIVE_SIZE {
1447 warn!(
1448 "Egg archive too large: {} bytes (limit: {} bytes)",
1449 metadata.len(),
1450 MAX_ARCHIVE_SIZE
1451 );
1452 return default_package_data(path);
1453 }
1454
1455 let file = match File::open(path) {
1456 Ok(f) => f,
1457 Err(e) => {
1458 warn!("Failed to open egg archive {:?}: {}", path, e);
1459 return default_package_data(path);
1460 }
1461 };
1462
1463 let mut archive = match ZipArchive::new(file) {
1464 Ok(a) => a,
1465 Err(e) => {
1466 warn!("Failed to read egg archive {:?}: {}", path, e);
1467 return default_package_data(path);
1468 }
1469 };
1470
1471 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1472 Ok(entries) => entries,
1473 Err(_) => return default_package_data(path),
1474 };
1475
1476 let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1477 &validated_entries,
1478 &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1479 ) {
1480 Some(entry) => entry,
1481 None => {
1482 warn!("No PKG-INFO file found in egg archive {:?}", path);
1483 return default_package_data(path);
1484 }
1485 };
1486
1487 let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1488 Ok(c) => c,
1489 Err(e) => {
1490 warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1491 return default_package_data(path);
1492 }
1493 };
1494
1495 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1496
1497 let (size, sha256) = calculate_file_checksums(path);
1498 package_data.size = size;
1499 package_data.sha256 = sha256;
1500
1501 if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1502 &validated_entries,
1503 &[
1504 "EGG-INFO/installed-files.txt",
1505 ".egg-info/installed-files.txt",
1506 ],
1507 ) && let Ok(installed_files_content) =
1508 read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1509 {
1510 package_data.file_references = parse_installed_files_txt(&installed_files_content);
1511 }
1512
1513 if let Some(egg_info) = parse_egg_filename(path) {
1514 if package_data.name.is_none() {
1515 package_data.name = Some(egg_info.name.clone());
1516 }
1517 if package_data.version.is_none() {
1518 package_data.version = Some(egg_info.version.clone());
1519 }
1520
1521 if let Some(python_version) = &egg_info.python_version {
1522 let mut extra_data = package_data.extra_data.unwrap_or_default();
1523 extra_data.insert(
1524 "python_version".to_string(),
1525 serde_json::Value::String(python_version.clone()),
1526 );
1527 package_data.extra_data = Some(extra_data);
1528 }
1529 }
1530
1531 package_data.purl = build_egg_purl(
1532 package_data.name.as_deref(),
1533 package_data.version.as_deref(),
1534 );
1535
1536 package_data
1537}
1538
1539fn find_validated_zip_entry_by_suffix<'a>(
1540 entries: &'a [ValidatedZipEntry],
1541 suffix: &str,
1542) -> Option<&'a ValidatedZipEntry> {
1543 entries.iter().find(|entry| entry.name.ends_with(suffix))
1544}
1545
1546fn find_validated_zip_entry_by_any_suffix<'a>(
1547 entries: &'a [ValidatedZipEntry],
1548 suffixes: &[&str],
1549) -> Option<&'a ValidatedZipEntry> {
1550 entries
1551 .iter()
1552 .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1553}
1554
1555fn read_validated_zip_entry<R: Read + std::io::Seek>(
1556 archive: &mut ZipArchive<R>,
1557 entry: &ValidatedZipEntry,
1558 path: &Path,
1559 archive_type: &str,
1560) -> Result<String, String> {
1561 let mut file = archive
1562 .by_index(entry.index)
1563 .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1564
1565 let compressed_size = file.compressed_size();
1566 let uncompressed_size = file.size();
1567
1568 if compressed_size > 0 {
1569 let ratio = uncompressed_size as f64 / compressed_size as f64;
1570 if ratio > MAX_COMPRESSION_RATIO {
1571 return Err(format!(
1572 "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1573 archive_type, path, ratio
1574 ));
1575 }
1576 }
1577
1578 if uncompressed_size > MAX_FILE_SIZE {
1579 return Err(format!(
1580 "Rejected oversized entry in {} {:?}: {} bytes",
1581 archive_type, path, uncompressed_size
1582 ));
1583 }
1584
1585 read_limited_utf8(
1586 &mut file,
1587 MAX_FILE_SIZE,
1588 &format!("{} entry {}", archive_type, entry.name),
1589 )
1590}
1591
1592fn read_limited_utf8<R: Read>(
1593 reader: &mut R,
1594 max_bytes: u64,
1595 context: &str,
1596) -> Result<String, String> {
1597 let mut limited = reader.take(max_bytes + 1);
1598 let mut bytes = Vec::new();
1599 limited
1600 .read_to_end(&mut bytes)
1601 .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1602
1603 if bytes.len() as u64 > max_bytes {
1604 return Err(format!(
1605 "{} exceeded {} byte limit while reading",
1606 context, max_bytes
1607 ));
1608 }
1609
1610 match String::from_utf8(bytes) {
1611 Ok(s) => Ok(s),
1612 Err(err) => {
1613 let bytes = err.into_bytes();
1614 warn!("Invalid UTF-8 in archive entry; using lossy conversion");
1615 Ok(String::from_utf8_lossy(&bytes).into_owned())
1616 }
1617 }
1618}
1619
1620fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1621 let normalized = entry_path.replace('\\', "/");
1622 if normalized.len() >= 3 {
1623 let bytes = normalized.as_bytes();
1624 if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1625 return None;
1626 }
1627 }
1628 let path = Path::new(&normalized);
1629 let mut components = Vec::new();
1630
1631 for component in path.components() {
1632 match component {
1633 Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1634 Component::CurDir => {}
1635 Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1636 }
1637 }
1638
1639 (!components.is_empty()).then_some(components.join("/"))
1640}
1641
1642pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1647 let mut reader = ReaderBuilder::new()
1648 .has_headers(false)
1649 .from_reader(content.as_bytes());
1650
1651 let mut file_references = Vec::new();
1652 let mut record_count = 0usize;
1653
1654 for result in reader.records() {
1655 record_count += 1;
1656 if record_count > MAX_ITERATION_COUNT {
1657 warn!(
1658 "Exceeded max record count in RECORD CSV; stopping at {} records",
1659 MAX_ITERATION_COUNT
1660 );
1661 break;
1662 }
1663 match result {
1664 Ok(record) => {
1665 if record.len() < 3 {
1666 continue;
1667 }
1668
1669 let path = record.get(0).unwrap_or("").trim().to_string();
1670 if path.is_empty() {
1671 continue;
1672 }
1673
1674 let hash_field = record.get(1).unwrap_or("").trim();
1675 let size_field = record.get(2).unwrap_or("").trim();
1676
1677 let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1679 let parts: Vec<&str> = hash_field.split('=').collect();
1680 if parts.len() == 2 && parts[0] == "sha256" {
1681 match URL_SAFE_NO_PAD.decode(parts[1]) {
1682 Ok(decoded) => {
1683 let hex = decoded
1684 .iter()
1685 .map(|b| format!("{:02x}", b))
1686 .collect::<String>();
1687 Sha256Digest::from_hex(&hex).ok()
1688 }
1689 Err(_) => None,
1690 }
1691 } else {
1692 None
1693 }
1694 } else {
1695 None
1696 };
1697
1698 let size = if !size_field.is_empty() && size_field != "-" {
1700 size_field.parse::<u64>().ok()
1701 } else {
1702 None
1703 };
1704
1705 file_references.push(FileReference {
1706 path,
1707 size,
1708 sha1: None,
1709 md5: None,
1710 sha256,
1711 sha512: None,
1712 extra_data: None,
1713 });
1714 }
1715 Err(e) => {
1716 warn!("Failed to parse RECORD CSV row: {}", e);
1717 continue;
1718 }
1719 }
1720 }
1721
1722 file_references
1723}
1724
1725pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1728 content
1729 .lines()
1730 .take(MAX_ITERATION_COUNT)
1731 .map(|line| line.trim())
1732 .filter(|line| !line.is_empty())
1733 .map(|path| FileReference {
1734 path: path.to_string(),
1735 size: None,
1736 sha1: None,
1737 md5: None,
1738 sha256: None,
1739 sha512: None,
1740 extra_data: None,
1741 })
1742 .collect()
1743}
1744
1745pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1746 content
1747 .lines()
1748 .take(MAX_ITERATION_COUNT)
1749 .map(str::trim)
1750 .filter(|line| !line.is_empty())
1751 .map(|path| FileReference {
1752 path: path.to_string(),
1753 size: None,
1754 sha1: None,
1755 md5: None,
1756 sha256: None,
1757 sha512: None,
1758 extra_data: None,
1759 })
1760 .collect()
1761}
1762
1763struct WheelInfo {
1764 name: String,
1765 version: String,
1766 python_tag: String,
1767 abi_tag: String,
1768 platform_tag: String,
1769}
1770
1771fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1772 let stem = path.file_stem()?.to_string_lossy();
1773 let parts: Vec<&str> = stem.split('-').collect();
1774
1775 if parts.len() >= 5 {
1776 Some(WheelInfo {
1777 name: parts[0].replace('_', "-"),
1778 version: parts[1].to_string(),
1779 python_tag: parts[2].to_string(),
1780 abi_tag: parts[3].to_string(),
1781 platform_tag: parts[4..].join("-"),
1782 })
1783 } else {
1784 None
1785 }
1786}
1787
1788struct EggInfo {
1789 name: String,
1790 version: String,
1791 python_version: Option<String>,
1792}
1793
1794fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1795 let stem = path.file_stem()?.to_string_lossy();
1796 let parts: Vec<&str> = stem.split('-').collect();
1797
1798 if parts.len() >= 2 {
1799 Some(EggInfo {
1800 name: parts[0].replace('_', "-"),
1801 version: parts[1].to_string(),
1802 python_version: parts.get(2).map(|s| s.to_string()),
1803 })
1804 } else {
1805 None
1806 }
1807}
1808
1809fn build_wheel_purl(
1810 name: Option<&str>,
1811 version: Option<&str>,
1812 wheel_info: &WheelInfo,
1813) -> Option<String> {
1814 let name = name?;
1815 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1816
1817 if let Some(ver) = version {
1818 package_url.with_version(ver).ok()?;
1819 }
1820
1821 let extension = format!(
1822 "{}-{}-{}",
1823 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1824 );
1825 package_url.add_qualifier("extension", extension).ok()?;
1826
1827 Some(package_url.to_string())
1828}
1829
1830fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1831 let name = name?;
1832 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1833
1834 if let Some(ver) = version {
1835 package_url.with_version(ver).ok()?;
1836 }
1837
1838 package_url.add_qualifier("type", "egg").ok()?;
1839
1840 Some(package_url.to_string())
1841}
1842
1843fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1844 let metadata = super::rfc822::parse_rfc822_content(content);
1845 build_package_data_from_rfc822(&metadata, datasource_id)
1846}
1847
1848fn build_package_data_from_rfc822(
1853 metadata: &super::rfc822::Rfc822Metadata,
1854 datasource_id: DatasourceId,
1855) -> PackageData {
1856 use super::rfc822::{get_header_all, get_header_first};
1857
1858 let name = get_header_first(&metadata.headers, "name").map(truncate_field);
1859 let version = get_header_first(&metadata.headers, "version").map(truncate_field);
1860 let summary = get_header_first(&metadata.headers, "summary").map(truncate_field);
1861 let mut homepage_url = get_header_first(&metadata.headers, "home-page").map(truncate_field);
1862 let author = get_header_first(&metadata.headers, "author").map(truncate_field);
1863 let author_email = get_header_first(&metadata.headers, "author-email").map(truncate_field);
1864 let license = get_header_first(&metadata.headers, "license").map(truncate_field);
1865 let license_expression = get_header_first(&metadata.headers, "license-expression");
1866 let download_url = get_header_first(&metadata.headers, "download-url");
1867 let platform = get_header_first(&metadata.headers, "platform");
1868 let requires_python = get_header_first(&metadata.headers, "requires-python");
1869 let classifiers = get_header_all(&metadata.headers, "classifier");
1870 let license_files = get_header_all(&metadata.headers, "license-file");
1871
1872 let description_body = if metadata.body.is_empty() {
1873 get_header_first(&metadata.headers, "description").unwrap_or_default()
1874 } else {
1875 metadata.body.clone()
1876 };
1877
1878 let description = build_description(summary.as_deref(), &description_body).map(truncate_field);
1879
1880 let mut parties = Vec::new();
1881 if author.is_some() || author_email.is_some() {
1882 parties.push(Party {
1883 r#type: Some("person".to_string()),
1884 role: Some("author".to_string()),
1885 name: author,
1886 email: author_email,
1887 url: None,
1888 organization: None,
1889 organization_url: None,
1890 timezone: None,
1891 });
1892 }
1893
1894 let (keywords, license_classifiers) = split_classifiers(&classifiers);
1895 let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1896 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1897 license_expression
1898 .as_deref()
1899 .and_then(normalize_spdx_expression)
1900 .map(|normalized| {
1901 build_declared_license_data(
1902 normalized,
1903 DeclaredLicenseMatchMetadata::single_line(
1904 license_expression.as_deref().unwrap_or_default(),
1905 )
1906 .with_referenced_filenames(&referenced_license_files),
1907 )
1908 })
1909 .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1910
1911 let extracted_license_statement = license_expression
1912 .clone()
1913 .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1914
1915 let mut extra_data = HashMap::new();
1916 if let Some(platform_value) = platform
1917 && !platform_value.eq_ignore_ascii_case("unknown")
1918 && !platform_value.is_empty()
1919 {
1920 extra_data.insert(
1921 "platform".to_string(),
1922 serde_json::Value::String(platform_value),
1923 );
1924 }
1925
1926 if let Some(requires_python_value) = requires_python
1927 && !requires_python_value.is_empty()
1928 {
1929 extra_data.insert(
1930 "requires_python".to_string(),
1931 serde_json::Value::String(requires_python_value),
1932 );
1933 }
1934
1935 if !license_files.is_empty() {
1936 extra_data.insert(
1937 "license_files".to_string(),
1938 serde_json::Value::Array(
1939 license_files
1940 .iter()
1941 .cloned()
1942 .map(serde_json::Value::String)
1943 .collect(),
1944 ),
1945 );
1946 }
1947
1948 let file_references = license_files
1949 .iter()
1950 .map(|path| FileReference {
1951 path: path.clone(),
1952 size: None,
1953 sha1: None,
1954 md5: None,
1955 sha256: None,
1956 sha512: None,
1957 extra_data: None,
1958 })
1959 .collect();
1960
1961 let project_urls = get_header_all(&metadata.headers, "project-url");
1962 let dependencies = extract_rfc822_dependencies(&metadata.headers);
1963 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1964
1965 if !project_urls.is_empty() {
1966 let parsed_urls = parse_project_urls(&project_urls);
1967
1968 for (label, url) in &parsed_urls {
1969 let label_lower = label.to_lowercase();
1970
1971 if bug_tracking_url.is_none()
1972 && matches!(
1973 label_lower.as_str(),
1974 "tracker"
1975 | "bug reports"
1976 | "bug tracker"
1977 | "issues"
1978 | "issue tracker"
1979 | "github: issues"
1980 )
1981 {
1982 bug_tracking_url = Some(url.clone());
1983 } else if code_view_url.is_none()
1984 && matches!(label_lower.as_str(), "source" | "source code" | "code")
1985 {
1986 code_view_url = Some(url.clone());
1987 } else if vcs_url.is_none()
1988 && matches!(
1989 label_lower.as_str(),
1990 "github" | "gitlab" | "github: repo" | "repository"
1991 )
1992 {
1993 vcs_url = Some(url.clone());
1994 } else if homepage_url.is_none()
1995 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1996 {
1997 homepage_url = Some(url.clone());
1998 } else if label_lower == "changelog" {
1999 extra_data.insert(
2000 "changelog_url".to_string(),
2001 serde_json::Value::String(url.clone()),
2002 );
2003 }
2004 }
2005
2006 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
2007 .iter()
2008 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
2009 .collect();
2010
2011 if !project_urls_json.is_empty() {
2012 extra_data.insert(
2013 "project_urls".to_string(),
2014 serde_json::Value::Object(project_urls_json),
2015 );
2016 }
2017 }
2018
2019 let extra_data = if extra_data.is_empty() {
2020 None
2021 } else {
2022 Some(extra_data)
2023 };
2024
2025 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
2026 build_pypi_urls(name.as_deref(), version.as_deref());
2027
2028 PackageData {
2029 package_type: Some(PythonParser::PACKAGE_TYPE),
2030 namespace: None,
2031 name,
2032 version,
2033 qualifiers: None,
2034 subpath: None,
2035 primary_language: Some("Python".to_string()),
2036 description,
2037 release_date: None,
2038 parties,
2039 keywords,
2040 homepage_url,
2041 download_url,
2042 size: None,
2043 sha1: None,
2044 md5: None,
2045 sha256: None,
2046 sha512: None,
2047 bug_tracking_url,
2048 code_view_url,
2049 vcs_url,
2050 copyright: None,
2051 holder: None,
2052 declared_license_expression,
2053 declared_license_expression_spdx,
2054 license_detections,
2055 other_license_expression: None,
2056 other_license_expression_spdx: None,
2057 other_license_detections: Vec::new(),
2058 extracted_license_statement,
2059 notice_text: None,
2060 source_packages: Vec::new(),
2061 file_references,
2062 is_private: false,
2063 is_virtual: false,
2064 extra_data,
2065 dependencies,
2066 repository_homepage_url,
2067 repository_download_url,
2068 api_data_url,
2069 datasource_id: Some(datasource_id),
2070 purl,
2071 }
2072}
2073
2074fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
2075 project_urls
2076 .iter()
2077 .filter_map(|url_entry| {
2078 if let Some((label, url)) = url_entry.split_once(", ") {
2079 let label_trimmed = label.trim();
2080 let url_trimmed = url.trim();
2081 if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
2082 return Some((label_trimmed.to_string(), url_trimmed.to_string()));
2083 }
2084 }
2085 None
2086 })
2087 .collect()
2088}
2089
2090fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
2091 let mut parts = Vec::new();
2092 if let Some(summary_value) = summary
2093 && !summary_value.trim().is_empty()
2094 {
2095 parts.push(summary_value.trim().to_string());
2096 }
2097
2098 if !body.trim().is_empty() {
2099 parts.push(body.trim().to_string());
2100 }
2101
2102 if parts.is_empty() {
2103 None
2104 } else {
2105 Some(parts.join("\n"))
2106 }
2107}
2108
2109fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
2110 let mut keywords = Vec::new();
2111 let mut license_classifiers = Vec::new();
2112
2113 for classifier in classifiers {
2114 if classifier.starts_with("License ::") {
2115 license_classifiers.push(classifier.to_string());
2116 } else {
2117 keywords.push(classifier.to_string());
2118 }
2119 }
2120
2121 (keywords, license_classifiers)
2122}
2123
2124fn build_extracted_license_statement(
2125 license: Option<&str>,
2126 license_classifiers: &[String],
2127) -> Option<String> {
2128 let mut lines = Vec::new();
2129
2130 if let Some(value) = license
2131 && !value.trim().is_empty()
2132 {
2133 lines.push(format!("license: {}", value.trim()));
2134 }
2135
2136 if !license_classifiers.is_empty() {
2137 lines.push("classifiers:".to_string());
2138 for classifier in license_classifiers {
2139 lines.push(format!(" - '{}'", classifier));
2140 }
2141 }
2142
2143 if lines.is_empty() {
2144 None
2145 } else {
2146 Some(format!("{}\n", lines.join("\n")))
2147 }
2148}
2149
2150pub(crate) fn build_pypi_urls(
2151 name: Option<&str>,
2152 version: Option<&str>,
2153) -> (
2154 Option<String>,
2155 Option<String>,
2156 Option<String>,
2157 Option<String>,
2158) {
2159 let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2160
2161 let repository_download_url = name.and_then(|value| {
2162 version.map(|ver| {
2163 format!(
2164 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2165 &value[..1.min(value.len())],
2166 value,
2167 value,
2168 ver
2169 )
2170 })
2171 });
2172
2173 let api_data_url = name.map(|value| {
2174 if let Some(ver) = version {
2175 format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2176 } else {
2177 format!("https://pypi.org/pypi/{}/json", value)
2178 }
2179 });
2180
2181 let purl = name.and_then(|value| {
2182 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2183 if let Some(ver) = version {
2184 package_url.with_version(ver).ok()?;
2185 }
2186 Some(package_url.to_string())
2187 });
2188
2189 (
2190 repository_homepage_url,
2191 repository_download_url,
2192 api_data_url,
2193 purl,
2194 )
2195}
2196
2197fn build_pypi_purl_with_extension(
2198 name: &str,
2199 version: Option<&str>,
2200 extension: &str,
2201) -> Option<String> {
2202 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2203 if let Some(ver) = version {
2204 package_url.with_version(ver).ok()?;
2205 }
2206 package_url.add_qualifier("extension", extension).ok()?;
2207 Some(package_url.to_string())
2208}
2209
2210fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2211 let toml_content = match read_toml_file(path) {
2212 Ok(content) => content,
2213 Err(e) => {
2214 warn!(
2215 "Failed to read or parse pyproject.toml at {:?}: {}",
2216 path, e
2217 );
2218 return default_package_data(path);
2219 }
2220 };
2221
2222 let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2223 let is_poetry_pyproject = tool_table
2224 .and_then(|tool| tool.get("poetry"))
2225 .and_then(|value| value.as_table())
2226 .is_some();
2227
2228 let project_table =
2230 if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2231 project.clone()
2233 } else if let Some(tool) = tool_table {
2234 if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2235 poetry.clone()
2237 } else {
2238 return default_package_data(path);
2239 }
2240 } else if toml_content.get(FIELD_NAME).is_some() {
2241 match toml_content.as_table() {
2243 Some(table) => table.clone(),
2244 None => {
2245 warn!("Failed to convert TOML content to table in {:?}", path);
2246 return default_package_data(path);
2247 }
2248 }
2249 } else {
2250 return default_package_data(path);
2251 };
2252
2253 let name = project_table
2254 .get(FIELD_NAME)
2255 .and_then(|v| v.as_str())
2256 .map(|v| truncate_field(v.to_string()));
2257
2258 let version = project_table
2259 .get(FIELD_VERSION)
2260 .and_then(|v| v.as_str())
2261 .map(String::from);
2262 let classifiers = project_table
2263 .get("classifiers")
2264 .and_then(|value| value.as_array())
2265 .map(|values| {
2266 values
2267 .iter()
2268 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2269 .collect::<Vec<_>>()
2270 })
2271 .unwrap_or_default();
2272 let (classifier_keywords, license_classifiers) = split_classifiers(&classifiers);
2273
2274 let extracted_license_statement = extract_raw_license_string(&project_table);
2275 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2276 normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2277
2278 let description = project_table
2279 .get(FIELD_DESCRIPTION)
2280 .and_then(|value| value.as_str())
2281 .map(|value| truncate_field(value.to_string()));
2282 let mut keywords = project_table
2283 .get(FIELD_KEYWORDS)
2284 .and_then(|value| value.as_array())
2285 .map(|values| {
2286 values
2287 .iter()
2288 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2289 .collect::<Vec<_>>()
2290 })
2291 .unwrap_or_default();
2292 for classifier in classifier_keywords {
2293 if !keywords.contains(&classifier) {
2294 keywords.push(classifier);
2295 }
2296 }
2297
2298 let mut extra_data = extract_pyproject_extra_data(&toml_content).unwrap_or_default();
2300 let (homepage_url, download_url, bug_tracking_url, code_view_url, repository_url) =
2301 extract_urls(&project_table, &mut extra_data);
2302
2303 let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2304
2305 let purl = name.as_ref().and_then(|n| {
2307 let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2308 Ok(p) => p,
2309 Err(e) => {
2310 warn!(
2311 "Failed to create PackageUrl for Python package '{}': {}",
2312 n, e
2313 );
2314 return None;
2315 }
2316 };
2317
2318 if let Some(v) = &version
2319 && let Err(e) = package_url.with_version(v)
2320 {
2321 warn!(
2322 "Failed to set version '{}' for Python package '{}': {}",
2323 v, n, e
2324 );
2325 return None;
2326 }
2327
2328 Some(package_url.to_string())
2329 });
2330
2331 let api_data_url = name.as_ref().map(|n| {
2332 if let Some(v) = &version {
2333 format!("https://pypi.org/pypi/{}/{}/json", n, v)
2334 } else {
2335 format!("https://pypi.org/pypi/{}/json", n)
2336 }
2337 });
2338
2339 let pypi_homepage_url = name
2340 .as_ref()
2341 .map(|n| format!("https://pypi.org/project/{}", n));
2342
2343 let pypi_download_url = name.as_ref().and_then(|n| {
2344 version.as_ref().map(|v| {
2345 format!(
2346 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2347 &n[..1.min(n.len())],
2348 n,
2349 n,
2350 v
2351 )
2352 })
2353 });
2354
2355 PackageData {
2356 package_type: Some(PythonParser::PACKAGE_TYPE),
2357 namespace: None,
2358 name,
2359 version,
2360 qualifiers: None,
2361 subpath: None,
2362 primary_language: None,
2363 description,
2364 release_date: None,
2365 parties: extract_parties(&project_table),
2366 keywords,
2367 homepage_url: homepage_url.or(pypi_homepage_url),
2368 download_url: download_url
2369 .or_else(|| repository_url.clone())
2370 .or(pypi_download_url),
2371 size: None,
2372 sha1: None,
2373 md5: None,
2374 sha256: None,
2375 sha512: None,
2376 bug_tracking_url,
2377 code_view_url,
2378 vcs_url: repository_url,
2379 copyright: None,
2380 holder: None,
2381 declared_license_expression,
2382 declared_license_expression_spdx,
2383 license_detections,
2384 other_license_expression: None,
2385 other_license_expression_spdx: None,
2386 other_license_detections: Vec::new(),
2387 extracted_license_statement: extracted_license_statement
2388 .or_else(|| build_extracted_license_statement(None, &license_classifiers)),
2389 notice_text: None,
2390 source_packages: Vec::new(),
2391 file_references: Vec::new(),
2392 is_private: has_private_classifier(&classifiers),
2393 is_virtual: false,
2394 extra_data: if extra_data.is_empty() {
2395 None
2396 } else {
2397 Some(extra_data)
2398 },
2399 dependencies: [dependencies, optional_dependencies].concat(),
2400 repository_homepage_url: None,
2401 repository_download_url: None,
2402 api_data_url,
2403 datasource_id: Some(if is_poetry_pyproject {
2404 DatasourceId::PypiPoetryPyprojectToml
2405 } else {
2406 DatasourceId::PypiPyprojectToml
2407 }),
2408 purl,
2409 }
2410}
2411
2412fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2413 let path_str = path.to_string_lossy().replace('\\', "/");
2414 if path_str.contains("/EGG-INFO/PKG-INFO") {
2415 DatasourceId::PypiEggPkginfo
2416 } else if path_str.ends_with(".egg-info/PKG-INFO") {
2417 DatasourceId::PypiEditableEggPkginfo
2418 } else {
2419 DatasourceId::PypiSdistPkginfo
2420 }
2421}
2422
2423fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2424 project
2425 .get(FIELD_LICENSE)
2426 .and_then(|license_value| match license_value {
2427 TomlValue::String(license_str) => Some(license_str.clone()),
2428 TomlValue::Table(license_table) => license_table
2429 .get("text")
2430 .and_then(|v| v.as_str())
2431 .map(|s| s.to_string())
2432 .or_else(|| {
2433 license_table
2434 .get("expression")
2435 .and_then(|v| v.as_str())
2436 .map(|expr| expr.to_string())
2437 }),
2438 _ => None,
2439 })
2440}
2441
2442fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2443 match project.get(FIELD_LICENSE) {
2444 Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2445 Some(TomlValue::Table(license_table)) => license_table
2446 .get("expression")
2447 .and_then(|value| value.as_str()),
2448 _ => None,
2449 }
2450}
2451
2452fn extract_urls(
2453 project: &TomlMap<String, TomlValue>,
2454 extra_data: &mut HashMap<String, serde_json::Value>,
2455) -> ProjectUrls {
2456 let mut homepage_url = None;
2457 let mut download_url = None;
2458 let mut bug_tracking_url = None;
2459 let mut code_view_url = None;
2460 let mut repository_url = None;
2461
2462 if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2464 let parsed_urls: Vec<(String, String)> = urls
2465 .iter()
2466 .filter_map(|(label, value)| {
2467 value
2468 .as_str()
2469 .map(|url| (label.to_string(), url.to_string()))
2470 })
2471 .collect();
2472 apply_project_url_mappings(
2473 &parsed_urls,
2474 &mut homepage_url,
2475 &mut bug_tracking_url,
2476 &mut code_view_url,
2477 &mut repository_url,
2478 extra_data,
2479 );
2480
2481 download_url = urls
2482 .get("Downloads")
2483 .or_else(|| urls.get("downloads"))
2484 .and_then(|v| v.as_str())
2485 .map(String::from);
2486
2487 if homepage_url.is_none() {
2488 homepage_url = urls
2489 .get(FIELD_HOMEPAGE)
2490 .and_then(|v| v.as_str())
2491 .map(String::from);
2492 }
2493 if repository_url.is_none() {
2494 repository_url = urls
2495 .get(FIELD_REPOSITORY)
2496 .and_then(|v| v.as_str())
2497 .map(String::from);
2498 }
2499 }
2500
2501 if homepage_url.is_none() {
2503 homepage_url = project
2504 .get(FIELD_HOMEPAGE)
2505 .and_then(|v| v.as_str())
2506 .map(String::from);
2507 }
2508
2509 if repository_url.is_none() {
2510 repository_url = project
2511 .get(FIELD_REPOSITORY)
2512 .and_then(|v| v.as_str())
2513 .map(String::from);
2514 }
2515
2516 (
2517 homepage_url,
2518 download_url,
2519 bug_tracking_url,
2520 code_view_url,
2521 repository_url,
2522 )
2523}
2524
2525fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2526 let mut parties = Vec::new();
2527
2528 if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2529 for author in authors {
2530 if let Some(author_str) = author.as_str() {
2531 let (name, email) = split_name_email(author_str);
2532 parties.push(Party {
2533 r#type: None,
2534 role: Some("author".to_string()),
2535 name,
2536 email,
2537 url: None,
2538 organization: None,
2539 organization_url: None,
2540 timezone: None,
2541 });
2542 } else if let Some(author_table) = author.as_table() {
2543 let name = author_table
2544 .get("name")
2545 .and_then(|value| value.as_str())
2546 .map(|value| value.to_string());
2547 let email = author_table
2548 .get("email")
2549 .and_then(|value| value.as_str())
2550 .map(|value| value.to_string());
2551 if name.is_some() || email.is_some() {
2552 parties.push(Party {
2553 r#type: None,
2554 role: Some("author".to_string()),
2555 name,
2556 email,
2557 url: None,
2558 organization: None,
2559 organization_url: None,
2560 timezone: None,
2561 });
2562 }
2563 }
2564 }
2565 }
2566
2567 if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2568 for maintainer in maintainers {
2569 if let Some(maintainer_str) = maintainer.as_str() {
2570 let (name, email) = split_name_email(maintainer_str);
2571 parties.push(Party {
2572 r#type: None,
2573 role: Some("maintainer".to_string()),
2574 name,
2575 email,
2576 url: None,
2577 organization: None,
2578 organization_url: None,
2579 timezone: None,
2580 });
2581 } else if let Some(maintainer_table) = maintainer.as_table() {
2582 let name = maintainer_table
2583 .get("name")
2584 .and_then(|value| value.as_str())
2585 .map(|value| value.to_string());
2586 let email = maintainer_table
2587 .get("email")
2588 .and_then(|value| value.as_str())
2589 .map(|value| value.to_string());
2590 if name.is_some() || email.is_some() {
2591 parties.push(Party {
2592 r#type: None,
2593 role: Some("maintainer".to_string()),
2594 name,
2595 email,
2596 url: None,
2597 organization: None,
2598 organization_url: None,
2599 timezone: None,
2600 });
2601 }
2602 }
2603 }
2604 }
2605
2606 parties
2607}
2608
2609fn extract_dependencies(
2610 project: &TomlMap<String, TomlValue>,
2611 toml_content: &TomlValue,
2612) -> (Vec<Dependency>, Vec<Dependency>) {
2613 let mut dependencies = Vec::new();
2614 let mut optional_dependencies = Vec::new();
2615
2616 if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2618 match deps_value {
2619 TomlValue::Array(arr) => {
2620 dependencies = parse_dependency_array(arr, false, None);
2621 }
2622 TomlValue::Table(table) => {
2623 dependencies = parse_dependency_table(table, false, None);
2624 }
2625 _ => {}
2626 }
2627 }
2628
2629 if let Some(opt_deps_table) = project
2631 .get(FIELD_OPTIONAL_DEPENDENCIES)
2632 .and_then(|v| v.as_table())
2633 {
2634 for (extra_name, deps) in opt_deps_table {
2635 match deps {
2636 TomlValue::Array(arr) => {
2637 optional_dependencies.extend(parse_dependency_array(
2638 arr,
2639 true,
2640 Some(extra_name),
2641 ));
2642 }
2643 TomlValue::Table(table) => {
2644 optional_dependencies.extend(parse_dependency_table(
2645 table,
2646 true,
2647 Some(extra_name),
2648 ));
2649 }
2650 _ => {}
2651 }
2652 }
2653 }
2654
2655 if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2657 match dev_deps_value {
2658 TomlValue::Array(arr) => {
2659 optional_dependencies.extend(parse_dependency_array(
2660 arr,
2661 true,
2662 Some(FIELD_DEV_DEPENDENCIES),
2663 ));
2664 }
2665 TomlValue::Table(table) => {
2666 optional_dependencies.extend(parse_dependency_table(
2667 table,
2668 true,
2669 Some(FIELD_DEV_DEPENDENCIES),
2670 ));
2671 }
2672 _ => {}
2673 }
2674 }
2675
2676 if let Some(groups_table) = toml_content
2678 .get("tool")
2679 .and_then(|value| value.as_table())
2680 .and_then(|tool| tool.get("poetry"))
2681 .and_then(|value| value.as_table())
2682 .and_then(|poetry| poetry.get("group"))
2683 .and_then(|value| value.as_table())
2684 {
2685 for (group_name, group_data) in groups_table {
2686 if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2687 match group_deps {
2688 TomlValue::Array(arr) => {
2689 optional_dependencies.extend(parse_dependency_array(
2690 arr,
2691 true,
2692 Some(group_name),
2693 ));
2694 }
2695 TomlValue::Table(table) => {
2696 optional_dependencies.extend(parse_poetry_group_dependency_table(
2697 table,
2698 true,
2699 Some(group_name),
2700 ));
2701 }
2702 _ => {}
2703 }
2704 }
2705 }
2706 }
2707
2708 if let Some(groups_table) = toml_content
2709 .get(FIELD_DEPENDENCY_GROUPS)
2710 .and_then(|value| value.as_table())
2711 {
2712 for (group_name, deps) in groups_table {
2713 match deps {
2714 TomlValue::Array(arr) => {
2715 optional_dependencies.extend(parse_dependency_array(
2716 arr,
2717 true,
2718 Some(group_name),
2719 ));
2720 }
2721 TomlValue::Table(table) => {
2722 optional_dependencies.extend(parse_dependency_table(
2723 table,
2724 true,
2725 Some(group_name),
2726 ));
2727 }
2728 _ => {}
2729 }
2730 }
2731 }
2732
2733 if let Some(dev_deps_value) = toml_content
2734 .get("tool")
2735 .and_then(|value| value.as_table())
2736 .and_then(|tool| tool.get("uv"))
2737 .and_then(|value| value.as_table())
2738 .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2739 {
2740 match dev_deps_value {
2741 TomlValue::Array(arr) => {
2742 optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2743 }
2744 TomlValue::Table(table) => {
2745 optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2746 }
2747 _ => {}
2748 }
2749 }
2750
2751 (dependencies, optional_dependencies)
2752}
2753
2754fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2755 let mut extra_data = HashMap::new();
2756
2757 if let Some(tool_uv) = toml_content
2758 .get("tool")
2759 .and_then(|value| value.as_table())
2760 .and_then(|tool| tool.get("uv"))
2761 {
2762 extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2763 }
2764
2765 if extra_data.is_empty() {
2766 None
2767 } else {
2768 Some(extra_data)
2769 }
2770}
2771
2772fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2773 match value {
2774 TomlValue::String(value) => JsonValue::String(value.clone()),
2775 TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2776 TomlValue::Float(value) => JsonValue::String(value.to_string()),
2777 TomlValue::Boolean(value) => JsonValue::Bool(*value),
2778 TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2779 TomlValue::Array(values) => {
2780 JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2781 }
2782 TomlValue::Table(values) => JsonValue::Object(
2783 values
2784 .iter()
2785 .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2786 .collect::<JsonMap<String, JsonValue>>(),
2787 ),
2788 }
2789}
2790
2791fn parse_dependency_table(
2792 table: &TomlMap<String, TomlValue>,
2793 is_optional: bool,
2794 scope: Option<&str>,
2795) -> Vec<Dependency> {
2796 table
2797 .iter()
2798 .filter_map(|(name, version)| {
2799 let version_str = version.as_str().map(|s| s.to_string());
2800 let mut package_url =
2801 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2802
2803 if let Some(v) = &version_str {
2804 package_url.with_version(v).ok()?;
2805 }
2806
2807 Some(Dependency {
2808 purl: Some(package_url.to_string()),
2809 extracted_requirement: None,
2810 scope: scope.map(|s| s.to_string()),
2811 is_runtime: Some(!is_optional),
2812 is_optional: Some(is_optional),
2813 is_pinned: None,
2814 is_direct: Some(true),
2815 resolved_package: None,
2816 extra_data: None,
2817 })
2818 })
2819 .collect()
2820}
2821
2822fn parse_poetry_group_dependency_table(
2823 table: &TomlMap<String, TomlValue>,
2824 is_optional: bool,
2825 scope: Option<&str>,
2826) -> Vec<Dependency> {
2827 table
2828 .iter()
2829 .filter_map(|(name, value)| build_poetry_group_dependency(name, value, is_optional, scope))
2830 .collect()
2831}
2832
2833fn build_poetry_group_dependency(
2834 name: &str,
2835 value: &TomlValue,
2836 is_optional: bool,
2837 scope: Option<&str>,
2838) -> Option<Dependency> {
2839 let normalized_name = normalize_python_dependency_name(name);
2840 let (version_spec, extras, marker) = match value {
2841 TomlValue::String(spec) => (Some(spec.trim().to_string()), Vec::new(), None),
2842 TomlValue::Table(table) => {
2843 let version_spec = table
2844 .get(FIELD_VERSION)
2845 .and_then(|value| value.as_str())
2846 .map(str::trim)
2847 .filter(|value| !value.is_empty())
2848 .map(ToOwned::to_owned);
2849 let extras = table
2850 .get(FIELD_EXTRAS)
2851 .and_then(|value| value.as_array())
2852 .map(|values| {
2853 values
2854 .iter()
2855 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2856 .collect::<Vec<_>>()
2857 })
2858 .unwrap_or_default();
2859 let marker = table
2860 .get("markers")
2861 .and_then(|value| value.as_str())
2862 .map(str::trim)
2863 .filter(|value| !value.is_empty())
2864 .map(ToOwned::to_owned);
2865
2866 (version_spec, extras, marker)
2867 }
2868 _ => return None,
2869 };
2870
2871 let pinned_version = version_spec
2872 .as_deref()
2873 .and_then(extract_exact_pinned_version);
2874 let purl = build_python_dependency_purl(&normalized_name, pinned_version.as_deref())?;
2875
2876 let mut extra_data = HashMap::new();
2877 if let Some(marker) = marker {
2878 extra_data.insert("marker".to_string(), JsonValue::String(marker));
2879 }
2880 if !extras.is_empty() {
2881 extra_data.insert(
2882 "extras".to_string(),
2883 JsonValue::Array(extras.into_iter().map(JsonValue::String).collect()),
2884 );
2885 }
2886
2887 Some(Dependency {
2888 purl: Some(purl),
2889 extracted_requirement: version_spec,
2890 scope: scope.map(|value| value.to_string()),
2891 is_runtime: Some(!is_optional),
2892 is_optional: Some(is_optional),
2893 is_pinned: Some(pinned_version.is_some()),
2894 is_direct: Some(true),
2895 resolved_package: None,
2896 extra_data: if extra_data.is_empty() {
2897 None
2898 } else {
2899 Some(extra_data)
2900 },
2901 })
2902}
2903
2904fn parse_dependency_array(
2905 array: &[TomlValue],
2906 is_optional: bool,
2907 scope: Option<&str>,
2908) -> Vec<Dependency> {
2909 array
2910 .iter()
2911 .filter_map(|dep| {
2912 let dep_str = dep.as_str()?;
2913 build_pyproject_array_dependency(dep_str, is_optional, scope)
2914 })
2915 .collect()
2916}
2917
2918fn build_pyproject_array_dependency(
2919 dep_str: &str,
2920 is_optional: bool,
2921 scope: Option<&str>,
2922) -> Option<Dependency> {
2923 let parsed = parse_pep508_requirement(dep_str)?;
2924 let name = normalize_python_package_name(&parsed.name);
2925 let pinned_version = parsed
2926 .specifiers
2927 .as_deref()
2928 .and_then(extract_exact_pinned_version);
2929
2930 let purl = build_python_dependency_purl(&name, pinned_version.as_deref())?;
2931
2932 let mut extra_data = HashMap::new();
2933 if let Some(marker) = parsed.marker {
2934 extra_data.insert("marker".to_string(), JsonValue::String(marker));
2935 }
2936 if !parsed.extras.is_empty() {
2937 extra_data.insert(
2938 "extras".to_string(),
2939 JsonValue::Array(parsed.extras.into_iter().map(JsonValue::String).collect()),
2940 );
2941 }
2942
2943 let extracted_requirement = parsed.specifiers.or(parsed.url);
2944
2945 Some(Dependency {
2946 purl: Some(purl),
2947 extracted_requirement: extracted_requirement.clone(),
2948 scope: scope.map(|s| s.to_string()),
2949 is_runtime: Some(!is_optional),
2950 is_optional: Some(is_optional),
2951 is_pinned: Some(pinned_version.is_some()),
2952 is_direct: Some(true),
2953 resolved_package: None,
2954 extra_data: if extra_data.is_empty() {
2955 None
2956 } else {
2957 Some(extra_data)
2958 },
2959 })
2960}
2961
2962fn extract_exact_pinned_version(specifiers: &str) -> Option<String> {
2963 let trimmed = specifiers.trim();
2964 if trimmed.contains(',') {
2965 return None;
2966 }
2967
2968 let stripped = if let Some(version) = trimmed.strip_prefix("===") {
2969 version
2970 } else if let Some(version) = trimmed.strip_prefix("==") {
2971 version
2972 } else {
2973 return None;
2974 };
2975
2976 let version = stripped.trim();
2977 if version.is_empty() {
2978 None
2979 } else {
2980 Some(version.to_string())
2981 }
2982}
2983
2984#[derive(Debug, Clone)]
2985enum Value {
2986 String(String),
2987 Number(f64),
2988 Bool(bool),
2989 None,
2990 List(Vec<Value>),
2991 Tuple(Vec<Value>),
2992 Dict(HashMap<String, Value>),
2993}
2994
2995struct LiteralEvaluator {
2996 constants: HashMap<String, Value>,
2997 max_depth: usize,
2998 max_nodes: usize,
2999 nodes_visited: usize,
3000}
3001
3002impl LiteralEvaluator {
3003 fn new(constants: HashMap<String, Value>) -> Self {
3004 Self {
3005 constants,
3006 max_depth: MAX_SETUP_PY_AST_DEPTH,
3007 max_nodes: MAX_SETUP_PY_AST_NODES,
3008 nodes_visited: 0,
3009 }
3010 }
3011
3012 fn insert_constant(&mut self, name: String, value: Value) {
3013 self.constants.insert(name, value);
3014 }
3015
3016 fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
3017 if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
3018 return None;
3019 }
3020 self.nodes_visited += 1;
3021
3022 match expr {
3023 ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
3024 Some(Value::String(value.to_str().to_string()))
3025 }
3026 ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
3027 Some(Value::Bool(*value))
3028 }
3029 ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
3030 self.evaluate_number(value)
3031 }
3032 ast::Expr::NoneLiteral(_) => Some(Value::None),
3033 ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
3034 ast::Expr::List(ast::ExprList { elts, .. }) => {
3035 let mut values = Vec::new();
3036 for elt in elts {
3037 values.push(self.evaluate_expr(elt, depth + 1)?);
3038 }
3039 Some(Value::List(values))
3040 }
3041 ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
3042 let mut values = Vec::new();
3043 for elt in elts {
3044 values.push(self.evaluate_expr(elt, depth + 1)?);
3045 }
3046 Some(Value::Tuple(values))
3047 }
3048 ast::Expr::Dict(ast::ExprDict { items, .. }) => {
3049 let mut dict = HashMap::new();
3050 for item in items {
3051 let key_expr = item.key.as_ref()?;
3052 let key_value = self.evaluate_expr(key_expr, depth + 1)?;
3053 let key = value_to_string(&key_value)?;
3054 let value = self.evaluate_expr(&item.value, depth + 1)?;
3055 dict.insert(key, value);
3056 }
3057 Some(Value::Dict(dict))
3058 }
3059 ast::Expr::Call(ast::ExprCall {
3060 func, arguments, ..
3061 }) => {
3062 let args = arguments.args.as_ref();
3063 let keywords = arguments.keywords.as_ref();
3064 if keywords.is_empty()
3065 && let Some(name) = dotted_name(func.as_ref(), depth + 1)
3066 && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
3067 {
3068 return self.evaluate_ordered_dict(args, depth + 1);
3069 }
3070
3071 if !args.is_empty() {
3072 return None;
3073 }
3074
3075 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
3076 && id == "dict"
3077 {
3078 let mut dict = HashMap::new();
3079 for keyword in keywords {
3080 let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
3081 let value = self.evaluate_expr(&keyword.value, depth + 1)?;
3082 dict.insert(key.to_string(), value);
3083 }
3084 return Some(Value::Dict(dict));
3085 }
3086
3087 None
3088 }
3089 _ => None,
3090 }
3091 }
3092
3093 fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
3094 match number {
3095 ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
3096 ast::Number::Float(value) => Some(Value::Number(*value)),
3097 ast::Number::Complex { .. } => None,
3098 }
3099 }
3100
3101 fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
3102 if args.len() != 1 {
3103 return None;
3104 }
3105
3106 let items = match self.evaluate_expr(&args[0], depth)? {
3107 Value::List(items) | Value::Tuple(items) => items,
3108 _ => return None,
3109 };
3110
3111 let mut dict = HashMap::new();
3112 for item in items {
3113 let Value::Tuple(values) = item else {
3114 return None;
3115 };
3116 if values.len() != 2 {
3117 return None;
3118 }
3119 let key = value_to_string(&values[0])?;
3120 dict.insert(key, values[1].clone());
3121 }
3122
3123 Some(Value::Dict(dict))
3124 }
3125}
3126
3127#[derive(Default)]
3128struct SetupAliases {
3129 setup_names: HashSet<String>,
3130 module_aliases: HashMap<String, String>,
3131}
3132
3133fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
3134 extract_from_setup_py(path).into_iter().collect()
3135}
3136
3137fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
3138 let content = match read_file_to_string(path, None) {
3139 Ok(content) => content,
3140 Err(e) => {
3141 warn!("Failed to read setup.py at {:?}: {}", path, e);
3142 return Some(default_package_data(path));
3143 }
3144 };
3145
3146 if content.len() > MAX_SETUP_PY_BYTES {
3147 warn!("setup.py too large at {:?}: {} bytes", path, content.len());
3148 let package_data = extract_from_setup_py_regex(&content);
3149 return should_emit_setup_py_package(&package_data).then_some(package_data);
3150 }
3151
3152 let mut package_data = match extract_from_setup_py_ast(&content) {
3153 Ok(Some(data)) => data,
3154 Ok(None) => return Some(default_package_data(path)),
3155 Err(e) => {
3156 warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
3157 extract_from_setup_py_regex(&content)
3158 }
3159 };
3160
3161 if package_data.name.is_none() {
3162 package_data.name = extract_setup_value(&content, "name");
3163 }
3164
3165 if package_data.version.is_none() {
3166 package_data.version = extract_setup_value(&content, "version");
3167 }
3168
3169 if package_data
3170 .version
3171 .as_deref()
3172 .is_some_and(|version| version.trim().is_empty())
3173 {
3174 package_data.version = None;
3175 }
3176
3177 fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
3178 package_data.purl = build_setup_py_purl(
3179 package_data.name.as_deref(),
3180 package_data.version.as_deref(),
3181 );
3182
3183 if should_emit_setup_py_package(&package_data) {
3184 Some(package_data)
3185 } else {
3186 Some(default_package_data(path))
3187 }
3188}
3189
3190fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
3191 package_data.name.is_some()
3192 || package_data.version.is_some()
3193 || package_data.purl.is_some()
3194 || !package_data.dependencies.is_empty()
3195 || package_data.extracted_license_statement.is_some()
3196 || !package_data.license_detections.is_empty()
3197 || !package_data.parties.is_empty()
3198 || package_data.description.is_some()
3199 || package_data.homepage_url.is_some()
3200 || package_data.bug_tracking_url.is_some()
3201 || package_data.code_view_url.is_some()
3202 || package_data.vcs_url.is_some()
3203}
3204
3205fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
3206 if package_data.version.is_some()
3207 && package_data.extracted_license_statement.is_some()
3208 && package_data
3209 .parties
3210 .iter()
3211 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
3212 {
3213 return;
3214 }
3215
3216 let Some(root) = path.parent() else {
3217 return;
3218 };
3219
3220 let dunder_metadata = collect_sibling_dunder_metadata(root, content);
3221
3222 if package_data.version.is_none() {
3223 package_data.version = dunder_metadata.version;
3224 }
3225
3226 if package_data.extracted_license_statement.is_none() {
3227 package_data.extracted_license_statement = dunder_metadata.license;
3228 }
3229
3230 let has_author = package_data
3231 .parties
3232 .iter()
3233 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
3234
3235 if !has_author && let Some(author) = dunder_metadata.author {
3236 package_data.parties.push(Party {
3237 r#type: Some("person".to_string()),
3238 role: Some("author".to_string()),
3239 name: Some(author),
3240 email: None,
3241 url: None,
3242 organization: None,
3243 organization_url: None,
3244 timezone: None,
3245 });
3246 }
3247}
3248
3249#[derive(Default)]
3250struct DunderMetadata {
3251 version: Option<String>,
3252 author: Option<String>,
3253 license: Option<String>,
3254}
3255
3256fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
3257 let statements = match parse_module(content) {
3258 Ok(parsed) => parsed.into_suite(),
3259 Err(_) => return DunderMetadata::default(),
3260 };
3261
3262 let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3263 let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3264 let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3265 let mut metadata = DunderMetadata::default();
3266 let mut candidate_paths = Vec::new();
3267
3268 for module in imported_dunder_modules(&statements) {
3269 let Some(path) = resolve_imported_module_path(root, &module) else {
3270 continue;
3271 };
3272
3273 candidate_paths.push(path);
3274 }
3275
3276 candidate_paths.extend(referenced_dunder_init_paths(root, content));
3277
3278 let mut seen_paths = HashSet::new();
3279 for path in candidate_paths {
3280 if !seen_paths.insert(path.clone()) {
3281 continue;
3282 }
3283
3284 let Ok(module_content) = read_file_to_string(&path, None) else {
3285 continue;
3286 };
3287
3288 if metadata.version.is_none() {
3289 metadata.version = version_re
3290 .as_ref()
3291 .and_then(|regex| regex.captures(&module_content))
3292 .and_then(|captures| captures.get(1))
3293 .map(|match_| match_.as_str().to_string());
3294 }
3295
3296 if metadata.author.is_none() {
3297 metadata.author = author_re
3298 .as_ref()
3299 .and_then(|regex| regex.captures(&module_content))
3300 .and_then(|captures| captures.get(1))
3301 .map(|match_| match_.as_str().to_string());
3302 }
3303
3304 if metadata.license.is_none() {
3305 metadata.license = license_re
3306 .as_ref()
3307 .and_then(|regex| regex.captures(&module_content))
3308 .and_then(|captures| captures.get(1))
3309 .map(|match_| match_.as_str().to_string());
3310 }
3311
3312 if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
3313 return metadata;
3314 }
3315 }
3316
3317 metadata
3318}
3319
3320fn referenced_dunder_init_paths(root: &Path, content: &str) -> Vec<PathBuf> {
3321 let open_re = match Regex::new(r#"open\(\s*['\"]([^'\"]+__init__\.py)['\"]"#) {
3322 Ok(regex) => regex,
3323 Err(_) => return Vec::new(),
3324 };
3325
3326 open_re
3327 .captures_iter(content)
3328 .filter_map(|captures| captures.get(1).map(|m| m.as_str()))
3329 .filter_map(|relative| {
3330 let relative_path = PathBuf::from(relative);
3331 if relative_path.is_absolute()
3332 || relative_path.components().any(|component| {
3333 matches!(
3334 component,
3335 Component::ParentDir | Component::RootDir | Component::Prefix(_)
3336 )
3337 })
3338 {
3339 return None;
3340 }
3341
3342 let candidate = root.join(relative_path);
3343 candidate.exists().then_some(candidate)
3344 })
3345 .collect()
3346}
3347
3348fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
3349 let mut modules = Vec::new();
3350
3351 for statement in statements {
3352 let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
3353 continue;
3354 };
3355 let Some(module) = module.as_ref().map(|name| name.as_str()) else {
3356 continue;
3357 };
3358 let imports_dunder = names.iter().any(|alias| {
3359 matches!(
3360 alias.name.as_str(),
3361 "__version__" | "__author__" | "__license__"
3362 )
3363 });
3364 if imports_dunder {
3365 modules.push(module.to_string());
3366 }
3367 }
3368
3369 modules
3370}
3371
3372fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
3373 let relative = PathBuf::from_iter(module.split('.'));
3374 let candidates = [
3375 root.join(relative.with_extension("py")),
3376 root.join(&relative).join("__init__.py"),
3377 root.join("src").join(relative.with_extension("py")),
3378 root.join("src").join(relative).join("__init__.py"),
3379 ];
3380
3381 candidates.into_iter().find(|candidate| candidate.exists())
3382}
3383
3384fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
3400 let statements = parse_module(content)
3401 .map(|parsed| parsed.into_suite())
3402 .map_err(|e| e.to_string())?;
3403 let aliases = collect_setup_aliases(&statements);
3404 let mut evaluator = LiteralEvaluator::new(HashMap::new());
3405 build_setup_py_constants(&statements, &mut evaluator);
3406
3407 let setup_call = find_setup_call(&statements, &aliases);
3408 let Some(call_expr) = setup_call else {
3409 return Ok(None);
3410 };
3411
3412 let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3413 Ok(Some(build_setup_py_package_data(&setup_values)))
3414}
3415
3416fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3417 for stmt in statements {
3418 if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3419 if targets.len() != 1 {
3420 continue;
3421 }
3422
3423 let Some(name) = extract_assign_name(&targets[0]) else {
3424 continue;
3425 };
3426
3427 if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3428 evaluator.insert_constant(name, value);
3429 }
3430 }
3431 }
3432}
3433
3434fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3435 match target {
3436 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3437 _ => None,
3438 }
3439}
3440
3441fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3442 let mut aliases = SetupAliases::default();
3443 aliases.setup_names.insert("setup".to_string());
3444
3445 for stmt in statements {
3446 match stmt {
3447 ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3448 for alias in names {
3449 let module_name = alias.name.as_str();
3450 if !is_setup_module(module_name) {
3451 continue;
3452 }
3453 let alias_name = alias
3454 .asname
3455 .as_ref()
3456 .map(|name| name.as_str())
3457 .unwrap_or(module_name);
3458 aliases
3459 .module_aliases
3460 .insert(alias_name.to_string(), module_name.to_string());
3461 }
3462 }
3463 ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3464 let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3465 continue;
3466 };
3467 if !is_setup_module(module_name) {
3468 continue;
3469 }
3470 for alias in names {
3471 if alias.name.as_str() != "setup" {
3472 continue;
3473 }
3474 let alias_name = alias
3475 .asname
3476 .as_ref()
3477 .map(|name| name.as_str())
3478 .unwrap_or("setup");
3479 aliases.setup_names.insert(alias_name.to_string());
3480 }
3481 }
3482 _ => {}
3483 }
3484 }
3485
3486 aliases
3487}
3488
3489fn is_setup_module(module_name: &str) -> bool {
3490 matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3491}
3492
3493fn find_setup_call<'a>(
3494 statements: &'a [ast::Stmt],
3495 aliases: &'a SetupAliases,
3496) -> Option<&'a ast::Expr> {
3497 let mut finder = SetupCallFinder {
3498 aliases,
3499 called_function_names: collect_top_level_called_function_names(statements),
3500 nodes_visited: 0,
3501 };
3502 finder.find_in_statements(statements)
3503}
3504
3505fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3506 let mut called = HashSet::new();
3507 collect_called_function_names_in_statements(statements, &mut called);
3508 called
3509}
3510
3511fn collect_called_function_names_in_statements(
3512 statements: &[ast::Stmt],
3513 called: &mut HashSet<String>,
3514) {
3515 for stmt in statements {
3516 match stmt {
3517 ast::Stmt::Expr(ast::StmtExpr { value, .. })
3518 | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3519 collect_called_function_names_in_expr(value.as_ref(), called);
3520 }
3521 ast::Stmt::If(ast::StmtIf {
3522 body,
3523 elif_else_clauses,
3524 ..
3525 }) => {
3526 collect_called_function_names_in_statements(body, called);
3527 for clause in elif_else_clauses {
3528 collect_called_function_names_in_statements(&clause.body, called);
3529 }
3530 }
3531 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3532 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3533 collect_called_function_names_in_statements(body, called);
3534 collect_called_function_names_in_statements(orelse, called);
3535 }
3536 ast::Stmt::With(ast::StmtWith { body, .. }) => {
3537 collect_called_function_names_in_statements(body, called);
3538 }
3539 ast::Stmt::Try(ast::StmtTry {
3540 body,
3541 orelse,
3542 finalbody,
3543 handlers,
3544 ..
3545 }) => {
3546 collect_called_function_names_in_statements(body, called);
3547 collect_called_function_names_in_statements(orelse, called);
3548 collect_called_function_names_in_statements(finalbody, called);
3549 for handler in handlers {
3550 let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3551 body,
3552 ..
3553 }) = handler;
3554 collect_called_function_names_in_statements(body, called);
3555 }
3556 }
3557 _ => {}
3558 }
3559 }
3560}
3561
3562fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3563 if let ast::Expr::Call(ast::ExprCall {
3564 func, arguments, ..
3565 }) = expr
3566 {
3567 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3568 called.insert(id.as_str().to_string());
3569 }
3570
3571 for arg in arguments.args.iter() {
3572 collect_called_function_names_in_expr(arg, called);
3573 }
3574 for keyword in arguments.keywords.iter() {
3575 collect_called_function_names_in_expr(&keyword.value, called);
3576 }
3577 }
3578}
3579
3580struct SetupCallFinder<'a> {
3581 aliases: &'a SetupAliases,
3582 called_function_names: HashSet<String>,
3583 nodes_visited: usize,
3584}
3585
3586impl<'a> SetupCallFinder<'a> {
3587 fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3588 for stmt in statements {
3589 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3590 return None;
3591 }
3592 self.nodes_visited += 1;
3593
3594 let found = match stmt {
3595 ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3596 ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3597 ast::Stmt::If(ast::StmtIf {
3598 body,
3599 elif_else_clauses,
3600 ..
3601 }) => self.find_in_statements(body).or_else(|| {
3602 for clause in elif_else_clauses {
3603 if let Some(found) = self.find_in_statements(&clause.body) {
3604 return Some(found);
3605 }
3606 }
3607 None
3608 }),
3609 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3610 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3611 .find_in_statements(body)
3612 .or_else(|| self.find_in_statements(orelse)),
3613 ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3614 .called_function_names
3615 .contains(name.as_str())
3616 .then(|| self.find_in_statements(body))
3617 .flatten(),
3618 ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3619 ast::Stmt::Try(ast::StmtTry {
3620 body,
3621 orelse,
3622 finalbody,
3623 handlers,
3624 ..
3625 }) => self
3626 .find_in_statements(body)
3627 .or_else(|| self.find_in_statements(orelse))
3628 .or_else(|| self.find_in_statements(finalbody))
3629 .or_else(|| {
3630 for handler in handlers {
3631 let ast::ExceptHandler::ExceptHandler(
3632 ast::ExceptHandlerExceptHandler { body, .. },
3633 ) = handler;
3634 if let Some(found) = self.find_in_statements(body) {
3635 return Some(found);
3636 }
3637 }
3638 None
3639 }),
3640 _ => None,
3641 };
3642
3643 if found.is_some() {
3644 return found;
3645 }
3646 }
3647
3648 None
3649 }
3650
3651 fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3652 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3653 return None;
3654 }
3655 self.nodes_visited += 1;
3656
3657 match expr {
3658 ast::Expr::Call(ast::ExprCall { func, .. })
3659 if is_setup_call(func.as_ref(), self.aliases) =>
3660 {
3661 Some(expr)
3662 }
3663 _ => None,
3664 }
3665 }
3666}
3667
3668fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3669 let Some(dotted) = dotted_name(func, 0) else {
3670 return false;
3671 };
3672
3673 if aliases.setup_names.contains(&dotted) {
3674 return true;
3675 }
3676
3677 let Some(module) = dotted.strip_suffix(".setup") else {
3678 return false;
3679 };
3680
3681 let resolved = resolve_module_alias(module, aliases);
3682 is_setup_module(&resolved)
3683}
3684
3685fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3686 if depth >= MAX_SETUP_PY_AST_DEPTH {
3687 return None;
3688 }
3689
3690 match expr {
3691 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3692 ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3693 let base = dotted_name(value.as_ref(), depth + 1)?;
3694 Some(format!("{}.{}", base, attr.as_str()))
3695 }
3696 _ => None,
3697 }
3698}
3699
3700fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3701 if let Some(mapped) = aliases.module_aliases.get(module) {
3702 return mapped.clone();
3703 }
3704
3705 let Some((base, rest)) = module.split_once('.') else {
3706 return module.to_string();
3707 };
3708
3709 if let Some(mapped) = aliases.module_aliases.get(base) {
3710 return format!("{}.{}", mapped, rest);
3711 }
3712
3713 module.to_string()
3714}
3715
3716fn extract_setup_keywords(
3717 call_expr: &ast::Expr,
3718 evaluator: &mut LiteralEvaluator,
3719) -> HashMap<String, Value> {
3720 let mut values = HashMap::new();
3721 let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3722 return values;
3723 };
3724
3725 for keyword in arguments.keywords.iter() {
3726 if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3727 if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3728 values.insert(arg.to_string(), value);
3729 }
3730 } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3731 for (key, value) in dict {
3732 values.insert(key, value);
3733 }
3734 }
3735 }
3736
3737 values
3738}
3739
3740fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3741 let name = get_value_string(values, "name").map(truncate_field);
3742 let version = get_value_string(values, "version").map(truncate_field);
3743 let description = get_value_string(values, "description")
3744 .or_else(|| get_value_string(values, "summary"))
3745 .map(truncate_field);
3746 let homepage_url = get_value_string(values, "url")
3747 .or_else(|| get_value_string(values, "home_page"))
3748 .map(truncate_field);
3749 let author = get_value_string(values, "author").map(truncate_field);
3750 let author_email = get_value_string(values, "author_email");
3751 let maintainer = get_value_string(values, "maintainer").map(truncate_field);
3752 let maintainer_email = get_value_string(values, "maintainer_email");
3753 let license = get_value_string(values, "license").map(truncate_field);
3754 let classifiers = values
3755 .get("classifiers")
3756 .and_then(value_to_string_list)
3757 .unwrap_or_default();
3758
3759 let mut parties = Vec::new();
3760 if author.is_some() || author_email.is_some() {
3761 parties.push(Party {
3762 r#type: Some("person".to_string()),
3763 role: Some("author".to_string()),
3764 name: author,
3765 email: author_email,
3766 url: None,
3767 organization: None,
3768 organization_url: None,
3769 timezone: None,
3770 });
3771 }
3772
3773 if maintainer.is_some() || maintainer_email.is_some() {
3774 parties.push(Party {
3775 r#type: Some("person".to_string()),
3776 role: Some("maintainer".to_string()),
3777 name: maintainer,
3778 email: maintainer_email,
3779 url: None,
3780 organization: None,
3781 organization_url: None,
3782 timezone: None,
3783 });
3784 }
3785
3786 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3787 normalize_spdx_declared_license(license.as_deref());
3788 let extracted_license_statement = license.clone();
3789
3790 let dependencies = build_setup_py_dependencies(values);
3791 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3792 let mut homepage_from_project_urls = None;
3793 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3794 let mut extra_data = HashMap::new();
3795
3796 if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3797 apply_project_url_mappings(
3798 &parsed_project_urls,
3799 &mut homepage_from_project_urls,
3800 &mut bug_tracking_url,
3801 &mut code_view_url,
3802 &mut vcs_url,
3803 &mut extra_data,
3804 );
3805 }
3806
3807 let extra_data = if extra_data.is_empty() {
3808 None
3809 } else {
3810 Some(extra_data)
3811 };
3812
3813 PackageData {
3814 package_type: Some(PythonParser::PACKAGE_TYPE),
3815 namespace: None,
3816 name,
3817 version,
3818 qualifiers: None,
3819 subpath: None,
3820 primary_language: Some("Python".to_string()),
3821 description,
3822 release_date: None,
3823 parties,
3824 keywords: Vec::new(),
3825 homepage_url: homepage_url.or(homepage_from_project_urls),
3826 download_url: None,
3827 size: None,
3828 sha1: None,
3829 md5: None,
3830 sha256: None,
3831 sha512: None,
3832 bug_tracking_url,
3833 code_view_url,
3834 vcs_url,
3835 copyright: None,
3836 holder: None,
3837 declared_license_expression,
3838 declared_license_expression_spdx,
3839 license_detections,
3840 other_license_expression: None,
3841 other_license_expression_spdx: None,
3842 other_license_detections: Vec::new(),
3843 extracted_license_statement,
3844 notice_text: None,
3845 source_packages: Vec::new(),
3846 file_references: Vec::new(),
3847 is_private: has_private_classifier(&classifiers),
3848 is_virtual: false,
3849 extra_data,
3850 dependencies,
3851 repository_homepage_url: None,
3852 repository_download_url: None,
3853 api_data_url: None,
3854 datasource_id: Some(DatasourceId::PypiSetupPy),
3855 purl,
3856 }
3857}
3858
3859fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3860 let mut dependencies = Vec::new();
3861
3862 if let Some(reqs) = values
3863 .get("install_requires")
3864 .and_then(value_to_string_list)
3865 {
3866 dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3867 }
3868
3869 if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3870 dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3871 }
3872
3873 if let Some(Value::Dict(extras)) = values.get("extras_require") {
3874 let mut extra_items: Vec<_> = extras.iter().collect();
3875 extra_items.sort_by_key(|(name, _)| *name);
3876 for (extra_name, extra_value) in extra_items {
3877 if let Some(reqs) = value_to_string_list(extra_value) {
3878 dependencies.extend(build_setup_py_dependency_list(
3879 reqs.as_slice(),
3880 extra_name,
3881 true,
3882 ));
3883 }
3884 }
3885 }
3886
3887 dependencies
3888}
3889
3890fn build_setup_py_dependency_list(
3891 reqs: &[String],
3892 scope: &str,
3893 is_optional: bool,
3894) -> Vec<Dependency> {
3895 reqs.iter()
3896 .filter_map(|req| build_python_dependency(req, scope, is_optional, None))
3897 .collect()
3898}
3899
3900fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3901 values.get(key).and_then(value_to_string)
3902}
3903
3904fn value_to_string(value: &Value) -> Option<String> {
3905 match value {
3906 Value::String(value) => Some(value.clone()),
3907 Value::Number(value) => Some(value.to_string()),
3908 Value::Bool(value) => Some(value.to_string()),
3909 _ => None,
3910 }
3911}
3912
3913fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3914 match value {
3915 Value::String(value) => Some(vec![value.clone()]),
3916 Value::List(values) | Value::Tuple(values) => {
3917 let mut items = Vec::new();
3918 for item in values {
3919 items.push(value_to_string(item)?);
3920 }
3921 Some(items)
3922 }
3923 _ => None,
3924 }
3925}
3926
3927fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3928 let Value::Dict(dict) = value else {
3929 return None;
3930 };
3931
3932 let mut pairs: Vec<(String, String)> = dict
3933 .iter()
3934 .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3935 .collect::<Option<Vec<_>>>()?;
3936 pairs.sort_by(|left, right| left.0.cmp(&right.0));
3937 Some(pairs)
3938}
3939
3940fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3941 let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3942 extract_requires_dist_dependencies(&requires_dist)
3943}
3944
3945pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3946 requires_dist
3947 .iter()
3948 .filter_map(|entry| build_rfc822_dependency(entry))
3949 .collect()
3950}
3951
3952fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3953 build_python_dependency(entry, "install", false, None)
3954}
3955
3956fn build_python_dependency(
3957 entry: &str,
3958 default_scope: &str,
3959 default_optional: bool,
3960 marker_override: Option<&str>,
3961) -> Option<Dependency> {
3962 let (requirement_part, marker_part) = entry
3963 .split_once(';')
3964 .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3965 .unwrap_or((entry.trim(), None));
3966
3967 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3968 let requirement = normalize_rfc822_requirement(requirement_part);
3969 let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3970 marker_part.or(marker_override),
3971 default_scope,
3972 default_optional,
3973 );
3974 let purl = build_python_dependency_purl(&name, None)?;
3975
3976 let is_pinned = requirement
3977 .as_deref()
3978 .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3979 let purl = if is_pinned {
3980 requirement
3981 .as_deref()
3982 .map(|req| req.trim_start_matches('='))
3983 .and_then(|version| build_python_dependency_purl(&name, Some(version)))
3984 .unwrap_or(purl)
3985 } else {
3986 purl
3987 };
3988
3989 let mut extra_data = HashMap::new();
3990 extra_data.extend(marker_data);
3991 if let Some(marker) = marker {
3992 extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3993 }
3994
3995 Some(Dependency {
3996 purl: Some(purl),
3997 extracted_requirement: requirement,
3998 scope: Some(scope),
3999 is_runtime: Some(true),
4000 is_optional: Some(is_optional),
4001 is_pinned: Some(is_pinned),
4002 is_direct: Some(true),
4003 resolved_package: None,
4004 extra_data: if extra_data.is_empty() {
4005 None
4006 } else {
4007 Some(extra_data)
4008 },
4009 })
4010}
4011
4012fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
4013 let name = extract_setup_cfg_dependency_name(requirement_part)?;
4014 let trimmed = requirement_part.trim();
4015 let mut remainder = trimmed[name.len()..].trim();
4016
4017 if let Some(stripped) = remainder.strip_prefix('[')
4018 && let Some(end_idx) = stripped.find(']')
4019 {
4020 remainder = stripped[end_idx + 1..].trim();
4021 }
4022
4023 let remainder = remainder
4024 .strip_prefix('(')
4025 .and_then(|value| value.strip_suffix(')'))
4026 .unwrap_or(remainder)
4027 .trim();
4028
4029 if remainder.is_empty() {
4030 return None;
4031 }
4032
4033 let mut specifiers: Vec<String> = remainder
4034 .split(',')
4035 .map(|specifier| specifier.trim().replace(' ', ""))
4036 .filter(|specifier| !specifier.is_empty())
4037 .collect();
4038 specifiers.sort();
4039 Some(specifiers.join(","))
4040}
4041
4042fn encode_python_dependency_purl_version(version: &str) -> String {
4043 version.replace('*', "%2A")
4044}
4045
4046fn build_python_dependency_purl(name: &str, version: Option<&str>) -> Option<String> {
4047 let normalized_name = normalize_python_dependency_name(name);
4048
4049 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &normalized_name)
4050 .ok()
4051 .map(|_| match version {
4052 Some(version) => {
4053 format!(
4054 "pkg:pypi/{normalized_name}@{}",
4055 encode_python_dependency_purl_version(version)
4056 )
4057 }
4058 None => format!("pkg:pypi/{normalized_name}"),
4059 })
4060}
4061
4062fn normalize_python_dependency_name(name: &str) -> String {
4063 name.trim().to_ascii_lowercase().replace('_', "-")
4064}
4065
4066fn parse_rfc822_marker(
4067 marker_part: Option<&str>,
4068 default_scope: &str,
4069 default_optional: bool,
4070) -> (
4071 String,
4072 bool,
4073 Option<String>,
4074 HashMap<String, serde_json::Value>,
4075) {
4076 let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
4077 return (
4078 default_scope.to_string(),
4079 default_optional,
4080 None,
4081 HashMap::new(),
4082 );
4083 };
4084
4085 let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
4086 .expect("extra marker regex should compile");
4087 let mut extra_data = HashMap::new();
4088
4089 if let Some(python_version) = extract_marker_field(marker, "python_version") {
4090 extra_data.insert(
4091 "python_version".to_string(),
4092 serde_json::Value::String(python_version),
4093 );
4094 }
4095 if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
4096 extra_data.insert(
4097 "sys_platform".to_string(),
4098 serde_json::Value::String(sys_platform),
4099 );
4100 }
4101
4102 if let Some(captures) = extra_re.captures(marker)
4103 && let Some(scope) = captures.get(1)
4104 {
4105 return (
4106 scope.as_str().to_string(),
4107 true,
4108 Some(marker.trim().to_string()),
4109 extra_data,
4110 );
4111 }
4112
4113 (
4114 default_scope.to_string(),
4115 default_optional,
4116 Some(marker.trim().to_string()),
4117 extra_data,
4118 )
4119}
4120
4121fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
4122 let re = Regex::new(&format!(
4123 r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
4124 field
4125 ))
4126 .ok()?;
4127 let captures = re.captures(marker)?;
4128 let operator = captures.get(1)?.as_str();
4129 let value = captures.get(2)?.as_str();
4130 Some(format!("{} {}", operator, value))
4131}
4132
4133fn parse_requires_txt(content: &str) -> Vec<Dependency> {
4134 let mut dependencies = Vec::new();
4135 let mut current_scope = "install".to_string();
4136 let mut current_optional = false;
4137 let mut current_marker: Option<String> = None;
4138 let mut line_count = 0usize;
4139
4140 for line in content.lines() {
4141 line_count += 1;
4142 if line_count > MAX_ITERATION_COUNT {
4143 warn!(
4144 "Exceeded max line count in requires.txt; stopping at {} lines",
4145 MAX_ITERATION_COUNT
4146 );
4147 break;
4148 }
4149 let trimmed = line.trim();
4150 if trimmed.is_empty() || trimmed.starts_with('#') {
4151 continue;
4152 }
4153
4154 if trimmed.starts_with('[') && trimmed.ends_with(']') {
4155 let inner = &trimmed[1..trimmed.len() - 1];
4156 if let Some(rest) = inner.strip_prefix(':') {
4157 current_scope = "install".to_string();
4158 current_optional = false;
4159 current_marker = Some(rest.trim().to_string());
4160 } else if let Some((scope, marker)) = inner.split_once(':') {
4161 current_scope = scope.trim().to_string();
4162 current_optional = true;
4163 current_marker = Some(marker.trim().to_string());
4164 } else {
4165 current_scope = inner.trim().to_string();
4166 current_optional = true;
4167 current_marker = None;
4168 }
4169 continue;
4170 }
4171
4172 if let Some(dependency) = build_python_dependency(
4173 trimmed,
4174 ¤t_scope,
4175 current_optional,
4176 current_marker.as_deref(),
4177 ) {
4178 dependencies.push(dependency);
4179 }
4180 }
4181
4182 dependencies
4183}
4184
4185fn has_private_classifier(classifiers: &[String]) -> bool {
4186 classifiers
4187 .iter()
4188 .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
4189}
4190
4191fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
4192 let name = name?;
4193 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
4194 if let Some(version) = version {
4195 package_url.with_version(version).ok()?;
4196 }
4197 Some(package_url.to_string())
4198}
4199
4200fn extract_from_setup_py_regex(content: &str) -> PackageData {
4201 let name = extract_setup_value(content, "name").map(truncate_field);
4202 let version = extract_setup_value(content, "version").map(truncate_field);
4203 let license_expression = extract_setup_value(content, "license").map(truncate_field);
4204
4205 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4206 normalize_spdx_declared_license(license_expression.as_deref());
4207 let extracted_license_statement = license_expression.clone();
4208
4209 let dependencies = extract_setup_py_dependencies(content);
4210 let homepage_url = extract_setup_value(content, "url").map(truncate_field);
4211 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
4212
4213 PackageData {
4214 package_type: Some(PythonParser::PACKAGE_TYPE),
4215 namespace: None,
4216 name,
4217 version,
4218 qualifiers: None,
4219 subpath: None,
4220 primary_language: Some("Python".to_string()),
4221 description: None,
4222 release_date: None,
4223 parties: Vec::new(),
4224 keywords: Vec::new(),
4225 homepage_url,
4226 download_url: None,
4227 size: None,
4228 sha1: None,
4229 md5: None,
4230 sha256: None,
4231 sha512: None,
4232 bug_tracking_url: None,
4233 code_view_url: None,
4234 vcs_url: None,
4235 copyright: None,
4236 holder: None,
4237 declared_license_expression,
4238 declared_license_expression_spdx,
4239 license_detections,
4240 other_license_expression: None,
4241 other_license_expression_spdx: None,
4242 other_license_detections: Vec::new(),
4243 extracted_license_statement,
4244 notice_text: None,
4245 source_packages: Vec::new(),
4246 file_references: Vec::new(),
4247 is_private: false,
4248 is_virtual: false,
4249 extra_data: None,
4250 dependencies,
4251 repository_homepage_url: None,
4252 repository_download_url: None,
4253 api_data_url: None,
4254 datasource_id: Some(DatasourceId::PypiSetupPy),
4255 purl,
4256 }
4257}
4258
4259fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
4260 crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
4261}
4262
4263fn extract_from_pypi_json(path: &Path) -> PackageData {
4264 let default = PackageData {
4265 package_type: Some(PythonParser::PACKAGE_TYPE),
4266 datasource_id: Some(DatasourceId::PypiJson),
4267 ..Default::default()
4268 };
4269
4270 let content = match read_file_to_string(path, None) {
4271 Ok(content) => content,
4272 Err(error) => {
4273 warn!("Failed to read pypi.json at {:?}: {}", path, error);
4274 return default;
4275 }
4276 };
4277
4278 let root: serde_json::Value = match serde_json::from_str(&content) {
4279 Ok(value) => value,
4280 Err(error) => {
4281 warn!("Failed to parse pypi.json at {:?}: {}", path, error);
4282 return default;
4283 }
4284 };
4285
4286 let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
4287 warn!("No info object found in pypi.json at {:?}", path);
4288 return default;
4289 };
4290
4291 let name = info
4292 .get("name")
4293 .and_then(|value| value.as_str())
4294 .map(|v| truncate_field(v.to_owned()));
4295 let version = info
4296 .get("version")
4297 .and_then(|value| value.as_str())
4298 .map(ToOwned::to_owned);
4299 let summary = info
4300 .get("summary")
4301 .and_then(|value| value.as_str())
4302 .map(|v| truncate_field(v.to_owned()));
4303 let description = info
4304 .get("description")
4305 .and_then(|value| value.as_str())
4306 .filter(|value| !value.trim().is_empty())
4307 .map(|v| truncate_field(v.to_owned()))
4308 .or(summary);
4309 let mut homepage_url = info
4310 .get("home_page")
4311 .and_then(|value| value.as_str())
4312 .map(|v| truncate_field(v.to_owned()));
4313 let author = info
4314 .get("author")
4315 .and_then(|value| value.as_str())
4316 .filter(|value| !value.trim().is_empty())
4317 .map(|v| truncate_field(v.to_owned()));
4318 let author_email = info
4319 .get("author_email")
4320 .and_then(|value| value.as_str())
4321 .filter(|value| !value.trim().is_empty())
4322 .map(ToOwned::to_owned);
4323 let license = info
4324 .get("license")
4325 .and_then(|value| value.as_str())
4326 .filter(|value| !value.trim().is_empty())
4327 .map(ToOwned::to_owned);
4328 let keywords = parse_setup_cfg_keywords(
4329 info.get("keywords")
4330 .and_then(|value| value.as_str())
4331 .map(ToOwned::to_owned),
4332 );
4333 let classifiers = info
4334 .get("classifiers")
4335 .and_then(|value| value.as_array())
4336 .map(|values| {
4337 values
4338 .iter()
4339 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
4340 .collect::<Vec<_>>()
4341 })
4342 .unwrap_or_default();
4343
4344 let mut parties = Vec::new();
4345 if author.is_some() || author_email.is_some() {
4346 parties.push(Party {
4347 r#type: Some("person".to_string()),
4348 role: Some("author".to_string()),
4349 name: author,
4350 email: author_email,
4351 url: None,
4352 organization: None,
4353 organization_url: None,
4354 timezone: None,
4355 });
4356 }
4357
4358 let mut bug_tracking_url = None;
4359 let mut code_view_url = None;
4360 let mut vcs_url = None;
4361 let mut extra_data = HashMap::new();
4362
4363 let parsed_project_urls = info
4364 .get("project_urls")
4365 .and_then(|value| value.as_object())
4366 .map(|map| {
4367 let mut pairs: Vec<(String, String)> = map
4368 .iter()
4369 .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
4370 .collect();
4371 pairs.sort_by(|left, right| left.0.cmp(&right.0));
4372 pairs
4373 })
4374 .unwrap_or_default();
4375
4376 apply_project_url_mappings(
4377 &parsed_project_urls,
4378 &mut homepage_url,
4379 &mut bug_tracking_url,
4380 &mut code_view_url,
4381 &mut vcs_url,
4382 &mut extra_data,
4383 );
4384
4385 let (download_url, size, sha256) = root
4386 .get("urls")
4387 .and_then(|value| value.as_array())
4388 .map(|urls| select_pypi_json_artifact(urls))
4389 .unwrap_or((None, None, None));
4390
4391 let sha256 = sha256.and_then(|h| Sha256Digest::from_hex(&h).ok());
4392
4393 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4394 normalize_spdx_declared_license(license.as_deref());
4395 let dependencies = info
4396 .get("requires_dist")
4397 .and_then(|value| value.as_array())
4398 .map(|entries| {
4399 entries
4400 .iter()
4401 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4402 .collect::<Vec<_>>()
4403 })
4404 .map(|entries| extract_requires_dist_dependencies(&entries))
4405 .unwrap_or_default();
4406
4407 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
4408 build_pypi_urls(name.as_deref(), version.as_deref());
4409
4410 PackageData {
4411 package_type: Some(PythonParser::PACKAGE_TYPE),
4412 namespace: None,
4413 name,
4414 version,
4415 qualifiers: None,
4416 subpath: None,
4417 primary_language: None,
4418 description,
4419 release_date: None,
4420 parties,
4421 keywords,
4422 homepage_url: homepage_url.or(repository_homepage_url.clone()),
4423 download_url,
4424 size,
4425 sha1: None,
4426 md5: None,
4427 sha256,
4428 sha512: None,
4429 bug_tracking_url,
4430 code_view_url,
4431 vcs_url,
4432 copyright: None,
4433 holder: None,
4434 declared_license_expression,
4435 declared_license_expression_spdx,
4436 license_detections,
4437 other_license_expression: None,
4438 other_license_expression_spdx: None,
4439 other_license_detections: Vec::new(),
4440 extracted_license_statement: license,
4441 notice_text: None,
4442 source_packages: Vec::new(),
4443 file_references: Vec::new(),
4444 is_private: has_private_classifier(&classifiers),
4445 is_virtual: false,
4446 extra_data: if extra_data.is_empty() {
4447 None
4448 } else {
4449 Some(extra_data)
4450 },
4451 dependencies,
4452 repository_homepage_url,
4453 repository_download_url,
4454 api_data_url,
4455 datasource_id: Some(DatasourceId::PypiJson),
4456 purl,
4457 }
4458}
4459
4460fn select_pypi_json_artifact(
4461 urls: &[serde_json::Value],
4462) -> (Option<String>, Option<u64>, Option<String>) {
4463 let selected = urls
4464 .iter()
4465 .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4466 .or_else(|| urls.first());
4467
4468 let Some(entry) = selected else {
4469 return (None, None, None);
4470 };
4471
4472 let download_url = entry
4473 .get("url")
4474 .and_then(|value| value.as_str())
4475 .map(ToOwned::to_owned);
4476 let size = entry.get("size").and_then(|value| value.as_u64());
4477 let sha256 = entry
4478 .get("digests")
4479 .and_then(|value| value.as_object())
4480 .and_then(|digests| digests.get("sha256"))
4481 .and_then(|value| value.as_str())
4482 .map(ToOwned::to_owned);
4483
4484 (download_url, size, sha256)
4485}
4486
4487fn extract_from_pip_inspect(path: &Path) -> PackageData {
4488 let content = match read_file_to_string(path, None) {
4489 Ok(content) => content,
4490 Err(e) => {
4491 warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4492 return default_package_data(path);
4493 }
4494 };
4495
4496 let root: serde_json::Value = match serde_json::from_str(&content) {
4497 Ok(value) => value,
4498 Err(e) => {
4499 warn!(
4500 "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4501 path, e
4502 );
4503 return default_package_data(path);
4504 }
4505 };
4506
4507 let installed = match root.get("installed").and_then(|v| v.as_array()) {
4508 Some(arr) => arr,
4509 None => {
4510 warn!(
4511 "No 'installed' array found in pip-inspect.deplock at {:?}",
4512 path
4513 );
4514 return default_package_data(path);
4515 }
4516 };
4517
4518 let pip_version = root
4519 .get("pip_version")
4520 .and_then(|v| v.as_str())
4521 .map(String::from);
4522 let inspect_version = root
4523 .get("version")
4524 .and_then(|v| v.as_str())
4525 .map(String::from);
4526
4527 let mut main_package: Option<PackageData> = None;
4528 let mut dependencies: Vec<Dependency> = Vec::new();
4529
4530 for package_entry in installed {
4531 let metadata = match package_entry.get("metadata") {
4532 Some(m) => m,
4533 None => continue,
4534 };
4535
4536 let is_requested = package_entry
4537 .get("requested")
4538 .and_then(|v| v.as_bool())
4539 .unwrap_or(false);
4540 let has_direct_url = package_entry.get("direct_url").is_some();
4541
4542 let name = metadata
4543 .get("name")
4544 .and_then(|v| v.as_str())
4545 .map(|v| truncate_field(v.to_string()));
4546 let version = metadata
4547 .get("version")
4548 .and_then(|v| v.as_str())
4549 .map(String::from);
4550 let summary = metadata
4551 .get("summary")
4552 .and_then(|v| v.as_str())
4553 .map(|v| truncate_field(v.to_string()));
4554 let home_page = metadata
4555 .get("home_page")
4556 .and_then(|v| v.as_str())
4557 .map(|v| truncate_field(v.to_string()));
4558 let author = metadata
4559 .get("author")
4560 .and_then(|v| v.as_str())
4561 .map(|v| truncate_field(v.to_string()));
4562 let author_email = metadata
4563 .get("author_email")
4564 .and_then(|v| v.as_str())
4565 .map(String::from);
4566 let license = metadata
4567 .get("license")
4568 .and_then(|v| v.as_str())
4569 .map(|v| truncate_field(v.to_string()));
4570 let description = metadata
4571 .get("description")
4572 .and_then(|v| v.as_str())
4573 .map(|v| truncate_field(v.to_string()));
4574 let keywords = metadata
4575 .get("keywords")
4576 .and_then(|v| v.as_array())
4577 .map(|arr| {
4578 arr.iter()
4579 .filter_map(|k| k.as_str().map(String::from))
4580 .collect::<Vec<_>>()
4581 })
4582 .unwrap_or_default();
4583
4584 let mut parties = Vec::new();
4585 if author.is_some() || author_email.is_some() {
4586 parties.push(Party {
4587 r#type: Some("person".to_string()),
4588 role: Some("author".to_string()),
4589 name: author,
4590 email: author_email,
4591 url: None,
4592 organization: None,
4593 organization_url: None,
4594 timezone: None,
4595 });
4596 }
4597
4598 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4599 normalize_spdx_declared_license(license.as_deref());
4600 let extracted_license_statement = license.clone();
4601 let requires_dist = metadata
4602 .get("requires_dist")
4603 .and_then(|v| v.as_array())
4604 .map(|entries| {
4605 entries
4606 .iter()
4607 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4608 .collect::<Vec<_>>()
4609 })
4610 .unwrap_or_default();
4611 let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4612
4613 let purl = name.as_ref().and_then(|n| {
4614 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4615 if let Some(v) = &version {
4616 package_url.with_version(v).ok()?;
4617 }
4618 Some(package_url.to_string())
4619 });
4620
4621 if is_requested && has_direct_url {
4622 let mut extra_data = HashMap::new();
4623 if let Some(pv) = &pip_version {
4624 extra_data.insert(
4625 "pip_version".to_string(),
4626 serde_json::Value::String(pv.clone()),
4627 );
4628 }
4629 if let Some(iv) = &inspect_version {
4630 extra_data.insert(
4631 "inspect_version".to_string(),
4632 serde_json::Value::String(iv.clone()),
4633 );
4634 }
4635
4636 main_package = Some(PackageData {
4637 package_type: Some(PythonParser::PACKAGE_TYPE),
4638 namespace: None,
4639 name,
4640 version,
4641 qualifiers: None,
4642 subpath: None,
4643 primary_language: Some("Python".to_string()),
4644 description: description.or(summary),
4645 release_date: None,
4646 parties,
4647 keywords,
4648 homepage_url: home_page,
4649 download_url: None,
4650 size: None,
4651 sha1: None,
4652 md5: None,
4653 sha256: None,
4654 sha512: None,
4655 bug_tracking_url: None,
4656 code_view_url: None,
4657 vcs_url: None,
4658 copyright: None,
4659 holder: None,
4660 declared_license_expression,
4661 declared_license_expression_spdx,
4662 license_detections,
4663 other_license_expression: None,
4664 other_license_expression_spdx: None,
4665 other_license_detections: Vec::new(),
4666 extracted_license_statement,
4667 notice_text: None,
4668 source_packages: Vec::new(),
4669 file_references: Vec::new(),
4670 is_private: false,
4671 is_virtual: true,
4672 extra_data: if extra_data.is_empty() {
4673 None
4674 } else {
4675 Some(extra_data)
4676 },
4677 dependencies: parsed_dependencies,
4678 repository_homepage_url: None,
4679 repository_download_url: None,
4680 api_data_url: None,
4681 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4682 purl,
4683 });
4684 } else {
4685 let resolved_package = PackageData {
4686 package_type: Some(PythonParser::PACKAGE_TYPE),
4687 namespace: None,
4688 name: name.clone(),
4689 version: version.clone(),
4690 qualifiers: None,
4691 subpath: None,
4692 primary_language: Some("Python".to_string()),
4693 description: description.or(summary),
4694 release_date: None,
4695 parties,
4696 keywords,
4697 homepage_url: home_page,
4698 download_url: None,
4699 size: None,
4700 sha1: None,
4701 md5: None,
4702 sha256: None,
4703 sha512: None,
4704 bug_tracking_url: None,
4705 code_view_url: None,
4706 vcs_url: None,
4707 copyright: None,
4708 holder: None,
4709 declared_license_expression,
4710 declared_license_expression_spdx,
4711 license_detections,
4712 other_license_expression: None,
4713 other_license_expression_spdx: None,
4714 other_license_detections: Vec::new(),
4715 extracted_license_statement,
4716 notice_text: None,
4717 source_packages: Vec::new(),
4718 file_references: Vec::new(),
4719 is_private: false,
4720 is_virtual: true,
4721 extra_data: None,
4722 dependencies: parsed_dependencies,
4723 repository_homepage_url: None,
4724 repository_download_url: None,
4725 api_data_url: None,
4726 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4727 purl: purl.clone(),
4728 };
4729
4730 let resolved = package_data_to_resolved(&resolved_package);
4731 dependencies.push(Dependency {
4732 purl,
4733 extracted_requirement: None,
4734 scope: None,
4735 is_runtime: Some(true),
4736 is_optional: Some(false),
4737 is_pinned: Some(true),
4738 is_direct: Some(is_requested),
4739 resolved_package: Some(Box::new(resolved)),
4740 extra_data: None,
4741 });
4742 }
4743 }
4744
4745 if let Some(mut main_pkg) = main_package {
4746 let direct_requirement_purls: HashSet<String> = main_pkg
4747 .dependencies
4748 .iter()
4749 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4750 .collect();
4751
4752 let resolved_requirement_purls: HashSet<String> = dependencies
4753 .iter()
4754 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4755 .collect();
4756
4757 let unresolved_dependencies = main_pkg
4758 .dependencies
4759 .iter()
4760 .filter(|dep| {
4761 dep.purl.as_ref().is_some_and(|purl| {
4762 !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4763 })
4764 })
4765 .cloned()
4766 .collect::<Vec<_>>();
4767
4768 for dependency in &mut dependencies {
4769 if dependency
4770 .purl
4771 .as_ref()
4772 .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4773 {
4774 dependency.is_direct = Some(true);
4775 }
4776 }
4777
4778 main_pkg.dependencies = dependencies;
4779 main_pkg.dependencies.extend(unresolved_dependencies);
4780 main_pkg
4781 } else {
4782 default_package_data(path)
4783 }
4784}
4785
4786fn base_dependency_purl(purl: &str) -> String {
4787 purl.split_once('@')
4788 .map(|(base, _)| base.to_string())
4789 .unwrap_or_else(|| purl.to_string())
4790}
4791
4792type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4793
4794fn extract_from_setup_cfg(path: &Path) -> PackageData {
4795 let content = match read_file_to_string(path, None) {
4796 Ok(content) => content,
4797 Err(e) => {
4798 warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4799 return default_package_data(path);
4800 }
4801 };
4802
4803 let sections = parse_setup_cfg(&content);
4804 let name = get_ini_value(§ions, "metadata", "name").map(truncate_field);
4805 let version = get_ini_value(§ions, "metadata", "version").map(truncate_field);
4806 let description = get_ini_value(§ions, "metadata", "description").map(truncate_field);
4807 let author = get_ini_value(§ions, "metadata", "author").map(truncate_field);
4808 let author_email = get_ini_value(§ions, "metadata", "author_email");
4809 let maintainer = get_ini_value(§ions, "metadata", "maintainer").map(truncate_field);
4810 let maintainer_email = get_ini_value(§ions, "metadata", "maintainer_email");
4811 let license = get_ini_value(§ions, "metadata", "license").map(truncate_field);
4812 let mut homepage_url = get_ini_value(§ions, "metadata", "url").map(truncate_field);
4813 let classifiers = get_ini_values(§ions, "metadata", "classifiers");
4814 let keywords = parse_setup_cfg_keywords(get_ini_value(§ions, "metadata", "keywords"));
4815 let python_requires = get_ini_value(§ions, "options", "python_requires");
4816 let parsed_project_urls =
4817 parse_setup_cfg_project_urls(&get_ini_values(§ions, "metadata", "project_urls"));
4818 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4819 let mut extra_data = HashMap::new();
4820
4821 let mut parties = Vec::new();
4822 if author.is_some() || author_email.is_some() {
4823 parties.push(Party {
4824 r#type: Some("person".to_string()),
4825 role: Some("author".to_string()),
4826 name: author,
4827 email: author_email,
4828 url: None,
4829 organization: None,
4830 organization_url: None,
4831 timezone: None,
4832 });
4833 }
4834
4835 if maintainer.is_some() || maintainer_email.is_some() {
4836 parties.push(Party {
4837 r#type: Some("person".to_string()),
4838 role: Some("maintainer".to_string()),
4839 name: maintainer,
4840 email: maintainer_email,
4841 url: None,
4842 organization: None,
4843 organization_url: None,
4844 timezone: None,
4845 });
4846 }
4847
4848 let declared_license_expression = None;
4849 let declared_license_expression_spdx = None;
4850 let license_detections = Vec::new();
4851 let extracted_license_statement = license.clone();
4852
4853 let dependencies = extract_setup_cfg_dependencies(§ions);
4854
4855 if let Some(value) = python_requires {
4856 extra_data.insert(
4857 "python_requires".to_string(),
4858 serde_json::Value::String(value),
4859 );
4860 }
4861
4862 apply_project_url_mappings(
4863 &parsed_project_urls,
4864 &mut homepage_url,
4865 &mut bug_tracking_url,
4866 &mut code_view_url,
4867 &mut vcs_url,
4868 &mut extra_data,
4869 );
4870
4871 let extra_data = if extra_data.is_empty() {
4872 None
4873 } else {
4874 Some(extra_data)
4875 };
4876
4877 let purl = name.as_ref().and_then(|n| {
4878 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4879 if let Some(v) = &version {
4880 package_url.with_version(v).ok()?;
4881 }
4882 Some(package_url.to_string())
4883 });
4884
4885 PackageData {
4886 package_type: Some(PythonParser::PACKAGE_TYPE),
4887 namespace: None,
4888 name,
4889 version,
4890 qualifiers: None,
4891 subpath: None,
4892 primary_language: Some("Python".to_string()),
4893 description,
4894 release_date: None,
4895 parties,
4896 keywords,
4897 homepage_url,
4898 download_url: None,
4899 size: None,
4900 sha1: None,
4901 md5: None,
4902 sha256: None,
4903 sha512: None,
4904 bug_tracking_url,
4905 code_view_url,
4906 vcs_url,
4907 copyright: None,
4908 holder: None,
4909 declared_license_expression,
4910 declared_license_expression_spdx,
4911 license_detections,
4912 other_license_expression: None,
4913 other_license_expression_spdx: None,
4914 other_license_detections: Vec::new(),
4915 extracted_license_statement,
4916 notice_text: None,
4917 source_packages: Vec::new(),
4918 file_references: Vec::new(),
4919 is_private: has_private_classifier(&classifiers),
4920 is_virtual: false,
4921 extra_data,
4922 dependencies,
4923 repository_homepage_url: None,
4924 repository_download_url: None,
4925 api_data_url: None,
4926 datasource_id: Some(DatasourceId::PypiSetupCfg),
4927 purl,
4928 }
4929}
4930
4931fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4932 let Some(keywords) = value else {
4933 return Vec::new();
4934 };
4935
4936 keywords
4937 .split(',')
4938 .map(str::trim)
4939 .filter(|keyword| !keyword.is_empty())
4940 .map(ToOwned::to_owned)
4941 .collect()
4942}
4943
4944fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4945 entries
4946 .iter()
4947 .filter_map(|entry| {
4948 let (label, url) = entry.split_once('=')?;
4949 let label = label.trim();
4950 let url = url.trim();
4951 if label.is_empty() || url.is_empty() {
4952 None
4953 } else {
4954 Some((label.to_string(), url.to_string()))
4955 }
4956 })
4957 .collect()
4958}
4959
4960fn apply_project_url_mappings(
4961 parsed_urls: &[(String, String)],
4962 homepage_url: &mut Option<String>,
4963 bug_tracking_url: &mut Option<String>,
4964 code_view_url: &mut Option<String>,
4965 vcs_url: &mut Option<String>,
4966 extra_data: &mut HashMap<String, serde_json::Value>,
4967) {
4968 for (label, url) in parsed_urls {
4969 let label_lower = label.to_lowercase();
4970
4971 if bug_tracking_url.is_none()
4972 && matches!(
4973 label_lower.as_str(),
4974 "tracker"
4975 | "bug reports"
4976 | "bug tracker"
4977 | "issues"
4978 | "issue tracker"
4979 | "github: issues"
4980 )
4981 {
4982 *bug_tracking_url = Some(url.clone());
4983 } else if code_view_url.is_none()
4984 && matches!(label_lower.as_str(), "source" | "source code" | "code")
4985 {
4986 *code_view_url = Some(url.clone());
4987 } else if vcs_url.is_none()
4988 && matches!(
4989 label_lower.as_str(),
4990 "github" | "gitlab" | "github: repo" | "repository"
4991 )
4992 {
4993 *vcs_url = Some(url.clone());
4994 } else if homepage_url.is_none()
4995 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4996 {
4997 *homepage_url = Some(url.clone());
4998 } else if label_lower == "changelog" {
4999 extra_data.insert(
5000 "changelog_url".to_string(),
5001 serde_json::Value::String(url.clone()),
5002 );
5003 }
5004 }
5005
5006 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
5007 .iter()
5008 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
5009 .collect();
5010
5011 if !project_urls_json.is_empty() {
5012 extra_data.insert(
5013 "project_urls".to_string(),
5014 serde_json::Value::Object(project_urls_json),
5015 );
5016 }
5017}
5018
5019fn parse_setup_cfg(content: &str) -> IniSections {
5020 let mut sections: IniSections = HashMap::new();
5021 let mut current_section: Option<String> = None;
5022 let mut current_key: Option<String> = None;
5023
5024 for raw_line in content.lines() {
5025 let line = raw_line.trim_end_matches('\r');
5026 let trimmed = line.trim();
5027 if trimmed.is_empty() {
5028 continue;
5029 }
5030
5031 let stripped = line.trim_start();
5032 if stripped.starts_with('#') || stripped.starts_with(';') {
5033 continue;
5034 }
5035
5036 if stripped.starts_with('[') && stripped.ends_with(']') {
5037 let section_name = stripped
5038 .trim_start_matches('[')
5039 .trim_end_matches(']')
5040 .trim()
5041 .to_ascii_lowercase();
5042 current_section = if section_name.is_empty() {
5043 None
5044 } else {
5045 Some(section_name)
5046 };
5047 current_key = None;
5048 continue;
5049 }
5050
5051 if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
5052 if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
5053 let value = stripped.trim();
5054 if !value.is_empty() {
5055 sections
5056 .entry(section.clone())
5057 .or_default()
5058 .entry(key.clone())
5059 .or_default()
5060 .push(value.to_string());
5061 }
5062 }
5063 continue;
5064 }
5065
5066 if let Some((key, value)) = stripped.split_once('=')
5067 && let Some(section) = current_section.as_ref()
5068 {
5069 let key_name = key.trim().to_ascii_lowercase();
5070 let value_trimmed = value.trim();
5071 let entry = sections
5072 .entry(section.clone())
5073 .or_default()
5074 .entry(key_name.clone())
5075 .or_default();
5076 if !value_trimmed.is_empty() {
5077 entry.push(value_trimmed.to_string());
5078 }
5079 current_key = Some(key_name);
5080 }
5081 }
5082
5083 sections
5084}
5085
5086fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
5087 sections
5088 .get(§ion.to_ascii_lowercase())
5089 .and_then(|values| values.get(&key.to_ascii_lowercase()))
5090 .and_then(|entries| entries.first())
5091 .map(|value| value.trim().to_string())
5092}
5093
5094fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
5095 sections
5096 .get(§ion.to_ascii_lowercase())
5097 .and_then(|values| values.get(&key.to_ascii_lowercase()))
5098 .cloned()
5099 .unwrap_or_default()
5100}
5101
5102fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
5103 let mut dependencies = Vec::new();
5104
5105 for (sub_section, scope) in [
5106 ("install_requires", "install"),
5107 ("tests_require", "test"),
5108 ("setup_requires", "setup"),
5109 ] {
5110 let reqs = get_ini_values(sections, "options", sub_section);
5111 dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
5112 }
5113
5114 if let Some(extras) = sections.get("options.extras_require") {
5115 let mut extra_items: Vec<_> = extras.iter().collect();
5116 extra_items.sort_by_key(|(name, _)| *name);
5117 for (extra_name, reqs) in extra_items {
5118 dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
5119 }
5120 }
5121
5122 dependencies
5123}
5124
5125fn parse_setup_cfg_requirements(
5126 reqs: &[String],
5127 scope: &str,
5128 is_optional: bool,
5129) -> Vec<Dependency> {
5130 reqs.iter()
5131 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
5132 .collect()
5133}
5134
5135fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
5136 let trimmed = req.trim();
5137 if trimmed.is_empty() || trimmed.starts_with('#') {
5138 return None;
5139 }
5140
5141 let name = extract_setup_cfg_dependency_name(trimmed)?;
5142 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5143
5144 Some(Dependency {
5145 purl: Some(purl.to_string()),
5146 extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
5147 scope: Some(scope.to_string()),
5148 is_runtime: Some(true),
5149 is_optional: Some(is_optional),
5150 is_pinned: Some(false),
5151 is_direct: Some(true),
5152 resolved_package: None,
5153 extra_data: None,
5154 })
5155}
5156
5157fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
5158 let trimmed = req.trim();
5159 if trimmed.is_empty() {
5160 return None;
5161 }
5162
5163 let end = trimmed
5164 .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
5165 .unwrap_or(trimmed.len());
5166 let name = trimmed[..end].trim();
5167 if name.is_empty() {
5168 None
5169 } else {
5170 Some(name.to_string())
5171 }
5172}
5173
5174fn normalize_setup_cfg_requirement(req: &str) -> String {
5175 req.chars().filter(|c| !c.is_whitespace()).collect()
5176}
5177
5178fn extract_setup_value(content: &str, key: &str) -> Option<String> {
5179 let patterns = vec![
5180 format!("{}=\"", key), format!("{} =\"", key), format!("{}= \"", key), format!("{} = \"", key), format!("{}='", key), format!("{} ='", key), format!("{}= '", key), format!("{} = '", key), ];
5189
5190 for pattern in patterns {
5191 if let Some(start_idx) = content.find(&pattern) {
5192 let value_start = start_idx + pattern.len();
5193 let remaining = &content[value_start..];
5194
5195 if let Some(end_idx) = remaining.find(['"', '\'']) {
5196 return Some(remaining[..end_idx].to_string());
5197 }
5198 }
5199 }
5200
5201 None
5202}
5203
5204fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
5205 let mut dependencies = Vec::new();
5206
5207 if let Some(tests_deps) = extract_tests_require(content) {
5208 dependencies.extend(tests_deps);
5209 }
5210
5211 if let Some(extras_deps) = extract_extras_require(content) {
5212 dependencies.extend(extras_deps);
5213 }
5214
5215 dependencies
5216}
5217
5218fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
5219 let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
5220 let re = Regex::new(pattern).ok()?;
5221 let captures = re.captures(content)?;
5222 let deps_str = captures.get(1)?.as_str();
5223
5224 let deps = parse_setup_py_dep_list(deps_str, "test", true);
5225 if deps.is_empty() { None } else { Some(deps) }
5226}
5227
5228fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
5229 let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
5230 let re = Regex::new(pattern).ok()?;
5231 let captures = re.captures(content)?;
5232 let dict_content = captures.get(1)?.as_str();
5233
5234 let mut all_deps = Vec::new();
5235
5236 let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
5237 let entry_re = Regex::new(entry_pattern).ok()?;
5238
5239 for entry_cap in entry_re.captures_iter(dict_content) {
5240 if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
5241 let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
5242 all_deps.extend(deps);
5243 }
5244 }
5245
5246 if all_deps.is_empty() {
5247 None
5248 } else {
5249 Some(all_deps)
5250 }
5251}
5252
5253fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
5254 let dep_pattern = r#"['"]([^'"]+)['"]"#;
5255 let re = match Regex::new(dep_pattern) {
5256 Ok(r) => r,
5257 Err(_) => return Vec::new(),
5258 };
5259
5260 re.captures_iter(deps_str)
5261 .filter_map(|cap| {
5262 let dep_str = cap.get(1)?.as_str().trim();
5263 if dep_str.is_empty() {
5264 return None;
5265 }
5266
5267 let name = extract_setup_cfg_dependency_name(dep_str)?;
5268 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5269
5270 Some(Dependency {
5271 purl: Some(purl.to_string()),
5272 extracted_requirement: Some(dep_str.to_string()),
5273 scope: Some(scope.to_string()),
5274 is_runtime: Some(true),
5275 is_optional: Some(is_optional),
5276 is_pinned: Some(false),
5277 is_direct: Some(true),
5278 resolved_package: None,
5279 extra_data: None,
5280 })
5281 })
5282 .collect()
5283}
5284
5285pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
5287 let content = read_file_to_string(path, None).map_err(|e| e.to_string())?;
5288 toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
5289}
5290
5291fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<Sha256Digest>) {
5302 let mut file = match File::open(path) {
5303 Ok(f) => f,
5304 Err(_) => return (None, None),
5305 };
5306
5307 let metadata = match file.metadata() {
5308 Ok(m) => m,
5309 Err(_) => return (None, None),
5310 };
5311 let size = metadata.len();
5312
5313 let mut hasher = Sha256::new();
5314 let mut buffer = vec![0; 8192];
5315
5316 loop {
5317 match file.read(&mut buffer) {
5318 Ok(0) => break,
5319 Ok(n) => hasher.update(&buffer[..n]),
5320 Err(_) => return (Some(size), None),
5321 }
5322 }
5323
5324 let hash = Sha256Digest::from_bytes(hasher.finalize().into());
5325 (Some(size), Some(hash))
5326}
5327
5328fn default_package_data(path: &Path) -> PackageData {
5329 PackageData {
5330 package_type: Some(PythonParser::PACKAGE_TYPE),
5331 primary_language: Some("Python".to_string()),
5332 datasource_id: infer_python_datasource_id(path),
5333 ..Default::default()
5334 }
5335}
5336
5337fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
5338 let file_name = path.file_name().and_then(|name| name.to_str());
5339
5340 match file_name {
5341 Some("pyproject.toml") => {
5342 if read_toml_file(path)
5343 .ok()
5344 .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
5345 .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
5346 .is_some()
5347 {
5348 Some(DatasourceId::PypiPoetryPyprojectToml)
5349 } else {
5350 Some(DatasourceId::PypiPyprojectToml)
5351 }
5352 }
5353 Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
5354 Some(DatasourceId::PypiSetupPy)
5355 }
5356 Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
5357 Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
5358 Some("METADATA") if is_installed_wheel_metadata_path(path) => {
5359 Some(DatasourceId::PypiWheelMetadata)
5360 }
5361 Some("pypi.json") => Some(DatasourceId::PypiJson),
5362 Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
5363 Some("origin.json") if is_pip_cache_origin_json(path) => {
5364 Some(DatasourceId::PypiPipOriginJson)
5365 }
5366 _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
5367 Some(DatasourceId::PypiSdist)
5368 }
5369 _ if path
5370 .extension()
5371 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
5372 {
5373 Some(DatasourceId::PypiWheel)
5374 }
5375 _ if path
5376 .extension()
5377 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
5378 {
5379 Some(DatasourceId::PypiEgg)
5380 }
5381 _ => None,
5382 }
5383}
5384
5385crate::register_parser!(
5386 "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
5387 &[
5388 "**/pyproject.toml",
5389 "**/setup.py",
5390 "**/*_setup.py",
5391 "**/setup.cfg",
5392 "**/pypi.json",
5393 "**/PKG-INFO",
5394 "**/*.dist-info/METADATA",
5395 "**/origin.json",
5396 "**/*.tar.gz",
5397 "**/*.tgz",
5398 "**/*.tar.bz2",
5399 "**/*.tar.xz",
5400 "**/*.zip",
5401 "**/*.whl",
5402 "**/*.egg"
5403 ],
5404 "pypi",
5405 "Python",
5406 Some("https://packaging.python.org/"),
5407);