1use crate::models::{
35 DatasourceId, Dependency, FileReference, PackageData, PackageType, Party, Sha256Digest,
36};
37use crate::parser_warn as warn;
38use crate::parsers::utils::{
39 MAX_ITERATION_COUNT, read_file_to_string, split_name_email, truncate_field,
40};
41use base64::Engine;
42use base64::engine::general_purpose::URL_SAFE_NO_PAD;
43use bzip2::read::BzDecoder;
44use csv::ReaderBuilder;
45use flate2::read::GzDecoder;
46use liblzma::read::XzDecoder;
47use packageurl::PackageUrl;
48use regex::Regex;
49use ruff_python_ast as ast;
50use ruff_python_parser::parse_module;
51use serde_json::{Map as JsonMap, Value as JsonValue};
52use sha2::{Digest, Sha256};
53use std::collections::{HashMap, HashSet};
54use std::fs::File;
55use std::io::Read;
56use std::path::{Component, Path, PathBuf};
57use tar::Archive;
58use toml::Value as TomlValue;
59use toml::map::Map as TomlMap;
60use zip::ZipArchive;
61
62use super::PackageParser;
63use super::license_normalization::{
64 DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
65 normalize_spdx_expression,
66};
67use super::pep508::parse_pep508_requirement;
68
69const FIELD_PROJECT: &str = "project";
71const FIELD_NAME: &str = "name";
72const FIELD_VERSION: &str = "version";
73const FIELD_DESCRIPTION: &str = "description";
74const FIELD_KEYWORDS: &str = "keywords";
75const FIELD_LICENSE: &str = "license";
76const FIELD_AUTHORS: &str = "authors";
77const FIELD_MAINTAINERS: &str = "maintainers";
78const FIELD_URLS: &str = "urls";
79const FIELD_HOMEPAGE: &str = "homepage";
80const FIELD_REPOSITORY: &str = "repository";
81const FIELD_DEPENDENCIES: &str = "dependencies";
82const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
83const FIELD_EXTRAS: &str = "extras";
84
85type ProjectUrls = (
86 Option<String>,
87 Option<String>,
88 Option<String>,
89 Option<String>,
90 Option<String>,
91);
92const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
93const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
94const MAX_SETUP_PY_BYTES: usize = 1_048_576;
95const MAX_SETUP_PY_AST_NODES: usize = 10_000;
96const MAX_SETUP_PY_AST_DEPTH: usize = 50;
97const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; const MAX_COMPRESSION_RATIO: f64 = 100.0; pub struct PythonParser;
111
112#[derive(Clone, Copy, Debug)]
113enum PythonSdistArchiveFormat {
114 TarGz,
115 Tgz,
116 TarBz2,
117 TarXz,
118 Zip,
119}
120
121#[derive(Clone, Debug)]
122struct ValidatedZipEntry {
123 index: usize,
124 name: String,
125}
126
127impl PackageParser for PythonParser {
128 const PACKAGE_TYPE: PackageType = PackageType::Pypi;
129
130 fn extract_packages(path: &Path) -> Vec<PackageData> {
131 vec![
132 if path.file_name().unwrap_or_default() == "pyproject.toml" {
133 extract_from_pyproject_toml(path)
134 } else if path.file_name().unwrap_or_default() == "setup.cfg" {
135 extract_from_setup_cfg(path)
136 } else if is_setup_py_like_path(path) {
137 return extract_setup_py_packages(path);
138 } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
139 extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
140 } else if is_installed_wheel_metadata_path(path) {
141 extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
142 } else if is_pip_cache_origin_json(path) {
143 extract_from_pip_origin_json(path)
144 } else if path.file_name().unwrap_or_default() == "pypi.json" {
145 extract_from_pypi_json(path)
146 } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
147 extract_from_pip_inspect(path)
148 } else if is_python_sdist_archive_path(path) {
149 extract_from_sdist_archive(path)
150 } else if path
151 .extension()
152 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
153 {
154 extract_from_wheel_archive(path)
155 } else if path
156 .extension()
157 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
158 {
159 extract_from_egg_archive(path)
160 } else {
161 default_package_data(path)
162 },
163 ]
164 }
165
166 fn is_match(path: &Path) -> bool {
167 if let Some(filename) = path.file_name()
168 && (filename == "pyproject.toml"
169 || filename == "setup.cfg"
170 || is_setup_py_like_path(path)
171 || filename == "PKG-INFO"
172 || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
173 || filename == "pypi.json"
174 || filename == "pip-inspect.deplock"
175 || is_pip_cache_origin_json(path))
176 {
177 return true;
178 }
179
180 if let Some(extension) = path.extension() {
181 let ext = extension.to_string_lossy().to_lowercase();
182 if (ext == "whl" && is_valid_wheel_archive_path(path))
183 || ext == "egg"
184 || is_python_sdist_archive_path(path)
185 {
186 return true;
187 }
188 }
189
190 false
191 }
192}
193
194fn is_setup_py_like_path(path: &Path) -> bool {
195 path.file_name()
196 .and_then(|name| name.to_str())
197 .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
198}
199
200fn is_installed_wheel_metadata_path(path: &Path) -> bool {
201 path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
202 && path
203 .parent()
204 .and_then(|parent| parent.file_name())
205 .and_then(|name| name.to_str())
206 .is_some_and(|name| name.ends_with(".dist-info"))
207}
208
209#[derive(Debug, Clone)]
210struct InstalledWheelMetadata {
211 wheel_tags: Vec<String>,
212 wheel_version: Option<String>,
213 wheel_generator: Option<String>,
214 root_is_purelib: Option<bool>,
215 compressed_tag: Option<String>,
216}
217
218fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
219 let Some(parent) = path.parent() else {
220 return;
221 };
222
223 if !parent
224 .file_name()
225 .and_then(|name| name.to_str())
226 .is_some_and(|name| name.ends_with(".dist-info"))
227 {
228 return;
229 }
230
231 let wheel_path = parent.join("WHEEL");
232 if !wheel_path.exists() {
233 return;
234 }
235
236 let Ok(content) = read_file_to_string(&wheel_path, None) else {
237 warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
238 return;
239 };
240
241 let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
242 return;
243 };
244
245 apply_installed_wheel_metadata(package_data, &wheel_metadata);
246}
247
248fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
249 use super::rfc822::{get_header_all, get_header_first};
250
251 let metadata = super::rfc822::parse_rfc822_content(content);
252 let wheel_tags = get_header_all(&metadata.headers, "tag");
253 if wheel_tags.is_empty() {
254 return None;
255 }
256
257 let wheel_version = get_header_first(&metadata.headers, "wheel-version");
258 let wheel_generator = get_header_first(&metadata.headers, "generator");
259 let root_is_purelib =
260 get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
261 match value.to_ascii_lowercase().as_str() {
262 "true" => Some(true),
263 "false" => Some(false),
264 _ => None,
265 }
266 });
267
268 let compressed_tag = compress_wheel_tags(&wheel_tags);
269
270 Some(InstalledWheelMetadata {
271 wheel_tags,
272 wheel_version,
273 wheel_generator,
274 root_is_purelib,
275 compressed_tag,
276 })
277}
278
279fn compress_wheel_tags(tags: &[String]) -> Option<String> {
280 if tags.is_empty() {
281 return None;
282 }
283
284 if tags.len() == 1 {
285 return Some(tags[0].clone());
286 }
287
288 let mut python_tags = Vec::new();
289 let mut abi_tag: Option<&str> = None;
290 let mut platform_tag: Option<&str> = None;
291
292 for tag in tags {
293 let mut parts = tag.splitn(3, '-');
294 let python = parts.next()?;
295 let abi = parts.next()?;
296 let platform = parts.next()?;
297
298 if abi_tag.is_some_and(|existing| existing != abi)
299 || platform_tag.is_some_and(|existing| existing != platform)
300 {
301 return None;
302 }
303
304 abi_tag = Some(abi);
305 platform_tag = Some(platform);
306 python_tags.push(python.to_string());
307 }
308
309 Some(format!(
310 "{}-{}-{}",
311 python_tags.join("."),
312 abi_tag?,
313 platform_tag?
314 ))
315}
316
317fn apply_installed_wheel_metadata(
318 package_data: &mut PackageData,
319 wheel_metadata: &InstalledWheelMetadata,
320) {
321 let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
322 extra_data.insert(
323 "wheel_tags".to_string(),
324 JsonValue::Array(
325 wheel_metadata
326 .wheel_tags
327 .iter()
328 .cloned()
329 .map(JsonValue::String)
330 .collect(),
331 ),
332 );
333
334 if let Some(wheel_version) = &wheel_metadata.wheel_version {
335 extra_data.insert(
336 "wheel_version".to_string(),
337 JsonValue::String(wheel_version.clone()),
338 );
339 }
340
341 if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
342 extra_data.insert(
343 "wheel_generator".to_string(),
344 JsonValue::String(wheel_generator.clone()),
345 );
346 }
347
348 if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
349 extra_data.insert(
350 "root_is_purelib".to_string(),
351 JsonValue::Bool(root_is_purelib),
352 );
353 }
354
355 if let (Some(name), Some(version), Some(extension)) = (
356 package_data.name.as_deref(),
357 package_data.version.as_deref(),
358 wheel_metadata.compressed_tag.as_deref(),
359 ) {
360 package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
361 }
362}
363
364fn is_pip_cache_origin_json(path: &Path) -> bool {
365 path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
366 && path.ancestors().skip(1).any(|ancestor| {
367 ancestor
368 .file_name()
369 .and_then(|name| name.to_str())
370 .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
371 })
372}
373
374fn extract_from_pip_origin_json(path: &Path) -> PackageData {
375 let content = match read_file_to_string(path, None) {
376 Ok(content) => content,
377 Err(e) => {
378 warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
379 return default_package_data(path);
380 }
381 };
382
383 let root: JsonValue = match serde_json::from_str(&content) {
384 Ok(root) => root,
385 Err(e) => {
386 warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
387 return default_package_data(path);
388 }
389 };
390
391 let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
392 warn!("No url found in pip cache origin.json at {:?}", path);
393 return default_package_data(path);
394 };
395
396 let sibling_wheel = find_sibling_cached_wheel(path);
397 let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
398 sibling_wheel
399 .as_ref()
400 .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
401 });
402
403 let Some((name, version)) = name_version else {
404 warn!(
405 "Failed to infer package name/version from pip cache origin.json at {:?}",
406 path
407 );
408 return default_package_data(path);
409 };
410
411 let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
412 build_pypi_urls(Some(&name), Some(&version));
413 let purl = sibling_wheel
414 .as_ref()
415 .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
416 .or(plain_purl);
417
418 PackageData {
419 package_type: Some(PythonParser::PACKAGE_TYPE),
420 primary_language: Some("Python".to_string()),
421 name: Some(truncate_field(name)),
422 version: Some(version),
423 datasource_id: Some(DatasourceId::PypiPipOriginJson),
424 download_url: Some(truncate_field(download_url.to_string())),
425 sha256: extract_sha256_from_origin_json(&root)
426 .and_then(|h| Sha256Digest::from_hex(&h).ok()),
427 repository_homepage_url,
428 repository_download_url,
429 api_data_url,
430 purl,
431 ..Default::default()
432 }
433}
434
435fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
436 let parent = path.parent()?;
437 let entries = parent.read_dir().ok()?;
438
439 for entry in entries.flatten() {
440 let sibling_path = entry.path();
441 if sibling_path
442 .extension()
443 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
444 && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
445 {
446 return Some(wheel_info);
447 }
448 }
449
450 None
451}
452
453fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
454 let file_name = url.rsplit('/').next()?;
455
456 if file_name.ends_with(".whl") {
457 return parse_wheel_filename(Path::new(file_name))
458 .map(|wheel_info| (wheel_info.name, wheel_info.version));
459 }
460
461 let stem = strip_python_archive_extension(file_name)?;
462 let (name, version) = stem.rsplit_once('-')?;
463 if name.is_empty() || version.is_empty() {
464 return None;
465 }
466
467 Some((name.replace('_', "-"), version.to_string()))
468}
469
470fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
471 [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
472 .iter()
473 .find_map(|suffix| file_name.strip_suffix(suffix))
474}
475
476fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
477 root.pointer("/archive_info/hashes/sha256")
478 .and_then(|value| value.as_str())
479 .map(ToOwned::to_owned)
480 .or_else(|| {
481 root.pointer("/archive_info/hash")
482 .and_then(|value| value.as_str())
483 .and_then(normalize_origin_hash)
484 })
485}
486
487fn normalize_origin_hash(hash: &str) -> Option<String> {
488 if let Some(value) = hash.strip_prefix("sha256=") {
489 return Some(value.to_string());
490 }
491 if let Some(value) = hash.strip_prefix("sha256:") {
492 return Some(value.to_string());
493 }
494 if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
495 return Some(hash.to_string());
496 }
497 None
498}
499
500fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
501 let content = match read_file_to_string(path, None) {
502 Ok(content) => content,
503 Err(e) => {
504 warn!("Failed to read metadata at {:?}: {}", path, e);
505 return default_package_data(path);
506 }
507 };
508
509 let metadata = super::rfc822::parse_rfc822_content(&content);
510 let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
511 merge_sibling_metadata_dependencies(path, &mut package_data);
512 merge_sibling_metadata_file_references(path, &mut package_data);
513 if datasource_id == DatasourceId::PypiWheelMetadata {
514 merge_sibling_wheel_metadata(path, &mut package_data);
515 }
516 package_data
517}
518
519fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
520 let mut extra_dependencies = Vec::new();
521
522 if let Some(parent) = path.parent() {
523 let direct_requires = parent.join("requires.txt");
524 if direct_requires.exists()
525 && let Ok(content) = read_file_to_string(&direct_requires, None)
526 {
527 extra_dependencies.extend(parse_requires_txt(&content));
528 }
529
530 let sibling_egg_info_requires = parent
531 .read_dir()
532 .ok()
533 .into_iter()
534 .flatten()
535 .flatten()
536 .find_map(|entry| {
537 let child_path = entry.path();
538 if child_path.is_dir()
539 && child_path
540 .file_name()
541 .and_then(|name| name.to_str())
542 .is_some_and(|name| name.ends_with(".egg-info"))
543 {
544 let requires = child_path.join("requires.txt");
545 requires.exists().then_some(requires)
546 } else {
547 None
548 }
549 });
550
551 if let Some(requires_path) = sibling_egg_info_requires
552 && let Ok(content) = read_file_to_string(&requires_path, None)
553 {
554 extra_dependencies.extend(parse_requires_txt(&content));
555 }
556 }
557
558 for dependency in extra_dependencies {
559 if !package_data.dependencies.iter().any(|existing| {
560 existing.purl == dependency.purl
561 && existing.scope == dependency.scope
562 && existing.extracted_requirement == dependency.extracted_requirement
563 && existing.extra_data == dependency.extra_data
564 }) {
565 package_data.dependencies.push(dependency);
566 }
567 }
568}
569
570fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
571 let mut extra_refs = Vec::new();
572
573 if let Some(parent) = path.parent() {
574 let record_path = parent.join("RECORD");
575 if record_path.exists()
576 && let Ok(content) = read_file_to_string(&record_path, None)
577 {
578 extra_refs.extend(parse_record_csv(&content));
579 }
580
581 let installed_files_path = parent.join("installed-files.txt");
582 if installed_files_path.exists()
583 && let Ok(content) = read_file_to_string(&installed_files_path, None)
584 {
585 extra_refs.extend(parse_installed_files_txt(&content));
586 }
587
588 let sources_path = parent.join("SOURCES.txt");
589 if sources_path.exists()
590 && let Ok(content) = read_file_to_string(&sources_path, None)
591 {
592 extra_refs.extend(parse_sources_txt(&content));
593 }
594 }
595
596 for file_ref in extra_refs {
597 if !package_data
598 .file_references
599 .iter()
600 .any(|existing| existing.path == file_ref.path)
601 {
602 package_data.file_references.push(file_ref);
603 }
604 }
605}
606
607fn collect_validated_zip_entries<R: Read + std::io::Seek>(
608 archive: &mut ZipArchive<R>,
609 path: &Path,
610 archive_type: &str,
611) -> Result<Vec<ValidatedZipEntry>, String> {
612 let mut total_extracted = 0u64;
613 let mut entries = Vec::new();
614 let mut entry_count = 0usize;
615
616 for i in 0..archive.len() {
617 entry_count += 1;
618 if entry_count > MAX_ITERATION_COUNT {
619 warn!(
620 "Exceeded max entry count in {} {:?}; stopping at {} entries",
621 archive_type, path, MAX_ITERATION_COUNT
622 );
623 break;
624 }
625 if let Ok(file) = archive.by_index_raw(i) {
626 let compressed_size = file.compressed_size();
627 let uncompressed_size = file.size();
628 let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
629 warn!(
630 "Skipping unsafe path in {} {:?}: {}",
631 archive_type,
632 path,
633 file.name()
634 );
635 continue;
636 };
637
638 if compressed_size > 0 {
639 let ratio = uncompressed_size as f64 / compressed_size as f64;
640 if ratio > MAX_COMPRESSION_RATIO {
641 warn!(
642 "Suspicious compression ratio in {} {:?}: {:.2}:1",
643 archive_type, path, ratio
644 );
645 continue;
646 }
647 }
648
649 if uncompressed_size > MAX_FILE_SIZE {
650 warn!(
651 "File too large in {} {:?}: {} bytes (limit: {} bytes)",
652 archive_type, path, uncompressed_size, MAX_FILE_SIZE
653 );
654 continue;
655 }
656
657 total_extracted += uncompressed_size;
658 if total_extracted > MAX_ARCHIVE_SIZE {
659 let msg = format!(
660 "Total extracted size exceeds limit for {} {:?}",
661 archive_type, path
662 );
663 warn!("{}", msg);
664 return Err(msg);
665 }
666
667 entries.push(ValidatedZipEntry {
668 index: i,
669 name: entry_name,
670 });
671 }
672 }
673
674 Ok(entries)
675}
676
677fn is_python_sdist_archive_path(path: &Path) -> bool {
678 detect_python_sdist_archive_format(path).is_some()
679}
680
681fn is_valid_wheel_archive_path(path: &Path) -> bool {
682 if !path.is_file() {
683 return true;
684 }
685
686 let file = match File::open(path) {
687 Ok(file) => file,
688 Err(_) => return false,
689 };
690 let mut archive = match ZipArchive::new(file) {
691 Ok(archive) => archive,
692 Err(_) => return false,
693 };
694
695 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
696 Ok(entries) => entries,
697 Err(_) => return false,
698 };
699
700 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA").is_some()
701}
702
703fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
704 let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
705
706 if !is_likely_python_sdist_filename(&file_name) {
707 return None;
708 }
709
710 if file_name.ends_with(".tar.gz") {
711 tar_gz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarGz)
712 } else if file_name.ends_with(".tgz") {
713 tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
714 } else if file_name.ends_with(".tar.bz2") {
715 tar_bz2_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarBz2)
716 } else if file_name.ends_with(".tar.xz") {
717 tar_xz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarXz)
718 } else if file_name.ends_with(".zip") {
719 zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
720 } else {
721 None
722 }
723}
724
725fn tar_gz_sdist_contains_pkg_info(path: &Path) -> bool {
726 let Some(compressed_size) = compressed_archive_size(path) else {
727 return false;
728 };
729 let file = match File::open(path) {
730 Ok(file) => file,
731 Err(_) => return false,
732 };
733 let decoder = GzDecoder::new(file);
734 tar_sdist_contains_pkg_info(path, decoder, "tar.gz", compressed_size)
735}
736
737fn tar_bz2_sdist_contains_pkg_info(path: &Path) -> bool {
738 let Some(compressed_size) = compressed_archive_size(path) else {
739 return false;
740 };
741 let file = match File::open(path) {
742 Ok(file) => file,
743 Err(_) => return false,
744 };
745 let decoder = BzDecoder::new(file);
746 tar_sdist_contains_pkg_info(path, decoder, "tar.bz2", compressed_size)
747}
748
749fn tar_xz_sdist_contains_pkg_info(path: &Path) -> bool {
750 let Some(compressed_size) = compressed_archive_size(path) else {
751 return false;
752 };
753 let file = match File::open(path) {
754 Ok(file) => file,
755 Err(_) => return false,
756 };
757 let decoder = XzDecoder::new(file);
758 tar_sdist_contains_pkg_info(path, decoder, "tar.xz", compressed_size)
759}
760
761fn compressed_archive_size(path: &Path) -> Option<u64> {
762 std::fs::metadata(path).ok().map(|metadata| metadata.len())
763}
764
765fn tar_sdist_contains_pkg_info<R: Read>(
766 path: &Path,
767 reader: R,
768 archive_type: &str,
769 compressed_size: u64,
770) -> bool {
771 let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
772 else {
773 return false;
774 };
775
776 select_sdist_pkginfo_entry(path, &entries).is_some()
777}
778
779fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
780 if !path.is_file() {
781 return true;
782 }
783
784 let Some(compressed_size) = compressed_archive_size(path) else {
785 return false;
786 };
787 let file = match File::open(path) {
788 Ok(file) => file,
789 Err(_) => return false,
790 };
791 let decoder = GzDecoder::new(file);
792 tar_sdist_contains_pkg_info(path, decoder, "tgz", compressed_size)
793}
794
795fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
796 if !path.is_file() {
797 return true;
798 }
799
800 let file = match File::open(path) {
801 Ok(file) => file,
802 Err(_) => return false,
803 };
804 let mut archive = match ZipArchive::new(file) {
805 Ok(archive) => archive,
806 Err(_) => return false,
807 };
808
809 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
810 Ok(entries) => entries,
811 Err(_) => return false,
812 };
813 let metadata_entries: Vec<_> = validated_entries
814 .iter()
815 .filter(|entry| entry.name.ends_with("/PKG-INFO"))
816 .filter_map(|entry| {
817 read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
818 .ok()
819 .map(|content| (entry.name.clone(), content))
820 })
821 .collect();
822
823 has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
824}
825
826fn is_likely_python_sdist_filename(file_name: &str) -> bool {
827 let Some(stem) = strip_python_archive_extension(file_name) else {
828 return false;
829 };
830
831 let Some((name, version)) = stem.rsplit_once('-') else {
832 return false;
833 };
834
835 !name.is_empty()
836 && !version.is_empty()
837 && version.chars().any(|ch| ch.is_ascii_digit())
838 && name
839 .chars()
840 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
841}
842
843fn extract_from_sdist_archive(path: &Path) -> PackageData {
844 let metadata = match std::fs::metadata(path) {
845 Ok(m) => m,
846 Err(e) => {
847 warn!(
848 "Failed to read metadata for sdist archive {:?}: {}",
849 path, e
850 );
851 return default_package_data(path);
852 }
853 };
854
855 if metadata.len() > MAX_ARCHIVE_SIZE {
856 warn!(
857 "sdist archive too large: {} bytes (limit: {} bytes)",
858 metadata.len(),
859 MAX_ARCHIVE_SIZE
860 );
861 return default_package_data(path);
862 }
863
864 let Some(format) = detect_python_sdist_archive_format(path) else {
865 return default_package_data(path);
866 };
867
868 let mut package_data = match format {
869 PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
870 let file = match File::open(path) {
871 Ok(file) => file,
872 Err(e) => {
873 warn!("Failed to open sdist archive {:?}: {}", path, e);
874 return default_package_data(path);
875 }
876 };
877 let decoder = GzDecoder::new(file);
878 extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
879 }
880 PythonSdistArchiveFormat::TarBz2 => {
881 let file = match File::open(path) {
882 Ok(file) => file,
883 Err(e) => {
884 warn!("Failed to open sdist archive {:?}: {}", path, e);
885 return default_package_data(path);
886 }
887 };
888 let decoder = BzDecoder::new(file);
889 extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
890 }
891 PythonSdistArchiveFormat::TarXz => {
892 let file = match File::open(path) {
893 Ok(file) => file,
894 Err(e) => {
895 warn!("Failed to open sdist archive {:?}: {}", path, e);
896 return default_package_data(path);
897 }
898 };
899 let decoder = XzDecoder::new(file);
900 extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
901 }
902 PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
903 };
904
905 if package_data.package_type.is_some() {
906 let (size, sha256) = calculate_file_checksums(path);
907 package_data.size = size;
908 package_data.sha256 = sha256;
909 }
910
911 package_data
912}
913
914fn extract_from_tar_sdist_archive<R: Read>(
915 path: &Path,
916 reader: R,
917 archive_type: &str,
918 compressed_size: u64,
919) -> PackageData {
920 let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
921 else {
922 return default_package_data(path);
923 };
924
925 build_sdist_package_data(path, entries)
926}
927
928fn collect_tar_sdist_entries<R: Read>(
929 path: &Path,
930 reader: R,
931 archive_type: &str,
932 compressed_size: u64,
933) -> Option<Vec<(String, String)>> {
934 let mut archive = Archive::new(reader);
935 let archive_entries = match archive.entries() {
936 Ok(entries) => entries,
937 Err(e) => {
938 warn!(
939 "Failed to read {} sdist archive {:?}: {}",
940 archive_type, path, e
941 );
942 return None;
943 }
944 };
945
946 let mut total_extracted = 0u64;
947 let mut entries = Vec::new();
948 let mut entry_count = 0usize;
949
950 for entry_result in archive_entries {
951 entry_count += 1;
952 if entry_count > MAX_ITERATION_COUNT {
953 warn!(
954 "Exceeded max entry count in {} sdist {:?}; stopping at {} entries",
955 archive_type, path, MAX_ITERATION_COUNT
956 );
957 break;
958 }
959
960 let mut entry = match entry_result {
961 Ok(entry) => entry,
962 Err(e) => {
963 warn!(
964 "Failed to read {} sdist entry from {:?}: {}",
965 archive_type, path, e
966 );
967 continue;
968 }
969 };
970
971 let entry_size = entry.size();
972 if entry_size > MAX_FILE_SIZE {
973 warn!(
974 "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
975 archive_type, path, entry_size, MAX_FILE_SIZE
976 );
977 continue;
978 }
979
980 total_extracted += entry_size;
981 if total_extracted > MAX_ARCHIVE_SIZE {
982 warn!(
983 "Total extracted size exceeds limit for {} sdist {:?}",
984 archive_type, path
985 );
986 return None;
987 }
988
989 if compressed_size > 0 {
990 let ratio = total_extracted as f64 / compressed_size as f64;
991 if ratio > MAX_COMPRESSION_RATIO {
992 warn!(
993 "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
994 archive_type, path, ratio
995 );
996 return None;
997 }
998 }
999
1000 let entry_path = match entry.path() {
1001 Ok(path) => path.to_string_lossy().replace('\\', "/"),
1002 Err(e) => {
1003 warn!(
1004 "Failed to get {} sdist entry path from {:?}: {}",
1005 archive_type, path, e
1006 );
1007 continue;
1008 }
1009 };
1010
1011 let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
1012 warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
1013 continue;
1014 };
1015
1016 if !is_relevant_sdist_text_entry(&entry_path) {
1017 continue;
1018 }
1019
1020 if let Ok(content) = read_limited_utf8(
1021 &mut entry,
1022 MAX_FILE_SIZE,
1023 &format!("{} entry {}", archive_type, entry_path),
1024 ) {
1025 entries.push((entry_path, content));
1026 }
1027 }
1028
1029 Some(entries)
1030}
1031
1032fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
1033 let file = match File::open(path) {
1034 Ok(file) => file,
1035 Err(e) => {
1036 warn!("Failed to open zip sdist archive {:?}: {}", path, e);
1037 return default_package_data(path);
1038 }
1039 };
1040
1041 let mut archive = match ZipArchive::new(file) {
1042 Ok(archive) => archive,
1043 Err(e) => {
1044 warn!("Failed to read zip sdist archive {:?}: {}", path, e);
1045 return default_package_data(path);
1046 }
1047 };
1048
1049 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
1050 Ok(entries) => entries,
1051 Err(_) => return default_package_data(path),
1052 };
1053
1054 let mut entries = Vec::new();
1055 for entry in validated_entries.iter() {
1056 if !is_relevant_sdist_text_entry(&entry.name) {
1057 continue;
1058 }
1059
1060 if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
1061 entries.push((entry.name.clone(), content));
1062 }
1063 }
1064
1065 build_sdist_package_data(path, entries)
1066}
1067
1068fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
1069 entry_path.ends_with("/PKG-INFO")
1070 || entry_path.ends_with("/requires.txt")
1071 || entry_path.ends_with("/SOURCES.txt")
1072}
1073
1074fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
1075 let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
1076 warn!("No PKG-INFO file found in sdist archive {:?}", path);
1077 return default_package_data(path);
1078 };
1079
1080 let mut package_data =
1081 python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
1082 merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
1083 merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
1084 apply_sdist_name_version_fallback(path, &mut package_data);
1085 package_data.datasource_id = Some(DatasourceId::PypiSdist);
1086 package_data
1087}
1088
1089fn select_sdist_pkginfo_entry(
1090 archive_path: &Path,
1091 entries: &[(String, String)],
1092) -> Option<(String, String)> {
1093 let expected_name = sdist_archive_expected_name(archive_path);
1094
1095 entries
1096 .iter()
1097 .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
1098 .min_by_key(|(entry_path, content)| {
1099 let components: Vec<_> = entry_path
1100 .split('/')
1101 .filter(|part| !part.is_empty())
1102 .collect();
1103 let candidate_name = sdist_pkginfo_candidate_name(content);
1104 let name_rank = if candidate_name == expected_name {
1105 0
1106 } else {
1107 1
1108 };
1109 let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1110
1111 (name_rank, kind_rank, components.len(), entry_path.clone())
1112 })
1113 .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1114}
1115
1116fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1117 let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1118 return false;
1119 };
1120
1121 entries.iter().any(|(entry_path, content)| {
1122 sdist_pkginfo_kind_rank(entry_path) < 3
1123 && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1124 })
1125}
1126
1127fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1128 archive_path
1129 .file_name()
1130 .and_then(|name| name.to_str())
1131 .and_then(strip_python_archive_extension)
1132 .and_then(|stem| {
1133 stem.rsplit_once('-')
1134 .map(|(name, _)| normalize_python_package_name(name))
1135 })
1136}
1137
1138fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1139 let metadata = super::rfc822::parse_rfc822_content(content);
1140 super::rfc822::get_header_first(&metadata.headers, "name")
1141 .map(|name| normalize_python_package_name(&name))
1142}
1143
1144fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1145 let components: Vec<_> = entry_path
1146 .split('/')
1147 .filter(|part| !part.is_empty())
1148 .collect();
1149
1150 if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1151 {
1152 0
1153 } else if components.len() == 2 && components[1] == "PKG-INFO" {
1154 1
1155 } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1156 2
1157 } else {
1158 3
1159 }
1160}
1161
1162fn merge_sdist_archive_dependencies(
1163 entries: &[(String, String)],
1164 metadata_path: &str,
1165 package_data: &mut PackageData,
1166) {
1167 let metadata_dir = metadata_path
1168 .rsplit_once('/')
1169 .map(|(dir, _)| dir)
1170 .unwrap_or("");
1171 let archive_root = metadata_path.split('/').next().unwrap_or("");
1172 let matched_egg_info_dir =
1173 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1174 let mut extra_dependencies = Vec::new();
1175
1176 for (entry_path, content) in entries {
1177 let is_direct_requires =
1178 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1179 let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1180 entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1181 });
1182
1183 if is_direct_requires || is_egg_info_requires {
1184 extra_dependencies.extend(parse_requires_txt(content));
1185 }
1186 }
1187
1188 for dependency in extra_dependencies {
1189 if !package_data.dependencies.iter().any(|existing| {
1190 existing.purl == dependency.purl
1191 && existing.scope == dependency.scope
1192 && existing.extracted_requirement == dependency.extracted_requirement
1193 && existing.extra_data == dependency.extra_data
1194 }) {
1195 package_data.dependencies.push(dependency);
1196 }
1197 }
1198}
1199
1200fn merge_sdist_archive_file_references(
1201 entries: &[(String, String)],
1202 metadata_path: &str,
1203 package_data: &mut PackageData,
1204) {
1205 let metadata_dir = metadata_path
1206 .rsplit_once('/')
1207 .map(|(dir, _)| dir)
1208 .unwrap_or("");
1209 let archive_root = metadata_path.split('/').next().unwrap_or("");
1210 let matched_egg_info_dir =
1211 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1212 let mut extra_refs = Vec::new();
1213
1214 for (entry_path, content) in entries {
1215 let is_direct_sources =
1216 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1217 let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1218 entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1219 });
1220
1221 if is_direct_sources || is_egg_info_sources {
1222 extra_refs.extend(parse_sources_txt(content));
1223 }
1224 }
1225
1226 for file_ref in extra_refs {
1227 if !package_data
1228 .file_references
1229 .iter()
1230 .any(|existing| existing.path == file_ref.path)
1231 {
1232 package_data.file_references.push(file_ref);
1233 }
1234 }
1235}
1236
1237fn select_matching_sdist_egg_info_dir(
1238 entries: &[(String, String)],
1239 archive_root: &str,
1240 package_name: Option<&str>,
1241) -> Option<String> {
1242 let normalized_package_name = package_name.map(normalize_python_package_name);
1243
1244 entries
1245 .iter()
1246 .filter_map(|(entry_path, _)| {
1247 let components: Vec<_> = entry_path
1248 .split('/')
1249 .filter(|part| !part.is_empty())
1250 .collect();
1251 if components.len() == 3
1252 && components[0] == archive_root
1253 && components[1].ends_with(".egg-info")
1254 {
1255 Some(components[1].to_string())
1256 } else {
1257 None
1258 }
1259 })
1260 .min_by_key(|egg_info_dir| {
1261 let normalized_dir_name =
1262 normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1263 let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1264 0
1265 } else {
1266 1
1267 };
1268
1269 (name_rank, egg_info_dir.clone())
1270 })
1271}
1272
1273fn normalize_python_package_name(name: &str) -> String {
1274 name.to_ascii_lowercase().replace('_', "-")
1275}
1276
1277fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1278 let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1279 return;
1280 };
1281
1282 let Some(stem) = strip_python_archive_extension(file_name) else {
1283 return;
1284 };
1285
1286 let Some((name, version)) = stem.rsplit_once('-') else {
1287 return;
1288 };
1289
1290 if package_data.name.is_none() {
1291 package_data.name = Some(name.replace('_', "-"));
1292 }
1293 if package_data.version.is_none() {
1294 package_data.version = Some(version.to_string());
1295 }
1296
1297 if package_data.purl.is_none()
1298 || package_data.repository_homepage_url.is_none()
1299 || package_data.repository_download_url.is_none()
1300 || package_data.api_data_url.is_none()
1301 {
1302 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1303 build_pypi_urls(
1304 package_data.name.as_deref(),
1305 package_data.version.as_deref(),
1306 );
1307
1308 if package_data.repository_homepage_url.is_none() {
1309 package_data.repository_homepage_url = repository_homepage_url;
1310 }
1311 if package_data.repository_download_url.is_none() {
1312 package_data.repository_download_url = repository_download_url;
1313 }
1314 if package_data.api_data_url.is_none() {
1315 package_data.api_data_url = api_data_url;
1316 }
1317 if package_data.purl.is_none() {
1318 package_data.purl = purl;
1319 }
1320 }
1321}
1322
1323fn extract_from_wheel_archive(path: &Path) -> PackageData {
1324 let metadata = match std::fs::metadata(path) {
1325 Ok(m) => m,
1326 Err(e) => {
1327 warn!(
1328 "Failed to read metadata for wheel archive {:?}: {}",
1329 path, e
1330 );
1331 return default_package_data(path);
1332 }
1333 };
1334
1335 if metadata.len() > MAX_ARCHIVE_SIZE {
1336 warn!(
1337 "Wheel archive too large: {} bytes (limit: {} bytes)",
1338 metadata.len(),
1339 MAX_ARCHIVE_SIZE
1340 );
1341 return default_package_data(path);
1342 }
1343
1344 let file = match File::open(path) {
1345 Ok(f) => f,
1346 Err(e) => {
1347 warn!("Failed to open wheel archive {:?}: {}", path, e);
1348 return default_package_data(path);
1349 }
1350 };
1351
1352 let mut archive = match ZipArchive::new(file) {
1353 Ok(a) => a,
1354 Err(e) => {
1355 warn!("Failed to read wheel archive {:?}: {}", path, e);
1356 return default_package_data(path);
1357 }
1358 };
1359
1360 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1361 Ok(entries) => entries,
1362 Err(_) => return default_package_data(path),
1363 };
1364
1365 let metadata_entry =
1366 match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1367 Some(entry) => entry,
1368 None => {
1369 warn!("No METADATA file found in wheel archive {:?}", path);
1370 return default_package_data(path);
1371 }
1372 };
1373
1374 let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1375 Ok(c) => c,
1376 Err(e) => {
1377 warn!("Failed to read METADATA from {:?}: {}", path, e);
1378 return default_package_data(path);
1379 }
1380 };
1381
1382 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1383
1384 let (size, sha256) = calculate_file_checksums(path);
1385 package_data.size = size;
1386 package_data.sha256 = sha256;
1387
1388 if let Some(record_entry) =
1389 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1390 && let Ok(record_content) =
1391 read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1392 {
1393 package_data.file_references = parse_record_csv(&record_content);
1394 }
1395
1396 if let Some(wheel_info) = parse_wheel_filename(path) {
1397 if package_data.name.is_none() {
1398 package_data.name = Some(wheel_info.name.clone());
1399 }
1400 if package_data.version.is_none() {
1401 package_data.version = Some(wheel_info.version.clone());
1402 }
1403
1404 package_data.qualifiers = Some(std::collections::HashMap::from([(
1405 "extension".to_string(),
1406 format!(
1407 "{}-{}-{}",
1408 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1409 ),
1410 )]));
1411
1412 package_data.purl = build_wheel_purl(
1413 package_data.name.as_deref(),
1414 package_data.version.as_deref(),
1415 &wheel_info,
1416 );
1417
1418 let mut extra_data = package_data.extra_data.unwrap_or_default();
1419 extra_data.insert(
1420 "python_requires".to_string(),
1421 serde_json::Value::String(wheel_info.python_tag.clone()),
1422 );
1423 extra_data.insert(
1424 "abi_tag".to_string(),
1425 serde_json::Value::String(wheel_info.abi_tag.clone()),
1426 );
1427 extra_data.insert(
1428 "platform_tag".to_string(),
1429 serde_json::Value::String(wheel_info.platform_tag.clone()),
1430 );
1431 package_data.extra_data = Some(extra_data);
1432 }
1433
1434 package_data
1435}
1436
1437fn extract_from_egg_archive(path: &Path) -> PackageData {
1438 let metadata = match std::fs::metadata(path) {
1439 Ok(m) => m,
1440 Err(e) => {
1441 warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1442 return default_package_data(path);
1443 }
1444 };
1445
1446 if metadata.len() > MAX_ARCHIVE_SIZE {
1447 warn!(
1448 "Egg archive too large: {} bytes (limit: {} bytes)",
1449 metadata.len(),
1450 MAX_ARCHIVE_SIZE
1451 );
1452 return default_package_data(path);
1453 }
1454
1455 let file = match File::open(path) {
1456 Ok(f) => f,
1457 Err(e) => {
1458 warn!("Failed to open egg archive {:?}: {}", path, e);
1459 return default_package_data(path);
1460 }
1461 };
1462
1463 let mut archive = match ZipArchive::new(file) {
1464 Ok(a) => a,
1465 Err(e) => {
1466 warn!("Failed to read egg archive {:?}: {}", path, e);
1467 return default_package_data(path);
1468 }
1469 };
1470
1471 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1472 Ok(entries) => entries,
1473 Err(_) => return default_package_data(path),
1474 };
1475
1476 let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1477 &validated_entries,
1478 &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1479 ) {
1480 Some(entry) => entry,
1481 None => {
1482 warn!("No PKG-INFO file found in egg archive {:?}", path);
1483 return default_package_data(path);
1484 }
1485 };
1486
1487 let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1488 Ok(c) => c,
1489 Err(e) => {
1490 warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1491 return default_package_data(path);
1492 }
1493 };
1494
1495 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1496
1497 let (size, sha256) = calculate_file_checksums(path);
1498 package_data.size = size;
1499 package_data.sha256 = sha256;
1500
1501 if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1502 &validated_entries,
1503 &[
1504 "EGG-INFO/installed-files.txt",
1505 ".egg-info/installed-files.txt",
1506 ],
1507 ) && let Ok(installed_files_content) =
1508 read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1509 {
1510 package_data.file_references = parse_installed_files_txt(&installed_files_content);
1511 }
1512
1513 if let Some(egg_info) = parse_egg_filename(path) {
1514 if package_data.name.is_none() {
1515 package_data.name = Some(egg_info.name.clone());
1516 }
1517 if package_data.version.is_none() {
1518 package_data.version = Some(egg_info.version.clone());
1519 }
1520
1521 if let Some(python_version) = &egg_info.python_version {
1522 let mut extra_data = package_data.extra_data.unwrap_or_default();
1523 extra_data.insert(
1524 "python_version".to_string(),
1525 serde_json::Value::String(python_version.clone()),
1526 );
1527 package_data.extra_data = Some(extra_data);
1528 }
1529 }
1530
1531 package_data.purl = build_egg_purl(
1532 package_data.name.as_deref(),
1533 package_data.version.as_deref(),
1534 );
1535
1536 package_data
1537}
1538
1539fn find_validated_zip_entry_by_suffix<'a>(
1540 entries: &'a [ValidatedZipEntry],
1541 suffix: &str,
1542) -> Option<&'a ValidatedZipEntry> {
1543 entries.iter().find(|entry| entry.name.ends_with(suffix))
1544}
1545
1546fn find_validated_zip_entry_by_any_suffix<'a>(
1547 entries: &'a [ValidatedZipEntry],
1548 suffixes: &[&str],
1549) -> Option<&'a ValidatedZipEntry> {
1550 entries
1551 .iter()
1552 .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1553}
1554
1555fn read_validated_zip_entry<R: Read + std::io::Seek>(
1556 archive: &mut ZipArchive<R>,
1557 entry: &ValidatedZipEntry,
1558 path: &Path,
1559 archive_type: &str,
1560) -> Result<String, String> {
1561 let mut file = archive
1562 .by_index(entry.index)
1563 .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1564
1565 let compressed_size = file.compressed_size();
1566 let uncompressed_size = file.size();
1567
1568 if compressed_size > 0 {
1569 let ratio = uncompressed_size as f64 / compressed_size as f64;
1570 if ratio > MAX_COMPRESSION_RATIO {
1571 return Err(format!(
1572 "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1573 archive_type, path, ratio
1574 ));
1575 }
1576 }
1577
1578 if uncompressed_size > MAX_FILE_SIZE {
1579 return Err(format!(
1580 "Rejected oversized entry in {} {:?}: {} bytes",
1581 archive_type, path, uncompressed_size
1582 ));
1583 }
1584
1585 read_limited_utf8(
1586 &mut file,
1587 MAX_FILE_SIZE,
1588 &format!("{} entry {}", archive_type, entry.name),
1589 )
1590}
1591
1592fn read_limited_utf8<R: Read>(
1593 reader: &mut R,
1594 max_bytes: u64,
1595 context: &str,
1596) -> Result<String, String> {
1597 let mut limited = reader.take(max_bytes + 1);
1598 let mut bytes = Vec::new();
1599 limited
1600 .read_to_end(&mut bytes)
1601 .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1602
1603 if bytes.len() as u64 > max_bytes {
1604 return Err(format!(
1605 "{} exceeded {} byte limit while reading",
1606 context, max_bytes
1607 ));
1608 }
1609
1610 match String::from_utf8(bytes) {
1611 Ok(s) => Ok(s),
1612 Err(err) => {
1613 let bytes = err.into_bytes();
1614 warn!("Invalid UTF-8 in archive entry; using lossy conversion");
1615 Ok(String::from_utf8_lossy(&bytes).into_owned())
1616 }
1617 }
1618}
1619
1620fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1621 let normalized = entry_path.replace('\\', "/");
1622 if normalized.len() >= 3 {
1623 let bytes = normalized.as_bytes();
1624 if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1625 return None;
1626 }
1627 }
1628 let path = Path::new(&normalized);
1629 let mut components = Vec::new();
1630
1631 for component in path.components() {
1632 match component {
1633 Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1634 Component::CurDir => {}
1635 Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1636 }
1637 }
1638
1639 (!components.is_empty()).then_some(components.join("/"))
1640}
1641
1642pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1647 let mut reader = ReaderBuilder::new()
1648 .has_headers(false)
1649 .from_reader(content.as_bytes());
1650
1651 let mut file_references = Vec::new();
1652 let mut record_count = 0usize;
1653
1654 for result in reader.records() {
1655 record_count += 1;
1656 if record_count > MAX_ITERATION_COUNT {
1657 warn!(
1658 "Exceeded max record count in RECORD CSV; stopping at {} records",
1659 MAX_ITERATION_COUNT
1660 );
1661 break;
1662 }
1663 match result {
1664 Ok(record) => {
1665 if record.len() < 3 {
1666 continue;
1667 }
1668
1669 let path = record.get(0).unwrap_or("").trim().to_string();
1670 if path.is_empty() {
1671 continue;
1672 }
1673
1674 let hash_field = record.get(1).unwrap_or("").trim();
1675 let size_field = record.get(2).unwrap_or("").trim();
1676
1677 let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1679 let parts: Vec<&str> = hash_field.split('=').collect();
1680 if parts.len() == 2 && parts[0] == "sha256" {
1681 match URL_SAFE_NO_PAD.decode(parts[1]) {
1682 Ok(decoded) => {
1683 let hex = decoded
1684 .iter()
1685 .map(|b| format!("{:02x}", b))
1686 .collect::<String>();
1687 Sha256Digest::from_hex(&hex).ok()
1688 }
1689 Err(_) => None,
1690 }
1691 } else {
1692 None
1693 }
1694 } else {
1695 None
1696 };
1697
1698 let size = if !size_field.is_empty() && size_field != "-" {
1700 size_field.parse::<u64>().ok()
1701 } else {
1702 None
1703 };
1704
1705 file_references.push(FileReference {
1706 path,
1707 size,
1708 sha1: None,
1709 md5: None,
1710 sha256,
1711 sha512: None,
1712 extra_data: None,
1713 });
1714 }
1715 Err(e) => {
1716 warn!("Failed to parse RECORD CSV row: {}", e);
1717 continue;
1718 }
1719 }
1720 }
1721
1722 file_references
1723}
1724
1725pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1728 content
1729 .lines()
1730 .map(|line| line.trim())
1731 .filter(|line| !line.is_empty())
1732 .map(|path| FileReference {
1733 path: path.to_string(),
1734 size: None,
1735 sha1: None,
1736 md5: None,
1737 sha256: None,
1738 sha512: None,
1739 extra_data: None,
1740 })
1741 .collect()
1742}
1743
1744pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1745 content
1746 .lines()
1747 .map(str::trim)
1748 .filter(|line| !line.is_empty())
1749 .map(|path| FileReference {
1750 path: path.to_string(),
1751 size: None,
1752 sha1: None,
1753 md5: None,
1754 sha256: None,
1755 sha512: None,
1756 extra_data: None,
1757 })
1758 .collect()
1759}
1760
1761struct WheelInfo {
1762 name: String,
1763 version: String,
1764 python_tag: String,
1765 abi_tag: String,
1766 platform_tag: String,
1767}
1768
1769fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1770 let stem = path.file_stem()?.to_string_lossy();
1771 let parts: Vec<&str> = stem.split('-').collect();
1772
1773 if parts.len() >= 5 {
1774 Some(WheelInfo {
1775 name: parts[0].replace('_', "-"),
1776 version: parts[1].to_string(),
1777 python_tag: parts[2].to_string(),
1778 abi_tag: parts[3].to_string(),
1779 platform_tag: parts[4..].join("-"),
1780 })
1781 } else {
1782 None
1783 }
1784}
1785
1786struct EggInfo {
1787 name: String,
1788 version: String,
1789 python_version: Option<String>,
1790}
1791
1792fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1793 let stem = path.file_stem()?.to_string_lossy();
1794 let parts: Vec<&str> = stem.split('-').collect();
1795
1796 if parts.len() >= 2 {
1797 Some(EggInfo {
1798 name: parts[0].replace('_', "-"),
1799 version: parts[1].to_string(),
1800 python_version: parts.get(2).map(|s| s.to_string()),
1801 })
1802 } else {
1803 None
1804 }
1805}
1806
1807fn build_wheel_purl(
1808 name: Option<&str>,
1809 version: Option<&str>,
1810 wheel_info: &WheelInfo,
1811) -> Option<String> {
1812 let name = name?;
1813 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1814
1815 if let Some(ver) = version {
1816 package_url.with_version(ver).ok()?;
1817 }
1818
1819 let extension = format!(
1820 "{}-{}-{}",
1821 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1822 );
1823 package_url.add_qualifier("extension", extension).ok()?;
1824
1825 Some(package_url.to_string())
1826}
1827
1828fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1829 let name = name?;
1830 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1831
1832 if let Some(ver) = version {
1833 package_url.with_version(ver).ok()?;
1834 }
1835
1836 package_url.add_qualifier("type", "egg").ok()?;
1837
1838 Some(package_url.to_string())
1839}
1840
1841fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1842 let metadata = super::rfc822::parse_rfc822_content(content);
1843 build_package_data_from_rfc822(&metadata, datasource_id)
1844}
1845
1846fn build_package_data_from_rfc822(
1851 metadata: &super::rfc822::Rfc822Metadata,
1852 datasource_id: DatasourceId,
1853) -> PackageData {
1854 use super::rfc822::{get_header_all, get_header_first};
1855
1856 let name = get_header_first(&metadata.headers, "name").map(truncate_field);
1857 let version = get_header_first(&metadata.headers, "version").map(truncate_field);
1858 let summary = get_header_first(&metadata.headers, "summary").map(truncate_field);
1859 let mut homepage_url = get_header_first(&metadata.headers, "home-page").map(truncate_field);
1860 let author = get_header_first(&metadata.headers, "author").map(truncate_field);
1861 let author_email = get_header_first(&metadata.headers, "author-email").map(truncate_field);
1862 let license = get_header_first(&metadata.headers, "license").map(truncate_field);
1863 let license_expression = get_header_first(&metadata.headers, "license-expression");
1864 let download_url = get_header_first(&metadata.headers, "download-url");
1865 let platform = get_header_first(&metadata.headers, "platform");
1866 let requires_python = get_header_first(&metadata.headers, "requires-python");
1867 let classifiers = get_header_all(&metadata.headers, "classifier");
1868 let license_files = get_header_all(&metadata.headers, "license-file");
1869
1870 let description_body = if metadata.body.is_empty() {
1871 get_header_first(&metadata.headers, "description").unwrap_or_default()
1872 } else {
1873 metadata.body.clone()
1874 };
1875
1876 let description = build_description(summary.as_deref(), &description_body).map(truncate_field);
1877
1878 let mut parties = Vec::new();
1879 if author.is_some() || author_email.is_some() {
1880 parties.push(Party {
1881 r#type: Some("person".to_string()),
1882 role: Some("author".to_string()),
1883 name: author,
1884 email: author_email,
1885 url: None,
1886 organization: None,
1887 organization_url: None,
1888 timezone: None,
1889 });
1890 }
1891
1892 let (keywords, license_classifiers) = split_classifiers(&classifiers);
1893 let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1894 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1895 license_expression
1896 .as_deref()
1897 .and_then(normalize_spdx_expression)
1898 .map(|normalized| {
1899 build_declared_license_data(
1900 normalized,
1901 DeclaredLicenseMatchMetadata::single_line(
1902 license_expression.as_deref().unwrap_or_default(),
1903 )
1904 .with_referenced_filenames(&referenced_license_files),
1905 )
1906 })
1907 .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1908
1909 let extracted_license_statement = license_expression
1910 .clone()
1911 .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1912
1913 let mut extra_data = HashMap::new();
1914 if let Some(platform_value) = platform
1915 && !platform_value.eq_ignore_ascii_case("unknown")
1916 && !platform_value.is_empty()
1917 {
1918 extra_data.insert(
1919 "platform".to_string(),
1920 serde_json::Value::String(platform_value),
1921 );
1922 }
1923
1924 if let Some(requires_python_value) = requires_python
1925 && !requires_python_value.is_empty()
1926 {
1927 extra_data.insert(
1928 "requires_python".to_string(),
1929 serde_json::Value::String(requires_python_value),
1930 );
1931 }
1932
1933 if !license_files.is_empty() {
1934 extra_data.insert(
1935 "license_files".to_string(),
1936 serde_json::Value::Array(
1937 license_files
1938 .iter()
1939 .cloned()
1940 .map(serde_json::Value::String)
1941 .collect(),
1942 ),
1943 );
1944 }
1945
1946 let file_references = license_files
1947 .iter()
1948 .map(|path| FileReference {
1949 path: path.clone(),
1950 size: None,
1951 sha1: None,
1952 md5: None,
1953 sha256: None,
1954 sha512: None,
1955 extra_data: None,
1956 })
1957 .collect();
1958
1959 let project_urls = get_header_all(&metadata.headers, "project-url");
1960 let dependencies = extract_rfc822_dependencies(&metadata.headers);
1961 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1962
1963 if !project_urls.is_empty() {
1964 let parsed_urls = parse_project_urls(&project_urls);
1965
1966 for (label, url) in &parsed_urls {
1967 let label_lower = label.to_lowercase();
1968
1969 if bug_tracking_url.is_none()
1970 && matches!(
1971 label_lower.as_str(),
1972 "tracker"
1973 | "bug reports"
1974 | "bug tracker"
1975 | "issues"
1976 | "issue tracker"
1977 | "github: issues"
1978 )
1979 {
1980 bug_tracking_url = Some(url.clone());
1981 } else if code_view_url.is_none()
1982 && matches!(label_lower.as_str(), "source" | "source code" | "code")
1983 {
1984 code_view_url = Some(url.clone());
1985 } else if vcs_url.is_none()
1986 && matches!(
1987 label_lower.as_str(),
1988 "github" | "gitlab" | "github: repo" | "repository"
1989 )
1990 {
1991 vcs_url = Some(url.clone());
1992 } else if homepage_url.is_none()
1993 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1994 {
1995 homepage_url = Some(url.clone());
1996 } else if label_lower == "changelog" {
1997 extra_data.insert(
1998 "changelog_url".to_string(),
1999 serde_json::Value::String(url.clone()),
2000 );
2001 }
2002 }
2003
2004 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
2005 .iter()
2006 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
2007 .collect();
2008
2009 if !project_urls_json.is_empty() {
2010 extra_data.insert(
2011 "project_urls".to_string(),
2012 serde_json::Value::Object(project_urls_json),
2013 );
2014 }
2015 }
2016
2017 let extra_data = if extra_data.is_empty() {
2018 None
2019 } else {
2020 Some(extra_data)
2021 };
2022
2023 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
2024 build_pypi_urls(name.as_deref(), version.as_deref());
2025
2026 PackageData {
2027 package_type: Some(PythonParser::PACKAGE_TYPE),
2028 namespace: None,
2029 name,
2030 version,
2031 qualifiers: None,
2032 subpath: None,
2033 primary_language: Some("Python".to_string()),
2034 description,
2035 release_date: None,
2036 parties,
2037 keywords,
2038 homepage_url,
2039 download_url,
2040 size: None,
2041 sha1: None,
2042 md5: None,
2043 sha256: None,
2044 sha512: None,
2045 bug_tracking_url,
2046 code_view_url,
2047 vcs_url,
2048 copyright: None,
2049 holder: None,
2050 declared_license_expression,
2051 declared_license_expression_spdx,
2052 license_detections,
2053 other_license_expression: None,
2054 other_license_expression_spdx: None,
2055 other_license_detections: Vec::new(),
2056 extracted_license_statement,
2057 notice_text: None,
2058 source_packages: Vec::new(),
2059 file_references,
2060 is_private: false,
2061 is_virtual: false,
2062 extra_data,
2063 dependencies,
2064 repository_homepage_url,
2065 repository_download_url,
2066 api_data_url,
2067 datasource_id: Some(datasource_id),
2068 purl,
2069 }
2070}
2071
2072fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
2073 project_urls
2074 .iter()
2075 .filter_map(|url_entry| {
2076 if let Some((label, url)) = url_entry.split_once(", ") {
2077 let label_trimmed = label.trim();
2078 let url_trimmed = url.trim();
2079 if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
2080 return Some((label_trimmed.to_string(), url_trimmed.to_string()));
2081 }
2082 }
2083 None
2084 })
2085 .collect()
2086}
2087
2088fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
2089 let mut parts = Vec::new();
2090 if let Some(summary_value) = summary
2091 && !summary_value.trim().is_empty()
2092 {
2093 parts.push(summary_value.trim().to_string());
2094 }
2095
2096 if !body.trim().is_empty() {
2097 parts.push(body.trim().to_string());
2098 }
2099
2100 if parts.is_empty() {
2101 None
2102 } else {
2103 Some(parts.join("\n"))
2104 }
2105}
2106
2107fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
2108 let mut keywords = Vec::new();
2109 let mut license_classifiers = Vec::new();
2110
2111 for classifier in classifiers {
2112 if classifier.starts_with("License ::") {
2113 license_classifiers.push(classifier.to_string());
2114 } else {
2115 keywords.push(classifier.to_string());
2116 }
2117 }
2118
2119 (keywords, license_classifiers)
2120}
2121
2122fn build_extracted_license_statement(
2123 license: Option<&str>,
2124 license_classifiers: &[String],
2125) -> Option<String> {
2126 let mut lines = Vec::new();
2127
2128 if let Some(value) = license
2129 && !value.trim().is_empty()
2130 {
2131 lines.push(format!("license: {}", value.trim()));
2132 }
2133
2134 if !license_classifiers.is_empty() {
2135 lines.push("classifiers:".to_string());
2136 for classifier in license_classifiers {
2137 lines.push(format!(" - '{}'", classifier));
2138 }
2139 }
2140
2141 if lines.is_empty() {
2142 None
2143 } else {
2144 Some(format!("{}\n", lines.join("\n")))
2145 }
2146}
2147
2148pub(crate) fn build_pypi_urls(
2149 name: Option<&str>,
2150 version: Option<&str>,
2151) -> (
2152 Option<String>,
2153 Option<String>,
2154 Option<String>,
2155 Option<String>,
2156) {
2157 let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2158
2159 let repository_download_url = name.and_then(|value| {
2160 version.map(|ver| {
2161 format!(
2162 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2163 &value[..1.min(value.len())],
2164 value,
2165 value,
2166 ver
2167 )
2168 })
2169 });
2170
2171 let api_data_url = name.map(|value| {
2172 if let Some(ver) = version {
2173 format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2174 } else {
2175 format!("https://pypi.org/pypi/{}/json", value)
2176 }
2177 });
2178
2179 let purl = name.and_then(|value| {
2180 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2181 if let Some(ver) = version {
2182 package_url.with_version(ver).ok()?;
2183 }
2184 Some(package_url.to_string())
2185 });
2186
2187 (
2188 repository_homepage_url,
2189 repository_download_url,
2190 api_data_url,
2191 purl,
2192 )
2193}
2194
2195fn build_pypi_purl_with_extension(
2196 name: &str,
2197 version: Option<&str>,
2198 extension: &str,
2199) -> Option<String> {
2200 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2201 if let Some(ver) = version {
2202 package_url.with_version(ver).ok()?;
2203 }
2204 package_url.add_qualifier("extension", extension).ok()?;
2205 Some(package_url.to_string())
2206}
2207
2208fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2209 let toml_content = match read_toml_file(path) {
2210 Ok(content) => content,
2211 Err(e) => {
2212 warn!(
2213 "Failed to read or parse pyproject.toml at {:?}: {}",
2214 path, e
2215 );
2216 return default_package_data(path);
2217 }
2218 };
2219
2220 let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2221 let is_poetry_pyproject = tool_table
2222 .and_then(|tool| tool.get("poetry"))
2223 .and_then(|value| value.as_table())
2224 .is_some();
2225
2226 let project_table =
2228 if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2229 project.clone()
2231 } else if let Some(tool) = tool_table {
2232 if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2233 poetry.clone()
2235 } else {
2236 return default_package_data(path);
2237 }
2238 } else if toml_content.get(FIELD_NAME).is_some() {
2239 match toml_content.as_table() {
2241 Some(table) => table.clone(),
2242 None => {
2243 warn!("Failed to convert TOML content to table in {:?}", path);
2244 return default_package_data(path);
2245 }
2246 }
2247 } else {
2248 return default_package_data(path);
2249 };
2250
2251 let name = project_table
2252 .get(FIELD_NAME)
2253 .and_then(|v| v.as_str())
2254 .map(|v| truncate_field(v.to_string()));
2255
2256 let version = project_table
2257 .get(FIELD_VERSION)
2258 .and_then(|v| v.as_str())
2259 .map(String::from);
2260 let classifiers = project_table
2261 .get("classifiers")
2262 .and_then(|value| value.as_array())
2263 .map(|values| {
2264 values
2265 .iter()
2266 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2267 .collect::<Vec<_>>()
2268 })
2269 .unwrap_or_default();
2270 let (classifier_keywords, license_classifiers) = split_classifiers(&classifiers);
2271
2272 let extracted_license_statement = extract_raw_license_string(&project_table);
2273 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2274 normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2275
2276 let description = project_table
2277 .get(FIELD_DESCRIPTION)
2278 .and_then(|value| value.as_str())
2279 .map(|value| truncate_field(value.to_string()));
2280 let mut keywords = project_table
2281 .get(FIELD_KEYWORDS)
2282 .and_then(|value| value.as_array())
2283 .map(|values| {
2284 values
2285 .iter()
2286 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2287 .collect::<Vec<_>>()
2288 })
2289 .unwrap_or_default();
2290 for classifier in classifier_keywords {
2291 if !keywords.contains(&classifier) {
2292 keywords.push(classifier);
2293 }
2294 }
2295
2296 let mut extra_data = extract_pyproject_extra_data(&toml_content).unwrap_or_default();
2298 let (homepage_url, download_url, bug_tracking_url, code_view_url, repository_url) =
2299 extract_urls(&project_table, &mut extra_data);
2300
2301 let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2302
2303 let purl = name.as_ref().and_then(|n| {
2305 let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2306 Ok(p) => p,
2307 Err(e) => {
2308 warn!(
2309 "Failed to create PackageUrl for Python package '{}': {}",
2310 n, e
2311 );
2312 return None;
2313 }
2314 };
2315
2316 if let Some(v) = &version
2317 && let Err(e) = package_url.with_version(v)
2318 {
2319 warn!(
2320 "Failed to set version '{}' for Python package '{}': {}",
2321 v, n, e
2322 );
2323 return None;
2324 }
2325
2326 Some(package_url.to_string())
2327 });
2328
2329 let api_data_url = name.as_ref().map(|n| {
2330 if let Some(v) = &version {
2331 format!("https://pypi.org/pypi/{}/{}/json", n, v)
2332 } else {
2333 format!("https://pypi.org/pypi/{}/json", n)
2334 }
2335 });
2336
2337 let pypi_homepage_url = name
2338 .as_ref()
2339 .map(|n| format!("https://pypi.org/project/{}", n));
2340
2341 let pypi_download_url = name.as_ref().and_then(|n| {
2342 version.as_ref().map(|v| {
2343 format!(
2344 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2345 &n[..1.min(n.len())],
2346 n,
2347 n,
2348 v
2349 )
2350 })
2351 });
2352
2353 PackageData {
2354 package_type: Some(PythonParser::PACKAGE_TYPE),
2355 namespace: None,
2356 name,
2357 version,
2358 qualifiers: None,
2359 subpath: None,
2360 primary_language: None,
2361 description,
2362 release_date: None,
2363 parties: extract_parties(&project_table),
2364 keywords,
2365 homepage_url: homepage_url.or(pypi_homepage_url),
2366 download_url: download_url
2367 .or_else(|| repository_url.clone())
2368 .or(pypi_download_url),
2369 size: None,
2370 sha1: None,
2371 md5: None,
2372 sha256: None,
2373 sha512: None,
2374 bug_tracking_url,
2375 code_view_url,
2376 vcs_url: repository_url,
2377 copyright: None,
2378 holder: None,
2379 declared_license_expression,
2380 declared_license_expression_spdx,
2381 license_detections,
2382 other_license_expression: None,
2383 other_license_expression_spdx: None,
2384 other_license_detections: Vec::new(),
2385 extracted_license_statement: extracted_license_statement
2386 .or_else(|| build_extracted_license_statement(None, &license_classifiers)),
2387 notice_text: None,
2388 source_packages: Vec::new(),
2389 file_references: Vec::new(),
2390 is_private: has_private_classifier(&classifiers),
2391 is_virtual: false,
2392 extra_data: if extra_data.is_empty() {
2393 None
2394 } else {
2395 Some(extra_data)
2396 },
2397 dependencies: [dependencies, optional_dependencies].concat(),
2398 repository_homepage_url: None,
2399 repository_download_url: None,
2400 api_data_url,
2401 datasource_id: Some(if is_poetry_pyproject {
2402 DatasourceId::PypiPoetryPyprojectToml
2403 } else {
2404 DatasourceId::PypiPyprojectToml
2405 }),
2406 purl,
2407 }
2408}
2409
2410fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2411 let path_str = path.to_string_lossy().replace('\\', "/");
2412 if path_str.contains("/EGG-INFO/PKG-INFO") {
2413 DatasourceId::PypiEggPkginfo
2414 } else if path_str.ends_with(".egg-info/PKG-INFO") {
2415 DatasourceId::PypiEditableEggPkginfo
2416 } else {
2417 DatasourceId::PypiSdistPkginfo
2418 }
2419}
2420
2421fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2422 project
2423 .get(FIELD_LICENSE)
2424 .and_then(|license_value| match license_value {
2425 TomlValue::String(license_str) => Some(license_str.clone()),
2426 TomlValue::Table(license_table) => license_table
2427 .get("text")
2428 .and_then(|v| v.as_str())
2429 .map(|s| s.to_string())
2430 .or_else(|| {
2431 license_table
2432 .get("expression")
2433 .and_then(|v| v.as_str())
2434 .map(|expr| expr.to_string())
2435 }),
2436 _ => None,
2437 })
2438}
2439
2440fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2441 match project.get(FIELD_LICENSE) {
2442 Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2443 Some(TomlValue::Table(license_table)) => license_table
2444 .get("expression")
2445 .and_then(|value| value.as_str()),
2446 _ => None,
2447 }
2448}
2449
2450fn extract_urls(
2451 project: &TomlMap<String, TomlValue>,
2452 extra_data: &mut HashMap<String, serde_json::Value>,
2453) -> ProjectUrls {
2454 let mut homepage_url = None;
2455 let mut download_url = None;
2456 let mut bug_tracking_url = None;
2457 let mut code_view_url = None;
2458 let mut repository_url = None;
2459
2460 if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2462 let parsed_urls: Vec<(String, String)> = urls
2463 .iter()
2464 .filter_map(|(label, value)| {
2465 value
2466 .as_str()
2467 .map(|url| (label.to_string(), url.to_string()))
2468 })
2469 .collect();
2470 apply_project_url_mappings(
2471 &parsed_urls,
2472 &mut homepage_url,
2473 &mut bug_tracking_url,
2474 &mut code_view_url,
2475 &mut repository_url,
2476 extra_data,
2477 );
2478
2479 download_url = urls
2480 .get("Downloads")
2481 .or_else(|| urls.get("downloads"))
2482 .and_then(|v| v.as_str())
2483 .map(String::from);
2484
2485 if homepage_url.is_none() {
2486 homepage_url = urls
2487 .get(FIELD_HOMEPAGE)
2488 .and_then(|v| v.as_str())
2489 .map(String::from);
2490 }
2491 if repository_url.is_none() {
2492 repository_url = urls
2493 .get(FIELD_REPOSITORY)
2494 .and_then(|v| v.as_str())
2495 .map(String::from);
2496 }
2497 }
2498
2499 if homepage_url.is_none() {
2501 homepage_url = project
2502 .get(FIELD_HOMEPAGE)
2503 .and_then(|v| v.as_str())
2504 .map(String::from);
2505 }
2506
2507 if repository_url.is_none() {
2508 repository_url = project
2509 .get(FIELD_REPOSITORY)
2510 .and_then(|v| v.as_str())
2511 .map(String::from);
2512 }
2513
2514 (
2515 homepage_url,
2516 download_url,
2517 bug_tracking_url,
2518 code_view_url,
2519 repository_url,
2520 )
2521}
2522
2523fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2524 let mut parties = Vec::new();
2525
2526 if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2527 for author in authors {
2528 if let Some(author_str) = author.as_str() {
2529 let (name, email) = split_name_email(author_str);
2530 parties.push(Party {
2531 r#type: None,
2532 role: Some("author".to_string()),
2533 name,
2534 email,
2535 url: None,
2536 organization: None,
2537 organization_url: None,
2538 timezone: None,
2539 });
2540 } else if let Some(author_table) = author.as_table() {
2541 let name = author_table
2542 .get("name")
2543 .and_then(|value| value.as_str())
2544 .map(|value| value.to_string());
2545 let email = author_table
2546 .get("email")
2547 .and_then(|value| value.as_str())
2548 .map(|value| value.to_string());
2549 if name.is_some() || email.is_some() {
2550 parties.push(Party {
2551 r#type: None,
2552 role: Some("author".to_string()),
2553 name,
2554 email,
2555 url: None,
2556 organization: None,
2557 organization_url: None,
2558 timezone: None,
2559 });
2560 }
2561 }
2562 }
2563 }
2564
2565 if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2566 for maintainer in maintainers {
2567 if let Some(maintainer_str) = maintainer.as_str() {
2568 let (name, email) = split_name_email(maintainer_str);
2569 parties.push(Party {
2570 r#type: None,
2571 role: Some("maintainer".to_string()),
2572 name,
2573 email,
2574 url: None,
2575 organization: None,
2576 organization_url: None,
2577 timezone: None,
2578 });
2579 } else if let Some(maintainer_table) = maintainer.as_table() {
2580 let name = maintainer_table
2581 .get("name")
2582 .and_then(|value| value.as_str())
2583 .map(|value| value.to_string());
2584 let email = maintainer_table
2585 .get("email")
2586 .and_then(|value| value.as_str())
2587 .map(|value| value.to_string());
2588 if name.is_some() || email.is_some() {
2589 parties.push(Party {
2590 r#type: None,
2591 role: Some("maintainer".to_string()),
2592 name,
2593 email,
2594 url: None,
2595 organization: None,
2596 organization_url: None,
2597 timezone: None,
2598 });
2599 }
2600 }
2601 }
2602 }
2603
2604 parties
2605}
2606
2607fn extract_dependencies(
2608 project: &TomlMap<String, TomlValue>,
2609 toml_content: &TomlValue,
2610) -> (Vec<Dependency>, Vec<Dependency>) {
2611 let mut dependencies = Vec::new();
2612 let mut optional_dependencies = Vec::new();
2613
2614 if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2616 match deps_value {
2617 TomlValue::Array(arr) => {
2618 dependencies = parse_dependency_array(arr, false, None);
2619 }
2620 TomlValue::Table(table) => {
2621 dependencies = parse_dependency_table(table, false, None);
2622 }
2623 _ => {}
2624 }
2625 }
2626
2627 if let Some(opt_deps_table) = project
2629 .get(FIELD_OPTIONAL_DEPENDENCIES)
2630 .and_then(|v| v.as_table())
2631 {
2632 for (extra_name, deps) in opt_deps_table {
2633 match deps {
2634 TomlValue::Array(arr) => {
2635 optional_dependencies.extend(parse_dependency_array(
2636 arr,
2637 true,
2638 Some(extra_name),
2639 ));
2640 }
2641 TomlValue::Table(table) => {
2642 optional_dependencies.extend(parse_dependency_table(
2643 table,
2644 true,
2645 Some(extra_name),
2646 ));
2647 }
2648 _ => {}
2649 }
2650 }
2651 }
2652
2653 if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2655 match dev_deps_value {
2656 TomlValue::Array(arr) => {
2657 optional_dependencies.extend(parse_dependency_array(
2658 arr,
2659 true,
2660 Some(FIELD_DEV_DEPENDENCIES),
2661 ));
2662 }
2663 TomlValue::Table(table) => {
2664 optional_dependencies.extend(parse_dependency_table(
2665 table,
2666 true,
2667 Some(FIELD_DEV_DEPENDENCIES),
2668 ));
2669 }
2670 _ => {}
2671 }
2672 }
2673
2674 if let Some(groups_table) = toml_content
2676 .get("tool")
2677 .and_then(|value| value.as_table())
2678 .and_then(|tool| tool.get("poetry"))
2679 .and_then(|value| value.as_table())
2680 .and_then(|poetry| poetry.get("group"))
2681 .and_then(|value| value.as_table())
2682 {
2683 for (group_name, group_data) in groups_table {
2684 if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2685 match group_deps {
2686 TomlValue::Array(arr) => {
2687 optional_dependencies.extend(parse_dependency_array(
2688 arr,
2689 true,
2690 Some(group_name),
2691 ));
2692 }
2693 TomlValue::Table(table) => {
2694 optional_dependencies.extend(parse_poetry_group_dependency_table(
2695 table,
2696 true,
2697 Some(group_name),
2698 ));
2699 }
2700 _ => {}
2701 }
2702 }
2703 }
2704 }
2705
2706 if let Some(groups_table) = toml_content
2707 .get(FIELD_DEPENDENCY_GROUPS)
2708 .and_then(|value| value.as_table())
2709 {
2710 for (group_name, deps) in groups_table {
2711 match deps {
2712 TomlValue::Array(arr) => {
2713 optional_dependencies.extend(parse_dependency_array(
2714 arr,
2715 true,
2716 Some(group_name),
2717 ));
2718 }
2719 TomlValue::Table(table) => {
2720 optional_dependencies.extend(parse_dependency_table(
2721 table,
2722 true,
2723 Some(group_name),
2724 ));
2725 }
2726 _ => {}
2727 }
2728 }
2729 }
2730
2731 if let Some(dev_deps_value) = toml_content
2732 .get("tool")
2733 .and_then(|value| value.as_table())
2734 .and_then(|tool| tool.get("uv"))
2735 .and_then(|value| value.as_table())
2736 .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2737 {
2738 match dev_deps_value {
2739 TomlValue::Array(arr) => {
2740 optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2741 }
2742 TomlValue::Table(table) => {
2743 optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2744 }
2745 _ => {}
2746 }
2747 }
2748
2749 (dependencies, optional_dependencies)
2750}
2751
2752fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2753 let mut extra_data = HashMap::new();
2754
2755 if let Some(tool_uv) = toml_content
2756 .get("tool")
2757 .and_then(|value| value.as_table())
2758 .and_then(|tool| tool.get("uv"))
2759 {
2760 extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2761 }
2762
2763 if extra_data.is_empty() {
2764 None
2765 } else {
2766 Some(extra_data)
2767 }
2768}
2769
2770fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2771 match value {
2772 TomlValue::String(value) => JsonValue::String(value.clone()),
2773 TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2774 TomlValue::Float(value) => JsonValue::String(value.to_string()),
2775 TomlValue::Boolean(value) => JsonValue::Bool(*value),
2776 TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2777 TomlValue::Array(values) => {
2778 JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2779 }
2780 TomlValue::Table(values) => JsonValue::Object(
2781 values
2782 .iter()
2783 .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2784 .collect::<JsonMap<String, JsonValue>>(),
2785 ),
2786 }
2787}
2788
2789fn parse_dependency_table(
2790 table: &TomlMap<String, TomlValue>,
2791 is_optional: bool,
2792 scope: Option<&str>,
2793) -> Vec<Dependency> {
2794 table
2795 .iter()
2796 .filter_map(|(name, version)| {
2797 let version_str = version.as_str().map(|s| s.to_string());
2798 let mut package_url =
2799 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2800
2801 if let Some(v) = &version_str {
2802 package_url.with_version(v).ok()?;
2803 }
2804
2805 Some(Dependency {
2806 purl: Some(package_url.to_string()),
2807 extracted_requirement: None,
2808 scope: scope.map(|s| s.to_string()),
2809 is_runtime: Some(!is_optional),
2810 is_optional: Some(is_optional),
2811 is_pinned: None,
2812 is_direct: Some(true),
2813 resolved_package: None,
2814 extra_data: None,
2815 })
2816 })
2817 .collect()
2818}
2819
2820fn parse_poetry_group_dependency_table(
2821 table: &TomlMap<String, TomlValue>,
2822 is_optional: bool,
2823 scope: Option<&str>,
2824) -> Vec<Dependency> {
2825 table
2826 .iter()
2827 .filter_map(|(name, value)| build_poetry_group_dependency(name, value, is_optional, scope))
2828 .collect()
2829}
2830
2831fn build_poetry_group_dependency(
2832 name: &str,
2833 value: &TomlValue,
2834 is_optional: bool,
2835 scope: Option<&str>,
2836) -> Option<Dependency> {
2837 let normalized_name = normalize_python_dependency_name(name);
2838 let (version_spec, extras, marker) = match value {
2839 TomlValue::String(spec) => (Some(spec.trim().to_string()), Vec::new(), None),
2840 TomlValue::Table(table) => {
2841 let version_spec = table
2842 .get(FIELD_VERSION)
2843 .and_then(|value| value.as_str())
2844 .map(str::trim)
2845 .filter(|value| !value.is_empty())
2846 .map(ToOwned::to_owned);
2847 let extras = table
2848 .get(FIELD_EXTRAS)
2849 .and_then(|value| value.as_array())
2850 .map(|values| {
2851 values
2852 .iter()
2853 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2854 .collect::<Vec<_>>()
2855 })
2856 .unwrap_or_default();
2857 let marker = table
2858 .get("markers")
2859 .and_then(|value| value.as_str())
2860 .map(str::trim)
2861 .filter(|value| !value.is_empty())
2862 .map(ToOwned::to_owned);
2863
2864 (version_spec, extras, marker)
2865 }
2866 _ => return None,
2867 };
2868
2869 let pinned_version = version_spec
2870 .as_deref()
2871 .and_then(extract_exact_pinned_version);
2872 let purl = build_python_dependency_purl(&normalized_name, pinned_version.as_deref())?;
2873
2874 let mut extra_data = HashMap::new();
2875 if let Some(marker) = marker {
2876 extra_data.insert("marker".to_string(), JsonValue::String(marker));
2877 }
2878 if !extras.is_empty() {
2879 extra_data.insert(
2880 "extras".to_string(),
2881 JsonValue::Array(extras.into_iter().map(JsonValue::String).collect()),
2882 );
2883 }
2884
2885 Some(Dependency {
2886 purl: Some(purl),
2887 extracted_requirement: version_spec,
2888 scope: scope.map(|value| value.to_string()),
2889 is_runtime: Some(!is_optional),
2890 is_optional: Some(is_optional),
2891 is_pinned: Some(pinned_version.is_some()),
2892 is_direct: Some(true),
2893 resolved_package: None,
2894 extra_data: if extra_data.is_empty() {
2895 None
2896 } else {
2897 Some(extra_data)
2898 },
2899 })
2900}
2901
2902fn parse_dependency_array(
2903 array: &[TomlValue],
2904 is_optional: bool,
2905 scope: Option<&str>,
2906) -> Vec<Dependency> {
2907 array
2908 .iter()
2909 .filter_map(|dep| {
2910 let dep_str = dep.as_str()?;
2911 build_pyproject_array_dependency(dep_str, is_optional, scope)
2912 })
2913 .collect()
2914}
2915
2916fn build_pyproject_array_dependency(
2917 dep_str: &str,
2918 is_optional: bool,
2919 scope: Option<&str>,
2920) -> Option<Dependency> {
2921 let parsed = parse_pep508_requirement(dep_str)?;
2922 let name = normalize_python_package_name(&parsed.name);
2923 let pinned_version = parsed
2924 .specifiers
2925 .as_deref()
2926 .and_then(extract_exact_pinned_version);
2927
2928 let purl = build_python_dependency_purl(&name, pinned_version.as_deref())?;
2929
2930 let mut extra_data = HashMap::new();
2931 if let Some(marker) = parsed.marker {
2932 extra_data.insert("marker".to_string(), JsonValue::String(marker));
2933 }
2934 if !parsed.extras.is_empty() {
2935 extra_data.insert(
2936 "extras".to_string(),
2937 JsonValue::Array(parsed.extras.into_iter().map(JsonValue::String).collect()),
2938 );
2939 }
2940
2941 let extracted_requirement = parsed.specifiers.or(parsed.url);
2942
2943 Some(Dependency {
2944 purl: Some(purl),
2945 extracted_requirement: extracted_requirement.clone(),
2946 scope: scope.map(|s| s.to_string()),
2947 is_runtime: Some(!is_optional),
2948 is_optional: Some(is_optional),
2949 is_pinned: Some(pinned_version.is_some()),
2950 is_direct: Some(true),
2951 resolved_package: None,
2952 extra_data: if extra_data.is_empty() {
2953 None
2954 } else {
2955 Some(extra_data)
2956 },
2957 })
2958}
2959
2960fn extract_exact_pinned_version(specifiers: &str) -> Option<String> {
2961 let trimmed = specifiers.trim();
2962 if trimmed.contains(',') {
2963 return None;
2964 }
2965
2966 let stripped = if let Some(version) = trimmed.strip_prefix("===") {
2967 version
2968 } else if let Some(version) = trimmed.strip_prefix("==") {
2969 version
2970 } else {
2971 return None;
2972 };
2973
2974 let version = stripped.trim();
2975 if version.is_empty() {
2976 None
2977 } else {
2978 Some(version.to_string())
2979 }
2980}
2981
2982#[derive(Debug, Clone)]
2983enum Value {
2984 String(String),
2985 Number(f64),
2986 Bool(bool),
2987 None,
2988 List(Vec<Value>),
2989 Tuple(Vec<Value>),
2990 Dict(HashMap<String, Value>),
2991}
2992
2993struct LiteralEvaluator {
2994 constants: HashMap<String, Value>,
2995 max_depth: usize,
2996 max_nodes: usize,
2997 nodes_visited: usize,
2998}
2999
3000impl LiteralEvaluator {
3001 fn new(constants: HashMap<String, Value>) -> Self {
3002 Self {
3003 constants,
3004 max_depth: MAX_SETUP_PY_AST_DEPTH,
3005 max_nodes: MAX_SETUP_PY_AST_NODES,
3006 nodes_visited: 0,
3007 }
3008 }
3009
3010 fn insert_constant(&mut self, name: String, value: Value) {
3011 self.constants.insert(name, value);
3012 }
3013
3014 fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
3015 if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
3016 return None;
3017 }
3018 self.nodes_visited += 1;
3019
3020 match expr {
3021 ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
3022 Some(Value::String(value.to_str().to_string()))
3023 }
3024 ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
3025 Some(Value::Bool(*value))
3026 }
3027 ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
3028 self.evaluate_number(value)
3029 }
3030 ast::Expr::NoneLiteral(_) => Some(Value::None),
3031 ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
3032 ast::Expr::List(ast::ExprList { elts, .. }) => {
3033 let mut values = Vec::new();
3034 for elt in elts {
3035 values.push(self.evaluate_expr(elt, depth + 1)?);
3036 }
3037 Some(Value::List(values))
3038 }
3039 ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
3040 let mut values = Vec::new();
3041 for elt in elts {
3042 values.push(self.evaluate_expr(elt, depth + 1)?);
3043 }
3044 Some(Value::Tuple(values))
3045 }
3046 ast::Expr::Dict(ast::ExprDict { items, .. }) => {
3047 let mut dict = HashMap::new();
3048 for item in items {
3049 let key_expr = item.key.as_ref()?;
3050 let key_value = self.evaluate_expr(key_expr, depth + 1)?;
3051 let key = value_to_string(&key_value)?;
3052 let value = self.evaluate_expr(&item.value, depth + 1)?;
3053 dict.insert(key, value);
3054 }
3055 Some(Value::Dict(dict))
3056 }
3057 ast::Expr::Call(ast::ExprCall {
3058 func, arguments, ..
3059 }) => {
3060 let args = arguments.args.as_ref();
3061 let keywords = arguments.keywords.as_ref();
3062 if keywords.is_empty()
3063 && let Some(name) = dotted_name(func.as_ref(), depth + 1)
3064 && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
3065 {
3066 return self.evaluate_ordered_dict(args, depth + 1);
3067 }
3068
3069 if !args.is_empty() {
3070 return None;
3071 }
3072
3073 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
3074 && id == "dict"
3075 {
3076 let mut dict = HashMap::new();
3077 for keyword in keywords {
3078 let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
3079 let value = self.evaluate_expr(&keyword.value, depth + 1)?;
3080 dict.insert(key.to_string(), value);
3081 }
3082 return Some(Value::Dict(dict));
3083 }
3084
3085 None
3086 }
3087 _ => None,
3088 }
3089 }
3090
3091 fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
3092 match number {
3093 ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
3094 ast::Number::Float(value) => Some(Value::Number(*value)),
3095 ast::Number::Complex { .. } => None,
3096 }
3097 }
3098
3099 fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
3100 if args.len() != 1 {
3101 return None;
3102 }
3103
3104 let items = match self.evaluate_expr(&args[0], depth)? {
3105 Value::List(items) | Value::Tuple(items) => items,
3106 _ => return None,
3107 };
3108
3109 let mut dict = HashMap::new();
3110 for item in items {
3111 let Value::Tuple(values) = item else {
3112 return None;
3113 };
3114 if values.len() != 2 {
3115 return None;
3116 }
3117 let key = value_to_string(&values[0])?;
3118 dict.insert(key, values[1].clone());
3119 }
3120
3121 Some(Value::Dict(dict))
3122 }
3123}
3124
3125#[derive(Default)]
3126struct SetupAliases {
3127 setup_names: HashSet<String>,
3128 module_aliases: HashMap<String, String>,
3129}
3130
3131fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
3132 extract_from_setup_py(path).into_iter().collect()
3133}
3134
3135fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
3136 let content = match read_file_to_string(path, None) {
3137 Ok(content) => content,
3138 Err(e) => {
3139 warn!("Failed to read setup.py at {:?}: {}", path, e);
3140 return Some(default_package_data(path));
3141 }
3142 };
3143
3144 if content.len() > MAX_SETUP_PY_BYTES {
3145 warn!("setup.py too large at {:?}: {} bytes", path, content.len());
3146 let package_data = extract_from_setup_py_regex(&content);
3147 return should_emit_setup_py_package(&package_data).then_some(package_data);
3148 }
3149
3150 let mut package_data = match extract_from_setup_py_ast(&content) {
3151 Ok(Some(data)) => data,
3152 Ok(None) => return Some(default_package_data(path)),
3153 Err(e) => {
3154 warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
3155 extract_from_setup_py_regex(&content)
3156 }
3157 };
3158
3159 if package_data.name.is_none() {
3160 package_data.name = extract_setup_value(&content, "name");
3161 }
3162
3163 if package_data.version.is_none() {
3164 package_data.version = extract_setup_value(&content, "version");
3165 }
3166
3167 if package_data
3168 .version
3169 .as_deref()
3170 .is_some_and(|version| version.trim().is_empty())
3171 {
3172 package_data.version = None;
3173 }
3174
3175 fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
3176 package_data.purl = build_setup_py_purl(
3177 package_data.name.as_deref(),
3178 package_data.version.as_deref(),
3179 );
3180
3181 if should_emit_setup_py_package(&package_data) {
3182 Some(package_data)
3183 } else {
3184 Some(default_package_data(path))
3185 }
3186}
3187
3188fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
3189 package_data.name.is_some()
3190 || package_data.version.is_some()
3191 || package_data.purl.is_some()
3192 || !package_data.dependencies.is_empty()
3193 || package_data.extracted_license_statement.is_some()
3194 || !package_data.license_detections.is_empty()
3195 || !package_data.parties.is_empty()
3196 || package_data.description.is_some()
3197 || package_data.homepage_url.is_some()
3198 || package_data.bug_tracking_url.is_some()
3199 || package_data.code_view_url.is_some()
3200 || package_data.vcs_url.is_some()
3201}
3202
3203fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
3204 if package_data.version.is_some()
3205 && package_data.extracted_license_statement.is_some()
3206 && package_data
3207 .parties
3208 .iter()
3209 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
3210 {
3211 return;
3212 }
3213
3214 let Some(root) = path.parent() else {
3215 return;
3216 };
3217
3218 let dunder_metadata = collect_sibling_dunder_metadata(root, content);
3219
3220 if package_data.version.is_none() {
3221 package_data.version = dunder_metadata.version;
3222 }
3223
3224 if package_data.extracted_license_statement.is_none() {
3225 package_data.extracted_license_statement = dunder_metadata.license;
3226 }
3227
3228 let has_author = package_data
3229 .parties
3230 .iter()
3231 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
3232
3233 if !has_author && let Some(author) = dunder_metadata.author {
3234 package_data.parties.push(Party {
3235 r#type: Some("person".to_string()),
3236 role: Some("author".to_string()),
3237 name: Some(author),
3238 email: None,
3239 url: None,
3240 organization: None,
3241 organization_url: None,
3242 timezone: None,
3243 });
3244 }
3245}
3246
3247#[derive(Default)]
3248struct DunderMetadata {
3249 version: Option<String>,
3250 author: Option<String>,
3251 license: Option<String>,
3252}
3253
3254fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
3255 let statements = match parse_module(content) {
3256 Ok(parsed) => parsed.into_suite(),
3257 Err(_) => return DunderMetadata::default(),
3258 };
3259
3260 let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3261 let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3262 let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3263 let mut metadata = DunderMetadata::default();
3264 let mut candidate_paths = Vec::new();
3265
3266 for module in imported_dunder_modules(&statements) {
3267 let Some(path) = resolve_imported_module_path(root, &module) else {
3268 continue;
3269 };
3270
3271 candidate_paths.push(path);
3272 }
3273
3274 candidate_paths.extend(referenced_dunder_init_paths(root, content));
3275
3276 let mut seen_paths = HashSet::new();
3277 for path in candidate_paths {
3278 if !seen_paths.insert(path.clone()) {
3279 continue;
3280 }
3281
3282 let Ok(module_content) = read_file_to_string(&path, None) else {
3283 continue;
3284 };
3285
3286 if metadata.version.is_none() {
3287 metadata.version = version_re
3288 .as_ref()
3289 .and_then(|regex| regex.captures(&module_content))
3290 .and_then(|captures| captures.get(1))
3291 .map(|match_| match_.as_str().to_string());
3292 }
3293
3294 if metadata.author.is_none() {
3295 metadata.author = author_re
3296 .as_ref()
3297 .and_then(|regex| regex.captures(&module_content))
3298 .and_then(|captures| captures.get(1))
3299 .map(|match_| match_.as_str().to_string());
3300 }
3301
3302 if metadata.license.is_none() {
3303 metadata.license = license_re
3304 .as_ref()
3305 .and_then(|regex| regex.captures(&module_content))
3306 .and_then(|captures| captures.get(1))
3307 .map(|match_| match_.as_str().to_string());
3308 }
3309
3310 if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
3311 return metadata;
3312 }
3313 }
3314
3315 metadata
3316}
3317
3318fn referenced_dunder_init_paths(root: &Path, content: &str) -> Vec<PathBuf> {
3319 let open_re = match Regex::new(r#"open\(\s*['\"]([^'\"]+__init__\.py)['\"]"#) {
3320 Ok(regex) => regex,
3321 Err(_) => return Vec::new(),
3322 };
3323
3324 open_re
3325 .captures_iter(content)
3326 .filter_map(|captures| captures.get(1).map(|m| m.as_str()))
3327 .filter_map(|relative| {
3328 let relative_path = PathBuf::from(relative);
3329 if relative_path.is_absolute()
3330 || relative_path.components().any(|component| {
3331 matches!(
3332 component,
3333 Component::ParentDir | Component::RootDir | Component::Prefix(_)
3334 )
3335 })
3336 {
3337 return None;
3338 }
3339
3340 let candidate = root.join(relative_path);
3341 candidate.exists().then_some(candidate)
3342 })
3343 .collect()
3344}
3345
3346fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
3347 let mut modules = Vec::new();
3348
3349 for statement in statements {
3350 let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
3351 continue;
3352 };
3353 let Some(module) = module.as_ref().map(|name| name.as_str()) else {
3354 continue;
3355 };
3356 let imports_dunder = names.iter().any(|alias| {
3357 matches!(
3358 alias.name.as_str(),
3359 "__version__" | "__author__" | "__license__"
3360 )
3361 });
3362 if imports_dunder {
3363 modules.push(module.to_string());
3364 }
3365 }
3366
3367 modules
3368}
3369
3370fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
3371 let relative = PathBuf::from_iter(module.split('.'));
3372 let candidates = [
3373 root.join(relative.with_extension("py")),
3374 root.join(&relative).join("__init__.py"),
3375 root.join("src").join(relative.with_extension("py")),
3376 root.join("src").join(relative).join("__init__.py"),
3377 ];
3378
3379 candidates.into_iter().find(|candidate| candidate.exists())
3380}
3381
3382fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
3398 let statements = parse_module(content)
3399 .map(|parsed| parsed.into_suite())
3400 .map_err(|e| e.to_string())?;
3401 let aliases = collect_setup_aliases(&statements);
3402 let mut evaluator = LiteralEvaluator::new(HashMap::new());
3403 build_setup_py_constants(&statements, &mut evaluator);
3404
3405 let setup_call = find_setup_call(&statements, &aliases);
3406 let Some(call_expr) = setup_call else {
3407 return Ok(None);
3408 };
3409
3410 let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3411 Ok(Some(build_setup_py_package_data(&setup_values)))
3412}
3413
3414fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3415 for stmt in statements {
3416 if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3417 if targets.len() != 1 {
3418 continue;
3419 }
3420
3421 let Some(name) = extract_assign_name(&targets[0]) else {
3422 continue;
3423 };
3424
3425 if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3426 evaluator.insert_constant(name, value);
3427 }
3428 }
3429 }
3430}
3431
3432fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3433 match target {
3434 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3435 _ => None,
3436 }
3437}
3438
3439fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3440 let mut aliases = SetupAliases::default();
3441 aliases.setup_names.insert("setup".to_string());
3442
3443 for stmt in statements {
3444 match stmt {
3445 ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3446 for alias in names {
3447 let module_name = alias.name.as_str();
3448 if !is_setup_module(module_name) {
3449 continue;
3450 }
3451 let alias_name = alias
3452 .asname
3453 .as_ref()
3454 .map(|name| name.as_str())
3455 .unwrap_or(module_name);
3456 aliases
3457 .module_aliases
3458 .insert(alias_name.to_string(), module_name.to_string());
3459 }
3460 }
3461 ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3462 let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3463 continue;
3464 };
3465 if !is_setup_module(module_name) {
3466 continue;
3467 }
3468 for alias in names {
3469 if alias.name.as_str() != "setup" {
3470 continue;
3471 }
3472 let alias_name = alias
3473 .asname
3474 .as_ref()
3475 .map(|name| name.as_str())
3476 .unwrap_or("setup");
3477 aliases.setup_names.insert(alias_name.to_string());
3478 }
3479 }
3480 _ => {}
3481 }
3482 }
3483
3484 aliases
3485}
3486
3487fn is_setup_module(module_name: &str) -> bool {
3488 matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3489}
3490
3491fn find_setup_call<'a>(
3492 statements: &'a [ast::Stmt],
3493 aliases: &'a SetupAliases,
3494) -> Option<&'a ast::Expr> {
3495 let mut finder = SetupCallFinder {
3496 aliases,
3497 called_function_names: collect_top_level_called_function_names(statements),
3498 nodes_visited: 0,
3499 };
3500 finder.find_in_statements(statements)
3501}
3502
3503fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3504 let mut called = HashSet::new();
3505 collect_called_function_names_in_statements(statements, &mut called);
3506 called
3507}
3508
3509fn collect_called_function_names_in_statements(
3510 statements: &[ast::Stmt],
3511 called: &mut HashSet<String>,
3512) {
3513 for stmt in statements {
3514 match stmt {
3515 ast::Stmt::Expr(ast::StmtExpr { value, .. })
3516 | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3517 collect_called_function_names_in_expr(value.as_ref(), called);
3518 }
3519 ast::Stmt::If(ast::StmtIf {
3520 body,
3521 elif_else_clauses,
3522 ..
3523 }) => {
3524 collect_called_function_names_in_statements(body, called);
3525 for clause in elif_else_clauses {
3526 collect_called_function_names_in_statements(&clause.body, called);
3527 }
3528 }
3529 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3530 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3531 collect_called_function_names_in_statements(body, called);
3532 collect_called_function_names_in_statements(orelse, called);
3533 }
3534 ast::Stmt::With(ast::StmtWith { body, .. }) => {
3535 collect_called_function_names_in_statements(body, called);
3536 }
3537 ast::Stmt::Try(ast::StmtTry {
3538 body,
3539 orelse,
3540 finalbody,
3541 handlers,
3542 ..
3543 }) => {
3544 collect_called_function_names_in_statements(body, called);
3545 collect_called_function_names_in_statements(orelse, called);
3546 collect_called_function_names_in_statements(finalbody, called);
3547 for handler in handlers {
3548 let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3549 body,
3550 ..
3551 }) = handler;
3552 collect_called_function_names_in_statements(body, called);
3553 }
3554 }
3555 _ => {}
3556 }
3557 }
3558}
3559
3560fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3561 if let ast::Expr::Call(ast::ExprCall {
3562 func, arguments, ..
3563 }) = expr
3564 {
3565 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3566 called.insert(id.as_str().to_string());
3567 }
3568
3569 for arg in arguments.args.iter() {
3570 collect_called_function_names_in_expr(arg, called);
3571 }
3572 for keyword in arguments.keywords.iter() {
3573 collect_called_function_names_in_expr(&keyword.value, called);
3574 }
3575 }
3576}
3577
3578struct SetupCallFinder<'a> {
3579 aliases: &'a SetupAliases,
3580 called_function_names: HashSet<String>,
3581 nodes_visited: usize,
3582}
3583
3584impl<'a> SetupCallFinder<'a> {
3585 fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3586 for stmt in statements {
3587 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3588 return None;
3589 }
3590 self.nodes_visited += 1;
3591
3592 let found = match stmt {
3593 ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3594 ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3595 ast::Stmt::If(ast::StmtIf {
3596 body,
3597 elif_else_clauses,
3598 ..
3599 }) => self.find_in_statements(body).or_else(|| {
3600 for clause in elif_else_clauses {
3601 if let Some(found) = self.find_in_statements(&clause.body) {
3602 return Some(found);
3603 }
3604 }
3605 None
3606 }),
3607 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3608 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3609 .find_in_statements(body)
3610 .or_else(|| self.find_in_statements(orelse)),
3611 ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3612 .called_function_names
3613 .contains(name.as_str())
3614 .then(|| self.find_in_statements(body))
3615 .flatten(),
3616 ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3617 ast::Stmt::Try(ast::StmtTry {
3618 body,
3619 orelse,
3620 finalbody,
3621 handlers,
3622 ..
3623 }) => self
3624 .find_in_statements(body)
3625 .or_else(|| self.find_in_statements(orelse))
3626 .or_else(|| self.find_in_statements(finalbody))
3627 .or_else(|| {
3628 for handler in handlers {
3629 let ast::ExceptHandler::ExceptHandler(
3630 ast::ExceptHandlerExceptHandler { body, .. },
3631 ) = handler;
3632 if let Some(found) = self.find_in_statements(body) {
3633 return Some(found);
3634 }
3635 }
3636 None
3637 }),
3638 _ => None,
3639 };
3640
3641 if found.is_some() {
3642 return found;
3643 }
3644 }
3645
3646 None
3647 }
3648
3649 fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3650 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3651 return None;
3652 }
3653 self.nodes_visited += 1;
3654
3655 match expr {
3656 ast::Expr::Call(ast::ExprCall { func, .. })
3657 if is_setup_call(func.as_ref(), self.aliases) =>
3658 {
3659 Some(expr)
3660 }
3661 _ => None,
3662 }
3663 }
3664}
3665
3666fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3667 let Some(dotted) = dotted_name(func, 0) else {
3668 return false;
3669 };
3670
3671 if aliases.setup_names.contains(&dotted) {
3672 return true;
3673 }
3674
3675 let Some(module) = dotted.strip_suffix(".setup") else {
3676 return false;
3677 };
3678
3679 let resolved = resolve_module_alias(module, aliases);
3680 is_setup_module(&resolved)
3681}
3682
3683fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3684 if depth >= MAX_SETUP_PY_AST_DEPTH {
3685 return None;
3686 }
3687
3688 match expr {
3689 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3690 ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3691 let base = dotted_name(value.as_ref(), depth + 1)?;
3692 Some(format!("{}.{}", base, attr.as_str()))
3693 }
3694 _ => None,
3695 }
3696}
3697
3698fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3699 if let Some(mapped) = aliases.module_aliases.get(module) {
3700 return mapped.clone();
3701 }
3702
3703 let Some((base, rest)) = module.split_once('.') else {
3704 return module.to_string();
3705 };
3706
3707 if let Some(mapped) = aliases.module_aliases.get(base) {
3708 return format!("{}.{}", mapped, rest);
3709 }
3710
3711 module.to_string()
3712}
3713
3714fn extract_setup_keywords(
3715 call_expr: &ast::Expr,
3716 evaluator: &mut LiteralEvaluator,
3717) -> HashMap<String, Value> {
3718 let mut values = HashMap::new();
3719 let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3720 return values;
3721 };
3722
3723 for keyword in arguments.keywords.iter() {
3724 if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3725 if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3726 values.insert(arg.to_string(), value);
3727 }
3728 } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3729 for (key, value) in dict {
3730 values.insert(key, value);
3731 }
3732 }
3733 }
3734
3735 values
3736}
3737
3738fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3739 let name = get_value_string(values, "name").map(truncate_field);
3740 let version = get_value_string(values, "version").map(truncate_field);
3741 let description = get_value_string(values, "description")
3742 .or_else(|| get_value_string(values, "summary"))
3743 .map(truncate_field);
3744 let homepage_url = get_value_string(values, "url")
3745 .or_else(|| get_value_string(values, "home_page"))
3746 .map(truncate_field);
3747 let author = get_value_string(values, "author").map(truncate_field);
3748 let author_email = get_value_string(values, "author_email");
3749 let maintainer = get_value_string(values, "maintainer").map(truncate_field);
3750 let maintainer_email = get_value_string(values, "maintainer_email");
3751 let license = get_value_string(values, "license").map(truncate_field);
3752 let classifiers = values
3753 .get("classifiers")
3754 .and_then(value_to_string_list)
3755 .unwrap_or_default();
3756
3757 let mut parties = Vec::new();
3758 if author.is_some() || author_email.is_some() {
3759 parties.push(Party {
3760 r#type: Some("person".to_string()),
3761 role: Some("author".to_string()),
3762 name: author,
3763 email: author_email,
3764 url: None,
3765 organization: None,
3766 organization_url: None,
3767 timezone: None,
3768 });
3769 }
3770
3771 if maintainer.is_some() || maintainer_email.is_some() {
3772 parties.push(Party {
3773 r#type: Some("person".to_string()),
3774 role: Some("maintainer".to_string()),
3775 name: maintainer,
3776 email: maintainer_email,
3777 url: None,
3778 organization: None,
3779 organization_url: None,
3780 timezone: None,
3781 });
3782 }
3783
3784 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3785 normalize_spdx_declared_license(license.as_deref());
3786 let extracted_license_statement = license.clone();
3787
3788 let dependencies = build_setup_py_dependencies(values);
3789 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3790 let mut homepage_from_project_urls = None;
3791 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3792 let mut extra_data = HashMap::new();
3793
3794 if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3795 apply_project_url_mappings(
3796 &parsed_project_urls,
3797 &mut homepage_from_project_urls,
3798 &mut bug_tracking_url,
3799 &mut code_view_url,
3800 &mut vcs_url,
3801 &mut extra_data,
3802 );
3803 }
3804
3805 let extra_data = if extra_data.is_empty() {
3806 None
3807 } else {
3808 Some(extra_data)
3809 };
3810
3811 PackageData {
3812 package_type: Some(PythonParser::PACKAGE_TYPE),
3813 namespace: None,
3814 name,
3815 version,
3816 qualifiers: None,
3817 subpath: None,
3818 primary_language: Some("Python".to_string()),
3819 description,
3820 release_date: None,
3821 parties,
3822 keywords: Vec::new(),
3823 homepage_url: homepage_url.or(homepage_from_project_urls),
3824 download_url: None,
3825 size: None,
3826 sha1: None,
3827 md5: None,
3828 sha256: None,
3829 sha512: None,
3830 bug_tracking_url,
3831 code_view_url,
3832 vcs_url,
3833 copyright: None,
3834 holder: None,
3835 declared_license_expression,
3836 declared_license_expression_spdx,
3837 license_detections,
3838 other_license_expression: None,
3839 other_license_expression_spdx: None,
3840 other_license_detections: Vec::new(),
3841 extracted_license_statement,
3842 notice_text: None,
3843 source_packages: Vec::new(),
3844 file_references: Vec::new(),
3845 is_private: has_private_classifier(&classifiers),
3846 is_virtual: false,
3847 extra_data,
3848 dependencies,
3849 repository_homepage_url: None,
3850 repository_download_url: None,
3851 api_data_url: None,
3852 datasource_id: Some(DatasourceId::PypiSetupPy),
3853 purl,
3854 }
3855}
3856
3857fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3858 let mut dependencies = Vec::new();
3859
3860 if let Some(reqs) = values
3861 .get("install_requires")
3862 .and_then(value_to_string_list)
3863 {
3864 dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3865 }
3866
3867 if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3868 dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3869 }
3870
3871 if let Some(Value::Dict(extras)) = values.get("extras_require") {
3872 let mut extra_items: Vec<_> = extras.iter().collect();
3873 extra_items.sort_by_key(|(name, _)| *name);
3874 for (extra_name, extra_value) in extra_items {
3875 if let Some(reqs) = value_to_string_list(extra_value) {
3876 dependencies.extend(build_setup_py_dependency_list(
3877 reqs.as_slice(),
3878 extra_name,
3879 true,
3880 ));
3881 }
3882 }
3883 }
3884
3885 dependencies
3886}
3887
3888fn build_setup_py_dependency_list(
3889 reqs: &[String],
3890 scope: &str,
3891 is_optional: bool,
3892) -> Vec<Dependency> {
3893 reqs.iter()
3894 .filter_map(|req| build_python_dependency(req, scope, is_optional, None))
3895 .collect()
3896}
3897
3898fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3899 values.get(key).and_then(value_to_string)
3900}
3901
3902fn value_to_string(value: &Value) -> Option<String> {
3903 match value {
3904 Value::String(value) => Some(value.clone()),
3905 Value::Number(value) => Some(value.to_string()),
3906 Value::Bool(value) => Some(value.to_string()),
3907 _ => None,
3908 }
3909}
3910
3911fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3912 match value {
3913 Value::String(value) => Some(vec![value.clone()]),
3914 Value::List(values) | Value::Tuple(values) => {
3915 let mut items = Vec::new();
3916 for item in values {
3917 items.push(value_to_string(item)?);
3918 }
3919 Some(items)
3920 }
3921 _ => None,
3922 }
3923}
3924
3925fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3926 let Value::Dict(dict) = value else {
3927 return None;
3928 };
3929
3930 let mut pairs: Vec<(String, String)> = dict
3931 .iter()
3932 .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3933 .collect::<Option<Vec<_>>>()?;
3934 pairs.sort_by(|left, right| left.0.cmp(&right.0));
3935 Some(pairs)
3936}
3937
3938fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3939 let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3940 extract_requires_dist_dependencies(&requires_dist)
3941}
3942
3943pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3944 requires_dist
3945 .iter()
3946 .filter_map(|entry| build_rfc822_dependency(entry))
3947 .collect()
3948}
3949
3950fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3951 build_python_dependency(entry, "install", false, None)
3952}
3953
3954fn build_python_dependency(
3955 entry: &str,
3956 default_scope: &str,
3957 default_optional: bool,
3958 marker_override: Option<&str>,
3959) -> Option<Dependency> {
3960 let (requirement_part, marker_part) = entry
3961 .split_once(';')
3962 .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3963 .unwrap_or((entry.trim(), None));
3964
3965 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3966 let requirement = normalize_rfc822_requirement(requirement_part);
3967 let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3968 marker_part.or(marker_override),
3969 default_scope,
3970 default_optional,
3971 );
3972 let purl = build_python_dependency_purl(&name, None)?;
3973
3974 let is_pinned = requirement
3975 .as_deref()
3976 .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3977 let purl = if is_pinned {
3978 requirement
3979 .as_deref()
3980 .map(|req| req.trim_start_matches('='))
3981 .and_then(|version| build_python_dependency_purl(&name, Some(version)))
3982 .unwrap_or(purl)
3983 } else {
3984 purl
3985 };
3986
3987 let mut extra_data = HashMap::new();
3988 extra_data.extend(marker_data);
3989 if let Some(marker) = marker {
3990 extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3991 }
3992
3993 Some(Dependency {
3994 purl: Some(purl),
3995 extracted_requirement: requirement,
3996 scope: Some(scope),
3997 is_runtime: Some(true),
3998 is_optional: Some(is_optional),
3999 is_pinned: Some(is_pinned),
4000 is_direct: Some(true),
4001 resolved_package: None,
4002 extra_data: if extra_data.is_empty() {
4003 None
4004 } else {
4005 Some(extra_data)
4006 },
4007 })
4008}
4009
4010fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
4011 let name = extract_setup_cfg_dependency_name(requirement_part)?;
4012 let trimmed = requirement_part.trim();
4013 let mut remainder = trimmed[name.len()..].trim();
4014
4015 if let Some(stripped) = remainder.strip_prefix('[')
4016 && let Some(end_idx) = stripped.find(']')
4017 {
4018 remainder = stripped[end_idx + 1..].trim();
4019 }
4020
4021 let remainder = remainder
4022 .strip_prefix('(')
4023 .and_then(|value| value.strip_suffix(')'))
4024 .unwrap_or(remainder)
4025 .trim();
4026
4027 if remainder.is_empty() {
4028 return None;
4029 }
4030
4031 let mut specifiers: Vec<String> = remainder
4032 .split(',')
4033 .map(|specifier| specifier.trim().replace(' ', ""))
4034 .filter(|specifier| !specifier.is_empty())
4035 .collect();
4036 specifiers.sort();
4037 Some(specifiers.join(","))
4038}
4039
4040fn encode_python_dependency_purl_version(version: &str) -> String {
4041 version.replace('*', "%2A")
4042}
4043
4044fn build_python_dependency_purl(name: &str, version: Option<&str>) -> Option<String> {
4045 let normalized_name = normalize_python_dependency_name(name);
4046
4047 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &normalized_name)
4048 .ok()
4049 .map(|_| match version {
4050 Some(version) => {
4051 format!(
4052 "pkg:pypi/{normalized_name}@{}",
4053 encode_python_dependency_purl_version(version)
4054 )
4055 }
4056 None => format!("pkg:pypi/{normalized_name}"),
4057 })
4058}
4059
4060fn normalize_python_dependency_name(name: &str) -> String {
4061 name.trim().to_ascii_lowercase().replace('_', "-")
4062}
4063
4064fn parse_rfc822_marker(
4065 marker_part: Option<&str>,
4066 default_scope: &str,
4067 default_optional: bool,
4068) -> (
4069 String,
4070 bool,
4071 Option<String>,
4072 HashMap<String, serde_json::Value>,
4073) {
4074 let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
4075 return (
4076 default_scope.to_string(),
4077 default_optional,
4078 None,
4079 HashMap::new(),
4080 );
4081 };
4082
4083 let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
4084 .expect("extra marker regex should compile");
4085 let mut extra_data = HashMap::new();
4086
4087 if let Some(python_version) = extract_marker_field(marker, "python_version") {
4088 extra_data.insert(
4089 "python_version".to_string(),
4090 serde_json::Value::String(python_version),
4091 );
4092 }
4093 if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
4094 extra_data.insert(
4095 "sys_platform".to_string(),
4096 serde_json::Value::String(sys_platform),
4097 );
4098 }
4099
4100 if let Some(captures) = extra_re.captures(marker)
4101 && let Some(scope) = captures.get(1)
4102 {
4103 return (
4104 scope.as_str().to_string(),
4105 true,
4106 Some(marker.trim().to_string()),
4107 extra_data,
4108 );
4109 }
4110
4111 (
4112 default_scope.to_string(),
4113 default_optional,
4114 Some(marker.trim().to_string()),
4115 extra_data,
4116 )
4117}
4118
4119fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
4120 let re = Regex::new(&format!(
4121 r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
4122 field
4123 ))
4124 .ok()?;
4125 let captures = re.captures(marker)?;
4126 let operator = captures.get(1)?.as_str();
4127 let value = captures.get(2)?.as_str();
4128 Some(format!("{} {}", operator, value))
4129}
4130
4131fn parse_requires_txt(content: &str) -> Vec<Dependency> {
4132 let mut dependencies = Vec::new();
4133 let mut current_scope = "install".to_string();
4134 let mut current_optional = false;
4135 let mut current_marker: Option<String> = None;
4136 let mut line_count = 0usize;
4137
4138 for line in content.lines() {
4139 line_count += 1;
4140 if line_count > MAX_ITERATION_COUNT {
4141 warn!(
4142 "Exceeded max line count in requires.txt; stopping at {} lines",
4143 MAX_ITERATION_COUNT
4144 );
4145 break;
4146 }
4147 let trimmed = line.trim();
4148 if trimmed.is_empty() || trimmed.starts_with('#') {
4149 continue;
4150 }
4151
4152 if trimmed.starts_with('[') && trimmed.ends_with(']') {
4153 let inner = &trimmed[1..trimmed.len() - 1];
4154 if let Some(rest) = inner.strip_prefix(':') {
4155 current_scope = "install".to_string();
4156 current_optional = false;
4157 current_marker = Some(rest.trim().to_string());
4158 } else if let Some((scope, marker)) = inner.split_once(':') {
4159 current_scope = scope.trim().to_string();
4160 current_optional = true;
4161 current_marker = Some(marker.trim().to_string());
4162 } else {
4163 current_scope = inner.trim().to_string();
4164 current_optional = true;
4165 current_marker = None;
4166 }
4167 continue;
4168 }
4169
4170 if let Some(dependency) = build_python_dependency(
4171 trimmed,
4172 ¤t_scope,
4173 current_optional,
4174 current_marker.as_deref(),
4175 ) {
4176 dependencies.push(dependency);
4177 }
4178 }
4179
4180 dependencies
4181}
4182
4183fn has_private_classifier(classifiers: &[String]) -> bool {
4184 classifiers
4185 .iter()
4186 .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
4187}
4188
4189fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
4190 let name = name?;
4191 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
4192 if let Some(version) = version {
4193 package_url.with_version(version).ok()?;
4194 }
4195 Some(package_url.to_string())
4196}
4197
4198fn extract_from_setup_py_regex(content: &str) -> PackageData {
4199 let name = extract_setup_value(content, "name").map(truncate_field);
4200 let version = extract_setup_value(content, "version").map(truncate_field);
4201 let license_expression = extract_setup_value(content, "license").map(truncate_field);
4202
4203 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4204 normalize_spdx_declared_license(license_expression.as_deref());
4205 let extracted_license_statement = license_expression.clone();
4206
4207 let dependencies = extract_setup_py_dependencies(content);
4208 let homepage_url = extract_setup_value(content, "url").map(truncate_field);
4209 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
4210
4211 PackageData {
4212 package_type: Some(PythonParser::PACKAGE_TYPE),
4213 namespace: None,
4214 name,
4215 version,
4216 qualifiers: None,
4217 subpath: None,
4218 primary_language: Some("Python".to_string()),
4219 description: None,
4220 release_date: None,
4221 parties: Vec::new(),
4222 keywords: Vec::new(),
4223 homepage_url,
4224 download_url: None,
4225 size: None,
4226 sha1: None,
4227 md5: None,
4228 sha256: None,
4229 sha512: None,
4230 bug_tracking_url: None,
4231 code_view_url: None,
4232 vcs_url: None,
4233 copyright: None,
4234 holder: None,
4235 declared_license_expression,
4236 declared_license_expression_spdx,
4237 license_detections,
4238 other_license_expression: None,
4239 other_license_expression_spdx: None,
4240 other_license_detections: Vec::new(),
4241 extracted_license_statement,
4242 notice_text: None,
4243 source_packages: Vec::new(),
4244 file_references: Vec::new(),
4245 is_private: false,
4246 is_virtual: false,
4247 extra_data: None,
4248 dependencies,
4249 repository_homepage_url: None,
4250 repository_download_url: None,
4251 api_data_url: None,
4252 datasource_id: Some(DatasourceId::PypiSetupPy),
4253 purl,
4254 }
4255}
4256
4257fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
4258 crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
4259}
4260
4261fn extract_from_pypi_json(path: &Path) -> PackageData {
4262 let default = PackageData {
4263 package_type: Some(PythonParser::PACKAGE_TYPE),
4264 datasource_id: Some(DatasourceId::PypiJson),
4265 ..Default::default()
4266 };
4267
4268 let content = match read_file_to_string(path, None) {
4269 Ok(content) => content,
4270 Err(error) => {
4271 warn!("Failed to read pypi.json at {:?}: {}", path, error);
4272 return default;
4273 }
4274 };
4275
4276 let root: serde_json::Value = match serde_json::from_str(&content) {
4277 Ok(value) => value,
4278 Err(error) => {
4279 warn!("Failed to parse pypi.json at {:?}: {}", path, error);
4280 return default;
4281 }
4282 };
4283
4284 let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
4285 warn!("No info object found in pypi.json at {:?}", path);
4286 return default;
4287 };
4288
4289 let name = info
4290 .get("name")
4291 .and_then(|value| value.as_str())
4292 .map(|v| truncate_field(v.to_owned()));
4293 let version = info
4294 .get("version")
4295 .and_then(|value| value.as_str())
4296 .map(ToOwned::to_owned);
4297 let summary = info
4298 .get("summary")
4299 .and_then(|value| value.as_str())
4300 .map(|v| truncate_field(v.to_owned()));
4301 let description = info
4302 .get("description")
4303 .and_then(|value| value.as_str())
4304 .filter(|value| !value.trim().is_empty())
4305 .map(|v| truncate_field(v.to_owned()))
4306 .or(summary);
4307 let mut homepage_url = info
4308 .get("home_page")
4309 .and_then(|value| value.as_str())
4310 .map(|v| truncate_field(v.to_owned()));
4311 let author = info
4312 .get("author")
4313 .and_then(|value| value.as_str())
4314 .filter(|value| !value.trim().is_empty())
4315 .map(|v| truncate_field(v.to_owned()));
4316 let author_email = info
4317 .get("author_email")
4318 .and_then(|value| value.as_str())
4319 .filter(|value| !value.trim().is_empty())
4320 .map(ToOwned::to_owned);
4321 let license = info
4322 .get("license")
4323 .and_then(|value| value.as_str())
4324 .filter(|value| !value.trim().is_empty())
4325 .map(ToOwned::to_owned);
4326 let keywords = parse_setup_cfg_keywords(
4327 info.get("keywords")
4328 .and_then(|value| value.as_str())
4329 .map(ToOwned::to_owned),
4330 );
4331 let classifiers = info
4332 .get("classifiers")
4333 .and_then(|value| value.as_array())
4334 .map(|values| {
4335 values
4336 .iter()
4337 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
4338 .collect::<Vec<_>>()
4339 })
4340 .unwrap_or_default();
4341
4342 let mut parties = Vec::new();
4343 if author.is_some() || author_email.is_some() {
4344 parties.push(Party {
4345 r#type: Some("person".to_string()),
4346 role: Some("author".to_string()),
4347 name: author,
4348 email: author_email,
4349 url: None,
4350 organization: None,
4351 organization_url: None,
4352 timezone: None,
4353 });
4354 }
4355
4356 let mut bug_tracking_url = None;
4357 let mut code_view_url = None;
4358 let mut vcs_url = None;
4359 let mut extra_data = HashMap::new();
4360
4361 let parsed_project_urls = info
4362 .get("project_urls")
4363 .and_then(|value| value.as_object())
4364 .map(|map| {
4365 let mut pairs: Vec<(String, String)> = map
4366 .iter()
4367 .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
4368 .collect();
4369 pairs.sort_by(|left, right| left.0.cmp(&right.0));
4370 pairs
4371 })
4372 .unwrap_or_default();
4373
4374 apply_project_url_mappings(
4375 &parsed_project_urls,
4376 &mut homepage_url,
4377 &mut bug_tracking_url,
4378 &mut code_view_url,
4379 &mut vcs_url,
4380 &mut extra_data,
4381 );
4382
4383 let (download_url, size, sha256) = root
4384 .get("urls")
4385 .and_then(|value| value.as_array())
4386 .map(|urls| select_pypi_json_artifact(urls))
4387 .unwrap_or((None, None, None));
4388
4389 let sha256 = sha256.and_then(|h| Sha256Digest::from_hex(&h).ok());
4390
4391 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4392 normalize_spdx_declared_license(license.as_deref());
4393 let dependencies = info
4394 .get("requires_dist")
4395 .and_then(|value| value.as_array())
4396 .map(|entries| {
4397 entries
4398 .iter()
4399 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4400 .collect::<Vec<_>>()
4401 })
4402 .map(|entries| extract_requires_dist_dependencies(&entries))
4403 .unwrap_or_default();
4404
4405 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
4406 build_pypi_urls(name.as_deref(), version.as_deref());
4407
4408 PackageData {
4409 package_type: Some(PythonParser::PACKAGE_TYPE),
4410 namespace: None,
4411 name,
4412 version,
4413 qualifiers: None,
4414 subpath: None,
4415 primary_language: None,
4416 description,
4417 release_date: None,
4418 parties,
4419 keywords,
4420 homepage_url: homepage_url.or(repository_homepage_url.clone()),
4421 download_url,
4422 size,
4423 sha1: None,
4424 md5: None,
4425 sha256,
4426 sha512: None,
4427 bug_tracking_url,
4428 code_view_url,
4429 vcs_url,
4430 copyright: None,
4431 holder: None,
4432 declared_license_expression,
4433 declared_license_expression_spdx,
4434 license_detections,
4435 other_license_expression: None,
4436 other_license_expression_spdx: None,
4437 other_license_detections: Vec::new(),
4438 extracted_license_statement: license,
4439 notice_text: None,
4440 source_packages: Vec::new(),
4441 file_references: Vec::new(),
4442 is_private: has_private_classifier(&classifiers),
4443 is_virtual: false,
4444 extra_data: if extra_data.is_empty() {
4445 None
4446 } else {
4447 Some(extra_data)
4448 },
4449 dependencies,
4450 repository_homepage_url,
4451 repository_download_url,
4452 api_data_url,
4453 datasource_id: Some(DatasourceId::PypiJson),
4454 purl,
4455 }
4456}
4457
4458fn select_pypi_json_artifact(
4459 urls: &[serde_json::Value],
4460) -> (Option<String>, Option<u64>, Option<String>) {
4461 let selected = urls
4462 .iter()
4463 .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4464 .or_else(|| urls.first());
4465
4466 let Some(entry) = selected else {
4467 return (None, None, None);
4468 };
4469
4470 let download_url = entry
4471 .get("url")
4472 .and_then(|value| value.as_str())
4473 .map(ToOwned::to_owned);
4474 let size = entry.get("size").and_then(|value| value.as_u64());
4475 let sha256 = entry
4476 .get("digests")
4477 .and_then(|value| value.as_object())
4478 .and_then(|digests| digests.get("sha256"))
4479 .and_then(|value| value.as_str())
4480 .map(ToOwned::to_owned);
4481
4482 (download_url, size, sha256)
4483}
4484
4485fn extract_from_pip_inspect(path: &Path) -> PackageData {
4486 let content = match read_file_to_string(path, None) {
4487 Ok(content) => content,
4488 Err(e) => {
4489 warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4490 return default_package_data(path);
4491 }
4492 };
4493
4494 let root: serde_json::Value = match serde_json::from_str(&content) {
4495 Ok(value) => value,
4496 Err(e) => {
4497 warn!(
4498 "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4499 path, e
4500 );
4501 return default_package_data(path);
4502 }
4503 };
4504
4505 let installed = match root.get("installed").and_then(|v| v.as_array()) {
4506 Some(arr) => arr,
4507 None => {
4508 warn!(
4509 "No 'installed' array found in pip-inspect.deplock at {:?}",
4510 path
4511 );
4512 return default_package_data(path);
4513 }
4514 };
4515
4516 let pip_version = root
4517 .get("pip_version")
4518 .and_then(|v| v.as_str())
4519 .map(String::from);
4520 let inspect_version = root
4521 .get("version")
4522 .and_then(|v| v.as_str())
4523 .map(String::from);
4524
4525 let mut main_package: Option<PackageData> = None;
4526 let mut dependencies: Vec<Dependency> = Vec::new();
4527
4528 for package_entry in installed {
4529 let metadata = match package_entry.get("metadata") {
4530 Some(m) => m,
4531 None => continue,
4532 };
4533
4534 let is_requested = package_entry
4535 .get("requested")
4536 .and_then(|v| v.as_bool())
4537 .unwrap_or(false);
4538 let has_direct_url = package_entry.get("direct_url").is_some();
4539
4540 let name = metadata
4541 .get("name")
4542 .and_then(|v| v.as_str())
4543 .map(|v| truncate_field(v.to_string()));
4544 let version = metadata
4545 .get("version")
4546 .and_then(|v| v.as_str())
4547 .map(String::from);
4548 let summary = metadata
4549 .get("summary")
4550 .and_then(|v| v.as_str())
4551 .map(|v| truncate_field(v.to_string()));
4552 let home_page = metadata
4553 .get("home_page")
4554 .and_then(|v| v.as_str())
4555 .map(|v| truncate_field(v.to_string()));
4556 let author = metadata
4557 .get("author")
4558 .and_then(|v| v.as_str())
4559 .map(|v| truncate_field(v.to_string()));
4560 let author_email = metadata
4561 .get("author_email")
4562 .and_then(|v| v.as_str())
4563 .map(String::from);
4564 let license = metadata
4565 .get("license")
4566 .and_then(|v| v.as_str())
4567 .map(|v| truncate_field(v.to_string()));
4568 let description = metadata
4569 .get("description")
4570 .and_then(|v| v.as_str())
4571 .map(|v| truncate_field(v.to_string()));
4572 let keywords = metadata
4573 .get("keywords")
4574 .and_then(|v| v.as_array())
4575 .map(|arr| {
4576 arr.iter()
4577 .filter_map(|k| k.as_str().map(String::from))
4578 .collect::<Vec<_>>()
4579 })
4580 .unwrap_or_default();
4581
4582 let mut parties = Vec::new();
4583 if author.is_some() || author_email.is_some() {
4584 parties.push(Party {
4585 r#type: Some("person".to_string()),
4586 role: Some("author".to_string()),
4587 name: author,
4588 email: author_email,
4589 url: None,
4590 organization: None,
4591 organization_url: None,
4592 timezone: None,
4593 });
4594 }
4595
4596 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4597 normalize_spdx_declared_license(license.as_deref());
4598 let extracted_license_statement = license.clone();
4599 let requires_dist = metadata
4600 .get("requires_dist")
4601 .and_then(|v| v.as_array())
4602 .map(|entries| {
4603 entries
4604 .iter()
4605 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4606 .collect::<Vec<_>>()
4607 })
4608 .unwrap_or_default();
4609 let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4610
4611 let purl = name.as_ref().and_then(|n| {
4612 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4613 if let Some(v) = &version {
4614 package_url.with_version(v).ok()?;
4615 }
4616 Some(package_url.to_string())
4617 });
4618
4619 if is_requested && has_direct_url {
4620 let mut extra_data = HashMap::new();
4621 if let Some(pv) = &pip_version {
4622 extra_data.insert(
4623 "pip_version".to_string(),
4624 serde_json::Value::String(pv.clone()),
4625 );
4626 }
4627 if let Some(iv) = &inspect_version {
4628 extra_data.insert(
4629 "inspect_version".to_string(),
4630 serde_json::Value::String(iv.clone()),
4631 );
4632 }
4633
4634 main_package = Some(PackageData {
4635 package_type: Some(PythonParser::PACKAGE_TYPE),
4636 namespace: None,
4637 name,
4638 version,
4639 qualifiers: None,
4640 subpath: None,
4641 primary_language: Some("Python".to_string()),
4642 description: description.or(summary),
4643 release_date: None,
4644 parties,
4645 keywords,
4646 homepage_url: home_page,
4647 download_url: None,
4648 size: None,
4649 sha1: None,
4650 md5: None,
4651 sha256: None,
4652 sha512: None,
4653 bug_tracking_url: None,
4654 code_view_url: None,
4655 vcs_url: None,
4656 copyright: None,
4657 holder: None,
4658 declared_license_expression,
4659 declared_license_expression_spdx,
4660 license_detections,
4661 other_license_expression: None,
4662 other_license_expression_spdx: None,
4663 other_license_detections: Vec::new(),
4664 extracted_license_statement,
4665 notice_text: None,
4666 source_packages: Vec::new(),
4667 file_references: Vec::new(),
4668 is_private: false,
4669 is_virtual: true,
4670 extra_data: if extra_data.is_empty() {
4671 None
4672 } else {
4673 Some(extra_data)
4674 },
4675 dependencies: parsed_dependencies,
4676 repository_homepage_url: None,
4677 repository_download_url: None,
4678 api_data_url: None,
4679 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4680 purl,
4681 });
4682 } else {
4683 let resolved_package = PackageData {
4684 package_type: Some(PythonParser::PACKAGE_TYPE),
4685 namespace: None,
4686 name: name.clone(),
4687 version: version.clone(),
4688 qualifiers: None,
4689 subpath: None,
4690 primary_language: Some("Python".to_string()),
4691 description: description.or(summary),
4692 release_date: None,
4693 parties,
4694 keywords,
4695 homepage_url: home_page,
4696 download_url: None,
4697 size: None,
4698 sha1: None,
4699 md5: None,
4700 sha256: None,
4701 sha512: None,
4702 bug_tracking_url: None,
4703 code_view_url: None,
4704 vcs_url: None,
4705 copyright: None,
4706 holder: None,
4707 declared_license_expression,
4708 declared_license_expression_spdx,
4709 license_detections,
4710 other_license_expression: None,
4711 other_license_expression_spdx: None,
4712 other_license_detections: Vec::new(),
4713 extracted_license_statement,
4714 notice_text: None,
4715 source_packages: Vec::new(),
4716 file_references: Vec::new(),
4717 is_private: false,
4718 is_virtual: true,
4719 extra_data: None,
4720 dependencies: parsed_dependencies,
4721 repository_homepage_url: None,
4722 repository_download_url: None,
4723 api_data_url: None,
4724 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4725 purl: purl.clone(),
4726 };
4727
4728 let resolved = package_data_to_resolved(&resolved_package);
4729 dependencies.push(Dependency {
4730 purl,
4731 extracted_requirement: None,
4732 scope: None,
4733 is_runtime: Some(true),
4734 is_optional: Some(false),
4735 is_pinned: Some(true),
4736 is_direct: Some(is_requested),
4737 resolved_package: Some(Box::new(resolved)),
4738 extra_data: None,
4739 });
4740 }
4741 }
4742
4743 if let Some(mut main_pkg) = main_package {
4744 let direct_requirement_purls: HashSet<String> = main_pkg
4745 .dependencies
4746 .iter()
4747 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4748 .collect();
4749
4750 let resolved_requirement_purls: HashSet<String> = dependencies
4751 .iter()
4752 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4753 .collect();
4754
4755 let unresolved_dependencies = main_pkg
4756 .dependencies
4757 .iter()
4758 .filter(|dep| {
4759 dep.purl.as_ref().is_some_and(|purl| {
4760 !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4761 })
4762 })
4763 .cloned()
4764 .collect::<Vec<_>>();
4765
4766 for dependency in &mut dependencies {
4767 if dependency
4768 .purl
4769 .as_ref()
4770 .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4771 {
4772 dependency.is_direct = Some(true);
4773 }
4774 }
4775
4776 main_pkg.dependencies = dependencies;
4777 main_pkg.dependencies.extend(unresolved_dependencies);
4778 main_pkg
4779 } else {
4780 default_package_data(path)
4781 }
4782}
4783
4784fn base_dependency_purl(purl: &str) -> String {
4785 purl.split_once('@')
4786 .map(|(base, _)| base.to_string())
4787 .unwrap_or_else(|| purl.to_string())
4788}
4789
4790type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4791
4792fn extract_from_setup_cfg(path: &Path) -> PackageData {
4793 let content = match read_file_to_string(path, None) {
4794 Ok(content) => content,
4795 Err(e) => {
4796 warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4797 return default_package_data(path);
4798 }
4799 };
4800
4801 let sections = parse_setup_cfg(&content);
4802 let name = get_ini_value(§ions, "metadata", "name").map(truncate_field);
4803 let version = get_ini_value(§ions, "metadata", "version").map(truncate_field);
4804 let description = get_ini_value(§ions, "metadata", "description").map(truncate_field);
4805 let author = get_ini_value(§ions, "metadata", "author").map(truncate_field);
4806 let author_email = get_ini_value(§ions, "metadata", "author_email");
4807 let maintainer = get_ini_value(§ions, "metadata", "maintainer").map(truncate_field);
4808 let maintainer_email = get_ini_value(§ions, "metadata", "maintainer_email");
4809 let license = get_ini_value(§ions, "metadata", "license").map(truncate_field);
4810 let mut homepage_url = get_ini_value(§ions, "metadata", "url").map(truncate_field);
4811 let classifiers = get_ini_values(§ions, "metadata", "classifiers");
4812 let keywords = parse_setup_cfg_keywords(get_ini_value(§ions, "metadata", "keywords"));
4813 let python_requires = get_ini_value(§ions, "options", "python_requires");
4814 let parsed_project_urls =
4815 parse_setup_cfg_project_urls(&get_ini_values(§ions, "metadata", "project_urls"));
4816 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4817 let mut extra_data = HashMap::new();
4818
4819 let mut parties = Vec::new();
4820 if author.is_some() || author_email.is_some() {
4821 parties.push(Party {
4822 r#type: Some("person".to_string()),
4823 role: Some("author".to_string()),
4824 name: author,
4825 email: author_email,
4826 url: None,
4827 organization: None,
4828 organization_url: None,
4829 timezone: None,
4830 });
4831 }
4832
4833 if maintainer.is_some() || maintainer_email.is_some() {
4834 parties.push(Party {
4835 r#type: Some("person".to_string()),
4836 role: Some("maintainer".to_string()),
4837 name: maintainer,
4838 email: maintainer_email,
4839 url: None,
4840 organization: None,
4841 organization_url: None,
4842 timezone: None,
4843 });
4844 }
4845
4846 let declared_license_expression = None;
4847 let declared_license_expression_spdx = None;
4848 let license_detections = Vec::new();
4849 let extracted_license_statement = license.clone();
4850
4851 let dependencies = extract_setup_cfg_dependencies(§ions);
4852
4853 if let Some(value) = python_requires {
4854 extra_data.insert(
4855 "python_requires".to_string(),
4856 serde_json::Value::String(value),
4857 );
4858 }
4859
4860 apply_project_url_mappings(
4861 &parsed_project_urls,
4862 &mut homepage_url,
4863 &mut bug_tracking_url,
4864 &mut code_view_url,
4865 &mut vcs_url,
4866 &mut extra_data,
4867 );
4868
4869 let extra_data = if extra_data.is_empty() {
4870 None
4871 } else {
4872 Some(extra_data)
4873 };
4874
4875 let purl = name.as_ref().and_then(|n| {
4876 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4877 if let Some(v) = &version {
4878 package_url.with_version(v).ok()?;
4879 }
4880 Some(package_url.to_string())
4881 });
4882
4883 PackageData {
4884 package_type: Some(PythonParser::PACKAGE_TYPE),
4885 namespace: None,
4886 name,
4887 version,
4888 qualifiers: None,
4889 subpath: None,
4890 primary_language: Some("Python".to_string()),
4891 description,
4892 release_date: None,
4893 parties,
4894 keywords,
4895 homepage_url,
4896 download_url: None,
4897 size: None,
4898 sha1: None,
4899 md5: None,
4900 sha256: None,
4901 sha512: None,
4902 bug_tracking_url,
4903 code_view_url,
4904 vcs_url,
4905 copyright: None,
4906 holder: None,
4907 declared_license_expression,
4908 declared_license_expression_spdx,
4909 license_detections,
4910 other_license_expression: None,
4911 other_license_expression_spdx: None,
4912 other_license_detections: Vec::new(),
4913 extracted_license_statement,
4914 notice_text: None,
4915 source_packages: Vec::new(),
4916 file_references: Vec::new(),
4917 is_private: has_private_classifier(&classifiers),
4918 is_virtual: false,
4919 extra_data,
4920 dependencies,
4921 repository_homepage_url: None,
4922 repository_download_url: None,
4923 api_data_url: None,
4924 datasource_id: Some(DatasourceId::PypiSetupCfg),
4925 purl,
4926 }
4927}
4928
4929fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4930 let Some(keywords) = value else {
4931 return Vec::new();
4932 };
4933
4934 keywords
4935 .split(',')
4936 .map(str::trim)
4937 .filter(|keyword| !keyword.is_empty())
4938 .map(ToOwned::to_owned)
4939 .collect()
4940}
4941
4942fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4943 entries
4944 .iter()
4945 .filter_map(|entry| {
4946 let (label, url) = entry.split_once('=')?;
4947 let label = label.trim();
4948 let url = url.trim();
4949 if label.is_empty() || url.is_empty() {
4950 None
4951 } else {
4952 Some((label.to_string(), url.to_string()))
4953 }
4954 })
4955 .collect()
4956}
4957
4958fn apply_project_url_mappings(
4959 parsed_urls: &[(String, String)],
4960 homepage_url: &mut Option<String>,
4961 bug_tracking_url: &mut Option<String>,
4962 code_view_url: &mut Option<String>,
4963 vcs_url: &mut Option<String>,
4964 extra_data: &mut HashMap<String, serde_json::Value>,
4965) {
4966 for (label, url) in parsed_urls {
4967 let label_lower = label.to_lowercase();
4968
4969 if bug_tracking_url.is_none()
4970 && matches!(
4971 label_lower.as_str(),
4972 "tracker"
4973 | "bug reports"
4974 | "bug tracker"
4975 | "issues"
4976 | "issue tracker"
4977 | "github: issues"
4978 )
4979 {
4980 *bug_tracking_url = Some(url.clone());
4981 } else if code_view_url.is_none()
4982 && matches!(label_lower.as_str(), "source" | "source code" | "code")
4983 {
4984 *code_view_url = Some(url.clone());
4985 } else if vcs_url.is_none()
4986 && matches!(
4987 label_lower.as_str(),
4988 "github" | "gitlab" | "github: repo" | "repository"
4989 )
4990 {
4991 *vcs_url = Some(url.clone());
4992 } else if homepage_url.is_none()
4993 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4994 {
4995 *homepage_url = Some(url.clone());
4996 } else if label_lower == "changelog" {
4997 extra_data.insert(
4998 "changelog_url".to_string(),
4999 serde_json::Value::String(url.clone()),
5000 );
5001 }
5002 }
5003
5004 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
5005 .iter()
5006 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
5007 .collect();
5008
5009 if !project_urls_json.is_empty() {
5010 extra_data.insert(
5011 "project_urls".to_string(),
5012 serde_json::Value::Object(project_urls_json),
5013 );
5014 }
5015}
5016
5017fn parse_setup_cfg(content: &str) -> IniSections {
5018 let mut sections: IniSections = HashMap::new();
5019 let mut current_section: Option<String> = None;
5020 let mut current_key: Option<String> = None;
5021
5022 for raw_line in content.lines() {
5023 let line = raw_line.trim_end_matches('\r');
5024 let trimmed = line.trim();
5025 if trimmed.is_empty() {
5026 continue;
5027 }
5028
5029 let stripped = line.trim_start();
5030 if stripped.starts_with('#') || stripped.starts_with(';') {
5031 continue;
5032 }
5033
5034 if stripped.starts_with('[') && stripped.ends_with(']') {
5035 let section_name = stripped
5036 .trim_start_matches('[')
5037 .trim_end_matches(']')
5038 .trim()
5039 .to_ascii_lowercase();
5040 current_section = if section_name.is_empty() {
5041 None
5042 } else {
5043 Some(section_name)
5044 };
5045 current_key = None;
5046 continue;
5047 }
5048
5049 if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
5050 if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
5051 let value = stripped.trim();
5052 if !value.is_empty() {
5053 sections
5054 .entry(section.clone())
5055 .or_default()
5056 .entry(key.clone())
5057 .or_default()
5058 .push(value.to_string());
5059 }
5060 }
5061 continue;
5062 }
5063
5064 if let Some((key, value)) = stripped.split_once('=')
5065 && let Some(section) = current_section.as_ref()
5066 {
5067 let key_name = key.trim().to_ascii_lowercase();
5068 let value_trimmed = value.trim();
5069 let entry = sections
5070 .entry(section.clone())
5071 .or_default()
5072 .entry(key_name.clone())
5073 .or_default();
5074 if !value_trimmed.is_empty() {
5075 entry.push(value_trimmed.to_string());
5076 }
5077 current_key = Some(key_name);
5078 }
5079 }
5080
5081 sections
5082}
5083
5084fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
5085 sections
5086 .get(§ion.to_ascii_lowercase())
5087 .and_then(|values| values.get(&key.to_ascii_lowercase()))
5088 .and_then(|entries| entries.first())
5089 .map(|value| value.trim().to_string())
5090}
5091
5092fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
5093 sections
5094 .get(§ion.to_ascii_lowercase())
5095 .and_then(|values| values.get(&key.to_ascii_lowercase()))
5096 .cloned()
5097 .unwrap_or_default()
5098}
5099
5100fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
5101 let mut dependencies = Vec::new();
5102
5103 for (sub_section, scope) in [
5104 ("install_requires", "install"),
5105 ("tests_require", "test"),
5106 ("setup_requires", "setup"),
5107 ] {
5108 let reqs = get_ini_values(sections, "options", sub_section);
5109 dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
5110 }
5111
5112 if let Some(extras) = sections.get("options.extras_require") {
5113 let mut extra_items: Vec<_> = extras.iter().collect();
5114 extra_items.sort_by_key(|(name, _)| *name);
5115 for (extra_name, reqs) in extra_items {
5116 dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
5117 }
5118 }
5119
5120 dependencies
5121}
5122
5123fn parse_setup_cfg_requirements(
5124 reqs: &[String],
5125 scope: &str,
5126 is_optional: bool,
5127) -> Vec<Dependency> {
5128 reqs.iter()
5129 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
5130 .collect()
5131}
5132
5133fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
5134 let trimmed = req.trim();
5135 if trimmed.is_empty() || trimmed.starts_with('#') {
5136 return None;
5137 }
5138
5139 let name = extract_setup_cfg_dependency_name(trimmed)?;
5140 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5141
5142 Some(Dependency {
5143 purl: Some(purl.to_string()),
5144 extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
5145 scope: Some(scope.to_string()),
5146 is_runtime: Some(true),
5147 is_optional: Some(is_optional),
5148 is_pinned: Some(false),
5149 is_direct: Some(true),
5150 resolved_package: None,
5151 extra_data: None,
5152 })
5153}
5154
5155fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
5156 let trimmed = req.trim();
5157 if trimmed.is_empty() {
5158 return None;
5159 }
5160
5161 let end = trimmed
5162 .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
5163 .unwrap_or(trimmed.len());
5164 let name = trimmed[..end].trim();
5165 if name.is_empty() {
5166 None
5167 } else {
5168 Some(name.to_string())
5169 }
5170}
5171
5172fn normalize_setup_cfg_requirement(req: &str) -> String {
5173 req.chars().filter(|c| !c.is_whitespace()).collect()
5174}
5175
5176fn extract_setup_value(content: &str, key: &str) -> Option<String> {
5177 let patterns = vec![
5178 format!("{}=\"", key), format!("{} =\"", key), format!("{}= \"", key), format!("{} = \"", key), format!("{}='", key), format!("{} ='", key), format!("{}= '", key), format!("{} = '", key), ];
5187
5188 for pattern in patterns {
5189 if let Some(start_idx) = content.find(&pattern) {
5190 let value_start = start_idx + pattern.len();
5191 let remaining = &content[value_start..];
5192
5193 if let Some(end_idx) = remaining.find(['"', '\'']) {
5194 return Some(remaining[..end_idx].to_string());
5195 }
5196 }
5197 }
5198
5199 None
5200}
5201
5202fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
5203 let mut dependencies = Vec::new();
5204
5205 if let Some(tests_deps) = extract_tests_require(content) {
5206 dependencies.extend(tests_deps);
5207 }
5208
5209 if let Some(extras_deps) = extract_extras_require(content) {
5210 dependencies.extend(extras_deps);
5211 }
5212
5213 dependencies
5214}
5215
5216fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
5217 let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
5218 let re = Regex::new(pattern).ok()?;
5219 let captures = re.captures(content)?;
5220 let deps_str = captures.get(1)?.as_str();
5221
5222 let deps = parse_setup_py_dep_list(deps_str, "test", true);
5223 if deps.is_empty() { None } else { Some(deps) }
5224}
5225
5226fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
5227 let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
5228 let re = Regex::new(pattern).ok()?;
5229 let captures = re.captures(content)?;
5230 let dict_content = captures.get(1)?.as_str();
5231
5232 let mut all_deps = Vec::new();
5233
5234 let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
5235 let entry_re = Regex::new(entry_pattern).ok()?;
5236
5237 for entry_cap in entry_re.captures_iter(dict_content) {
5238 if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
5239 let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
5240 all_deps.extend(deps);
5241 }
5242 }
5243
5244 if all_deps.is_empty() {
5245 None
5246 } else {
5247 Some(all_deps)
5248 }
5249}
5250
5251fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
5252 let dep_pattern = r#"['"]([^'"]+)['"]"#;
5253 let re = match Regex::new(dep_pattern) {
5254 Ok(r) => r,
5255 Err(_) => return Vec::new(),
5256 };
5257
5258 re.captures_iter(deps_str)
5259 .filter_map(|cap| {
5260 let dep_str = cap.get(1)?.as_str().trim();
5261 if dep_str.is_empty() {
5262 return None;
5263 }
5264
5265 let name = extract_setup_cfg_dependency_name(dep_str)?;
5266 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5267
5268 Some(Dependency {
5269 purl: Some(purl.to_string()),
5270 extracted_requirement: Some(dep_str.to_string()),
5271 scope: Some(scope.to_string()),
5272 is_runtime: Some(true),
5273 is_optional: Some(is_optional),
5274 is_pinned: Some(false),
5275 is_direct: Some(true),
5276 resolved_package: None,
5277 extra_data: None,
5278 })
5279 })
5280 .collect()
5281}
5282
5283pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
5285 let content = read_file_to_string(path, None).map_err(|e| e.to_string())?;
5286 toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
5287}
5288
5289fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<Sha256Digest>) {
5300 let mut file = match File::open(path) {
5301 Ok(f) => f,
5302 Err(_) => return (None, None),
5303 };
5304
5305 let metadata = match file.metadata() {
5306 Ok(m) => m,
5307 Err(_) => return (None, None),
5308 };
5309 let size = metadata.len();
5310
5311 let mut hasher = Sha256::new();
5312 let mut buffer = vec![0; 8192];
5313
5314 loop {
5315 match file.read(&mut buffer) {
5316 Ok(0) => break,
5317 Ok(n) => hasher.update(&buffer[..n]),
5318 Err(_) => return (Some(size), None),
5319 }
5320 }
5321
5322 let hash = Sha256Digest::from_bytes(hasher.finalize().into());
5323 (Some(size), Some(hash))
5324}
5325
5326fn default_package_data(path: &Path) -> PackageData {
5327 PackageData {
5328 package_type: Some(PythonParser::PACKAGE_TYPE),
5329 primary_language: Some("Python".to_string()),
5330 datasource_id: infer_python_datasource_id(path),
5331 ..Default::default()
5332 }
5333}
5334
5335fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
5336 let file_name = path.file_name().and_then(|name| name.to_str());
5337
5338 match file_name {
5339 Some("pyproject.toml") => {
5340 if read_toml_file(path)
5341 .ok()
5342 .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
5343 .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
5344 .is_some()
5345 {
5346 Some(DatasourceId::PypiPoetryPyprojectToml)
5347 } else {
5348 Some(DatasourceId::PypiPyprojectToml)
5349 }
5350 }
5351 Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
5352 Some(DatasourceId::PypiSetupPy)
5353 }
5354 Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
5355 Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
5356 Some("METADATA") if is_installed_wheel_metadata_path(path) => {
5357 Some(DatasourceId::PypiWheelMetadata)
5358 }
5359 Some("pypi.json") => Some(DatasourceId::PypiJson),
5360 Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
5361 Some("origin.json") if is_pip_cache_origin_json(path) => {
5362 Some(DatasourceId::PypiPipOriginJson)
5363 }
5364 _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
5365 Some(DatasourceId::PypiSdist)
5366 }
5367 _ if path
5368 .extension()
5369 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
5370 {
5371 Some(DatasourceId::PypiWheel)
5372 }
5373 _ if path
5374 .extension()
5375 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
5376 {
5377 Some(DatasourceId::PypiEgg)
5378 }
5379 _ => None,
5380 }
5381}
5382
5383crate::register_parser!(
5384 "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
5385 &[
5386 "**/pyproject.toml",
5387 "**/setup.py",
5388 "**/*_setup.py",
5389 "**/setup.cfg",
5390 "**/pypi.json",
5391 "**/PKG-INFO",
5392 "**/*.dist-info/METADATA",
5393 "**/origin.json",
5394 "**/*.tar.gz",
5395 "**/*.tgz",
5396 "**/*.tar.bz2",
5397 "**/*.tar.xz",
5398 "**/*.zip",
5399 "**/*.whl",
5400 "**/*.egg"
5401 ],
5402 "pypi",
5403 "Python",
5404 Some("https://packaging.python.org/"),
5405);