1use crate::models::{
35 DatasourceId, Dependency, FileReference, PackageData, PackageType, Party, Sha256Digest,
36};
37use crate::parser_warn as warn;
38use crate::parsers::utils::{read_file_to_string, split_name_email};
39use base64::Engine;
40use base64::engine::general_purpose::URL_SAFE_NO_PAD;
41use bzip2::read::BzDecoder;
42use csv::ReaderBuilder;
43use flate2::read::GzDecoder;
44use liblzma::read::XzDecoder;
45use packageurl::PackageUrl;
46use regex::Regex;
47use ruff_python_ast as ast;
48use ruff_python_parser::parse_module;
49use serde_json::{Map as JsonMap, Value as JsonValue};
50use sha2::{Digest, Sha256};
51use std::collections::{HashMap, HashSet};
52use std::fs::File;
53use std::io::Read;
54use std::path::{Component, Path, PathBuf};
55use tar::Archive;
56use toml::Value as TomlValue;
57use toml::map::Map as TomlMap;
58use zip::ZipArchive;
59
60use super::PackageParser;
61use super::license_normalization::{
62 DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
63 normalize_spdx_expression,
64};
65use super::pep508::parse_pep508_requirement;
66
67const FIELD_PROJECT: &str = "project";
69const FIELD_NAME: &str = "name";
70const FIELD_VERSION: &str = "version";
71const FIELD_DESCRIPTION: &str = "description";
72const FIELD_KEYWORDS: &str = "keywords";
73const FIELD_LICENSE: &str = "license";
74const FIELD_AUTHORS: &str = "authors";
75const FIELD_MAINTAINERS: &str = "maintainers";
76const FIELD_URLS: &str = "urls";
77const FIELD_HOMEPAGE: &str = "homepage";
78const FIELD_REPOSITORY: &str = "repository";
79const FIELD_DEPENDENCIES: &str = "dependencies";
80const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
81
82type ProjectUrls = (
83 Option<String>,
84 Option<String>,
85 Option<String>,
86 Option<String>,
87 Option<String>,
88);
89const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
90const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
91const MAX_SETUP_PY_BYTES: usize = 1_048_576;
92const MAX_SETUP_PY_AST_NODES: usize = 10_000;
93const MAX_SETUP_PY_AST_DEPTH: usize = 50;
94const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; const MAX_COMPRESSION_RATIO: f64 = 100.0; pub struct PythonParser;
108
109#[derive(Clone, Copy, Debug)]
110enum PythonSdistArchiveFormat {
111 TarGz,
112 Tgz,
113 TarBz2,
114 TarXz,
115 Zip,
116}
117
118#[derive(Clone, Debug)]
119struct ValidatedZipEntry {
120 index: usize,
121 name: String,
122}
123
124impl PackageParser for PythonParser {
125 const PACKAGE_TYPE: PackageType = PackageType::Pypi;
126
127 fn extract_packages(path: &Path) -> Vec<PackageData> {
128 vec![
129 if path.file_name().unwrap_or_default() == "pyproject.toml" {
130 extract_from_pyproject_toml(path)
131 } else if path.file_name().unwrap_or_default() == "setup.cfg" {
132 extract_from_setup_cfg(path)
133 } else if is_setup_py_like_path(path) {
134 return extract_setup_py_packages(path);
135 } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
136 extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
137 } else if is_installed_wheel_metadata_path(path) {
138 extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
139 } else if is_pip_cache_origin_json(path) {
140 extract_from_pip_origin_json(path)
141 } else if path.file_name().unwrap_or_default() == "pypi.json" {
142 extract_from_pypi_json(path)
143 } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
144 extract_from_pip_inspect(path)
145 } else if is_python_sdist_archive_path(path) {
146 extract_from_sdist_archive(path)
147 } else if path
148 .extension()
149 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
150 {
151 extract_from_wheel_archive(path)
152 } else if path
153 .extension()
154 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
155 {
156 extract_from_egg_archive(path)
157 } else {
158 default_package_data(path)
159 },
160 ]
161 }
162
163 fn is_match(path: &Path) -> bool {
164 if let Some(filename) = path.file_name()
165 && (filename == "pyproject.toml"
166 || filename == "setup.cfg"
167 || is_setup_py_like_path(path)
168 || filename == "PKG-INFO"
169 || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
170 || filename == "pypi.json"
171 || filename == "pip-inspect.deplock"
172 || is_pip_cache_origin_json(path))
173 {
174 return true;
175 }
176
177 if let Some(extension) = path.extension() {
178 let ext = extension.to_string_lossy().to_lowercase();
179 if (ext == "whl" && is_valid_wheel_archive_path(path))
180 || ext == "egg"
181 || is_python_sdist_archive_path(path)
182 {
183 return true;
184 }
185 }
186
187 false
188 }
189}
190
191fn is_setup_py_like_path(path: &Path) -> bool {
192 path.file_name()
193 .and_then(|name| name.to_str())
194 .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
195}
196
197fn is_installed_wheel_metadata_path(path: &Path) -> bool {
198 path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
199 && path
200 .parent()
201 .and_then(|parent| parent.file_name())
202 .and_then(|name| name.to_str())
203 .is_some_and(|name| name.ends_with(".dist-info"))
204}
205
206#[derive(Debug, Clone)]
207struct InstalledWheelMetadata {
208 wheel_tags: Vec<String>,
209 wheel_version: Option<String>,
210 wheel_generator: Option<String>,
211 root_is_purelib: Option<bool>,
212 compressed_tag: Option<String>,
213}
214
215fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
216 let Some(parent) = path.parent() else {
217 return;
218 };
219
220 if !parent
221 .file_name()
222 .and_then(|name| name.to_str())
223 .is_some_and(|name| name.ends_with(".dist-info"))
224 {
225 return;
226 }
227
228 let wheel_path = parent.join("WHEEL");
229 if !wheel_path.exists() {
230 return;
231 }
232
233 let Ok(content) = read_file_to_string(&wheel_path) else {
234 warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
235 return;
236 };
237
238 let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
239 return;
240 };
241
242 apply_installed_wheel_metadata(package_data, &wheel_metadata);
243}
244
245fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
246 use super::rfc822::{get_header_all, get_header_first};
247
248 let metadata = super::rfc822::parse_rfc822_content(content);
249 let wheel_tags = get_header_all(&metadata.headers, "tag");
250 if wheel_tags.is_empty() {
251 return None;
252 }
253
254 let wheel_version = get_header_first(&metadata.headers, "wheel-version");
255 let wheel_generator = get_header_first(&metadata.headers, "generator");
256 let root_is_purelib =
257 get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
258 match value.to_ascii_lowercase().as_str() {
259 "true" => Some(true),
260 "false" => Some(false),
261 _ => None,
262 }
263 });
264
265 let compressed_tag = compress_wheel_tags(&wheel_tags);
266
267 Some(InstalledWheelMetadata {
268 wheel_tags,
269 wheel_version,
270 wheel_generator,
271 root_is_purelib,
272 compressed_tag,
273 })
274}
275
276fn compress_wheel_tags(tags: &[String]) -> Option<String> {
277 if tags.is_empty() {
278 return None;
279 }
280
281 if tags.len() == 1 {
282 return Some(tags[0].clone());
283 }
284
285 let mut python_tags = Vec::new();
286 let mut abi_tag: Option<&str> = None;
287 let mut platform_tag: Option<&str> = None;
288
289 for tag in tags {
290 let mut parts = tag.splitn(3, '-');
291 let python = parts.next()?;
292 let abi = parts.next()?;
293 let platform = parts.next()?;
294
295 if abi_tag.is_some_and(|existing| existing != abi)
296 || platform_tag.is_some_and(|existing| existing != platform)
297 {
298 return None;
299 }
300
301 abi_tag = Some(abi);
302 platform_tag = Some(platform);
303 python_tags.push(python.to_string());
304 }
305
306 Some(format!(
307 "{}-{}-{}",
308 python_tags.join("."),
309 abi_tag?,
310 platform_tag?
311 ))
312}
313
314fn apply_installed_wheel_metadata(
315 package_data: &mut PackageData,
316 wheel_metadata: &InstalledWheelMetadata,
317) {
318 let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
319 extra_data.insert(
320 "wheel_tags".to_string(),
321 JsonValue::Array(
322 wheel_metadata
323 .wheel_tags
324 .iter()
325 .cloned()
326 .map(JsonValue::String)
327 .collect(),
328 ),
329 );
330
331 if let Some(wheel_version) = &wheel_metadata.wheel_version {
332 extra_data.insert(
333 "wheel_version".to_string(),
334 JsonValue::String(wheel_version.clone()),
335 );
336 }
337
338 if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
339 extra_data.insert(
340 "wheel_generator".to_string(),
341 JsonValue::String(wheel_generator.clone()),
342 );
343 }
344
345 if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
346 extra_data.insert(
347 "root_is_purelib".to_string(),
348 JsonValue::Bool(root_is_purelib),
349 );
350 }
351
352 if let (Some(name), Some(version), Some(extension)) = (
353 package_data.name.as_deref(),
354 package_data.version.as_deref(),
355 wheel_metadata.compressed_tag.as_deref(),
356 ) {
357 package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
358 }
359}
360
361fn is_pip_cache_origin_json(path: &Path) -> bool {
362 path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
363 && path.ancestors().skip(1).any(|ancestor| {
364 ancestor
365 .file_name()
366 .and_then(|name| name.to_str())
367 .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
368 })
369}
370
371fn extract_from_pip_origin_json(path: &Path) -> PackageData {
372 let content = match read_file_to_string(path) {
373 Ok(content) => content,
374 Err(e) => {
375 warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
376 return default_package_data(path);
377 }
378 };
379
380 let root: JsonValue = match serde_json::from_str(&content) {
381 Ok(root) => root,
382 Err(e) => {
383 warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
384 return default_package_data(path);
385 }
386 };
387
388 let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
389 warn!("No url found in pip cache origin.json at {:?}", path);
390 return default_package_data(path);
391 };
392
393 let sibling_wheel = find_sibling_cached_wheel(path);
394 let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
395 sibling_wheel
396 .as_ref()
397 .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
398 });
399
400 let Some((name, version)) = name_version else {
401 warn!(
402 "Failed to infer package name/version from pip cache origin.json at {:?}",
403 path
404 );
405 return default_package_data(path);
406 };
407
408 let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
409 build_pypi_urls(Some(&name), Some(&version));
410 let purl = sibling_wheel
411 .as_ref()
412 .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
413 .or(plain_purl);
414
415 PackageData {
416 package_type: Some(PythonParser::PACKAGE_TYPE),
417 primary_language: Some("Python".to_string()),
418 name: Some(name),
419 version: Some(version),
420 datasource_id: Some(DatasourceId::PypiPipOriginJson),
421 download_url: Some(download_url.to_string()),
422 sha256: extract_sha256_from_origin_json(&root)
423 .and_then(|h| Sha256Digest::from_hex(&h).ok()),
424 repository_homepage_url,
425 repository_download_url,
426 api_data_url,
427 purl,
428 ..Default::default()
429 }
430}
431
432fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
433 let parent = path.parent()?;
434 let entries = parent.read_dir().ok()?;
435
436 for entry in entries.flatten() {
437 let sibling_path = entry.path();
438 if sibling_path
439 .extension()
440 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
441 && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
442 {
443 return Some(wheel_info);
444 }
445 }
446
447 None
448}
449
450fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
451 let file_name = url.rsplit('/').next()?;
452
453 if file_name.ends_with(".whl") {
454 return parse_wheel_filename(Path::new(file_name))
455 .map(|wheel_info| (wheel_info.name, wheel_info.version));
456 }
457
458 let stem = strip_python_archive_extension(file_name)?;
459 let (name, version) = stem.rsplit_once('-')?;
460 if name.is_empty() || version.is_empty() {
461 return None;
462 }
463
464 Some((name.replace('_', "-"), version.to_string()))
465}
466
467fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
468 [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
469 .iter()
470 .find_map(|suffix| file_name.strip_suffix(suffix))
471}
472
473fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
474 root.pointer("/archive_info/hashes/sha256")
475 .and_then(|value| value.as_str())
476 .map(ToOwned::to_owned)
477 .or_else(|| {
478 root.pointer("/archive_info/hash")
479 .and_then(|value| value.as_str())
480 .and_then(normalize_origin_hash)
481 })
482}
483
484fn normalize_origin_hash(hash: &str) -> Option<String> {
485 if let Some(value) = hash.strip_prefix("sha256=") {
486 return Some(value.to_string());
487 }
488 if let Some(value) = hash.strip_prefix("sha256:") {
489 return Some(value.to_string());
490 }
491 if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
492 return Some(hash.to_string());
493 }
494 None
495}
496
497fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
498 let content = match read_file_to_string(path) {
499 Ok(content) => content,
500 Err(e) => {
501 warn!("Failed to read metadata at {:?}: {}", path, e);
502 return default_package_data(path);
503 }
504 };
505
506 let metadata = super::rfc822::parse_rfc822_content(&content);
507 let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
508 merge_sibling_metadata_dependencies(path, &mut package_data);
509 merge_sibling_metadata_file_references(path, &mut package_data);
510 if datasource_id == DatasourceId::PypiWheelMetadata {
511 merge_sibling_wheel_metadata(path, &mut package_data);
512 }
513 package_data
514}
515
516fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
517 let mut extra_dependencies = Vec::new();
518
519 if let Some(parent) = path.parent() {
520 let direct_requires = parent.join("requires.txt");
521 if direct_requires.exists()
522 && let Ok(content) = read_file_to_string(&direct_requires)
523 {
524 extra_dependencies.extend(parse_requires_txt(&content));
525 }
526
527 let sibling_egg_info_requires = parent
528 .read_dir()
529 .ok()
530 .into_iter()
531 .flatten()
532 .flatten()
533 .find_map(|entry| {
534 let child_path = entry.path();
535 if child_path.is_dir()
536 && child_path
537 .file_name()
538 .and_then(|name| name.to_str())
539 .is_some_and(|name| name.ends_with(".egg-info"))
540 {
541 let requires = child_path.join("requires.txt");
542 requires.exists().then_some(requires)
543 } else {
544 None
545 }
546 });
547
548 if let Some(requires_path) = sibling_egg_info_requires
549 && let Ok(content) = read_file_to_string(&requires_path)
550 {
551 extra_dependencies.extend(parse_requires_txt(&content));
552 }
553 }
554
555 for dependency in extra_dependencies {
556 if !package_data.dependencies.iter().any(|existing| {
557 existing.purl == dependency.purl
558 && existing.scope == dependency.scope
559 && existing.extracted_requirement == dependency.extracted_requirement
560 && existing.extra_data == dependency.extra_data
561 }) {
562 package_data.dependencies.push(dependency);
563 }
564 }
565}
566
567fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
568 let mut extra_refs = Vec::new();
569
570 if let Some(parent) = path.parent() {
571 let record_path = parent.join("RECORD");
572 if record_path.exists()
573 && let Ok(content) = read_file_to_string(&record_path)
574 {
575 extra_refs.extend(parse_record_csv(&content));
576 }
577
578 let installed_files_path = parent.join("installed-files.txt");
579 if installed_files_path.exists()
580 && let Ok(content) = read_file_to_string(&installed_files_path)
581 {
582 extra_refs.extend(parse_installed_files_txt(&content));
583 }
584
585 let sources_path = parent.join("SOURCES.txt");
586 if sources_path.exists()
587 && let Ok(content) = read_file_to_string(&sources_path)
588 {
589 extra_refs.extend(parse_sources_txt(&content));
590 }
591 }
592
593 for file_ref in extra_refs {
594 if !package_data
595 .file_references
596 .iter()
597 .any(|existing| existing.path == file_ref.path)
598 {
599 package_data.file_references.push(file_ref);
600 }
601 }
602}
603
604fn collect_validated_zip_entries<R: Read + std::io::Seek>(
605 archive: &mut ZipArchive<R>,
606 path: &Path,
607 archive_type: &str,
608) -> Result<Vec<ValidatedZipEntry>, String> {
609 let mut total_extracted = 0u64;
610 let mut entries = Vec::new();
611
612 for i in 0..archive.len() {
613 if let Ok(file) = archive.by_index_raw(i) {
614 let compressed_size = file.compressed_size();
615 let uncompressed_size = file.size();
616 let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
617 warn!(
618 "Skipping unsafe path in {} {:?}: {}",
619 archive_type,
620 path,
621 file.name()
622 );
623 continue;
624 };
625
626 if compressed_size > 0 {
627 let ratio = uncompressed_size as f64 / compressed_size as f64;
628 if ratio > MAX_COMPRESSION_RATIO {
629 warn!(
630 "Suspicious compression ratio in {} {:?}: {:.2}:1",
631 archive_type, path, ratio
632 );
633 continue;
634 }
635 }
636
637 if uncompressed_size > MAX_FILE_SIZE {
638 warn!(
639 "File too large in {} {:?}: {} bytes (limit: {} bytes)",
640 archive_type, path, uncompressed_size, MAX_FILE_SIZE
641 );
642 continue;
643 }
644
645 total_extracted += uncompressed_size;
646 if total_extracted > MAX_ARCHIVE_SIZE {
647 let msg = format!(
648 "Total extracted size exceeds limit for {} {:?}",
649 archive_type, path
650 );
651 warn!("{}", msg);
652 return Err(msg);
653 }
654
655 entries.push(ValidatedZipEntry {
656 index: i,
657 name: entry_name,
658 });
659 }
660 }
661
662 Ok(entries)
663}
664
665fn is_python_sdist_archive_path(path: &Path) -> bool {
666 detect_python_sdist_archive_format(path).is_some()
667}
668
669fn is_valid_wheel_archive_path(path: &Path) -> bool {
670 if !path.is_file() {
671 return true;
672 }
673
674 let file = match File::open(path) {
675 Ok(file) => file,
676 Err(_) => return false,
677 };
678 let mut archive = match ZipArchive::new(file) {
679 Ok(archive) => archive,
680 Err(_) => return false,
681 };
682
683 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
684 Ok(entries) => entries,
685 Err(_) => return false,
686 };
687
688 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA").is_some()
689}
690
691fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
692 let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
693
694 if !is_likely_python_sdist_filename(&file_name) {
695 return None;
696 }
697
698 if file_name.ends_with(".tar.gz") {
699 tar_gz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarGz)
700 } else if file_name.ends_with(".tgz") {
701 tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
702 } else if file_name.ends_with(".tar.bz2") {
703 tar_bz2_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarBz2)
704 } else if file_name.ends_with(".tar.xz") {
705 tar_xz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarXz)
706 } else if file_name.ends_with(".zip") {
707 zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
708 } else {
709 None
710 }
711}
712
713fn tar_gz_sdist_contains_pkg_info(path: &Path) -> bool {
714 let Some(compressed_size) = compressed_archive_size(path) else {
715 return false;
716 };
717 let file = match File::open(path) {
718 Ok(file) => file,
719 Err(_) => return false,
720 };
721 let decoder = GzDecoder::new(file);
722 tar_sdist_contains_pkg_info(path, decoder, "tar.gz", compressed_size)
723}
724
725fn tar_bz2_sdist_contains_pkg_info(path: &Path) -> bool {
726 let Some(compressed_size) = compressed_archive_size(path) else {
727 return false;
728 };
729 let file = match File::open(path) {
730 Ok(file) => file,
731 Err(_) => return false,
732 };
733 let decoder = BzDecoder::new(file);
734 tar_sdist_contains_pkg_info(path, decoder, "tar.bz2", compressed_size)
735}
736
737fn tar_xz_sdist_contains_pkg_info(path: &Path) -> bool {
738 let Some(compressed_size) = compressed_archive_size(path) else {
739 return false;
740 };
741 let file = match File::open(path) {
742 Ok(file) => file,
743 Err(_) => return false,
744 };
745 let decoder = XzDecoder::new(file);
746 tar_sdist_contains_pkg_info(path, decoder, "tar.xz", compressed_size)
747}
748
749fn compressed_archive_size(path: &Path) -> Option<u64> {
750 std::fs::metadata(path).ok().map(|metadata| metadata.len())
751}
752
753fn tar_sdist_contains_pkg_info<R: Read>(
754 path: &Path,
755 reader: R,
756 archive_type: &str,
757 compressed_size: u64,
758) -> bool {
759 let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
760 else {
761 return false;
762 };
763
764 select_sdist_pkginfo_entry(path, &entries).is_some()
765}
766
767fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
768 if !path.is_file() {
769 return true;
770 }
771
772 let Some(compressed_size) = compressed_archive_size(path) else {
773 return false;
774 };
775 let file = match File::open(path) {
776 Ok(file) => file,
777 Err(_) => return false,
778 };
779 let decoder = GzDecoder::new(file);
780 tar_sdist_contains_pkg_info(path, decoder, "tgz", compressed_size)
781}
782
783fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
784 if !path.is_file() {
785 return true;
786 }
787
788 let file = match File::open(path) {
789 Ok(file) => file,
790 Err(_) => return false,
791 };
792 let mut archive = match ZipArchive::new(file) {
793 Ok(archive) => archive,
794 Err(_) => return false,
795 };
796
797 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
798 Ok(entries) => entries,
799 Err(_) => return false,
800 };
801 let metadata_entries: Vec<_> = validated_entries
802 .iter()
803 .filter(|entry| entry.name.ends_with("/PKG-INFO"))
804 .filter_map(|entry| {
805 read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
806 .ok()
807 .map(|content| (entry.name.clone(), content))
808 })
809 .collect();
810
811 has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
812}
813
814fn is_likely_python_sdist_filename(file_name: &str) -> bool {
815 let Some(stem) = strip_python_archive_extension(file_name) else {
816 return false;
817 };
818
819 let Some((name, version)) = stem.rsplit_once('-') else {
820 return false;
821 };
822
823 !name.is_empty()
824 && !version.is_empty()
825 && version.chars().any(|ch| ch.is_ascii_digit())
826 && name
827 .chars()
828 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
829}
830
831fn extract_from_sdist_archive(path: &Path) -> PackageData {
832 let metadata = match std::fs::metadata(path) {
833 Ok(m) => m,
834 Err(e) => {
835 warn!(
836 "Failed to read metadata for sdist archive {:?}: {}",
837 path, e
838 );
839 return default_package_data(path);
840 }
841 };
842
843 if metadata.len() > MAX_ARCHIVE_SIZE {
844 warn!(
845 "sdist archive too large: {} bytes (limit: {} bytes)",
846 metadata.len(),
847 MAX_ARCHIVE_SIZE
848 );
849 return default_package_data(path);
850 }
851
852 let Some(format) = detect_python_sdist_archive_format(path) else {
853 return default_package_data(path);
854 };
855
856 let mut package_data = match format {
857 PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
858 let file = match File::open(path) {
859 Ok(file) => file,
860 Err(e) => {
861 warn!("Failed to open sdist archive {:?}: {}", path, e);
862 return default_package_data(path);
863 }
864 };
865 let decoder = GzDecoder::new(file);
866 extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
867 }
868 PythonSdistArchiveFormat::TarBz2 => {
869 let file = match File::open(path) {
870 Ok(file) => file,
871 Err(e) => {
872 warn!("Failed to open sdist archive {:?}: {}", path, e);
873 return default_package_data(path);
874 }
875 };
876 let decoder = BzDecoder::new(file);
877 extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
878 }
879 PythonSdistArchiveFormat::TarXz => {
880 let file = match File::open(path) {
881 Ok(file) => file,
882 Err(e) => {
883 warn!("Failed to open sdist archive {:?}: {}", path, e);
884 return default_package_data(path);
885 }
886 };
887 let decoder = XzDecoder::new(file);
888 extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
889 }
890 PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
891 };
892
893 if package_data.package_type.is_some() {
894 let (size, sha256) = calculate_file_checksums(path);
895 package_data.size = size;
896 package_data.sha256 = sha256;
897 }
898
899 package_data
900}
901
902fn extract_from_tar_sdist_archive<R: Read>(
903 path: &Path,
904 reader: R,
905 archive_type: &str,
906 compressed_size: u64,
907) -> PackageData {
908 let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
909 else {
910 return default_package_data(path);
911 };
912
913 build_sdist_package_data(path, entries)
914}
915
916fn collect_tar_sdist_entries<R: Read>(
917 path: &Path,
918 reader: R,
919 archive_type: &str,
920 compressed_size: u64,
921) -> Option<Vec<(String, String)>> {
922 let mut archive = Archive::new(reader);
923 let archive_entries = match archive.entries() {
924 Ok(entries) => entries,
925 Err(e) => {
926 warn!(
927 "Failed to read {} sdist archive {:?}: {}",
928 archive_type, path, e
929 );
930 return None;
931 }
932 };
933
934 let mut total_extracted = 0u64;
935 let mut entries = Vec::new();
936
937 for entry_result in archive_entries {
938 let mut entry = match entry_result {
939 Ok(entry) => entry,
940 Err(e) => {
941 warn!(
942 "Failed to read {} sdist entry from {:?}: {}",
943 archive_type, path, e
944 );
945 continue;
946 }
947 };
948
949 let entry_size = entry.size();
950 if entry_size > MAX_FILE_SIZE {
951 warn!(
952 "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
953 archive_type, path, entry_size, MAX_FILE_SIZE
954 );
955 continue;
956 }
957
958 total_extracted += entry_size;
959 if total_extracted > MAX_ARCHIVE_SIZE {
960 warn!(
961 "Total extracted size exceeds limit for {} sdist {:?}",
962 archive_type, path
963 );
964 return None;
965 }
966
967 if compressed_size > 0 {
968 let ratio = total_extracted as f64 / compressed_size as f64;
969 if ratio > MAX_COMPRESSION_RATIO {
970 warn!(
971 "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
972 archive_type, path, ratio
973 );
974 return None;
975 }
976 }
977
978 let entry_path = match entry.path() {
979 Ok(path) => path.to_string_lossy().replace('\\', "/"),
980 Err(e) => {
981 warn!(
982 "Failed to get {} sdist entry path from {:?}: {}",
983 archive_type, path, e
984 );
985 continue;
986 }
987 };
988
989 let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
990 warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
991 continue;
992 };
993
994 if !is_relevant_sdist_text_entry(&entry_path) {
995 continue;
996 }
997
998 if let Ok(content) = read_limited_utf8(
999 &mut entry,
1000 MAX_FILE_SIZE,
1001 &format!("{} entry {}", archive_type, entry_path),
1002 ) {
1003 entries.push((entry_path, content));
1004 }
1005 }
1006
1007 Some(entries)
1008}
1009
1010fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
1011 let file = match File::open(path) {
1012 Ok(file) => file,
1013 Err(e) => {
1014 warn!("Failed to open zip sdist archive {:?}: {}", path, e);
1015 return default_package_data(path);
1016 }
1017 };
1018
1019 let mut archive = match ZipArchive::new(file) {
1020 Ok(archive) => archive,
1021 Err(e) => {
1022 warn!("Failed to read zip sdist archive {:?}: {}", path, e);
1023 return default_package_data(path);
1024 }
1025 };
1026
1027 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
1028 Ok(entries) => entries,
1029 Err(_) => return default_package_data(path),
1030 };
1031
1032 let mut entries = Vec::new();
1033 for entry in validated_entries.iter() {
1034 if !is_relevant_sdist_text_entry(&entry.name) {
1035 continue;
1036 }
1037
1038 if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
1039 entries.push((entry.name.clone(), content));
1040 }
1041 }
1042
1043 build_sdist_package_data(path, entries)
1044}
1045
1046fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
1047 entry_path.ends_with("/PKG-INFO")
1048 || entry_path.ends_with("/requires.txt")
1049 || entry_path.ends_with("/SOURCES.txt")
1050}
1051
1052fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
1053 let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
1054 warn!("No PKG-INFO file found in sdist archive {:?}", path);
1055 return default_package_data(path);
1056 };
1057
1058 let mut package_data =
1059 python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
1060 merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
1061 merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
1062 apply_sdist_name_version_fallback(path, &mut package_data);
1063 package_data.datasource_id = Some(DatasourceId::PypiSdist);
1064 package_data
1065}
1066
1067fn select_sdist_pkginfo_entry(
1068 archive_path: &Path,
1069 entries: &[(String, String)],
1070) -> Option<(String, String)> {
1071 let expected_name = sdist_archive_expected_name(archive_path);
1072
1073 entries
1074 .iter()
1075 .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
1076 .min_by_key(|(entry_path, content)| {
1077 let components: Vec<_> = entry_path
1078 .split('/')
1079 .filter(|part| !part.is_empty())
1080 .collect();
1081 let candidate_name = sdist_pkginfo_candidate_name(content);
1082 let name_rank = if candidate_name == expected_name {
1083 0
1084 } else {
1085 1
1086 };
1087 let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1088
1089 (name_rank, kind_rank, components.len(), entry_path.clone())
1090 })
1091 .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1092}
1093
1094fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1095 let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1096 return false;
1097 };
1098
1099 entries.iter().any(|(entry_path, content)| {
1100 sdist_pkginfo_kind_rank(entry_path) < 3
1101 && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1102 })
1103}
1104
1105fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1106 archive_path
1107 .file_name()
1108 .and_then(|name| name.to_str())
1109 .and_then(strip_python_archive_extension)
1110 .and_then(|stem| {
1111 stem.rsplit_once('-')
1112 .map(|(name, _)| normalize_python_package_name(name))
1113 })
1114}
1115
1116fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1117 let metadata = super::rfc822::parse_rfc822_content(content);
1118 super::rfc822::get_header_first(&metadata.headers, "name")
1119 .map(|name| normalize_python_package_name(&name))
1120}
1121
1122fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1123 let components: Vec<_> = entry_path
1124 .split('/')
1125 .filter(|part| !part.is_empty())
1126 .collect();
1127
1128 if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1129 {
1130 0
1131 } else if components.len() == 2 && components[1] == "PKG-INFO" {
1132 1
1133 } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1134 2
1135 } else {
1136 3
1137 }
1138}
1139
1140fn merge_sdist_archive_dependencies(
1141 entries: &[(String, String)],
1142 metadata_path: &str,
1143 package_data: &mut PackageData,
1144) {
1145 let metadata_dir = metadata_path
1146 .rsplit_once('/')
1147 .map(|(dir, _)| dir)
1148 .unwrap_or("");
1149 let archive_root = metadata_path.split('/').next().unwrap_or("");
1150 let matched_egg_info_dir =
1151 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1152 let mut extra_dependencies = Vec::new();
1153
1154 for (entry_path, content) in entries {
1155 let is_direct_requires =
1156 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1157 let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1158 entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1159 });
1160
1161 if is_direct_requires || is_egg_info_requires {
1162 extra_dependencies.extend(parse_requires_txt(content));
1163 }
1164 }
1165
1166 for dependency in extra_dependencies {
1167 if !package_data.dependencies.iter().any(|existing| {
1168 existing.purl == dependency.purl
1169 && existing.scope == dependency.scope
1170 && existing.extracted_requirement == dependency.extracted_requirement
1171 && existing.extra_data == dependency.extra_data
1172 }) {
1173 package_data.dependencies.push(dependency);
1174 }
1175 }
1176}
1177
1178fn merge_sdist_archive_file_references(
1179 entries: &[(String, String)],
1180 metadata_path: &str,
1181 package_data: &mut PackageData,
1182) {
1183 let metadata_dir = metadata_path
1184 .rsplit_once('/')
1185 .map(|(dir, _)| dir)
1186 .unwrap_or("");
1187 let archive_root = metadata_path.split('/').next().unwrap_or("");
1188 let matched_egg_info_dir =
1189 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1190 let mut extra_refs = Vec::new();
1191
1192 for (entry_path, content) in entries {
1193 let is_direct_sources =
1194 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1195 let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1196 entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1197 });
1198
1199 if is_direct_sources || is_egg_info_sources {
1200 extra_refs.extend(parse_sources_txt(content));
1201 }
1202 }
1203
1204 for file_ref in extra_refs {
1205 if !package_data
1206 .file_references
1207 .iter()
1208 .any(|existing| existing.path == file_ref.path)
1209 {
1210 package_data.file_references.push(file_ref);
1211 }
1212 }
1213}
1214
1215fn select_matching_sdist_egg_info_dir(
1216 entries: &[(String, String)],
1217 archive_root: &str,
1218 package_name: Option<&str>,
1219) -> Option<String> {
1220 let normalized_package_name = package_name.map(normalize_python_package_name);
1221
1222 entries
1223 .iter()
1224 .filter_map(|(entry_path, _)| {
1225 let components: Vec<_> = entry_path
1226 .split('/')
1227 .filter(|part| !part.is_empty())
1228 .collect();
1229 if components.len() == 3
1230 && components[0] == archive_root
1231 && components[1].ends_with(".egg-info")
1232 {
1233 Some(components[1].to_string())
1234 } else {
1235 None
1236 }
1237 })
1238 .min_by_key(|egg_info_dir| {
1239 let normalized_dir_name =
1240 normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1241 let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1242 0
1243 } else {
1244 1
1245 };
1246
1247 (name_rank, egg_info_dir.clone())
1248 })
1249}
1250
1251fn normalize_python_package_name(name: &str) -> String {
1252 name.to_ascii_lowercase().replace('_', "-")
1253}
1254
1255fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1256 let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1257 return;
1258 };
1259
1260 let Some(stem) = strip_python_archive_extension(file_name) else {
1261 return;
1262 };
1263
1264 let Some((name, version)) = stem.rsplit_once('-') else {
1265 return;
1266 };
1267
1268 if package_data.name.is_none() {
1269 package_data.name = Some(name.replace('_', "-"));
1270 }
1271 if package_data.version.is_none() {
1272 package_data.version = Some(version.to_string());
1273 }
1274
1275 if package_data.purl.is_none()
1276 || package_data.repository_homepage_url.is_none()
1277 || package_data.repository_download_url.is_none()
1278 || package_data.api_data_url.is_none()
1279 {
1280 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1281 build_pypi_urls(
1282 package_data.name.as_deref(),
1283 package_data.version.as_deref(),
1284 );
1285
1286 if package_data.repository_homepage_url.is_none() {
1287 package_data.repository_homepage_url = repository_homepage_url;
1288 }
1289 if package_data.repository_download_url.is_none() {
1290 package_data.repository_download_url = repository_download_url;
1291 }
1292 if package_data.api_data_url.is_none() {
1293 package_data.api_data_url = api_data_url;
1294 }
1295 if package_data.purl.is_none() {
1296 package_data.purl = purl;
1297 }
1298 }
1299}
1300
1301fn extract_from_wheel_archive(path: &Path) -> PackageData {
1302 let metadata = match std::fs::metadata(path) {
1303 Ok(m) => m,
1304 Err(e) => {
1305 warn!(
1306 "Failed to read metadata for wheel archive {:?}: {}",
1307 path, e
1308 );
1309 return default_package_data(path);
1310 }
1311 };
1312
1313 if metadata.len() > MAX_ARCHIVE_SIZE {
1314 warn!(
1315 "Wheel archive too large: {} bytes (limit: {} bytes)",
1316 metadata.len(),
1317 MAX_ARCHIVE_SIZE
1318 );
1319 return default_package_data(path);
1320 }
1321
1322 let file = match File::open(path) {
1323 Ok(f) => f,
1324 Err(e) => {
1325 warn!("Failed to open wheel archive {:?}: {}", path, e);
1326 return default_package_data(path);
1327 }
1328 };
1329
1330 let mut archive = match ZipArchive::new(file) {
1331 Ok(a) => a,
1332 Err(e) => {
1333 warn!("Failed to read wheel archive {:?}: {}", path, e);
1334 return default_package_data(path);
1335 }
1336 };
1337
1338 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1339 Ok(entries) => entries,
1340 Err(_) => return default_package_data(path),
1341 };
1342
1343 let metadata_entry =
1344 match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1345 Some(entry) => entry,
1346 None => {
1347 warn!("No METADATA file found in wheel archive {:?}", path);
1348 return default_package_data(path);
1349 }
1350 };
1351
1352 let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1353 Ok(c) => c,
1354 Err(e) => {
1355 warn!("Failed to read METADATA from {:?}: {}", path, e);
1356 return default_package_data(path);
1357 }
1358 };
1359
1360 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1361
1362 let (size, sha256) = calculate_file_checksums(path);
1363 package_data.size = size;
1364 package_data.sha256 = sha256;
1365
1366 if let Some(record_entry) =
1367 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1368 && let Ok(record_content) =
1369 read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1370 {
1371 package_data.file_references = parse_record_csv(&record_content);
1372 }
1373
1374 if let Some(wheel_info) = parse_wheel_filename(path) {
1375 if package_data.name.is_none() {
1376 package_data.name = Some(wheel_info.name.clone());
1377 }
1378 if package_data.version.is_none() {
1379 package_data.version = Some(wheel_info.version.clone());
1380 }
1381
1382 package_data.qualifiers = Some(std::collections::HashMap::from([(
1383 "extension".to_string(),
1384 format!(
1385 "{}-{}-{}",
1386 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1387 ),
1388 )]));
1389
1390 package_data.purl = build_wheel_purl(
1391 package_data.name.as_deref(),
1392 package_data.version.as_deref(),
1393 &wheel_info,
1394 );
1395
1396 let mut extra_data = package_data.extra_data.unwrap_or_default();
1397 extra_data.insert(
1398 "python_requires".to_string(),
1399 serde_json::Value::String(wheel_info.python_tag.clone()),
1400 );
1401 extra_data.insert(
1402 "abi_tag".to_string(),
1403 serde_json::Value::String(wheel_info.abi_tag.clone()),
1404 );
1405 extra_data.insert(
1406 "platform_tag".to_string(),
1407 serde_json::Value::String(wheel_info.platform_tag.clone()),
1408 );
1409 package_data.extra_data = Some(extra_data);
1410 }
1411
1412 package_data
1413}
1414
1415fn extract_from_egg_archive(path: &Path) -> PackageData {
1416 let metadata = match std::fs::metadata(path) {
1417 Ok(m) => m,
1418 Err(e) => {
1419 warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1420 return default_package_data(path);
1421 }
1422 };
1423
1424 if metadata.len() > MAX_ARCHIVE_SIZE {
1425 warn!(
1426 "Egg archive too large: {} bytes (limit: {} bytes)",
1427 metadata.len(),
1428 MAX_ARCHIVE_SIZE
1429 );
1430 return default_package_data(path);
1431 }
1432
1433 let file = match File::open(path) {
1434 Ok(f) => f,
1435 Err(e) => {
1436 warn!("Failed to open egg archive {:?}: {}", path, e);
1437 return default_package_data(path);
1438 }
1439 };
1440
1441 let mut archive = match ZipArchive::new(file) {
1442 Ok(a) => a,
1443 Err(e) => {
1444 warn!("Failed to read egg archive {:?}: {}", path, e);
1445 return default_package_data(path);
1446 }
1447 };
1448
1449 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1450 Ok(entries) => entries,
1451 Err(_) => return default_package_data(path),
1452 };
1453
1454 let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1455 &validated_entries,
1456 &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1457 ) {
1458 Some(entry) => entry,
1459 None => {
1460 warn!("No PKG-INFO file found in egg archive {:?}", path);
1461 return default_package_data(path);
1462 }
1463 };
1464
1465 let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1466 Ok(c) => c,
1467 Err(e) => {
1468 warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1469 return default_package_data(path);
1470 }
1471 };
1472
1473 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1474
1475 let (size, sha256) = calculate_file_checksums(path);
1476 package_data.size = size;
1477 package_data.sha256 = sha256;
1478
1479 if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1480 &validated_entries,
1481 &[
1482 "EGG-INFO/installed-files.txt",
1483 ".egg-info/installed-files.txt",
1484 ],
1485 ) && let Ok(installed_files_content) =
1486 read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1487 {
1488 package_data.file_references = parse_installed_files_txt(&installed_files_content);
1489 }
1490
1491 if let Some(egg_info) = parse_egg_filename(path) {
1492 if package_data.name.is_none() {
1493 package_data.name = Some(egg_info.name.clone());
1494 }
1495 if package_data.version.is_none() {
1496 package_data.version = Some(egg_info.version.clone());
1497 }
1498
1499 if let Some(python_version) = &egg_info.python_version {
1500 let mut extra_data = package_data.extra_data.unwrap_or_default();
1501 extra_data.insert(
1502 "python_version".to_string(),
1503 serde_json::Value::String(python_version.clone()),
1504 );
1505 package_data.extra_data = Some(extra_data);
1506 }
1507 }
1508
1509 package_data.purl = build_egg_purl(
1510 package_data.name.as_deref(),
1511 package_data.version.as_deref(),
1512 );
1513
1514 package_data
1515}
1516
1517fn find_validated_zip_entry_by_suffix<'a>(
1518 entries: &'a [ValidatedZipEntry],
1519 suffix: &str,
1520) -> Option<&'a ValidatedZipEntry> {
1521 entries.iter().find(|entry| entry.name.ends_with(suffix))
1522}
1523
1524fn find_validated_zip_entry_by_any_suffix<'a>(
1525 entries: &'a [ValidatedZipEntry],
1526 suffixes: &[&str],
1527) -> Option<&'a ValidatedZipEntry> {
1528 entries
1529 .iter()
1530 .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1531}
1532
1533fn read_validated_zip_entry<R: Read + std::io::Seek>(
1534 archive: &mut ZipArchive<R>,
1535 entry: &ValidatedZipEntry,
1536 path: &Path,
1537 archive_type: &str,
1538) -> Result<String, String> {
1539 let mut file = archive
1540 .by_index(entry.index)
1541 .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1542
1543 let compressed_size = file.compressed_size();
1544 let uncompressed_size = file.size();
1545
1546 if compressed_size > 0 {
1547 let ratio = uncompressed_size as f64 / compressed_size as f64;
1548 if ratio > MAX_COMPRESSION_RATIO {
1549 return Err(format!(
1550 "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1551 archive_type, path, ratio
1552 ));
1553 }
1554 }
1555
1556 if uncompressed_size > MAX_FILE_SIZE {
1557 return Err(format!(
1558 "Rejected oversized entry in {} {:?}: {} bytes",
1559 archive_type, path, uncompressed_size
1560 ));
1561 }
1562
1563 read_limited_utf8(
1564 &mut file,
1565 MAX_FILE_SIZE,
1566 &format!("{} entry {}", archive_type, entry.name),
1567 )
1568}
1569
1570fn read_limited_utf8<R: Read>(
1571 reader: &mut R,
1572 max_bytes: u64,
1573 context: &str,
1574) -> Result<String, String> {
1575 let mut limited = reader.take(max_bytes + 1);
1576 let mut bytes = Vec::new();
1577 limited
1578 .read_to_end(&mut bytes)
1579 .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1580
1581 if bytes.len() as u64 > max_bytes {
1582 return Err(format!(
1583 "{} exceeded {} byte limit while reading",
1584 context, max_bytes
1585 ));
1586 }
1587
1588 String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1589}
1590
1591fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1592 let normalized = entry_path.replace('\\', "/");
1593 if normalized.len() >= 3 {
1594 let bytes = normalized.as_bytes();
1595 if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1596 return None;
1597 }
1598 }
1599 let path = Path::new(&normalized);
1600 let mut components = Vec::new();
1601
1602 for component in path.components() {
1603 match component {
1604 Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1605 Component::CurDir => {}
1606 Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1607 }
1608 }
1609
1610 (!components.is_empty()).then_some(components.join("/"))
1611}
1612
1613pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1618 let mut reader = ReaderBuilder::new()
1619 .has_headers(false)
1620 .from_reader(content.as_bytes());
1621
1622 let mut file_references = Vec::new();
1623
1624 for result in reader.records() {
1625 match result {
1626 Ok(record) => {
1627 if record.len() < 3 {
1628 continue;
1629 }
1630
1631 let path = record.get(0).unwrap_or("").trim().to_string();
1632 if path.is_empty() {
1633 continue;
1634 }
1635
1636 let hash_field = record.get(1).unwrap_or("").trim();
1637 let size_field = record.get(2).unwrap_or("").trim();
1638
1639 let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1641 let parts: Vec<&str> = hash_field.split('=').collect();
1642 if parts.len() == 2 && parts[0] == "sha256" {
1643 match URL_SAFE_NO_PAD.decode(parts[1]) {
1644 Ok(decoded) => {
1645 let hex = decoded
1646 .iter()
1647 .map(|b| format!("{:02x}", b))
1648 .collect::<String>();
1649 Sha256Digest::from_hex(&hex).ok()
1650 }
1651 Err(_) => None,
1652 }
1653 } else {
1654 None
1655 }
1656 } else {
1657 None
1658 };
1659
1660 let size = if !size_field.is_empty() && size_field != "-" {
1662 size_field.parse::<u64>().ok()
1663 } else {
1664 None
1665 };
1666
1667 file_references.push(FileReference {
1668 path,
1669 size,
1670 sha1: None,
1671 md5: None,
1672 sha256,
1673 sha512: None,
1674 extra_data: None,
1675 });
1676 }
1677 Err(e) => {
1678 warn!("Failed to parse RECORD CSV row: {}", e);
1679 continue;
1680 }
1681 }
1682 }
1683
1684 file_references
1685}
1686
1687pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1690 content
1691 .lines()
1692 .map(|line| line.trim())
1693 .filter(|line| !line.is_empty())
1694 .map(|path| FileReference {
1695 path: path.to_string(),
1696 size: None,
1697 sha1: None,
1698 md5: None,
1699 sha256: None,
1700 sha512: None,
1701 extra_data: None,
1702 })
1703 .collect()
1704}
1705
1706pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1707 content
1708 .lines()
1709 .map(str::trim)
1710 .filter(|line| !line.is_empty())
1711 .map(|path| FileReference {
1712 path: path.to_string(),
1713 size: None,
1714 sha1: None,
1715 md5: None,
1716 sha256: None,
1717 sha512: None,
1718 extra_data: None,
1719 })
1720 .collect()
1721}
1722
1723struct WheelInfo {
1724 name: String,
1725 version: String,
1726 python_tag: String,
1727 abi_tag: String,
1728 platform_tag: String,
1729}
1730
1731fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1732 let stem = path.file_stem()?.to_string_lossy();
1733 let parts: Vec<&str> = stem.split('-').collect();
1734
1735 if parts.len() >= 5 {
1736 Some(WheelInfo {
1737 name: parts[0].replace('_', "-"),
1738 version: parts[1].to_string(),
1739 python_tag: parts[2].to_string(),
1740 abi_tag: parts[3].to_string(),
1741 platform_tag: parts[4..].join("-"),
1742 })
1743 } else {
1744 None
1745 }
1746}
1747
1748struct EggInfo {
1749 name: String,
1750 version: String,
1751 python_version: Option<String>,
1752}
1753
1754fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1755 let stem = path.file_stem()?.to_string_lossy();
1756 let parts: Vec<&str> = stem.split('-').collect();
1757
1758 if parts.len() >= 2 {
1759 Some(EggInfo {
1760 name: parts[0].replace('_', "-"),
1761 version: parts[1].to_string(),
1762 python_version: parts.get(2).map(|s| s.to_string()),
1763 })
1764 } else {
1765 None
1766 }
1767}
1768
1769fn build_wheel_purl(
1770 name: Option<&str>,
1771 version: Option<&str>,
1772 wheel_info: &WheelInfo,
1773) -> Option<String> {
1774 let name = name?;
1775 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1776
1777 if let Some(ver) = version {
1778 package_url.with_version(ver).ok()?;
1779 }
1780
1781 let extension = format!(
1782 "{}-{}-{}",
1783 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1784 );
1785 package_url.add_qualifier("extension", extension).ok()?;
1786
1787 Some(package_url.to_string())
1788}
1789
1790fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1791 let name = name?;
1792 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1793
1794 if let Some(ver) = version {
1795 package_url.with_version(ver).ok()?;
1796 }
1797
1798 package_url.add_qualifier("type", "egg").ok()?;
1799
1800 Some(package_url.to_string())
1801}
1802
1803fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1804 let metadata = super::rfc822::parse_rfc822_content(content);
1805 build_package_data_from_rfc822(&metadata, datasource_id)
1806}
1807
1808fn build_package_data_from_rfc822(
1813 metadata: &super::rfc822::Rfc822Metadata,
1814 datasource_id: DatasourceId,
1815) -> PackageData {
1816 use super::rfc822::{get_header_all, get_header_first};
1817
1818 let name = get_header_first(&metadata.headers, "name");
1819 let version = get_header_first(&metadata.headers, "version");
1820 let summary = get_header_first(&metadata.headers, "summary");
1821 let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1822 let author = get_header_first(&metadata.headers, "author");
1823 let author_email = get_header_first(&metadata.headers, "author-email");
1824 let license = get_header_first(&metadata.headers, "license");
1825 let license_expression = get_header_first(&metadata.headers, "license-expression");
1826 let download_url = get_header_first(&metadata.headers, "download-url");
1827 let platform = get_header_first(&metadata.headers, "platform");
1828 let requires_python = get_header_first(&metadata.headers, "requires-python");
1829 let classifiers = get_header_all(&metadata.headers, "classifier");
1830 let license_files = get_header_all(&metadata.headers, "license-file");
1831
1832 let description_body = if metadata.body.is_empty() {
1833 get_header_first(&metadata.headers, "description").unwrap_or_default()
1834 } else {
1835 metadata.body.clone()
1836 };
1837
1838 let description = build_description(summary.as_deref(), &description_body);
1839
1840 let mut parties = Vec::new();
1841 if author.is_some() || author_email.is_some() {
1842 parties.push(Party {
1843 r#type: Some("person".to_string()),
1844 role: Some("author".to_string()),
1845 name: author,
1846 email: author_email,
1847 url: None,
1848 organization: None,
1849 organization_url: None,
1850 timezone: None,
1851 });
1852 }
1853
1854 let (keywords, license_classifiers) = split_classifiers(&classifiers);
1855 let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1856 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1857 license_expression
1858 .as_deref()
1859 .and_then(normalize_spdx_expression)
1860 .map(|normalized| {
1861 build_declared_license_data(
1862 normalized,
1863 DeclaredLicenseMatchMetadata::single_line(
1864 license_expression.as_deref().unwrap_or_default(),
1865 )
1866 .with_referenced_filenames(&referenced_license_files),
1867 )
1868 })
1869 .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1870
1871 let extracted_license_statement = license_expression
1872 .clone()
1873 .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1874
1875 let mut extra_data = HashMap::new();
1876 if let Some(platform_value) = platform
1877 && !platform_value.eq_ignore_ascii_case("unknown")
1878 && !platform_value.is_empty()
1879 {
1880 extra_data.insert(
1881 "platform".to_string(),
1882 serde_json::Value::String(platform_value),
1883 );
1884 }
1885
1886 if let Some(requires_python_value) = requires_python
1887 && !requires_python_value.is_empty()
1888 {
1889 extra_data.insert(
1890 "requires_python".to_string(),
1891 serde_json::Value::String(requires_python_value),
1892 );
1893 }
1894
1895 if !license_files.is_empty() {
1896 extra_data.insert(
1897 "license_files".to_string(),
1898 serde_json::Value::Array(
1899 license_files
1900 .iter()
1901 .cloned()
1902 .map(serde_json::Value::String)
1903 .collect(),
1904 ),
1905 );
1906 }
1907
1908 let file_references = license_files
1909 .iter()
1910 .map(|path| FileReference {
1911 path: path.clone(),
1912 size: None,
1913 sha1: None,
1914 md5: None,
1915 sha256: None,
1916 sha512: None,
1917 extra_data: None,
1918 })
1919 .collect();
1920
1921 let project_urls = get_header_all(&metadata.headers, "project-url");
1922 let dependencies = extract_rfc822_dependencies(&metadata.headers);
1923 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1924
1925 if !project_urls.is_empty() {
1926 let parsed_urls = parse_project_urls(&project_urls);
1927
1928 for (label, url) in &parsed_urls {
1929 let label_lower = label.to_lowercase();
1930
1931 if bug_tracking_url.is_none()
1932 && matches!(
1933 label_lower.as_str(),
1934 "tracker"
1935 | "bug reports"
1936 | "bug tracker"
1937 | "issues"
1938 | "issue tracker"
1939 | "github: issues"
1940 )
1941 {
1942 bug_tracking_url = Some(url.clone());
1943 } else if code_view_url.is_none()
1944 && matches!(label_lower.as_str(), "source" | "source code" | "code")
1945 {
1946 code_view_url = Some(url.clone());
1947 } else if vcs_url.is_none()
1948 && matches!(
1949 label_lower.as_str(),
1950 "github" | "gitlab" | "github: repo" | "repository"
1951 )
1952 {
1953 vcs_url = Some(url.clone());
1954 } else if homepage_url.is_none()
1955 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1956 {
1957 homepage_url = Some(url.clone());
1958 } else if label_lower == "changelog" {
1959 extra_data.insert(
1960 "changelog_url".to_string(),
1961 serde_json::Value::String(url.clone()),
1962 );
1963 }
1964 }
1965
1966 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1967 .iter()
1968 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1969 .collect();
1970
1971 if !project_urls_json.is_empty() {
1972 extra_data.insert(
1973 "project_urls".to_string(),
1974 serde_json::Value::Object(project_urls_json),
1975 );
1976 }
1977 }
1978
1979 let extra_data = if extra_data.is_empty() {
1980 None
1981 } else {
1982 Some(extra_data)
1983 };
1984
1985 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1986 build_pypi_urls(name.as_deref(), version.as_deref());
1987
1988 PackageData {
1989 package_type: Some(PythonParser::PACKAGE_TYPE),
1990 namespace: None,
1991 name,
1992 version,
1993 qualifiers: None,
1994 subpath: None,
1995 primary_language: Some("Python".to_string()),
1996 description,
1997 release_date: None,
1998 parties,
1999 keywords,
2000 homepage_url,
2001 download_url,
2002 size: None,
2003 sha1: None,
2004 md5: None,
2005 sha256: None,
2006 sha512: None,
2007 bug_tracking_url,
2008 code_view_url,
2009 vcs_url,
2010 copyright: None,
2011 holder: None,
2012 declared_license_expression,
2013 declared_license_expression_spdx,
2014 license_detections,
2015 other_license_expression: None,
2016 other_license_expression_spdx: None,
2017 other_license_detections: Vec::new(),
2018 extracted_license_statement,
2019 notice_text: None,
2020 source_packages: Vec::new(),
2021 file_references,
2022 is_private: false,
2023 is_virtual: false,
2024 extra_data,
2025 dependencies,
2026 repository_homepage_url,
2027 repository_download_url,
2028 api_data_url,
2029 datasource_id: Some(datasource_id),
2030 purl,
2031 }
2032}
2033
2034fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
2035 project_urls
2036 .iter()
2037 .filter_map(|url_entry| {
2038 if let Some((label, url)) = url_entry.split_once(", ") {
2039 let label_trimmed = label.trim();
2040 let url_trimmed = url.trim();
2041 if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
2042 return Some((label_trimmed.to_string(), url_trimmed.to_string()));
2043 }
2044 }
2045 None
2046 })
2047 .collect()
2048}
2049
2050fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
2051 let mut parts = Vec::new();
2052 if let Some(summary_value) = summary
2053 && !summary_value.trim().is_empty()
2054 {
2055 parts.push(summary_value.trim().to_string());
2056 }
2057
2058 if !body.trim().is_empty() {
2059 parts.push(body.trim().to_string());
2060 }
2061
2062 if parts.is_empty() {
2063 None
2064 } else {
2065 Some(parts.join("\n"))
2066 }
2067}
2068
2069fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
2070 let mut keywords = Vec::new();
2071 let mut license_classifiers = Vec::new();
2072
2073 for classifier in classifiers {
2074 if classifier.starts_with("License ::") {
2075 license_classifiers.push(classifier.to_string());
2076 } else {
2077 keywords.push(classifier.to_string());
2078 }
2079 }
2080
2081 (keywords, license_classifiers)
2082}
2083
2084fn build_extracted_license_statement(
2085 license: Option<&str>,
2086 license_classifiers: &[String],
2087) -> Option<String> {
2088 let mut lines = Vec::new();
2089
2090 if let Some(value) = license
2091 && !value.trim().is_empty()
2092 {
2093 lines.push(format!("license: {}", value.trim()));
2094 }
2095
2096 if !license_classifiers.is_empty() {
2097 lines.push("classifiers:".to_string());
2098 for classifier in license_classifiers {
2099 lines.push(format!(" - '{}'", classifier));
2100 }
2101 }
2102
2103 if lines.is_empty() {
2104 None
2105 } else {
2106 Some(format!("{}\n", lines.join("\n")))
2107 }
2108}
2109
2110pub(crate) fn build_pypi_urls(
2111 name: Option<&str>,
2112 version: Option<&str>,
2113) -> (
2114 Option<String>,
2115 Option<String>,
2116 Option<String>,
2117 Option<String>,
2118) {
2119 let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2120
2121 let repository_download_url = name.and_then(|value| {
2122 version.map(|ver| {
2123 format!(
2124 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2125 &value[..1.min(value.len())],
2126 value,
2127 value,
2128 ver
2129 )
2130 })
2131 });
2132
2133 let api_data_url = name.map(|value| {
2134 if let Some(ver) = version {
2135 format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2136 } else {
2137 format!("https://pypi.org/pypi/{}/json", value)
2138 }
2139 });
2140
2141 let purl = name.and_then(|value| {
2142 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2143 if let Some(ver) = version {
2144 package_url.with_version(ver).ok()?;
2145 }
2146 Some(package_url.to_string())
2147 });
2148
2149 (
2150 repository_homepage_url,
2151 repository_download_url,
2152 api_data_url,
2153 purl,
2154 )
2155}
2156
2157fn build_pypi_purl_with_extension(
2158 name: &str,
2159 version: Option<&str>,
2160 extension: &str,
2161) -> Option<String> {
2162 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2163 if let Some(ver) = version {
2164 package_url.with_version(ver).ok()?;
2165 }
2166 package_url.add_qualifier("extension", extension).ok()?;
2167 Some(package_url.to_string())
2168}
2169
2170fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2171 let toml_content = match read_toml_file(path) {
2172 Ok(content) => content,
2173 Err(e) => {
2174 warn!(
2175 "Failed to read or parse pyproject.toml at {:?}: {}",
2176 path, e
2177 );
2178 return default_package_data(path);
2179 }
2180 };
2181
2182 let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2183 let is_poetry_pyproject = tool_table
2184 .and_then(|tool| tool.get("poetry"))
2185 .and_then(|value| value.as_table())
2186 .is_some();
2187
2188 let project_table =
2190 if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2191 project.clone()
2193 } else if let Some(tool) = tool_table {
2194 if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2195 poetry.clone()
2197 } else {
2198 return default_package_data(path);
2199 }
2200 } else if toml_content.get(FIELD_NAME).is_some() {
2201 match toml_content.as_table() {
2203 Some(table) => table.clone(),
2204 None => {
2205 warn!("Failed to convert TOML content to table in {:?}", path);
2206 return default_package_data(path);
2207 }
2208 }
2209 } else {
2210 return default_package_data(path);
2211 };
2212
2213 let name = project_table
2214 .get(FIELD_NAME)
2215 .and_then(|v| v.as_str())
2216 .map(String::from);
2217
2218 let version = project_table
2219 .get(FIELD_VERSION)
2220 .and_then(|v| v.as_str())
2221 .map(String::from);
2222 let classifiers = project_table
2223 .get("classifiers")
2224 .and_then(|value| value.as_array())
2225 .map(|values| {
2226 values
2227 .iter()
2228 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2229 .collect::<Vec<_>>()
2230 })
2231 .unwrap_or_default();
2232 let (classifier_keywords, license_classifiers) = split_classifiers(&classifiers);
2233
2234 let extracted_license_statement = extract_raw_license_string(&project_table);
2235 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2236 normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2237
2238 let description = project_table
2239 .get(FIELD_DESCRIPTION)
2240 .and_then(|value| value.as_str())
2241 .map(|value| value.to_string());
2242 let mut keywords = project_table
2243 .get(FIELD_KEYWORDS)
2244 .and_then(|value| value.as_array())
2245 .map(|values| {
2246 values
2247 .iter()
2248 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2249 .collect::<Vec<_>>()
2250 })
2251 .unwrap_or_default();
2252 for classifier in classifier_keywords {
2253 if !keywords.contains(&classifier) {
2254 keywords.push(classifier);
2255 }
2256 }
2257
2258 let mut extra_data = extract_pyproject_extra_data(&toml_content).unwrap_or_default();
2260 let (homepage_url, download_url, bug_tracking_url, code_view_url, repository_url) =
2261 extract_urls(&project_table, &mut extra_data);
2262
2263 let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2264
2265 let purl = name.as_ref().and_then(|n| {
2267 let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2268 Ok(p) => p,
2269 Err(e) => {
2270 warn!(
2271 "Failed to create PackageUrl for Python package '{}': {}",
2272 n, e
2273 );
2274 return None;
2275 }
2276 };
2277
2278 if let Some(v) = &version
2279 && let Err(e) = package_url.with_version(v)
2280 {
2281 warn!(
2282 "Failed to set version '{}' for Python package '{}': {}",
2283 v, n, e
2284 );
2285 return None;
2286 }
2287
2288 Some(package_url.to_string())
2289 });
2290
2291 let api_data_url = name.as_ref().map(|n| {
2292 if let Some(v) = &version {
2293 format!("https://pypi.org/pypi/{}/{}/json", n, v)
2294 } else {
2295 format!("https://pypi.org/pypi/{}/json", n)
2296 }
2297 });
2298
2299 let pypi_homepage_url = name
2300 .as_ref()
2301 .map(|n| format!("https://pypi.org/project/{}", n));
2302
2303 let pypi_download_url = name.as_ref().and_then(|n| {
2304 version.as_ref().map(|v| {
2305 format!(
2306 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2307 &n[..1.min(n.len())],
2308 n,
2309 n,
2310 v
2311 )
2312 })
2313 });
2314
2315 PackageData {
2316 package_type: Some(PythonParser::PACKAGE_TYPE),
2317 namespace: None,
2318 name,
2319 version,
2320 qualifiers: None,
2321 subpath: None,
2322 primary_language: None,
2323 description,
2324 release_date: None,
2325 parties: extract_parties(&project_table),
2326 keywords,
2327 homepage_url: homepage_url.or(pypi_homepage_url),
2328 download_url: download_url
2329 .or_else(|| repository_url.clone())
2330 .or(pypi_download_url),
2331 size: None,
2332 sha1: None,
2333 md5: None,
2334 sha256: None,
2335 sha512: None,
2336 bug_tracking_url,
2337 code_view_url,
2338 vcs_url: repository_url,
2339 copyright: None,
2340 holder: None,
2341 declared_license_expression,
2342 declared_license_expression_spdx,
2343 license_detections,
2344 other_license_expression: None,
2345 other_license_expression_spdx: None,
2346 other_license_detections: Vec::new(),
2347 extracted_license_statement: extracted_license_statement
2348 .or_else(|| build_extracted_license_statement(None, &license_classifiers)),
2349 notice_text: None,
2350 source_packages: Vec::new(),
2351 file_references: Vec::new(),
2352 is_private: has_private_classifier(&classifiers),
2353 is_virtual: false,
2354 extra_data: if extra_data.is_empty() {
2355 None
2356 } else {
2357 Some(extra_data)
2358 },
2359 dependencies: [dependencies, optional_dependencies].concat(),
2360 repository_homepage_url: None,
2361 repository_download_url: None,
2362 api_data_url,
2363 datasource_id: Some(if is_poetry_pyproject {
2364 DatasourceId::PypiPoetryPyprojectToml
2365 } else {
2366 DatasourceId::PypiPyprojectToml
2367 }),
2368 purl,
2369 }
2370}
2371
2372fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2373 let path_str = path.to_string_lossy().replace('\\', "/");
2374 if path_str.contains("/EGG-INFO/PKG-INFO") {
2375 DatasourceId::PypiEggPkginfo
2376 } else if path_str.ends_with(".egg-info/PKG-INFO") {
2377 DatasourceId::PypiEditableEggPkginfo
2378 } else {
2379 DatasourceId::PypiSdistPkginfo
2380 }
2381}
2382
2383fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2384 project
2385 .get(FIELD_LICENSE)
2386 .and_then(|license_value| match license_value {
2387 TomlValue::String(license_str) => Some(license_str.clone()),
2388 TomlValue::Table(license_table) => license_table
2389 .get("text")
2390 .and_then(|v| v.as_str())
2391 .map(|s| s.to_string())
2392 .or_else(|| {
2393 license_table
2394 .get("expression")
2395 .and_then(|v| v.as_str())
2396 .map(|expr| expr.to_string())
2397 }),
2398 _ => None,
2399 })
2400}
2401
2402fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2403 match project.get(FIELD_LICENSE) {
2404 Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2405 Some(TomlValue::Table(license_table)) => license_table
2406 .get("expression")
2407 .and_then(|value| value.as_str()),
2408 _ => None,
2409 }
2410}
2411
2412fn extract_urls(
2413 project: &TomlMap<String, TomlValue>,
2414 extra_data: &mut HashMap<String, serde_json::Value>,
2415) -> ProjectUrls {
2416 let mut homepage_url = None;
2417 let mut download_url = None;
2418 let mut bug_tracking_url = None;
2419 let mut code_view_url = None;
2420 let mut repository_url = None;
2421
2422 if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2424 let parsed_urls: Vec<(String, String)> = urls
2425 .iter()
2426 .filter_map(|(label, value)| {
2427 value
2428 .as_str()
2429 .map(|url| (label.to_string(), url.to_string()))
2430 })
2431 .collect();
2432 apply_project_url_mappings(
2433 &parsed_urls,
2434 &mut homepage_url,
2435 &mut bug_tracking_url,
2436 &mut code_view_url,
2437 &mut repository_url,
2438 extra_data,
2439 );
2440
2441 download_url = urls
2442 .get("Downloads")
2443 .or_else(|| urls.get("downloads"))
2444 .and_then(|v| v.as_str())
2445 .map(String::from);
2446
2447 if homepage_url.is_none() {
2448 homepage_url = urls
2449 .get(FIELD_HOMEPAGE)
2450 .and_then(|v| v.as_str())
2451 .map(String::from);
2452 }
2453 if repository_url.is_none() {
2454 repository_url = urls
2455 .get(FIELD_REPOSITORY)
2456 .and_then(|v| v.as_str())
2457 .map(String::from);
2458 }
2459 }
2460
2461 if homepage_url.is_none() {
2463 homepage_url = project
2464 .get(FIELD_HOMEPAGE)
2465 .and_then(|v| v.as_str())
2466 .map(String::from);
2467 }
2468
2469 if repository_url.is_none() {
2470 repository_url = project
2471 .get(FIELD_REPOSITORY)
2472 .and_then(|v| v.as_str())
2473 .map(String::from);
2474 }
2475
2476 (
2477 homepage_url,
2478 download_url,
2479 bug_tracking_url,
2480 code_view_url,
2481 repository_url,
2482 )
2483}
2484
2485fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2486 let mut parties = Vec::new();
2487
2488 if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2489 for author in authors {
2490 if let Some(author_str) = author.as_str() {
2491 let (name, email) = split_name_email(author_str);
2492 parties.push(Party {
2493 r#type: None,
2494 role: Some("author".to_string()),
2495 name,
2496 email,
2497 url: None,
2498 organization: None,
2499 organization_url: None,
2500 timezone: None,
2501 });
2502 } else if let Some(author_table) = author.as_table() {
2503 let name = author_table
2504 .get("name")
2505 .and_then(|value| value.as_str())
2506 .map(|value| value.to_string());
2507 let email = author_table
2508 .get("email")
2509 .and_then(|value| value.as_str())
2510 .map(|value| value.to_string());
2511 if name.is_some() || email.is_some() {
2512 parties.push(Party {
2513 r#type: None,
2514 role: Some("author".to_string()),
2515 name,
2516 email,
2517 url: None,
2518 organization: None,
2519 organization_url: None,
2520 timezone: None,
2521 });
2522 }
2523 }
2524 }
2525 }
2526
2527 if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2528 for maintainer in maintainers {
2529 if let Some(maintainer_str) = maintainer.as_str() {
2530 let (name, email) = split_name_email(maintainer_str);
2531 parties.push(Party {
2532 r#type: None,
2533 role: Some("maintainer".to_string()),
2534 name,
2535 email,
2536 url: None,
2537 organization: None,
2538 organization_url: None,
2539 timezone: None,
2540 });
2541 } else if let Some(maintainer_table) = maintainer.as_table() {
2542 let name = maintainer_table
2543 .get("name")
2544 .and_then(|value| value.as_str())
2545 .map(|value| value.to_string());
2546 let email = maintainer_table
2547 .get("email")
2548 .and_then(|value| value.as_str())
2549 .map(|value| value.to_string());
2550 if name.is_some() || email.is_some() {
2551 parties.push(Party {
2552 r#type: None,
2553 role: Some("maintainer".to_string()),
2554 name,
2555 email,
2556 url: None,
2557 organization: None,
2558 organization_url: None,
2559 timezone: None,
2560 });
2561 }
2562 }
2563 }
2564 }
2565
2566 parties
2567}
2568
2569fn extract_dependencies(
2570 project: &TomlMap<String, TomlValue>,
2571 toml_content: &TomlValue,
2572) -> (Vec<Dependency>, Vec<Dependency>) {
2573 let mut dependencies = Vec::new();
2574 let mut optional_dependencies = Vec::new();
2575
2576 if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2578 match deps_value {
2579 TomlValue::Array(arr) => {
2580 dependencies = parse_dependency_array(arr, false, None);
2581 }
2582 TomlValue::Table(table) => {
2583 dependencies = parse_dependency_table(table, false, None);
2584 }
2585 _ => {}
2586 }
2587 }
2588
2589 if let Some(opt_deps_table) = project
2591 .get(FIELD_OPTIONAL_DEPENDENCIES)
2592 .and_then(|v| v.as_table())
2593 {
2594 for (extra_name, deps) in opt_deps_table {
2595 match deps {
2596 TomlValue::Array(arr) => {
2597 optional_dependencies.extend(parse_dependency_array(
2598 arr,
2599 true,
2600 Some(extra_name),
2601 ));
2602 }
2603 TomlValue::Table(table) => {
2604 optional_dependencies.extend(parse_dependency_table(
2605 table,
2606 true,
2607 Some(extra_name),
2608 ));
2609 }
2610 _ => {}
2611 }
2612 }
2613 }
2614
2615 if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2617 match dev_deps_value {
2618 TomlValue::Array(arr) => {
2619 optional_dependencies.extend(parse_dependency_array(
2620 arr,
2621 true,
2622 Some(FIELD_DEV_DEPENDENCIES),
2623 ));
2624 }
2625 TomlValue::Table(table) => {
2626 optional_dependencies.extend(parse_dependency_table(
2627 table,
2628 true,
2629 Some(FIELD_DEV_DEPENDENCIES),
2630 ));
2631 }
2632 _ => {}
2633 }
2634 }
2635
2636 if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
2638 for (group_name, group_data) in groups_table {
2639 if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2640 match group_deps {
2641 TomlValue::Array(arr) => {
2642 optional_dependencies.extend(parse_dependency_array(
2643 arr,
2644 true,
2645 Some(group_name),
2646 ));
2647 }
2648 TomlValue::Table(table) => {
2649 optional_dependencies.extend(parse_dependency_table(
2650 table,
2651 true,
2652 Some(group_name),
2653 ));
2654 }
2655 _ => {}
2656 }
2657 }
2658 }
2659 }
2660
2661 if let Some(groups_table) = toml_content
2662 .get(FIELD_DEPENDENCY_GROUPS)
2663 .and_then(|value| value.as_table())
2664 {
2665 for (group_name, deps) in groups_table {
2666 match deps {
2667 TomlValue::Array(arr) => {
2668 optional_dependencies.extend(parse_dependency_array(
2669 arr,
2670 true,
2671 Some(group_name),
2672 ));
2673 }
2674 TomlValue::Table(table) => {
2675 optional_dependencies.extend(parse_dependency_table(
2676 table,
2677 true,
2678 Some(group_name),
2679 ));
2680 }
2681 _ => {}
2682 }
2683 }
2684 }
2685
2686 if let Some(dev_deps_value) = toml_content
2687 .get("tool")
2688 .and_then(|value| value.as_table())
2689 .and_then(|tool| tool.get("uv"))
2690 .and_then(|value| value.as_table())
2691 .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2692 {
2693 match dev_deps_value {
2694 TomlValue::Array(arr) => {
2695 optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2696 }
2697 TomlValue::Table(table) => {
2698 optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2699 }
2700 _ => {}
2701 }
2702 }
2703
2704 (dependencies, optional_dependencies)
2705}
2706
2707fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2708 let mut extra_data = HashMap::new();
2709
2710 if let Some(tool_uv) = toml_content
2711 .get("tool")
2712 .and_then(|value| value.as_table())
2713 .and_then(|tool| tool.get("uv"))
2714 {
2715 extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2716 }
2717
2718 if extra_data.is_empty() {
2719 None
2720 } else {
2721 Some(extra_data)
2722 }
2723}
2724
2725fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2726 match value {
2727 TomlValue::String(value) => JsonValue::String(value.clone()),
2728 TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2729 TomlValue::Float(value) => JsonValue::String(value.to_string()),
2730 TomlValue::Boolean(value) => JsonValue::Bool(*value),
2731 TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2732 TomlValue::Array(values) => {
2733 JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2734 }
2735 TomlValue::Table(values) => JsonValue::Object(
2736 values
2737 .iter()
2738 .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2739 .collect::<JsonMap<String, JsonValue>>(),
2740 ),
2741 }
2742}
2743
2744fn parse_dependency_table(
2745 table: &TomlMap<String, TomlValue>,
2746 is_optional: bool,
2747 scope: Option<&str>,
2748) -> Vec<Dependency> {
2749 table
2750 .iter()
2751 .filter_map(|(name, version)| {
2752 let version_str = version.as_str().map(|s| s.to_string());
2753 let mut package_url =
2754 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2755
2756 if let Some(v) = &version_str {
2757 package_url.with_version(v).ok()?;
2758 }
2759
2760 Some(Dependency {
2761 purl: Some(package_url.to_string()),
2762 extracted_requirement: None,
2763 scope: scope.map(|s| s.to_string()),
2764 is_runtime: Some(!is_optional),
2765 is_optional: Some(is_optional),
2766 is_pinned: None,
2767 is_direct: Some(true),
2768 resolved_package: None,
2769 extra_data: None,
2770 })
2771 })
2772 .collect()
2773}
2774
2775fn parse_dependency_array(
2776 array: &[TomlValue],
2777 is_optional: bool,
2778 scope: Option<&str>,
2779) -> Vec<Dependency> {
2780 array
2781 .iter()
2782 .filter_map(|dep| {
2783 let dep_str = dep.as_str()?;
2784 build_pyproject_array_dependency(dep_str, is_optional, scope)
2785 })
2786 .collect()
2787}
2788
2789fn build_pyproject_array_dependency(
2790 dep_str: &str,
2791 is_optional: bool,
2792 scope: Option<&str>,
2793) -> Option<Dependency> {
2794 let parsed = parse_pep508_requirement(dep_str)?;
2795 let name = normalize_python_package_name(&parsed.name);
2796 let pinned_version = parsed
2797 .specifiers
2798 .as_deref()
2799 .and_then(extract_exact_pinned_version);
2800
2801 let purl = build_python_dependency_purl(&name, pinned_version.as_deref())?;
2802
2803 let mut extra_data = HashMap::new();
2804 if let Some(marker) = parsed.marker {
2805 extra_data.insert("marker".to_string(), JsonValue::String(marker));
2806 }
2807 if !parsed.extras.is_empty() {
2808 extra_data.insert(
2809 "extras".to_string(),
2810 JsonValue::Array(parsed.extras.into_iter().map(JsonValue::String).collect()),
2811 );
2812 }
2813
2814 let extracted_requirement = parsed.specifiers.or(parsed.url);
2815
2816 Some(Dependency {
2817 purl: Some(purl),
2818 extracted_requirement: extracted_requirement.clone(),
2819 scope: scope.map(|s| s.to_string()),
2820 is_runtime: Some(!is_optional),
2821 is_optional: Some(is_optional),
2822 is_pinned: Some(pinned_version.is_some()),
2823 is_direct: Some(true),
2824 resolved_package: None,
2825 extra_data: if extra_data.is_empty() {
2826 None
2827 } else {
2828 Some(extra_data)
2829 },
2830 })
2831}
2832
2833fn extract_exact_pinned_version(specifiers: &str) -> Option<String> {
2834 let trimmed = specifiers.trim();
2835 if trimmed.contains(',') {
2836 return None;
2837 }
2838
2839 let stripped = if let Some(version) = trimmed.strip_prefix("===") {
2840 version
2841 } else if let Some(version) = trimmed.strip_prefix("==") {
2842 version
2843 } else {
2844 return None;
2845 };
2846
2847 let version = stripped.trim();
2848 if version.is_empty() {
2849 None
2850 } else {
2851 Some(version.to_string())
2852 }
2853}
2854
2855#[derive(Debug, Clone)]
2856enum Value {
2857 String(String),
2858 Number(f64),
2859 Bool(bool),
2860 None,
2861 List(Vec<Value>),
2862 Tuple(Vec<Value>),
2863 Dict(HashMap<String, Value>),
2864}
2865
2866struct LiteralEvaluator {
2867 constants: HashMap<String, Value>,
2868 max_depth: usize,
2869 max_nodes: usize,
2870 nodes_visited: usize,
2871}
2872
2873impl LiteralEvaluator {
2874 fn new(constants: HashMap<String, Value>) -> Self {
2875 Self {
2876 constants,
2877 max_depth: MAX_SETUP_PY_AST_DEPTH,
2878 max_nodes: MAX_SETUP_PY_AST_NODES,
2879 nodes_visited: 0,
2880 }
2881 }
2882
2883 fn insert_constant(&mut self, name: String, value: Value) {
2884 self.constants.insert(name, value);
2885 }
2886
2887 fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2888 if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2889 return None;
2890 }
2891 self.nodes_visited += 1;
2892
2893 match expr {
2894 ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
2895 Some(Value::String(value.to_str().to_string()))
2896 }
2897 ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
2898 Some(Value::Bool(*value))
2899 }
2900 ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
2901 self.evaluate_number(value)
2902 }
2903 ast::Expr::NoneLiteral(_) => Some(Value::None),
2904 ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2905 ast::Expr::List(ast::ExprList { elts, .. }) => {
2906 let mut values = Vec::new();
2907 for elt in elts {
2908 values.push(self.evaluate_expr(elt, depth + 1)?);
2909 }
2910 Some(Value::List(values))
2911 }
2912 ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
2913 let mut values = Vec::new();
2914 for elt in elts {
2915 values.push(self.evaluate_expr(elt, depth + 1)?);
2916 }
2917 Some(Value::Tuple(values))
2918 }
2919 ast::Expr::Dict(ast::ExprDict { items, .. }) => {
2920 let mut dict = HashMap::new();
2921 for item in items {
2922 let key_expr = item.key.as_ref()?;
2923 let key_value = self.evaluate_expr(key_expr, depth + 1)?;
2924 let key = value_to_string(&key_value)?;
2925 let value = self.evaluate_expr(&item.value, depth + 1)?;
2926 dict.insert(key, value);
2927 }
2928 Some(Value::Dict(dict))
2929 }
2930 ast::Expr::Call(ast::ExprCall {
2931 func, arguments, ..
2932 }) => {
2933 let args = arguments.args.as_ref();
2934 let keywords = arguments.keywords.as_ref();
2935 if keywords.is_empty()
2936 && let Some(name) = dotted_name(func.as_ref(), depth + 1)
2937 && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
2938 {
2939 return self.evaluate_ordered_dict(args, depth + 1);
2940 }
2941
2942 if !args.is_empty() {
2943 return None;
2944 }
2945
2946 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
2947 && id == "dict"
2948 {
2949 let mut dict = HashMap::new();
2950 for keyword in keywords {
2951 let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
2952 let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2953 dict.insert(key.to_string(), value);
2954 }
2955 return Some(Value::Dict(dict));
2956 }
2957
2958 None
2959 }
2960 _ => None,
2961 }
2962 }
2963
2964 fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
2965 match number {
2966 ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2967 ast::Number::Float(value) => Some(Value::Number(*value)),
2968 ast::Number::Complex { .. } => None,
2969 }
2970 }
2971
2972 fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2973 if args.len() != 1 {
2974 return None;
2975 }
2976
2977 let items = match self.evaluate_expr(&args[0], depth)? {
2978 Value::List(items) | Value::Tuple(items) => items,
2979 _ => return None,
2980 };
2981
2982 let mut dict = HashMap::new();
2983 for item in items {
2984 let Value::Tuple(values) = item else {
2985 return None;
2986 };
2987 if values.len() != 2 {
2988 return None;
2989 }
2990 let key = value_to_string(&values[0])?;
2991 dict.insert(key, values[1].clone());
2992 }
2993
2994 Some(Value::Dict(dict))
2995 }
2996}
2997
2998#[derive(Default)]
2999struct SetupAliases {
3000 setup_names: HashSet<String>,
3001 module_aliases: HashMap<String, String>,
3002}
3003
3004fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
3005 extract_from_setup_py(path).into_iter().collect()
3006}
3007
3008fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
3009 let content = match read_file_to_string(path) {
3010 Ok(content) => content,
3011 Err(e) => {
3012 warn!("Failed to read setup.py at {:?}: {}", path, e);
3013 return Some(default_package_data(path));
3014 }
3015 };
3016
3017 if content.len() > MAX_SETUP_PY_BYTES {
3018 warn!("setup.py too large at {:?}: {} bytes", path, content.len());
3019 let package_data = extract_from_setup_py_regex(&content);
3020 return should_emit_setup_py_package(&package_data).then_some(package_data);
3021 }
3022
3023 let mut package_data = match extract_from_setup_py_ast(&content) {
3024 Ok(Some(data)) => data,
3025 Ok(None) => return Some(default_package_data(path)),
3026 Err(e) => {
3027 warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
3028 extract_from_setup_py_regex(&content)
3029 }
3030 };
3031
3032 if package_data.name.is_none() {
3033 package_data.name = extract_setup_value(&content, "name");
3034 }
3035
3036 if package_data.version.is_none() {
3037 package_data.version = extract_setup_value(&content, "version");
3038 }
3039
3040 fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
3041
3042 if package_data.purl.is_none() {
3043 package_data.purl = build_setup_py_purl(
3044 package_data.name.as_deref(),
3045 package_data.version.as_deref(),
3046 );
3047 }
3048
3049 if should_emit_setup_py_package(&package_data) {
3050 Some(package_data)
3051 } else {
3052 Some(default_package_data(path))
3053 }
3054}
3055
3056fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
3057 package_data.name.is_some()
3058 || package_data.version.is_some()
3059 || package_data.purl.is_some()
3060 || !package_data.dependencies.is_empty()
3061 || package_data.extracted_license_statement.is_some()
3062 || !package_data.license_detections.is_empty()
3063 || !package_data.parties.is_empty()
3064 || package_data.description.is_some()
3065 || package_data.homepage_url.is_some()
3066 || package_data.bug_tracking_url.is_some()
3067 || package_data.code_view_url.is_some()
3068 || package_data.vcs_url.is_some()
3069}
3070
3071fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
3072 if package_data.version.is_some()
3073 && package_data.extracted_license_statement.is_some()
3074 && package_data
3075 .parties
3076 .iter()
3077 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
3078 {
3079 return;
3080 }
3081
3082 let Some(root) = path.parent() else {
3083 return;
3084 };
3085
3086 let dunder_metadata = collect_sibling_dunder_metadata(root, content);
3087
3088 if package_data.version.is_none() {
3089 package_data.version = dunder_metadata.version;
3090 }
3091
3092 if package_data.extracted_license_statement.is_none() {
3093 package_data.extracted_license_statement = dunder_metadata.license;
3094 }
3095
3096 let has_author = package_data
3097 .parties
3098 .iter()
3099 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
3100
3101 if !has_author && let Some(author) = dunder_metadata.author {
3102 package_data.parties.push(Party {
3103 r#type: Some("person".to_string()),
3104 role: Some("author".to_string()),
3105 name: Some(author),
3106 email: None,
3107 url: None,
3108 organization: None,
3109 organization_url: None,
3110 timezone: None,
3111 });
3112 }
3113}
3114
3115#[derive(Default)]
3116struct DunderMetadata {
3117 version: Option<String>,
3118 author: Option<String>,
3119 license: Option<String>,
3120}
3121
3122fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
3123 let statements = match parse_module(content) {
3124 Ok(parsed) => parsed.into_suite(),
3125 Err(_) => return DunderMetadata::default(),
3126 };
3127
3128 let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3129 let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3130 let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3131 let mut metadata = DunderMetadata::default();
3132
3133 for module in imported_dunder_modules(&statements) {
3134 let Some(path) = resolve_imported_module_path(root, &module) else {
3135 continue;
3136 };
3137 let Ok(module_content) = read_file_to_string(&path) else {
3138 continue;
3139 };
3140
3141 if metadata.version.is_none() {
3142 metadata.version = version_re
3143 .as_ref()
3144 .and_then(|regex| regex.captures(&module_content))
3145 .and_then(|captures| captures.get(1))
3146 .map(|match_| match_.as_str().to_string());
3147 }
3148
3149 if metadata.author.is_none() {
3150 metadata.author = author_re
3151 .as_ref()
3152 .and_then(|regex| regex.captures(&module_content))
3153 .and_then(|captures| captures.get(1))
3154 .map(|match_| match_.as_str().to_string());
3155 }
3156
3157 if metadata.license.is_none() {
3158 metadata.license = license_re
3159 .as_ref()
3160 .and_then(|regex| regex.captures(&module_content))
3161 .and_then(|captures| captures.get(1))
3162 .map(|match_| match_.as_str().to_string());
3163 }
3164
3165 if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
3166 return metadata;
3167 }
3168 }
3169
3170 metadata
3171}
3172
3173fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
3174 let mut modules = Vec::new();
3175
3176 for statement in statements {
3177 let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
3178 continue;
3179 };
3180 let Some(module) = module.as_ref().map(|name| name.as_str()) else {
3181 continue;
3182 };
3183 let imports_dunder = names.iter().any(|alias| {
3184 matches!(
3185 alias.name.as_str(),
3186 "__version__" | "__author__" | "__license__"
3187 )
3188 });
3189 if imports_dunder {
3190 modules.push(module.to_string());
3191 }
3192 }
3193
3194 modules
3195}
3196
3197fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
3198 let relative = PathBuf::from_iter(module.split('.'));
3199 let candidates = [
3200 root.join(relative.with_extension("py")),
3201 root.join(&relative).join("__init__.py"),
3202 root.join("src").join(relative.with_extension("py")),
3203 root.join("src").join(relative).join("__init__.py"),
3204 ];
3205
3206 candidates.into_iter().find(|candidate| candidate.exists())
3207}
3208
3209fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
3225 let statements = parse_module(content)
3226 .map(|parsed| parsed.into_suite())
3227 .map_err(|e| e.to_string())?;
3228 let aliases = collect_setup_aliases(&statements);
3229 let mut evaluator = LiteralEvaluator::new(HashMap::new());
3230 build_setup_py_constants(&statements, &mut evaluator);
3231
3232 let setup_call = find_setup_call(&statements, &aliases);
3233 let Some(call_expr) = setup_call else {
3234 return Ok(None);
3235 };
3236
3237 let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3238 Ok(Some(build_setup_py_package_data(&setup_values)))
3239}
3240
3241fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3242 for stmt in statements {
3243 if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3244 if targets.len() != 1 {
3245 continue;
3246 }
3247
3248 let Some(name) = extract_assign_name(&targets[0]) else {
3249 continue;
3250 };
3251
3252 if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3253 evaluator.insert_constant(name, value);
3254 }
3255 }
3256 }
3257}
3258
3259fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3260 match target {
3261 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3262 _ => None,
3263 }
3264}
3265
3266fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3267 let mut aliases = SetupAliases::default();
3268 aliases.setup_names.insert("setup".to_string());
3269
3270 for stmt in statements {
3271 match stmt {
3272 ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3273 for alias in names {
3274 let module_name = alias.name.as_str();
3275 if !is_setup_module(module_name) {
3276 continue;
3277 }
3278 let alias_name = alias
3279 .asname
3280 .as_ref()
3281 .map(|name| name.as_str())
3282 .unwrap_or(module_name);
3283 aliases
3284 .module_aliases
3285 .insert(alias_name.to_string(), module_name.to_string());
3286 }
3287 }
3288 ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3289 let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3290 continue;
3291 };
3292 if !is_setup_module(module_name) {
3293 continue;
3294 }
3295 for alias in names {
3296 if alias.name.as_str() != "setup" {
3297 continue;
3298 }
3299 let alias_name = alias
3300 .asname
3301 .as_ref()
3302 .map(|name| name.as_str())
3303 .unwrap_or("setup");
3304 aliases.setup_names.insert(alias_name.to_string());
3305 }
3306 }
3307 _ => {}
3308 }
3309 }
3310
3311 aliases
3312}
3313
3314fn is_setup_module(module_name: &str) -> bool {
3315 matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3316}
3317
3318fn find_setup_call<'a>(
3319 statements: &'a [ast::Stmt],
3320 aliases: &'a SetupAliases,
3321) -> Option<&'a ast::Expr> {
3322 let mut finder = SetupCallFinder {
3323 aliases,
3324 called_function_names: collect_top_level_called_function_names(statements),
3325 nodes_visited: 0,
3326 };
3327 finder.find_in_statements(statements)
3328}
3329
3330fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3331 let mut called = HashSet::new();
3332 collect_called_function_names_in_statements(statements, &mut called);
3333 called
3334}
3335
3336fn collect_called_function_names_in_statements(
3337 statements: &[ast::Stmt],
3338 called: &mut HashSet<String>,
3339) {
3340 for stmt in statements {
3341 match stmt {
3342 ast::Stmt::Expr(ast::StmtExpr { value, .. })
3343 | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3344 collect_called_function_names_in_expr(value.as_ref(), called);
3345 }
3346 ast::Stmt::If(ast::StmtIf {
3347 body,
3348 elif_else_clauses,
3349 ..
3350 }) => {
3351 collect_called_function_names_in_statements(body, called);
3352 for clause in elif_else_clauses {
3353 collect_called_function_names_in_statements(&clause.body, called);
3354 }
3355 }
3356 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3357 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3358 collect_called_function_names_in_statements(body, called);
3359 collect_called_function_names_in_statements(orelse, called);
3360 }
3361 ast::Stmt::With(ast::StmtWith { body, .. }) => {
3362 collect_called_function_names_in_statements(body, called);
3363 }
3364 ast::Stmt::Try(ast::StmtTry {
3365 body,
3366 orelse,
3367 finalbody,
3368 handlers,
3369 ..
3370 }) => {
3371 collect_called_function_names_in_statements(body, called);
3372 collect_called_function_names_in_statements(orelse, called);
3373 collect_called_function_names_in_statements(finalbody, called);
3374 for handler in handlers {
3375 let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3376 body,
3377 ..
3378 }) = handler;
3379 collect_called_function_names_in_statements(body, called);
3380 }
3381 }
3382 _ => {}
3383 }
3384 }
3385}
3386
3387fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3388 if let ast::Expr::Call(ast::ExprCall {
3389 func, arguments, ..
3390 }) = expr
3391 {
3392 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3393 called.insert(id.as_str().to_string());
3394 }
3395
3396 for arg in arguments.args.iter() {
3397 collect_called_function_names_in_expr(arg, called);
3398 }
3399 for keyword in arguments.keywords.iter() {
3400 collect_called_function_names_in_expr(&keyword.value, called);
3401 }
3402 }
3403}
3404
3405struct SetupCallFinder<'a> {
3406 aliases: &'a SetupAliases,
3407 called_function_names: HashSet<String>,
3408 nodes_visited: usize,
3409}
3410
3411impl<'a> SetupCallFinder<'a> {
3412 fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3413 for stmt in statements {
3414 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3415 return None;
3416 }
3417 self.nodes_visited += 1;
3418
3419 let found = match stmt {
3420 ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3421 ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3422 ast::Stmt::If(ast::StmtIf {
3423 body,
3424 elif_else_clauses,
3425 ..
3426 }) => self.find_in_statements(body).or_else(|| {
3427 for clause in elif_else_clauses {
3428 if let Some(found) = self.find_in_statements(&clause.body) {
3429 return Some(found);
3430 }
3431 }
3432 None
3433 }),
3434 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3435 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3436 .find_in_statements(body)
3437 .or_else(|| self.find_in_statements(orelse)),
3438 ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3439 .called_function_names
3440 .contains(name.as_str())
3441 .then(|| self.find_in_statements(body))
3442 .flatten(),
3443 ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3444 ast::Stmt::Try(ast::StmtTry {
3445 body,
3446 orelse,
3447 finalbody,
3448 handlers,
3449 ..
3450 }) => self
3451 .find_in_statements(body)
3452 .or_else(|| self.find_in_statements(orelse))
3453 .or_else(|| self.find_in_statements(finalbody))
3454 .or_else(|| {
3455 for handler in handlers {
3456 let ast::ExceptHandler::ExceptHandler(
3457 ast::ExceptHandlerExceptHandler { body, .. },
3458 ) = handler;
3459 if let Some(found) = self.find_in_statements(body) {
3460 return Some(found);
3461 }
3462 }
3463 None
3464 }),
3465 _ => None,
3466 };
3467
3468 if found.is_some() {
3469 return found;
3470 }
3471 }
3472
3473 None
3474 }
3475
3476 fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3477 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3478 return None;
3479 }
3480 self.nodes_visited += 1;
3481
3482 match expr {
3483 ast::Expr::Call(ast::ExprCall { func, .. })
3484 if is_setup_call(func.as_ref(), self.aliases) =>
3485 {
3486 Some(expr)
3487 }
3488 _ => None,
3489 }
3490 }
3491}
3492
3493fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3494 let Some(dotted) = dotted_name(func, 0) else {
3495 return false;
3496 };
3497
3498 if aliases.setup_names.contains(&dotted) {
3499 return true;
3500 }
3501
3502 let Some(module) = dotted.strip_suffix(".setup") else {
3503 return false;
3504 };
3505
3506 let resolved = resolve_module_alias(module, aliases);
3507 is_setup_module(&resolved)
3508}
3509
3510fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3511 if depth >= MAX_SETUP_PY_AST_DEPTH {
3512 return None;
3513 }
3514
3515 match expr {
3516 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3517 ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3518 let base = dotted_name(value.as_ref(), depth + 1)?;
3519 Some(format!("{}.{}", base, attr.as_str()))
3520 }
3521 _ => None,
3522 }
3523}
3524
3525fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3526 if let Some(mapped) = aliases.module_aliases.get(module) {
3527 return mapped.clone();
3528 }
3529
3530 let Some((base, rest)) = module.split_once('.') else {
3531 return module.to_string();
3532 };
3533
3534 if let Some(mapped) = aliases.module_aliases.get(base) {
3535 return format!("{}.{}", mapped, rest);
3536 }
3537
3538 module.to_string()
3539}
3540
3541fn extract_setup_keywords(
3542 call_expr: &ast::Expr,
3543 evaluator: &mut LiteralEvaluator,
3544) -> HashMap<String, Value> {
3545 let mut values = HashMap::new();
3546 let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3547 return values;
3548 };
3549
3550 for keyword in arguments.keywords.iter() {
3551 if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3552 if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3553 values.insert(arg.to_string(), value);
3554 }
3555 } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3556 for (key, value) in dict {
3557 values.insert(key, value);
3558 }
3559 }
3560 }
3561
3562 values
3563}
3564
3565fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3566 let name = get_value_string(values, "name");
3567 let version = get_value_string(values, "version");
3568 let description =
3569 get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3570 let homepage_url =
3571 get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3572 let author = get_value_string(values, "author");
3573 let author_email = get_value_string(values, "author_email");
3574 let maintainer = get_value_string(values, "maintainer");
3575 let maintainer_email = get_value_string(values, "maintainer_email");
3576 let license = get_value_string(values, "license");
3577 let classifiers = values
3578 .get("classifiers")
3579 .and_then(value_to_string_list)
3580 .unwrap_or_default();
3581
3582 let mut parties = Vec::new();
3583 if author.is_some() || author_email.is_some() {
3584 parties.push(Party {
3585 r#type: Some("person".to_string()),
3586 role: Some("author".to_string()),
3587 name: author,
3588 email: author_email,
3589 url: None,
3590 organization: None,
3591 organization_url: None,
3592 timezone: None,
3593 });
3594 }
3595
3596 if maintainer.is_some() || maintainer_email.is_some() {
3597 parties.push(Party {
3598 r#type: Some("person".to_string()),
3599 role: Some("maintainer".to_string()),
3600 name: maintainer,
3601 email: maintainer_email,
3602 url: None,
3603 organization: None,
3604 organization_url: None,
3605 timezone: None,
3606 });
3607 }
3608
3609 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3610 normalize_spdx_declared_license(license.as_deref());
3611 let extracted_license_statement = license.clone();
3612
3613 let dependencies = build_setup_py_dependencies(values);
3614 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3615 let mut homepage_from_project_urls = None;
3616 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3617 let mut extra_data = HashMap::new();
3618
3619 if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3620 apply_project_url_mappings(
3621 &parsed_project_urls,
3622 &mut homepage_from_project_urls,
3623 &mut bug_tracking_url,
3624 &mut code_view_url,
3625 &mut vcs_url,
3626 &mut extra_data,
3627 );
3628 }
3629
3630 let extra_data = if extra_data.is_empty() {
3631 None
3632 } else {
3633 Some(extra_data)
3634 };
3635
3636 PackageData {
3637 package_type: Some(PythonParser::PACKAGE_TYPE),
3638 namespace: None,
3639 name,
3640 version,
3641 qualifiers: None,
3642 subpath: None,
3643 primary_language: Some("Python".to_string()),
3644 description,
3645 release_date: None,
3646 parties,
3647 keywords: Vec::new(),
3648 homepage_url: homepage_url.or(homepage_from_project_urls),
3649 download_url: None,
3650 size: None,
3651 sha1: None,
3652 md5: None,
3653 sha256: None,
3654 sha512: None,
3655 bug_tracking_url,
3656 code_view_url,
3657 vcs_url,
3658 copyright: None,
3659 holder: None,
3660 declared_license_expression,
3661 declared_license_expression_spdx,
3662 license_detections,
3663 other_license_expression: None,
3664 other_license_expression_spdx: None,
3665 other_license_detections: Vec::new(),
3666 extracted_license_statement,
3667 notice_text: None,
3668 source_packages: Vec::new(),
3669 file_references: Vec::new(),
3670 is_private: has_private_classifier(&classifiers),
3671 is_virtual: false,
3672 extra_data,
3673 dependencies,
3674 repository_homepage_url: None,
3675 repository_download_url: None,
3676 api_data_url: None,
3677 datasource_id: Some(DatasourceId::PypiSetupPy),
3678 purl,
3679 }
3680}
3681
3682fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3683 let mut dependencies = Vec::new();
3684
3685 if let Some(reqs) = values
3686 .get("install_requires")
3687 .and_then(value_to_string_list)
3688 {
3689 dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3690 }
3691
3692 if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3693 dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3694 }
3695
3696 if let Some(Value::Dict(extras)) = values.get("extras_require") {
3697 let mut extra_items: Vec<_> = extras.iter().collect();
3698 extra_items.sort_by_key(|(name, _)| *name);
3699 for (extra_name, extra_value) in extra_items {
3700 if let Some(reqs) = value_to_string_list(extra_value) {
3701 dependencies.extend(build_setup_py_dependency_list(
3702 reqs.as_slice(),
3703 extra_name,
3704 true,
3705 ));
3706 }
3707 }
3708 }
3709
3710 dependencies
3711}
3712
3713fn build_setup_py_dependency_list(
3714 reqs: &[String],
3715 scope: &str,
3716 is_optional: bool,
3717) -> Vec<Dependency> {
3718 reqs.iter()
3719 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3720 .collect()
3721}
3722
3723fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3724 values.get(key).and_then(value_to_string)
3725}
3726
3727fn value_to_string(value: &Value) -> Option<String> {
3728 match value {
3729 Value::String(value) => Some(value.clone()),
3730 Value::Number(value) => Some(value.to_string()),
3731 Value::Bool(value) => Some(value.to_string()),
3732 _ => None,
3733 }
3734}
3735
3736fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3737 match value {
3738 Value::String(value) => Some(vec![value.clone()]),
3739 Value::List(values) | Value::Tuple(values) => {
3740 let mut items = Vec::new();
3741 for item in values {
3742 items.push(value_to_string(item)?);
3743 }
3744 Some(items)
3745 }
3746 _ => None,
3747 }
3748}
3749
3750fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3751 let Value::Dict(dict) = value else {
3752 return None;
3753 };
3754
3755 let mut pairs: Vec<(String, String)> = dict
3756 .iter()
3757 .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3758 .collect::<Option<Vec<_>>>()?;
3759 pairs.sort_by(|left, right| left.0.cmp(&right.0));
3760 Some(pairs)
3761}
3762
3763fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3764 let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3765 extract_requires_dist_dependencies(&requires_dist)
3766}
3767
3768pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3769 requires_dist
3770 .iter()
3771 .filter_map(|entry| build_rfc822_dependency(entry))
3772 .collect()
3773}
3774
3775fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3776 build_python_dependency(entry, "install", false, None)
3777}
3778
3779fn build_python_dependency(
3780 entry: &str,
3781 default_scope: &str,
3782 default_optional: bool,
3783 marker_override: Option<&str>,
3784) -> Option<Dependency> {
3785 let (requirement_part, marker_part) = entry
3786 .split_once(';')
3787 .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3788 .unwrap_or((entry.trim(), None));
3789
3790 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3791 let requirement = normalize_rfc822_requirement(requirement_part);
3792 let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3793 marker_part.or(marker_override),
3794 default_scope,
3795 default_optional,
3796 );
3797 let purl = build_python_dependency_purl(&name, None)?;
3798
3799 let is_pinned = requirement
3800 .as_deref()
3801 .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3802 let purl = if is_pinned {
3803 requirement
3804 .as_deref()
3805 .map(|req| req.trim_start_matches('='))
3806 .and_then(|version| build_python_dependency_purl(&name, Some(version)))
3807 .unwrap_or(purl)
3808 } else {
3809 purl
3810 };
3811
3812 let mut extra_data = HashMap::new();
3813 extra_data.extend(marker_data);
3814 if let Some(marker) = marker {
3815 extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3816 }
3817
3818 Some(Dependency {
3819 purl: Some(purl),
3820 extracted_requirement: requirement,
3821 scope: Some(scope),
3822 is_runtime: Some(true),
3823 is_optional: Some(is_optional),
3824 is_pinned: Some(is_pinned),
3825 is_direct: Some(true),
3826 resolved_package: None,
3827 extra_data: if extra_data.is_empty() {
3828 None
3829 } else {
3830 Some(extra_data)
3831 },
3832 })
3833}
3834
3835fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3836 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3837 let trimmed = requirement_part.trim();
3838 let mut remainder = trimmed[name.len()..].trim();
3839
3840 if let Some(stripped) = remainder.strip_prefix('[')
3841 && let Some(end_idx) = stripped.find(']')
3842 {
3843 remainder = stripped[end_idx + 1..].trim();
3844 }
3845
3846 let remainder = remainder
3847 .strip_prefix('(')
3848 .and_then(|value| value.strip_suffix(')'))
3849 .unwrap_or(remainder)
3850 .trim();
3851
3852 if remainder.is_empty() {
3853 return None;
3854 }
3855
3856 let mut specifiers: Vec<String> = remainder
3857 .split(',')
3858 .map(|specifier| specifier.trim().replace(' ', ""))
3859 .filter(|specifier| !specifier.is_empty())
3860 .collect();
3861 specifiers.sort();
3862 Some(specifiers.join(","))
3863}
3864
3865fn encode_python_dependency_purl_version(version: &str) -> String {
3866 version.replace('*', "%2A")
3867}
3868
3869fn build_python_dependency_purl(name: &str, version: Option<&str>) -> Option<String> {
3870 let normalized_name = normalize_python_dependency_name(name);
3871
3872 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &normalized_name)
3873 .ok()
3874 .map(|_| match version {
3875 Some(version) => {
3876 format!(
3877 "pkg:pypi/{normalized_name}@{}",
3878 encode_python_dependency_purl_version(version)
3879 )
3880 }
3881 None => format!("pkg:pypi/{normalized_name}"),
3882 })
3883}
3884
3885fn normalize_python_dependency_name(name: &str) -> String {
3886 name.trim().to_ascii_lowercase().replace('_', "-")
3887}
3888
3889fn parse_rfc822_marker(
3890 marker_part: Option<&str>,
3891 default_scope: &str,
3892 default_optional: bool,
3893) -> (
3894 String,
3895 bool,
3896 Option<String>,
3897 HashMap<String, serde_json::Value>,
3898) {
3899 let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3900 return (
3901 default_scope.to_string(),
3902 default_optional,
3903 None,
3904 HashMap::new(),
3905 );
3906 };
3907
3908 let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3909 .expect("extra marker regex should compile");
3910 let mut extra_data = HashMap::new();
3911
3912 if let Some(python_version) = extract_marker_field(marker, "python_version") {
3913 extra_data.insert(
3914 "python_version".to_string(),
3915 serde_json::Value::String(python_version),
3916 );
3917 }
3918 if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
3919 extra_data.insert(
3920 "sys_platform".to_string(),
3921 serde_json::Value::String(sys_platform),
3922 );
3923 }
3924
3925 if let Some(captures) = extra_re.captures(marker)
3926 && let Some(scope) = captures.get(1)
3927 {
3928 return (
3929 scope.as_str().to_string(),
3930 true,
3931 Some(marker.trim().to_string()),
3932 extra_data,
3933 );
3934 }
3935
3936 (
3937 default_scope.to_string(),
3938 default_optional,
3939 Some(marker.trim().to_string()),
3940 extra_data,
3941 )
3942}
3943
3944fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
3945 let re = Regex::new(&format!(
3946 r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
3947 field
3948 ))
3949 .ok()?;
3950 let captures = re.captures(marker)?;
3951 let operator = captures.get(1)?.as_str();
3952 let value = captures.get(2)?.as_str();
3953 Some(format!("{} {}", operator, value))
3954}
3955
3956fn parse_requires_txt(content: &str) -> Vec<Dependency> {
3957 let mut dependencies = Vec::new();
3958 let mut current_scope = "install".to_string();
3959 let mut current_optional = false;
3960 let mut current_marker: Option<String> = None;
3961
3962 for line in content.lines() {
3963 let trimmed = line.trim();
3964 if trimmed.is_empty() || trimmed.starts_with('#') {
3965 continue;
3966 }
3967
3968 if trimmed.starts_with('[') && trimmed.ends_with(']') {
3969 let inner = &trimmed[1..trimmed.len() - 1];
3970 if let Some(rest) = inner.strip_prefix(':') {
3971 current_scope = "install".to_string();
3972 current_optional = false;
3973 current_marker = Some(rest.trim().to_string());
3974 } else if let Some((scope, marker)) = inner.split_once(':') {
3975 current_scope = scope.trim().to_string();
3976 current_optional = true;
3977 current_marker = Some(marker.trim().to_string());
3978 } else {
3979 current_scope = inner.trim().to_string();
3980 current_optional = true;
3981 current_marker = None;
3982 }
3983 continue;
3984 }
3985
3986 if let Some(dependency) = build_python_dependency(
3987 trimmed,
3988 ¤t_scope,
3989 current_optional,
3990 current_marker.as_deref(),
3991 ) {
3992 dependencies.push(dependency);
3993 }
3994 }
3995
3996 dependencies
3997}
3998
3999fn has_private_classifier(classifiers: &[String]) -> bool {
4000 classifiers
4001 .iter()
4002 .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
4003}
4004
4005fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
4006 let name = name?;
4007 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
4008 if let Some(version) = version {
4009 package_url.with_version(version).ok()?;
4010 }
4011 Some(package_url.to_string())
4012}
4013
4014fn extract_from_setup_py_regex(content: &str) -> PackageData {
4015 let name = extract_setup_value(content, "name");
4016 let version = extract_setup_value(content, "version");
4017 let license_expression = extract_setup_value(content, "license");
4018
4019 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4020 normalize_spdx_declared_license(license_expression.as_deref());
4021 let extracted_license_statement = license_expression.clone();
4022
4023 let dependencies = extract_setup_py_dependencies(content);
4024 let homepage_url = extract_setup_value(content, "url");
4025 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
4026
4027 PackageData {
4028 package_type: Some(PythonParser::PACKAGE_TYPE),
4029 namespace: None,
4030 name,
4031 version,
4032 qualifiers: None,
4033 subpath: None,
4034 primary_language: Some("Python".to_string()),
4035 description: None,
4036 release_date: None,
4037 parties: Vec::new(),
4038 keywords: Vec::new(),
4039 homepage_url,
4040 download_url: None,
4041 size: None,
4042 sha1: None,
4043 md5: None,
4044 sha256: None,
4045 sha512: None,
4046 bug_tracking_url: None,
4047 code_view_url: None,
4048 vcs_url: None,
4049 copyright: None,
4050 holder: None,
4051 declared_license_expression,
4052 declared_license_expression_spdx,
4053 license_detections,
4054 other_license_expression: None,
4055 other_license_expression_spdx: None,
4056 other_license_detections: Vec::new(),
4057 extracted_license_statement,
4058 notice_text: None,
4059 source_packages: Vec::new(),
4060 file_references: Vec::new(),
4061 is_private: false,
4062 is_virtual: false,
4063 extra_data: None,
4064 dependencies,
4065 repository_homepage_url: None,
4066 repository_download_url: None,
4067 api_data_url: None,
4068 datasource_id: Some(DatasourceId::PypiSetupPy),
4069 purl,
4070 }
4071}
4072
4073fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
4074 crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
4075}
4076
4077fn extract_from_pypi_json(path: &Path) -> PackageData {
4078 let default = PackageData {
4079 package_type: Some(PythonParser::PACKAGE_TYPE),
4080 datasource_id: Some(DatasourceId::PypiJson),
4081 ..Default::default()
4082 };
4083
4084 let content = match read_file_to_string(path) {
4085 Ok(content) => content,
4086 Err(error) => {
4087 warn!("Failed to read pypi.json at {:?}: {}", path, error);
4088 return default;
4089 }
4090 };
4091
4092 let root: serde_json::Value = match serde_json::from_str(&content) {
4093 Ok(value) => value,
4094 Err(error) => {
4095 warn!("Failed to parse pypi.json at {:?}: {}", path, error);
4096 return default;
4097 }
4098 };
4099
4100 let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
4101 warn!("No info object found in pypi.json at {:?}", path);
4102 return default;
4103 };
4104
4105 let name = info
4106 .get("name")
4107 .and_then(|value| value.as_str())
4108 .map(ToOwned::to_owned);
4109 let version = info
4110 .get("version")
4111 .and_then(|value| value.as_str())
4112 .map(ToOwned::to_owned);
4113 let summary = info
4114 .get("summary")
4115 .and_then(|value| value.as_str())
4116 .map(ToOwned::to_owned);
4117 let description = info
4118 .get("description")
4119 .and_then(|value| value.as_str())
4120 .filter(|value| !value.trim().is_empty())
4121 .map(ToOwned::to_owned)
4122 .or(summary);
4123 let mut homepage_url = info
4124 .get("home_page")
4125 .and_then(|value| value.as_str())
4126 .map(ToOwned::to_owned);
4127 let author = info
4128 .get("author")
4129 .and_then(|value| value.as_str())
4130 .filter(|value| !value.trim().is_empty())
4131 .map(ToOwned::to_owned);
4132 let author_email = info
4133 .get("author_email")
4134 .and_then(|value| value.as_str())
4135 .filter(|value| !value.trim().is_empty())
4136 .map(ToOwned::to_owned);
4137 let license = info
4138 .get("license")
4139 .and_then(|value| value.as_str())
4140 .filter(|value| !value.trim().is_empty())
4141 .map(ToOwned::to_owned);
4142 let keywords = parse_setup_cfg_keywords(
4143 info.get("keywords")
4144 .and_then(|value| value.as_str())
4145 .map(ToOwned::to_owned),
4146 );
4147 let classifiers = info
4148 .get("classifiers")
4149 .and_then(|value| value.as_array())
4150 .map(|values| {
4151 values
4152 .iter()
4153 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
4154 .collect::<Vec<_>>()
4155 })
4156 .unwrap_or_default();
4157
4158 let mut parties = Vec::new();
4159 if author.is_some() || author_email.is_some() {
4160 parties.push(Party {
4161 r#type: Some("person".to_string()),
4162 role: Some("author".to_string()),
4163 name: author,
4164 email: author_email,
4165 url: None,
4166 organization: None,
4167 organization_url: None,
4168 timezone: None,
4169 });
4170 }
4171
4172 let mut bug_tracking_url = None;
4173 let mut code_view_url = None;
4174 let mut vcs_url = None;
4175 let mut extra_data = HashMap::new();
4176
4177 let parsed_project_urls = info
4178 .get("project_urls")
4179 .and_then(|value| value.as_object())
4180 .map(|map| {
4181 let mut pairs: Vec<(String, String)> = map
4182 .iter()
4183 .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
4184 .collect();
4185 pairs.sort_by(|left, right| left.0.cmp(&right.0));
4186 pairs
4187 })
4188 .unwrap_or_default();
4189
4190 apply_project_url_mappings(
4191 &parsed_project_urls,
4192 &mut homepage_url,
4193 &mut bug_tracking_url,
4194 &mut code_view_url,
4195 &mut vcs_url,
4196 &mut extra_data,
4197 );
4198
4199 let (download_url, size, sha256) = root
4200 .get("urls")
4201 .and_then(|value| value.as_array())
4202 .map(|urls| select_pypi_json_artifact(urls))
4203 .unwrap_or((None, None, None));
4204
4205 let sha256 = sha256.and_then(|h| Sha256Digest::from_hex(&h).ok());
4206
4207 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4208 normalize_spdx_declared_license(license.as_deref());
4209 let dependencies = info
4210 .get("requires_dist")
4211 .and_then(|value| value.as_array())
4212 .map(|entries| {
4213 entries
4214 .iter()
4215 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4216 .collect::<Vec<_>>()
4217 })
4218 .map(|entries| extract_requires_dist_dependencies(&entries))
4219 .unwrap_or_default();
4220
4221 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
4222 build_pypi_urls(name.as_deref(), version.as_deref());
4223
4224 PackageData {
4225 package_type: Some(PythonParser::PACKAGE_TYPE),
4226 namespace: None,
4227 name,
4228 version,
4229 qualifiers: None,
4230 subpath: None,
4231 primary_language: None,
4232 description,
4233 release_date: None,
4234 parties,
4235 keywords,
4236 homepage_url: homepage_url.or(repository_homepage_url.clone()),
4237 download_url,
4238 size,
4239 sha1: None,
4240 md5: None,
4241 sha256,
4242 sha512: None,
4243 bug_tracking_url,
4244 code_view_url,
4245 vcs_url,
4246 copyright: None,
4247 holder: None,
4248 declared_license_expression,
4249 declared_license_expression_spdx,
4250 license_detections,
4251 other_license_expression: None,
4252 other_license_expression_spdx: None,
4253 other_license_detections: Vec::new(),
4254 extracted_license_statement: license,
4255 notice_text: None,
4256 source_packages: Vec::new(),
4257 file_references: Vec::new(),
4258 is_private: has_private_classifier(&classifiers),
4259 is_virtual: false,
4260 extra_data: if extra_data.is_empty() {
4261 None
4262 } else {
4263 Some(extra_data)
4264 },
4265 dependencies,
4266 repository_homepage_url,
4267 repository_download_url,
4268 api_data_url,
4269 datasource_id: Some(DatasourceId::PypiJson),
4270 purl,
4271 }
4272}
4273
4274fn select_pypi_json_artifact(
4275 urls: &[serde_json::Value],
4276) -> (Option<String>, Option<u64>, Option<String>) {
4277 let selected = urls
4278 .iter()
4279 .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4280 .or_else(|| urls.first());
4281
4282 let Some(entry) = selected else {
4283 return (None, None, None);
4284 };
4285
4286 let download_url = entry
4287 .get("url")
4288 .and_then(|value| value.as_str())
4289 .map(ToOwned::to_owned);
4290 let size = entry.get("size").and_then(|value| value.as_u64());
4291 let sha256 = entry
4292 .get("digests")
4293 .and_then(|value| value.as_object())
4294 .and_then(|digests| digests.get("sha256"))
4295 .and_then(|value| value.as_str())
4296 .map(ToOwned::to_owned);
4297
4298 (download_url, size, sha256)
4299}
4300
4301fn extract_from_pip_inspect(path: &Path) -> PackageData {
4302 let content = match read_file_to_string(path) {
4303 Ok(content) => content,
4304 Err(e) => {
4305 warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4306 return default_package_data(path);
4307 }
4308 };
4309
4310 let root: serde_json::Value = match serde_json::from_str(&content) {
4311 Ok(value) => value,
4312 Err(e) => {
4313 warn!(
4314 "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4315 path, e
4316 );
4317 return default_package_data(path);
4318 }
4319 };
4320
4321 let installed = match root.get("installed").and_then(|v| v.as_array()) {
4322 Some(arr) => arr,
4323 None => {
4324 warn!(
4325 "No 'installed' array found in pip-inspect.deplock at {:?}",
4326 path
4327 );
4328 return default_package_data(path);
4329 }
4330 };
4331
4332 let pip_version = root
4333 .get("pip_version")
4334 .and_then(|v| v.as_str())
4335 .map(String::from);
4336 let inspect_version = root
4337 .get("version")
4338 .and_then(|v| v.as_str())
4339 .map(String::from);
4340
4341 let mut main_package: Option<PackageData> = None;
4342 let mut dependencies: Vec<Dependency> = Vec::new();
4343
4344 for package_entry in installed {
4345 let metadata = match package_entry.get("metadata") {
4346 Some(m) => m,
4347 None => continue,
4348 };
4349
4350 let is_requested = package_entry
4351 .get("requested")
4352 .and_then(|v| v.as_bool())
4353 .unwrap_or(false);
4354 let has_direct_url = package_entry.get("direct_url").is_some();
4355
4356 let name = metadata
4357 .get("name")
4358 .and_then(|v| v.as_str())
4359 .map(String::from);
4360 let version = metadata
4361 .get("version")
4362 .and_then(|v| v.as_str())
4363 .map(String::from);
4364 let summary = metadata
4365 .get("summary")
4366 .and_then(|v| v.as_str())
4367 .map(String::from);
4368 let home_page = metadata
4369 .get("home_page")
4370 .and_then(|v| v.as_str())
4371 .map(String::from);
4372 let author = metadata
4373 .get("author")
4374 .and_then(|v| v.as_str())
4375 .map(String::from);
4376 let author_email = metadata
4377 .get("author_email")
4378 .and_then(|v| v.as_str())
4379 .map(String::from);
4380 let license = metadata
4381 .get("license")
4382 .and_then(|v| v.as_str())
4383 .map(String::from);
4384 let description = metadata
4385 .get("description")
4386 .and_then(|v| v.as_str())
4387 .map(String::from);
4388 let keywords = metadata
4389 .get("keywords")
4390 .and_then(|v| v.as_array())
4391 .map(|arr| {
4392 arr.iter()
4393 .filter_map(|k| k.as_str().map(String::from))
4394 .collect::<Vec<_>>()
4395 })
4396 .unwrap_or_default();
4397
4398 let mut parties = Vec::new();
4399 if author.is_some() || author_email.is_some() {
4400 parties.push(Party {
4401 r#type: Some("person".to_string()),
4402 role: Some("author".to_string()),
4403 name: author,
4404 email: author_email,
4405 url: None,
4406 organization: None,
4407 organization_url: None,
4408 timezone: None,
4409 });
4410 }
4411
4412 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4413 normalize_spdx_declared_license(license.as_deref());
4414 let extracted_license_statement = license.clone();
4415 let requires_dist = metadata
4416 .get("requires_dist")
4417 .and_then(|v| v.as_array())
4418 .map(|entries| {
4419 entries
4420 .iter()
4421 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4422 .collect::<Vec<_>>()
4423 })
4424 .unwrap_or_default();
4425 let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4426
4427 let purl = name.as_ref().and_then(|n| {
4428 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4429 if let Some(v) = &version {
4430 package_url.with_version(v).ok()?;
4431 }
4432 Some(package_url.to_string())
4433 });
4434
4435 if is_requested && has_direct_url {
4436 let mut extra_data = HashMap::new();
4437 if let Some(pv) = &pip_version {
4438 extra_data.insert(
4439 "pip_version".to_string(),
4440 serde_json::Value::String(pv.clone()),
4441 );
4442 }
4443 if let Some(iv) = &inspect_version {
4444 extra_data.insert(
4445 "inspect_version".to_string(),
4446 serde_json::Value::String(iv.clone()),
4447 );
4448 }
4449
4450 main_package = Some(PackageData {
4451 package_type: Some(PythonParser::PACKAGE_TYPE),
4452 namespace: None,
4453 name,
4454 version,
4455 qualifiers: None,
4456 subpath: None,
4457 primary_language: Some("Python".to_string()),
4458 description: description.or(summary),
4459 release_date: None,
4460 parties,
4461 keywords,
4462 homepage_url: home_page,
4463 download_url: None,
4464 size: None,
4465 sha1: None,
4466 md5: None,
4467 sha256: None,
4468 sha512: None,
4469 bug_tracking_url: None,
4470 code_view_url: None,
4471 vcs_url: None,
4472 copyright: None,
4473 holder: None,
4474 declared_license_expression,
4475 declared_license_expression_spdx,
4476 license_detections,
4477 other_license_expression: None,
4478 other_license_expression_spdx: None,
4479 other_license_detections: Vec::new(),
4480 extracted_license_statement,
4481 notice_text: None,
4482 source_packages: Vec::new(),
4483 file_references: Vec::new(),
4484 is_private: false,
4485 is_virtual: true,
4486 extra_data: if extra_data.is_empty() {
4487 None
4488 } else {
4489 Some(extra_data)
4490 },
4491 dependencies: parsed_dependencies,
4492 repository_homepage_url: None,
4493 repository_download_url: None,
4494 api_data_url: None,
4495 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4496 purl,
4497 });
4498 } else {
4499 let resolved_package = PackageData {
4500 package_type: Some(PythonParser::PACKAGE_TYPE),
4501 namespace: None,
4502 name: name.clone(),
4503 version: version.clone(),
4504 qualifiers: None,
4505 subpath: None,
4506 primary_language: Some("Python".to_string()),
4507 description: description.or(summary),
4508 release_date: None,
4509 parties,
4510 keywords,
4511 homepage_url: home_page,
4512 download_url: None,
4513 size: None,
4514 sha1: None,
4515 md5: None,
4516 sha256: None,
4517 sha512: None,
4518 bug_tracking_url: None,
4519 code_view_url: None,
4520 vcs_url: None,
4521 copyright: None,
4522 holder: None,
4523 declared_license_expression,
4524 declared_license_expression_spdx,
4525 license_detections,
4526 other_license_expression: None,
4527 other_license_expression_spdx: None,
4528 other_license_detections: Vec::new(),
4529 extracted_license_statement,
4530 notice_text: None,
4531 source_packages: Vec::new(),
4532 file_references: Vec::new(),
4533 is_private: false,
4534 is_virtual: true,
4535 extra_data: None,
4536 dependencies: parsed_dependencies,
4537 repository_homepage_url: None,
4538 repository_download_url: None,
4539 api_data_url: None,
4540 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4541 purl: purl.clone(),
4542 };
4543
4544 let resolved = package_data_to_resolved(&resolved_package);
4545 dependencies.push(Dependency {
4546 purl,
4547 extracted_requirement: None,
4548 scope: None,
4549 is_runtime: Some(true),
4550 is_optional: Some(false),
4551 is_pinned: Some(true),
4552 is_direct: Some(is_requested),
4553 resolved_package: Some(Box::new(resolved)),
4554 extra_data: None,
4555 });
4556 }
4557 }
4558
4559 if let Some(mut main_pkg) = main_package {
4560 let direct_requirement_purls: HashSet<String> = main_pkg
4561 .dependencies
4562 .iter()
4563 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4564 .collect();
4565
4566 let resolved_requirement_purls: HashSet<String> = dependencies
4567 .iter()
4568 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4569 .collect();
4570
4571 let unresolved_dependencies = main_pkg
4572 .dependencies
4573 .iter()
4574 .filter(|dep| {
4575 dep.purl.as_ref().is_some_and(|purl| {
4576 !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4577 })
4578 })
4579 .cloned()
4580 .collect::<Vec<_>>();
4581
4582 for dependency in &mut dependencies {
4583 if dependency
4584 .purl
4585 .as_ref()
4586 .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4587 {
4588 dependency.is_direct = Some(true);
4589 }
4590 }
4591
4592 main_pkg.dependencies = dependencies;
4593 main_pkg.dependencies.extend(unresolved_dependencies);
4594 main_pkg
4595 } else {
4596 default_package_data(path)
4597 }
4598}
4599
4600fn base_dependency_purl(purl: &str) -> String {
4601 purl.split_once('@')
4602 .map(|(base, _)| base.to_string())
4603 .unwrap_or_else(|| purl.to_string())
4604}
4605
4606type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4607
4608fn extract_from_setup_cfg(path: &Path) -> PackageData {
4609 let content = match read_file_to_string(path) {
4610 Ok(content) => content,
4611 Err(e) => {
4612 warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4613 return default_package_data(path);
4614 }
4615 };
4616
4617 let sections = parse_setup_cfg(&content);
4618 let name = get_ini_value(§ions, "metadata", "name");
4619 let version = get_ini_value(§ions, "metadata", "version");
4620 let description = get_ini_value(§ions, "metadata", "description");
4621 let author = get_ini_value(§ions, "metadata", "author");
4622 let author_email = get_ini_value(§ions, "metadata", "author_email");
4623 let maintainer = get_ini_value(§ions, "metadata", "maintainer");
4624 let maintainer_email = get_ini_value(§ions, "metadata", "maintainer_email");
4625 let license = get_ini_value(§ions, "metadata", "license");
4626 let mut homepage_url = get_ini_value(§ions, "metadata", "url");
4627 let classifiers = get_ini_values(§ions, "metadata", "classifiers");
4628 let keywords = parse_setup_cfg_keywords(get_ini_value(§ions, "metadata", "keywords"));
4629 let python_requires = get_ini_value(§ions, "options", "python_requires");
4630 let parsed_project_urls =
4631 parse_setup_cfg_project_urls(&get_ini_values(§ions, "metadata", "project_urls"));
4632 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4633 let mut extra_data = HashMap::new();
4634
4635 let mut parties = Vec::new();
4636 if author.is_some() || author_email.is_some() {
4637 parties.push(Party {
4638 r#type: Some("person".to_string()),
4639 role: Some("author".to_string()),
4640 name: author,
4641 email: author_email,
4642 url: None,
4643 organization: None,
4644 organization_url: None,
4645 timezone: None,
4646 });
4647 }
4648
4649 if maintainer.is_some() || maintainer_email.is_some() {
4650 parties.push(Party {
4651 r#type: Some("person".to_string()),
4652 role: Some("maintainer".to_string()),
4653 name: maintainer,
4654 email: maintainer_email,
4655 url: None,
4656 organization: None,
4657 organization_url: None,
4658 timezone: None,
4659 });
4660 }
4661
4662 let declared_license_expression = None;
4663 let declared_license_expression_spdx = None;
4664 let license_detections = Vec::new();
4665 let extracted_license_statement = license.clone();
4666
4667 let dependencies = extract_setup_cfg_dependencies(§ions);
4668
4669 if let Some(value) = python_requires {
4670 extra_data.insert(
4671 "python_requires".to_string(),
4672 serde_json::Value::String(value),
4673 );
4674 }
4675
4676 apply_project_url_mappings(
4677 &parsed_project_urls,
4678 &mut homepage_url,
4679 &mut bug_tracking_url,
4680 &mut code_view_url,
4681 &mut vcs_url,
4682 &mut extra_data,
4683 );
4684
4685 let extra_data = if extra_data.is_empty() {
4686 None
4687 } else {
4688 Some(extra_data)
4689 };
4690
4691 let purl = name.as_ref().and_then(|n| {
4692 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4693 if let Some(v) = &version {
4694 package_url.with_version(v).ok()?;
4695 }
4696 Some(package_url.to_string())
4697 });
4698
4699 PackageData {
4700 package_type: Some(PythonParser::PACKAGE_TYPE),
4701 namespace: None,
4702 name,
4703 version,
4704 qualifiers: None,
4705 subpath: None,
4706 primary_language: Some("Python".to_string()),
4707 description,
4708 release_date: None,
4709 parties,
4710 keywords,
4711 homepage_url,
4712 download_url: None,
4713 size: None,
4714 sha1: None,
4715 md5: None,
4716 sha256: None,
4717 sha512: None,
4718 bug_tracking_url,
4719 code_view_url,
4720 vcs_url,
4721 copyright: None,
4722 holder: None,
4723 declared_license_expression,
4724 declared_license_expression_spdx,
4725 license_detections,
4726 other_license_expression: None,
4727 other_license_expression_spdx: None,
4728 other_license_detections: Vec::new(),
4729 extracted_license_statement,
4730 notice_text: None,
4731 source_packages: Vec::new(),
4732 file_references: Vec::new(),
4733 is_private: has_private_classifier(&classifiers),
4734 is_virtual: false,
4735 extra_data,
4736 dependencies,
4737 repository_homepage_url: None,
4738 repository_download_url: None,
4739 api_data_url: None,
4740 datasource_id: Some(DatasourceId::PypiSetupCfg),
4741 purl,
4742 }
4743}
4744
4745fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4746 let Some(keywords) = value else {
4747 return Vec::new();
4748 };
4749
4750 keywords
4751 .split(',')
4752 .map(str::trim)
4753 .filter(|keyword| !keyword.is_empty())
4754 .map(ToOwned::to_owned)
4755 .collect()
4756}
4757
4758fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4759 entries
4760 .iter()
4761 .filter_map(|entry| {
4762 let (label, url) = entry.split_once('=')?;
4763 let label = label.trim();
4764 let url = url.trim();
4765 if label.is_empty() || url.is_empty() {
4766 None
4767 } else {
4768 Some((label.to_string(), url.to_string()))
4769 }
4770 })
4771 .collect()
4772}
4773
4774fn apply_project_url_mappings(
4775 parsed_urls: &[(String, String)],
4776 homepage_url: &mut Option<String>,
4777 bug_tracking_url: &mut Option<String>,
4778 code_view_url: &mut Option<String>,
4779 vcs_url: &mut Option<String>,
4780 extra_data: &mut HashMap<String, serde_json::Value>,
4781) {
4782 for (label, url) in parsed_urls {
4783 let label_lower = label.to_lowercase();
4784
4785 if bug_tracking_url.is_none()
4786 && matches!(
4787 label_lower.as_str(),
4788 "tracker"
4789 | "bug reports"
4790 | "bug tracker"
4791 | "issues"
4792 | "issue tracker"
4793 | "github: issues"
4794 )
4795 {
4796 *bug_tracking_url = Some(url.clone());
4797 } else if code_view_url.is_none()
4798 && matches!(label_lower.as_str(), "source" | "source code" | "code")
4799 {
4800 *code_view_url = Some(url.clone());
4801 } else if vcs_url.is_none()
4802 && matches!(
4803 label_lower.as_str(),
4804 "github" | "gitlab" | "github: repo" | "repository"
4805 )
4806 {
4807 *vcs_url = Some(url.clone());
4808 } else if homepage_url.is_none()
4809 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4810 {
4811 *homepage_url = Some(url.clone());
4812 } else if label_lower == "changelog" {
4813 extra_data.insert(
4814 "changelog_url".to_string(),
4815 serde_json::Value::String(url.clone()),
4816 );
4817 }
4818 }
4819
4820 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4821 .iter()
4822 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4823 .collect();
4824
4825 if !project_urls_json.is_empty() {
4826 extra_data.insert(
4827 "project_urls".to_string(),
4828 serde_json::Value::Object(project_urls_json),
4829 );
4830 }
4831}
4832
4833fn parse_setup_cfg(content: &str) -> IniSections {
4834 let mut sections: IniSections = HashMap::new();
4835 let mut current_section: Option<String> = None;
4836 let mut current_key: Option<String> = None;
4837
4838 for raw_line in content.lines() {
4839 let line = raw_line.trim_end_matches('\r');
4840 let trimmed = line.trim();
4841 if trimmed.is_empty() {
4842 continue;
4843 }
4844
4845 let stripped = line.trim_start();
4846 if stripped.starts_with('#') || stripped.starts_with(';') {
4847 continue;
4848 }
4849
4850 if stripped.starts_with('[') && stripped.ends_with(']') {
4851 let section_name = stripped
4852 .trim_start_matches('[')
4853 .trim_end_matches(']')
4854 .trim()
4855 .to_ascii_lowercase();
4856 current_section = if section_name.is_empty() {
4857 None
4858 } else {
4859 Some(section_name)
4860 };
4861 current_key = None;
4862 continue;
4863 }
4864
4865 if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4866 if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4867 let value = stripped.trim();
4868 if !value.is_empty() {
4869 sections
4870 .entry(section.clone())
4871 .or_default()
4872 .entry(key.clone())
4873 .or_default()
4874 .push(value.to_string());
4875 }
4876 }
4877 continue;
4878 }
4879
4880 if let Some((key, value)) = stripped.split_once('=')
4881 && let Some(section) = current_section.as_ref()
4882 {
4883 let key_name = key.trim().to_ascii_lowercase();
4884 let value_trimmed = value.trim();
4885 let entry = sections
4886 .entry(section.clone())
4887 .or_default()
4888 .entry(key_name.clone())
4889 .or_default();
4890 if !value_trimmed.is_empty() {
4891 entry.push(value_trimmed.to_string());
4892 }
4893 current_key = Some(key_name);
4894 }
4895 }
4896
4897 sections
4898}
4899
4900fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4901 sections
4902 .get(§ion.to_ascii_lowercase())
4903 .and_then(|values| values.get(&key.to_ascii_lowercase()))
4904 .and_then(|entries| entries.first())
4905 .map(|value| value.trim().to_string())
4906}
4907
4908fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4909 sections
4910 .get(§ion.to_ascii_lowercase())
4911 .and_then(|values| values.get(&key.to_ascii_lowercase()))
4912 .cloned()
4913 .unwrap_or_default()
4914}
4915
4916fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
4917 let mut dependencies = Vec::new();
4918
4919 for (sub_section, scope) in [
4920 ("install_requires", "install"),
4921 ("tests_require", "test"),
4922 ("setup_requires", "setup"),
4923 ] {
4924 let reqs = get_ini_values(sections, "options", sub_section);
4925 dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
4926 }
4927
4928 if let Some(extras) = sections.get("options.extras_require") {
4929 let mut extra_items: Vec<_> = extras.iter().collect();
4930 extra_items.sort_by_key(|(name, _)| *name);
4931 for (extra_name, reqs) in extra_items {
4932 dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
4933 }
4934 }
4935
4936 dependencies
4937}
4938
4939fn parse_setup_cfg_requirements(
4940 reqs: &[String],
4941 scope: &str,
4942 is_optional: bool,
4943) -> Vec<Dependency> {
4944 reqs.iter()
4945 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
4946 .collect()
4947}
4948
4949fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
4950 let trimmed = req.trim();
4951 if trimmed.is_empty() || trimmed.starts_with('#') {
4952 return None;
4953 }
4954
4955 let name = extract_setup_cfg_dependency_name(trimmed)?;
4956 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4957
4958 Some(Dependency {
4959 purl: Some(purl.to_string()),
4960 extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
4961 scope: Some(scope.to_string()),
4962 is_runtime: Some(true),
4963 is_optional: Some(is_optional),
4964 is_pinned: Some(false),
4965 is_direct: Some(true),
4966 resolved_package: None,
4967 extra_data: None,
4968 })
4969}
4970
4971fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
4972 let trimmed = req.trim();
4973 if trimmed.is_empty() {
4974 return None;
4975 }
4976
4977 let end = trimmed
4978 .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
4979 .unwrap_or(trimmed.len());
4980 let name = trimmed[..end].trim();
4981 if name.is_empty() {
4982 None
4983 } else {
4984 Some(name.to_string())
4985 }
4986}
4987
4988fn normalize_setup_cfg_requirement(req: &str) -> String {
4989 req.chars().filter(|c| !c.is_whitespace()).collect()
4990}
4991
4992fn extract_setup_value(content: &str, key: &str) -> Option<String> {
4993 let patterns = vec![
4994 format!("{}=\"", key), format!("{} =\"", key), format!("{}= \"", key), format!("{} = \"", key), format!("{}='", key), format!("{} ='", key), format!("{}= '", key), format!("{} = '", key), ];
5003
5004 for pattern in patterns {
5005 if let Some(start_idx) = content.find(&pattern) {
5006 let value_start = start_idx + pattern.len();
5007 let remaining = &content[value_start..];
5008
5009 if let Some(end_idx) = remaining.find(['"', '\'']) {
5010 return Some(remaining[..end_idx].to_string());
5011 }
5012 }
5013 }
5014
5015 None
5016}
5017
5018fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
5019 let mut dependencies = Vec::new();
5020
5021 if let Some(tests_deps) = extract_tests_require(content) {
5022 dependencies.extend(tests_deps);
5023 }
5024
5025 if let Some(extras_deps) = extract_extras_require(content) {
5026 dependencies.extend(extras_deps);
5027 }
5028
5029 dependencies
5030}
5031
5032fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
5033 let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
5034 let re = Regex::new(pattern).ok()?;
5035 let captures = re.captures(content)?;
5036 let deps_str = captures.get(1)?.as_str();
5037
5038 let deps = parse_setup_py_dep_list(deps_str, "test", true);
5039 if deps.is_empty() { None } else { Some(deps) }
5040}
5041
5042fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
5043 let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
5044 let re = Regex::new(pattern).ok()?;
5045 let captures = re.captures(content)?;
5046 let dict_content = captures.get(1)?.as_str();
5047
5048 let mut all_deps = Vec::new();
5049
5050 let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
5051 let entry_re = Regex::new(entry_pattern).ok()?;
5052
5053 for entry_cap in entry_re.captures_iter(dict_content) {
5054 if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
5055 let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
5056 all_deps.extend(deps);
5057 }
5058 }
5059
5060 if all_deps.is_empty() {
5061 None
5062 } else {
5063 Some(all_deps)
5064 }
5065}
5066
5067fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
5068 let dep_pattern = r#"['"]([^'"]+)['"]"#;
5069 let re = match Regex::new(dep_pattern) {
5070 Ok(r) => r,
5071 Err(_) => return Vec::new(),
5072 };
5073
5074 re.captures_iter(deps_str)
5075 .filter_map(|cap| {
5076 let dep_str = cap.get(1)?.as_str().trim();
5077 if dep_str.is_empty() {
5078 return None;
5079 }
5080
5081 let name = extract_setup_cfg_dependency_name(dep_str)?;
5082 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5083
5084 Some(Dependency {
5085 purl: Some(purl.to_string()),
5086 extracted_requirement: Some(dep_str.to_string()),
5087 scope: Some(scope.to_string()),
5088 is_runtime: Some(true),
5089 is_optional: Some(is_optional),
5090 is_pinned: Some(false),
5091 is_direct: Some(true),
5092 resolved_package: None,
5093 extra_data: None,
5094 })
5095 })
5096 .collect()
5097}
5098
5099pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
5101 let content = read_file_to_string(path).map_err(|e| e.to_string())?;
5102 toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
5103}
5104
5105fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<Sha256Digest>) {
5116 let mut file = match File::open(path) {
5117 Ok(f) => f,
5118 Err(_) => return (None, None),
5119 };
5120
5121 let metadata = match file.metadata() {
5122 Ok(m) => m,
5123 Err(_) => return (None, None),
5124 };
5125 let size = metadata.len();
5126
5127 let mut hasher = Sha256::new();
5128 let mut buffer = vec![0; 8192];
5129
5130 loop {
5131 match file.read(&mut buffer) {
5132 Ok(0) => break,
5133 Ok(n) => hasher.update(&buffer[..n]),
5134 Err(_) => return (Some(size), None),
5135 }
5136 }
5137
5138 let hash = Sha256Digest::from_bytes(hasher.finalize().into());
5139 (Some(size), Some(hash))
5140}
5141
5142fn default_package_data(path: &Path) -> PackageData {
5143 PackageData {
5144 package_type: Some(PythonParser::PACKAGE_TYPE),
5145 primary_language: Some("Python".to_string()),
5146 datasource_id: infer_python_datasource_id(path),
5147 ..Default::default()
5148 }
5149}
5150
5151fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
5152 let file_name = path.file_name().and_then(|name| name.to_str());
5153
5154 match file_name {
5155 Some("pyproject.toml") => {
5156 if read_toml_file(path)
5157 .ok()
5158 .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
5159 .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
5160 .is_some()
5161 {
5162 Some(DatasourceId::PypiPoetryPyprojectToml)
5163 } else {
5164 Some(DatasourceId::PypiPyprojectToml)
5165 }
5166 }
5167 Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
5168 Some(DatasourceId::PypiSetupPy)
5169 }
5170 Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
5171 Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
5172 Some("METADATA") if is_installed_wheel_metadata_path(path) => {
5173 Some(DatasourceId::PypiWheelMetadata)
5174 }
5175 Some("pypi.json") => Some(DatasourceId::PypiJson),
5176 Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
5177 Some("origin.json") if is_pip_cache_origin_json(path) => {
5178 Some(DatasourceId::PypiPipOriginJson)
5179 }
5180 _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
5181 Some(DatasourceId::PypiSdist)
5182 }
5183 _ if path
5184 .extension()
5185 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
5186 {
5187 Some(DatasourceId::PypiWheel)
5188 }
5189 _ if path
5190 .extension()
5191 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
5192 {
5193 Some(DatasourceId::PypiEgg)
5194 }
5195 _ => None,
5196 }
5197}
5198
5199crate::register_parser!(
5200 "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
5201 &[
5202 "**/pyproject.toml",
5203 "**/setup.py",
5204 "**/*_setup.py",
5205 "**/setup.cfg",
5206 "**/pypi.json",
5207 "**/PKG-INFO",
5208 "**/*.dist-info/METADATA",
5209 "**/origin.json",
5210 "**/*.tar.gz",
5211 "**/*.tgz",
5212 "**/*.tar.bz2",
5213 "**/*.tar.xz",
5214 "**/*.zip",
5215 "**/*.whl",
5216 "**/*.egg"
5217 ],
5218 "pypi",
5219 "Python",
5220 Some("https://packaging.python.org/"),
5221);