1use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
35use crate::parser_warn as warn;
36use crate::parsers::utils::{read_file_to_string, split_name_email};
37use base64::Engine;
38use base64::engine::general_purpose::URL_SAFE_NO_PAD;
39use bzip2::read::BzDecoder;
40use csv::ReaderBuilder;
41use flate2::read::GzDecoder;
42use liblzma::read::XzDecoder;
43use packageurl::PackageUrl;
44use regex::Regex;
45use ruff_python_ast as ast;
46use ruff_python_parser::parse_module;
47use serde_json::{Map as JsonMap, Value as JsonValue};
48use sha2::{Digest, Sha256};
49use std::collections::{HashMap, HashSet};
50use std::fs::File;
51use std::io::Read;
52use std::path::{Component, Path, PathBuf};
53use tar::Archive;
54use toml::Value as TomlValue;
55use toml::map::Map as TomlMap;
56use zip::ZipArchive;
57
58use super::PackageParser;
59use super::license_normalization::{
60 DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
61 normalize_spdx_expression,
62};
63
64const FIELD_PROJECT: &str = "project";
66const FIELD_NAME: &str = "name";
67const FIELD_VERSION: &str = "version";
68const FIELD_LICENSE: &str = "license";
69const FIELD_AUTHORS: &str = "authors";
70const FIELD_MAINTAINERS: &str = "maintainers";
71const FIELD_URLS: &str = "urls";
72const FIELD_HOMEPAGE: &str = "homepage";
73const FIELD_REPOSITORY: &str = "repository";
74const FIELD_DEPENDENCIES: &str = "dependencies";
75const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
76const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
77const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
78const MAX_SETUP_PY_BYTES: usize = 1_048_576;
79const MAX_SETUP_PY_AST_NODES: usize = 10_000;
80const MAX_SETUP_PY_AST_DEPTH: usize = 50;
81const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; const MAX_COMPRESSION_RATIO: f64 = 100.0; pub struct PythonParser;
95
96#[derive(Clone, Copy, Debug)]
97enum PythonSdistArchiveFormat {
98 TarGz,
99 Tgz,
100 TarBz2,
101 TarXz,
102 Zip,
103}
104
105#[derive(Clone, Debug)]
106struct ValidatedZipEntry {
107 index: usize,
108 name: String,
109}
110
111impl PackageParser for PythonParser {
112 const PACKAGE_TYPE: PackageType = PackageType::Pypi;
113
114 fn extract_packages(path: &Path) -> Vec<PackageData> {
115 vec![
116 if path.file_name().unwrap_or_default() == "pyproject.toml" {
117 extract_from_pyproject_toml(path)
118 } else if path.file_name().unwrap_or_default() == "setup.cfg" {
119 extract_from_setup_cfg(path)
120 } else if is_setup_py_like_path(path) {
121 return extract_setup_py_packages(path);
122 } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
123 extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
124 } else if is_installed_wheel_metadata_path(path) {
125 extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
126 } else if is_pip_cache_origin_json(path) {
127 extract_from_pip_origin_json(path)
128 } else if path.file_name().unwrap_or_default() == "pypi.json" {
129 extract_from_pypi_json(path)
130 } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
131 extract_from_pip_inspect(path)
132 } else if is_python_sdist_archive_path(path) {
133 extract_from_sdist_archive(path)
134 } else if path
135 .extension()
136 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
137 {
138 extract_from_wheel_archive(path)
139 } else if path
140 .extension()
141 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
142 {
143 extract_from_egg_archive(path)
144 } else {
145 default_package_data(path)
146 },
147 ]
148 }
149
150 fn is_match(path: &Path) -> bool {
151 if let Some(filename) = path.file_name()
152 && (filename == "pyproject.toml"
153 || filename == "setup.cfg"
154 || is_setup_py_like_path(path)
155 || filename == "PKG-INFO"
156 || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
157 || filename == "pypi.json"
158 || filename == "pip-inspect.deplock"
159 || is_pip_cache_origin_json(path))
160 {
161 return true;
162 }
163
164 if let Some(extension) = path.extension() {
165 let ext = extension.to_string_lossy().to_lowercase();
166 if ext == "whl" || ext == "egg" || is_python_sdist_archive_path(path) {
167 return true;
168 }
169 }
170
171 false
172 }
173}
174
175fn is_setup_py_like_path(path: &Path) -> bool {
176 path.file_name()
177 .and_then(|name| name.to_str())
178 .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
179}
180
181fn is_installed_wheel_metadata_path(path: &Path) -> bool {
182 path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
183 && path
184 .parent()
185 .and_then(|parent| parent.file_name())
186 .and_then(|name| name.to_str())
187 .is_some_and(|name| name.ends_with(".dist-info"))
188}
189
190#[derive(Debug, Clone)]
191struct InstalledWheelMetadata {
192 wheel_tags: Vec<String>,
193 wheel_version: Option<String>,
194 wheel_generator: Option<String>,
195 root_is_purelib: Option<bool>,
196 compressed_tag: Option<String>,
197}
198
199fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
200 let Some(parent) = path.parent() else {
201 return;
202 };
203
204 if !parent
205 .file_name()
206 .and_then(|name| name.to_str())
207 .is_some_and(|name| name.ends_with(".dist-info"))
208 {
209 return;
210 }
211
212 let wheel_path = parent.join("WHEEL");
213 if !wheel_path.exists() {
214 return;
215 }
216
217 let Ok(content) = read_file_to_string(&wheel_path) else {
218 warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
219 return;
220 };
221
222 let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
223 return;
224 };
225
226 apply_installed_wheel_metadata(package_data, &wheel_metadata);
227}
228
229fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
230 use super::rfc822::{get_header_all, get_header_first};
231
232 let metadata = super::rfc822::parse_rfc822_content(content);
233 let wheel_tags = get_header_all(&metadata.headers, "tag");
234 if wheel_tags.is_empty() {
235 return None;
236 }
237
238 let wheel_version = get_header_first(&metadata.headers, "wheel-version");
239 let wheel_generator = get_header_first(&metadata.headers, "generator");
240 let root_is_purelib =
241 get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
242 match value.to_ascii_lowercase().as_str() {
243 "true" => Some(true),
244 "false" => Some(false),
245 _ => None,
246 }
247 });
248
249 let compressed_tag = compress_wheel_tags(&wheel_tags);
250
251 Some(InstalledWheelMetadata {
252 wheel_tags,
253 wheel_version,
254 wheel_generator,
255 root_is_purelib,
256 compressed_tag,
257 })
258}
259
260fn compress_wheel_tags(tags: &[String]) -> Option<String> {
261 if tags.is_empty() {
262 return None;
263 }
264
265 if tags.len() == 1 {
266 return Some(tags[0].clone());
267 }
268
269 let mut python_tags = Vec::new();
270 let mut abi_tag: Option<&str> = None;
271 let mut platform_tag: Option<&str> = None;
272
273 for tag in tags {
274 let mut parts = tag.splitn(3, '-');
275 let python = parts.next()?;
276 let abi = parts.next()?;
277 let platform = parts.next()?;
278
279 if abi_tag.is_some_and(|existing| existing != abi)
280 || platform_tag.is_some_and(|existing| existing != platform)
281 {
282 return None;
283 }
284
285 abi_tag = Some(abi);
286 platform_tag = Some(platform);
287 python_tags.push(python.to_string());
288 }
289
290 Some(format!(
291 "{}-{}-{}",
292 python_tags.join("."),
293 abi_tag?,
294 platform_tag?
295 ))
296}
297
298fn apply_installed_wheel_metadata(
299 package_data: &mut PackageData,
300 wheel_metadata: &InstalledWheelMetadata,
301) {
302 let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
303 extra_data.insert(
304 "wheel_tags".to_string(),
305 JsonValue::Array(
306 wheel_metadata
307 .wheel_tags
308 .iter()
309 .cloned()
310 .map(JsonValue::String)
311 .collect(),
312 ),
313 );
314
315 if let Some(wheel_version) = &wheel_metadata.wheel_version {
316 extra_data.insert(
317 "wheel_version".to_string(),
318 JsonValue::String(wheel_version.clone()),
319 );
320 }
321
322 if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
323 extra_data.insert(
324 "wheel_generator".to_string(),
325 JsonValue::String(wheel_generator.clone()),
326 );
327 }
328
329 if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
330 extra_data.insert(
331 "root_is_purelib".to_string(),
332 JsonValue::Bool(root_is_purelib),
333 );
334 }
335
336 if let (Some(name), Some(version), Some(extension)) = (
337 package_data.name.as_deref(),
338 package_data.version.as_deref(),
339 wheel_metadata.compressed_tag.as_deref(),
340 ) {
341 package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
342 }
343}
344
345fn is_pip_cache_origin_json(path: &Path) -> bool {
346 path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
347 && path.ancestors().skip(1).any(|ancestor| {
348 ancestor
349 .file_name()
350 .and_then(|name| name.to_str())
351 .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
352 })
353}
354
355fn extract_from_pip_origin_json(path: &Path) -> PackageData {
356 let content = match read_file_to_string(path) {
357 Ok(content) => content,
358 Err(e) => {
359 warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
360 return default_package_data(path);
361 }
362 };
363
364 let root: JsonValue = match serde_json::from_str(&content) {
365 Ok(root) => root,
366 Err(e) => {
367 warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
368 return default_package_data(path);
369 }
370 };
371
372 let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
373 warn!("No url found in pip cache origin.json at {:?}", path);
374 return default_package_data(path);
375 };
376
377 let sibling_wheel = find_sibling_cached_wheel(path);
378 let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
379 sibling_wheel
380 .as_ref()
381 .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
382 });
383
384 let Some((name, version)) = name_version else {
385 warn!(
386 "Failed to infer package name/version from pip cache origin.json at {:?}",
387 path
388 );
389 return default_package_data(path);
390 };
391
392 let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
393 build_pypi_urls(Some(&name), Some(&version));
394 let purl = sibling_wheel
395 .as_ref()
396 .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
397 .or(plain_purl);
398
399 PackageData {
400 package_type: Some(PythonParser::PACKAGE_TYPE),
401 primary_language: Some("Python".to_string()),
402 name: Some(name),
403 version: Some(version),
404 datasource_id: Some(DatasourceId::PypiPipOriginJson),
405 download_url: Some(download_url.to_string()),
406 sha256: extract_sha256_from_origin_json(&root),
407 repository_homepage_url,
408 repository_download_url,
409 api_data_url,
410 purl,
411 ..Default::default()
412 }
413}
414
415fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
416 let parent = path.parent()?;
417 let entries = parent.read_dir().ok()?;
418
419 for entry in entries.flatten() {
420 let sibling_path = entry.path();
421 if sibling_path
422 .extension()
423 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
424 && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
425 {
426 return Some(wheel_info);
427 }
428 }
429
430 None
431}
432
433fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
434 let file_name = url.rsplit('/').next()?;
435
436 if file_name.ends_with(".whl") {
437 return parse_wheel_filename(Path::new(file_name))
438 .map(|wheel_info| (wheel_info.name, wheel_info.version));
439 }
440
441 let stem = strip_python_archive_extension(file_name)?;
442 let (name, version) = stem.rsplit_once('-')?;
443 if name.is_empty() || version.is_empty() {
444 return None;
445 }
446
447 Some((name.replace('_', "-"), version.to_string()))
448}
449
450fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
451 [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
452 .iter()
453 .find_map(|suffix| file_name.strip_suffix(suffix))
454}
455
456fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
457 root.pointer("/archive_info/hashes/sha256")
458 .and_then(|value| value.as_str())
459 .map(ToOwned::to_owned)
460 .or_else(|| {
461 root.pointer("/archive_info/hash")
462 .and_then(|value| value.as_str())
463 .and_then(normalize_origin_hash)
464 })
465}
466
467fn normalize_origin_hash(hash: &str) -> Option<String> {
468 if let Some(value) = hash.strip_prefix("sha256=") {
469 return Some(value.to_string());
470 }
471 if let Some(value) = hash.strip_prefix("sha256:") {
472 return Some(value.to_string());
473 }
474 if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
475 return Some(hash.to_string());
476 }
477 None
478}
479
480fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
481 let content = match read_file_to_string(path) {
482 Ok(content) => content,
483 Err(e) => {
484 warn!("Failed to read metadata at {:?}: {}", path, e);
485 return default_package_data(path);
486 }
487 };
488
489 let metadata = super::rfc822::parse_rfc822_content(&content);
490 let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
491 merge_sibling_metadata_dependencies(path, &mut package_data);
492 merge_sibling_metadata_file_references(path, &mut package_data);
493 if datasource_id == DatasourceId::PypiWheelMetadata {
494 merge_sibling_wheel_metadata(path, &mut package_data);
495 }
496 package_data
497}
498
499fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
500 let mut extra_dependencies = Vec::new();
501
502 if let Some(parent) = path.parent() {
503 let direct_requires = parent.join("requires.txt");
504 if direct_requires.exists()
505 && let Ok(content) = read_file_to_string(&direct_requires)
506 {
507 extra_dependencies.extend(parse_requires_txt(&content));
508 }
509
510 let sibling_egg_info_requires = parent
511 .read_dir()
512 .ok()
513 .into_iter()
514 .flatten()
515 .flatten()
516 .find_map(|entry| {
517 let child_path = entry.path();
518 if child_path.is_dir()
519 && child_path
520 .file_name()
521 .and_then(|name| name.to_str())
522 .is_some_and(|name| name.ends_with(".egg-info"))
523 {
524 let requires = child_path.join("requires.txt");
525 requires.exists().then_some(requires)
526 } else {
527 None
528 }
529 });
530
531 if let Some(requires_path) = sibling_egg_info_requires
532 && let Ok(content) = read_file_to_string(&requires_path)
533 {
534 extra_dependencies.extend(parse_requires_txt(&content));
535 }
536 }
537
538 for dependency in extra_dependencies {
539 if !package_data.dependencies.iter().any(|existing| {
540 existing.purl == dependency.purl
541 && existing.scope == dependency.scope
542 && existing.extracted_requirement == dependency.extracted_requirement
543 && existing.extra_data == dependency.extra_data
544 }) {
545 package_data.dependencies.push(dependency);
546 }
547 }
548}
549
550fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
551 let mut extra_refs = Vec::new();
552
553 if let Some(parent) = path.parent() {
554 let record_path = parent.join("RECORD");
555 if record_path.exists()
556 && let Ok(content) = read_file_to_string(&record_path)
557 {
558 extra_refs.extend(parse_record_csv(&content));
559 }
560
561 let installed_files_path = parent.join("installed-files.txt");
562 if installed_files_path.exists()
563 && let Ok(content) = read_file_to_string(&installed_files_path)
564 {
565 extra_refs.extend(parse_installed_files_txt(&content));
566 }
567
568 let sources_path = parent.join("SOURCES.txt");
569 if sources_path.exists()
570 && let Ok(content) = read_file_to_string(&sources_path)
571 {
572 extra_refs.extend(parse_sources_txt(&content));
573 }
574 }
575
576 for file_ref in extra_refs {
577 if !package_data
578 .file_references
579 .iter()
580 .any(|existing| existing.path == file_ref.path)
581 {
582 package_data.file_references.push(file_ref);
583 }
584 }
585}
586
587fn collect_validated_zip_entries<R: Read + std::io::Seek>(
588 archive: &mut ZipArchive<R>,
589 path: &Path,
590 archive_type: &str,
591) -> Result<Vec<ValidatedZipEntry>, String> {
592 let mut total_extracted = 0u64;
593 let mut entries = Vec::new();
594
595 for i in 0..archive.len() {
596 if let Ok(file) = archive.by_index_raw(i) {
597 let compressed_size = file.compressed_size();
598 let uncompressed_size = file.size();
599 let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
600 warn!(
601 "Skipping unsafe path in {} {:?}: {}",
602 archive_type,
603 path,
604 file.name()
605 );
606 continue;
607 };
608
609 if compressed_size > 0 {
610 let ratio = uncompressed_size as f64 / compressed_size as f64;
611 if ratio > MAX_COMPRESSION_RATIO {
612 warn!(
613 "Suspicious compression ratio in {} {:?}: {:.2}:1",
614 archive_type, path, ratio
615 );
616 continue;
617 }
618 }
619
620 if uncompressed_size > MAX_FILE_SIZE {
621 warn!(
622 "File too large in {} {:?}: {} bytes (limit: {} bytes)",
623 archive_type, path, uncompressed_size, MAX_FILE_SIZE
624 );
625 continue;
626 }
627
628 total_extracted += uncompressed_size;
629 if total_extracted > MAX_ARCHIVE_SIZE {
630 let msg = format!(
631 "Total extracted size exceeds limit for {} {:?}",
632 archive_type, path
633 );
634 warn!("{}", msg);
635 return Err(msg);
636 }
637
638 entries.push(ValidatedZipEntry {
639 index: i,
640 name: entry_name,
641 });
642 }
643 }
644
645 Ok(entries)
646}
647
648fn is_python_sdist_archive_path(path: &Path) -> bool {
649 detect_python_sdist_archive_format(path).is_some()
650}
651
652fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
653 let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
654
655 if !is_likely_python_sdist_filename(&file_name) {
656 return None;
657 }
658
659 if file_name.ends_with(".tar.gz") {
660 Some(PythonSdistArchiveFormat::TarGz)
661 } else if file_name.ends_with(".tgz") {
662 Some(PythonSdistArchiveFormat::Tgz)
663 } else if file_name.ends_with(".tar.bz2") {
664 Some(PythonSdistArchiveFormat::TarBz2)
665 } else if file_name.ends_with(".tar.xz") {
666 Some(PythonSdistArchiveFormat::TarXz)
667 } else if file_name.ends_with(".zip") {
668 zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
669 } else {
670 None
671 }
672}
673
674fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
675 if !path.is_file() {
676 return true;
677 }
678
679 let file = match File::open(path) {
680 Ok(file) => file,
681 Err(_) => return false,
682 };
683 let mut archive = match ZipArchive::new(file) {
684 Ok(archive) => archive,
685 Err(_) => return false,
686 };
687
688 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
689 Ok(entries) => entries,
690 Err(_) => return false,
691 };
692 let metadata_entries: Vec<_> = validated_entries
693 .iter()
694 .filter(|entry| entry.name.ends_with("/PKG-INFO"))
695 .filter_map(|entry| {
696 read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
697 .ok()
698 .map(|content| (entry.name.clone(), content))
699 })
700 .collect();
701
702 has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
703}
704
705fn is_likely_python_sdist_filename(file_name: &str) -> bool {
706 let Some(stem) = strip_python_archive_extension(file_name) else {
707 return false;
708 };
709
710 let Some((name, version)) = stem.rsplit_once('-') else {
711 return false;
712 };
713
714 !name.is_empty()
715 && !version.is_empty()
716 && version.chars().any(|ch| ch.is_ascii_digit())
717 && name
718 .chars()
719 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
720}
721
722fn extract_from_sdist_archive(path: &Path) -> PackageData {
723 let metadata = match std::fs::metadata(path) {
724 Ok(m) => m,
725 Err(e) => {
726 warn!(
727 "Failed to read metadata for sdist archive {:?}: {}",
728 path, e
729 );
730 return default_package_data(path);
731 }
732 };
733
734 if metadata.len() > MAX_ARCHIVE_SIZE {
735 warn!(
736 "sdist archive too large: {} bytes (limit: {} bytes)",
737 metadata.len(),
738 MAX_ARCHIVE_SIZE
739 );
740 return default_package_data(path);
741 }
742
743 let Some(format) = detect_python_sdist_archive_format(path) else {
744 return default_package_data(path);
745 };
746
747 let mut package_data = match format {
748 PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
749 let file = match File::open(path) {
750 Ok(file) => file,
751 Err(e) => {
752 warn!("Failed to open sdist archive {:?}: {}", path, e);
753 return default_package_data(path);
754 }
755 };
756 let decoder = GzDecoder::new(file);
757 extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
758 }
759 PythonSdistArchiveFormat::TarBz2 => {
760 let file = match File::open(path) {
761 Ok(file) => file,
762 Err(e) => {
763 warn!("Failed to open sdist archive {:?}: {}", path, e);
764 return default_package_data(path);
765 }
766 };
767 let decoder = BzDecoder::new(file);
768 extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
769 }
770 PythonSdistArchiveFormat::TarXz => {
771 let file = match File::open(path) {
772 Ok(file) => file,
773 Err(e) => {
774 warn!("Failed to open sdist archive {:?}: {}", path, e);
775 return default_package_data(path);
776 }
777 };
778 let decoder = XzDecoder::new(file);
779 extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
780 }
781 PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
782 };
783
784 if package_data.package_type.is_some() {
785 let (size, sha256) = calculate_file_checksums(path);
786 package_data.size = size;
787 package_data.sha256 = sha256;
788 }
789
790 package_data
791}
792
793fn extract_from_tar_sdist_archive<R: Read>(
794 path: &Path,
795 reader: R,
796 archive_type: &str,
797 compressed_size: u64,
798) -> PackageData {
799 let mut archive = Archive::new(reader);
800 let archive_entries = match archive.entries() {
801 Ok(entries) => entries,
802 Err(e) => {
803 warn!(
804 "Failed to read {} sdist archive {:?}: {}",
805 archive_type, path, e
806 );
807 return default_package_data(path);
808 }
809 };
810
811 let mut total_extracted = 0u64;
812 let mut entries = Vec::new();
813
814 for entry_result in archive_entries {
815 let mut entry = match entry_result {
816 Ok(entry) => entry,
817 Err(e) => {
818 warn!(
819 "Failed to read {} sdist entry from {:?}: {}",
820 archive_type, path, e
821 );
822 continue;
823 }
824 };
825
826 let entry_size = entry.size();
827 if entry_size > MAX_FILE_SIZE {
828 warn!(
829 "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
830 archive_type, path, entry_size, MAX_FILE_SIZE
831 );
832 continue;
833 }
834
835 total_extracted += entry_size;
836 if total_extracted > MAX_ARCHIVE_SIZE {
837 warn!(
838 "Total extracted size exceeds limit for {} sdist {:?}",
839 archive_type, path
840 );
841 return default_package_data(path);
842 }
843
844 if compressed_size > 0 {
845 let ratio = total_extracted as f64 / compressed_size as f64;
846 if ratio > MAX_COMPRESSION_RATIO {
847 warn!(
848 "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
849 archive_type, path, ratio
850 );
851 return default_package_data(path);
852 }
853 }
854
855 let entry_path = match entry.path() {
856 Ok(path) => path.to_string_lossy().replace('\\', "/"),
857 Err(e) => {
858 warn!(
859 "Failed to get {} sdist entry path from {:?}: {}",
860 archive_type, path, e
861 );
862 continue;
863 }
864 };
865
866 let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
867 warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
868 continue;
869 };
870
871 if !is_relevant_sdist_text_entry(&entry_path) {
872 continue;
873 }
874
875 if let Ok(content) = read_limited_utf8(
876 &mut entry,
877 MAX_FILE_SIZE,
878 &format!("{} entry {}", archive_type, entry_path),
879 ) {
880 entries.push((entry_path, content));
881 }
882 }
883
884 build_sdist_package_data(path, entries)
885}
886
887fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
888 let file = match File::open(path) {
889 Ok(file) => file,
890 Err(e) => {
891 warn!("Failed to open zip sdist archive {:?}: {}", path, e);
892 return default_package_data(path);
893 }
894 };
895
896 let mut archive = match ZipArchive::new(file) {
897 Ok(archive) => archive,
898 Err(e) => {
899 warn!("Failed to read zip sdist archive {:?}: {}", path, e);
900 return default_package_data(path);
901 }
902 };
903
904 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
905 Ok(entries) => entries,
906 Err(_) => return default_package_data(path),
907 };
908
909 let mut entries = Vec::new();
910 for entry in validated_entries.iter() {
911 if !is_relevant_sdist_text_entry(&entry.name) {
912 continue;
913 }
914
915 if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
916 entries.push((entry.name.clone(), content));
917 }
918 }
919
920 build_sdist_package_data(path, entries)
921}
922
923fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
924 entry_path.ends_with("/PKG-INFO")
925 || entry_path.ends_with("/requires.txt")
926 || entry_path.ends_with("/SOURCES.txt")
927}
928
929fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
930 let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
931 warn!("No PKG-INFO file found in sdist archive {:?}", path);
932 return default_package_data(path);
933 };
934
935 let mut package_data =
936 python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
937 merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
938 merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
939 apply_sdist_name_version_fallback(path, &mut package_data);
940 package_data.datasource_id = Some(DatasourceId::PypiSdist);
941 package_data
942}
943
944fn select_sdist_pkginfo_entry(
945 archive_path: &Path,
946 entries: &[(String, String)],
947) -> Option<(String, String)> {
948 let expected_name = sdist_archive_expected_name(archive_path);
949
950 entries
951 .iter()
952 .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
953 .min_by_key(|(entry_path, content)| {
954 let components: Vec<_> = entry_path
955 .split('/')
956 .filter(|part| !part.is_empty())
957 .collect();
958 let candidate_name = sdist_pkginfo_candidate_name(content);
959 let name_rank = if candidate_name == expected_name {
960 0
961 } else {
962 1
963 };
964 let kind_rank = sdist_pkginfo_kind_rank(entry_path);
965
966 (name_rank, kind_rank, components.len(), entry_path.clone())
967 })
968 .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
969}
970
971fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
972 let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
973 return false;
974 };
975
976 entries.iter().any(|(entry_path, content)| {
977 sdist_pkginfo_kind_rank(entry_path) < 3
978 && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
979 })
980}
981
982fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
983 archive_path
984 .file_name()
985 .and_then(|name| name.to_str())
986 .and_then(strip_python_archive_extension)
987 .and_then(|stem| {
988 stem.rsplit_once('-')
989 .map(|(name, _)| normalize_python_package_name(name))
990 })
991}
992
993fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
994 let metadata = super::rfc822::parse_rfc822_content(content);
995 super::rfc822::get_header_first(&metadata.headers, "name")
996 .map(|name| normalize_python_package_name(&name))
997}
998
999fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1000 let components: Vec<_> = entry_path
1001 .split('/')
1002 .filter(|part| !part.is_empty())
1003 .collect();
1004
1005 if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1006 {
1007 0
1008 } else if components.len() == 2 && components[1] == "PKG-INFO" {
1009 1
1010 } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1011 2
1012 } else {
1013 3
1014 }
1015}
1016
1017fn merge_sdist_archive_dependencies(
1018 entries: &[(String, String)],
1019 metadata_path: &str,
1020 package_data: &mut PackageData,
1021) {
1022 let metadata_dir = metadata_path
1023 .rsplit_once('/')
1024 .map(|(dir, _)| dir)
1025 .unwrap_or("");
1026 let archive_root = metadata_path.split('/').next().unwrap_or("");
1027 let matched_egg_info_dir =
1028 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1029 let mut extra_dependencies = Vec::new();
1030
1031 for (entry_path, content) in entries {
1032 let is_direct_requires =
1033 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1034 let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1035 entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1036 });
1037
1038 if is_direct_requires || is_egg_info_requires {
1039 extra_dependencies.extend(parse_requires_txt(content));
1040 }
1041 }
1042
1043 for dependency in extra_dependencies {
1044 if !package_data.dependencies.iter().any(|existing| {
1045 existing.purl == dependency.purl
1046 && existing.scope == dependency.scope
1047 && existing.extracted_requirement == dependency.extracted_requirement
1048 && existing.extra_data == dependency.extra_data
1049 }) {
1050 package_data.dependencies.push(dependency);
1051 }
1052 }
1053}
1054
1055fn merge_sdist_archive_file_references(
1056 entries: &[(String, String)],
1057 metadata_path: &str,
1058 package_data: &mut PackageData,
1059) {
1060 let metadata_dir = metadata_path
1061 .rsplit_once('/')
1062 .map(|(dir, _)| dir)
1063 .unwrap_or("");
1064 let archive_root = metadata_path.split('/').next().unwrap_or("");
1065 let matched_egg_info_dir =
1066 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1067 let mut extra_refs = Vec::new();
1068
1069 for (entry_path, content) in entries {
1070 let is_direct_sources =
1071 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1072 let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1073 entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1074 });
1075
1076 if is_direct_sources || is_egg_info_sources {
1077 extra_refs.extend(parse_sources_txt(content));
1078 }
1079 }
1080
1081 for file_ref in extra_refs {
1082 if !package_data
1083 .file_references
1084 .iter()
1085 .any(|existing| existing.path == file_ref.path)
1086 {
1087 package_data.file_references.push(file_ref);
1088 }
1089 }
1090}
1091
1092fn select_matching_sdist_egg_info_dir(
1093 entries: &[(String, String)],
1094 archive_root: &str,
1095 package_name: Option<&str>,
1096) -> Option<String> {
1097 let normalized_package_name = package_name.map(normalize_python_package_name);
1098
1099 entries
1100 .iter()
1101 .filter_map(|(entry_path, _)| {
1102 let components: Vec<_> = entry_path
1103 .split('/')
1104 .filter(|part| !part.is_empty())
1105 .collect();
1106 if components.len() == 3
1107 && components[0] == archive_root
1108 && components[1].ends_with(".egg-info")
1109 {
1110 Some(components[1].to_string())
1111 } else {
1112 None
1113 }
1114 })
1115 .min_by_key(|egg_info_dir| {
1116 let normalized_dir_name =
1117 normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1118 let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1119 0
1120 } else {
1121 1
1122 };
1123
1124 (name_rank, egg_info_dir.clone())
1125 })
1126}
1127
1128fn normalize_python_package_name(name: &str) -> String {
1129 name.to_ascii_lowercase().replace('_', "-")
1130}
1131
1132fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1133 let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1134 return;
1135 };
1136
1137 let Some(stem) = strip_python_archive_extension(file_name) else {
1138 return;
1139 };
1140
1141 let Some((name, version)) = stem.rsplit_once('-') else {
1142 return;
1143 };
1144
1145 if package_data.name.is_none() {
1146 package_data.name = Some(name.replace('_', "-"));
1147 }
1148 if package_data.version.is_none() {
1149 package_data.version = Some(version.to_string());
1150 }
1151
1152 if package_data.purl.is_none()
1153 || package_data.repository_homepage_url.is_none()
1154 || package_data.repository_download_url.is_none()
1155 || package_data.api_data_url.is_none()
1156 {
1157 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1158 build_pypi_urls(
1159 package_data.name.as_deref(),
1160 package_data.version.as_deref(),
1161 );
1162
1163 if package_data.repository_homepage_url.is_none() {
1164 package_data.repository_homepage_url = repository_homepage_url;
1165 }
1166 if package_data.repository_download_url.is_none() {
1167 package_data.repository_download_url = repository_download_url;
1168 }
1169 if package_data.api_data_url.is_none() {
1170 package_data.api_data_url = api_data_url;
1171 }
1172 if package_data.purl.is_none() {
1173 package_data.purl = purl;
1174 }
1175 }
1176}
1177
1178fn extract_from_wheel_archive(path: &Path) -> PackageData {
1179 let metadata = match std::fs::metadata(path) {
1180 Ok(m) => m,
1181 Err(e) => {
1182 warn!(
1183 "Failed to read metadata for wheel archive {:?}: {}",
1184 path, e
1185 );
1186 return default_package_data(path);
1187 }
1188 };
1189
1190 if metadata.len() > MAX_ARCHIVE_SIZE {
1191 warn!(
1192 "Wheel archive too large: {} bytes (limit: {} bytes)",
1193 metadata.len(),
1194 MAX_ARCHIVE_SIZE
1195 );
1196 return default_package_data(path);
1197 }
1198
1199 let file = match File::open(path) {
1200 Ok(f) => f,
1201 Err(e) => {
1202 warn!("Failed to open wheel archive {:?}: {}", path, e);
1203 return default_package_data(path);
1204 }
1205 };
1206
1207 let mut archive = match ZipArchive::new(file) {
1208 Ok(a) => a,
1209 Err(e) => {
1210 warn!("Failed to read wheel archive {:?}: {}", path, e);
1211 return default_package_data(path);
1212 }
1213 };
1214
1215 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1216 Ok(entries) => entries,
1217 Err(_) => return default_package_data(path),
1218 };
1219
1220 let metadata_entry =
1221 match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1222 Some(entry) => entry,
1223 None => {
1224 warn!("No METADATA file found in wheel archive {:?}", path);
1225 return default_package_data(path);
1226 }
1227 };
1228
1229 let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1230 Ok(c) => c,
1231 Err(e) => {
1232 warn!("Failed to read METADATA from {:?}: {}", path, e);
1233 return default_package_data(path);
1234 }
1235 };
1236
1237 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1238
1239 let (size, sha256) = calculate_file_checksums(path);
1240 package_data.size = size;
1241 package_data.sha256 = sha256;
1242
1243 if let Some(record_entry) =
1244 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1245 && let Ok(record_content) =
1246 read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1247 {
1248 package_data.file_references = parse_record_csv(&record_content);
1249 }
1250
1251 if let Some(wheel_info) = parse_wheel_filename(path) {
1252 if package_data.name.is_none() {
1253 package_data.name = Some(wheel_info.name.clone());
1254 }
1255 if package_data.version.is_none() {
1256 package_data.version = Some(wheel_info.version.clone());
1257 }
1258
1259 package_data.qualifiers = Some(std::collections::HashMap::from([(
1260 "extension".to_string(),
1261 format!(
1262 "{}-{}-{}",
1263 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1264 ),
1265 )]));
1266
1267 package_data.purl = build_wheel_purl(
1268 package_data.name.as_deref(),
1269 package_data.version.as_deref(),
1270 &wheel_info,
1271 );
1272
1273 let mut extra_data = package_data.extra_data.unwrap_or_default();
1274 extra_data.insert(
1275 "python_requires".to_string(),
1276 serde_json::Value::String(wheel_info.python_tag.clone()),
1277 );
1278 extra_data.insert(
1279 "abi_tag".to_string(),
1280 serde_json::Value::String(wheel_info.abi_tag.clone()),
1281 );
1282 extra_data.insert(
1283 "platform_tag".to_string(),
1284 serde_json::Value::String(wheel_info.platform_tag.clone()),
1285 );
1286 package_data.extra_data = Some(extra_data);
1287 }
1288
1289 package_data
1290}
1291
1292fn extract_from_egg_archive(path: &Path) -> PackageData {
1293 let metadata = match std::fs::metadata(path) {
1294 Ok(m) => m,
1295 Err(e) => {
1296 warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1297 return default_package_data(path);
1298 }
1299 };
1300
1301 if metadata.len() > MAX_ARCHIVE_SIZE {
1302 warn!(
1303 "Egg archive too large: {} bytes (limit: {} bytes)",
1304 metadata.len(),
1305 MAX_ARCHIVE_SIZE
1306 );
1307 return default_package_data(path);
1308 }
1309
1310 let file = match File::open(path) {
1311 Ok(f) => f,
1312 Err(e) => {
1313 warn!("Failed to open egg archive {:?}: {}", path, e);
1314 return default_package_data(path);
1315 }
1316 };
1317
1318 let mut archive = match ZipArchive::new(file) {
1319 Ok(a) => a,
1320 Err(e) => {
1321 warn!("Failed to read egg archive {:?}: {}", path, e);
1322 return default_package_data(path);
1323 }
1324 };
1325
1326 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1327 Ok(entries) => entries,
1328 Err(_) => return default_package_data(path),
1329 };
1330
1331 let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1332 &validated_entries,
1333 &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1334 ) {
1335 Some(entry) => entry,
1336 None => {
1337 warn!("No PKG-INFO file found in egg archive {:?}", path);
1338 return default_package_data(path);
1339 }
1340 };
1341
1342 let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1343 Ok(c) => c,
1344 Err(e) => {
1345 warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1346 return default_package_data(path);
1347 }
1348 };
1349
1350 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1351
1352 let (size, sha256) = calculate_file_checksums(path);
1353 package_data.size = size;
1354 package_data.sha256 = sha256;
1355
1356 if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1357 &validated_entries,
1358 &[
1359 "EGG-INFO/installed-files.txt",
1360 ".egg-info/installed-files.txt",
1361 ],
1362 ) && let Ok(installed_files_content) =
1363 read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1364 {
1365 package_data.file_references = parse_installed_files_txt(&installed_files_content);
1366 }
1367
1368 if let Some(egg_info) = parse_egg_filename(path) {
1369 if package_data.name.is_none() {
1370 package_data.name = Some(egg_info.name.clone());
1371 }
1372 if package_data.version.is_none() {
1373 package_data.version = Some(egg_info.version.clone());
1374 }
1375
1376 if let Some(python_version) = &egg_info.python_version {
1377 let mut extra_data = package_data.extra_data.unwrap_or_default();
1378 extra_data.insert(
1379 "python_version".to_string(),
1380 serde_json::Value::String(python_version.clone()),
1381 );
1382 package_data.extra_data = Some(extra_data);
1383 }
1384 }
1385
1386 package_data.purl = build_egg_purl(
1387 package_data.name.as_deref(),
1388 package_data.version.as_deref(),
1389 );
1390
1391 package_data
1392}
1393
1394fn find_validated_zip_entry_by_suffix<'a>(
1395 entries: &'a [ValidatedZipEntry],
1396 suffix: &str,
1397) -> Option<&'a ValidatedZipEntry> {
1398 entries.iter().find(|entry| entry.name.ends_with(suffix))
1399}
1400
1401fn find_validated_zip_entry_by_any_suffix<'a>(
1402 entries: &'a [ValidatedZipEntry],
1403 suffixes: &[&str],
1404) -> Option<&'a ValidatedZipEntry> {
1405 entries
1406 .iter()
1407 .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1408}
1409
1410fn read_validated_zip_entry<R: Read + std::io::Seek>(
1411 archive: &mut ZipArchive<R>,
1412 entry: &ValidatedZipEntry,
1413 path: &Path,
1414 archive_type: &str,
1415) -> Result<String, String> {
1416 let mut file = archive
1417 .by_index(entry.index)
1418 .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1419
1420 let compressed_size = file.compressed_size();
1421 let uncompressed_size = file.size();
1422
1423 if compressed_size > 0 {
1424 let ratio = uncompressed_size as f64 / compressed_size as f64;
1425 if ratio > MAX_COMPRESSION_RATIO {
1426 return Err(format!(
1427 "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1428 archive_type, path, ratio
1429 ));
1430 }
1431 }
1432
1433 if uncompressed_size > MAX_FILE_SIZE {
1434 return Err(format!(
1435 "Rejected oversized entry in {} {:?}: {} bytes",
1436 archive_type, path, uncompressed_size
1437 ));
1438 }
1439
1440 read_limited_utf8(
1441 &mut file,
1442 MAX_FILE_SIZE,
1443 &format!("{} entry {}", archive_type, entry.name),
1444 )
1445}
1446
1447fn read_limited_utf8<R: Read>(
1448 reader: &mut R,
1449 max_bytes: u64,
1450 context: &str,
1451) -> Result<String, String> {
1452 let mut limited = reader.take(max_bytes + 1);
1453 let mut bytes = Vec::new();
1454 limited
1455 .read_to_end(&mut bytes)
1456 .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1457
1458 if bytes.len() as u64 > max_bytes {
1459 return Err(format!(
1460 "{} exceeded {} byte limit while reading",
1461 context, max_bytes
1462 ));
1463 }
1464
1465 String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1466}
1467
1468fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1469 let normalized = entry_path.replace('\\', "/");
1470 if normalized.len() >= 3 {
1471 let bytes = normalized.as_bytes();
1472 if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1473 return None;
1474 }
1475 }
1476 let path = Path::new(&normalized);
1477 let mut components = Vec::new();
1478
1479 for component in path.components() {
1480 match component {
1481 Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1482 Component::CurDir => {}
1483 Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1484 }
1485 }
1486
1487 (!components.is_empty()).then_some(components.join("/"))
1488}
1489
1490pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1495 let mut reader = ReaderBuilder::new()
1496 .has_headers(false)
1497 .from_reader(content.as_bytes());
1498
1499 let mut file_references = Vec::new();
1500
1501 for result in reader.records() {
1502 match result {
1503 Ok(record) => {
1504 if record.len() < 3 {
1505 continue;
1506 }
1507
1508 let path = record.get(0).unwrap_or("").trim().to_string();
1509 if path.is_empty() {
1510 continue;
1511 }
1512
1513 let hash_field = record.get(1).unwrap_or("").trim();
1514 let size_field = record.get(2).unwrap_or("").trim();
1515
1516 let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1518 let parts: Vec<&str> = hash_field.split('=').collect();
1519 if parts.len() == 2 && parts[0] == "sha256" {
1520 match URL_SAFE_NO_PAD.decode(parts[1]) {
1522 Ok(decoded) => {
1523 let hex = decoded
1524 .iter()
1525 .map(|b| format!("{:02x}", b))
1526 .collect::<String>();
1527 Some(hex)
1528 }
1529 Err(_) => None,
1530 }
1531 } else {
1532 None
1533 }
1534 } else {
1535 None
1536 };
1537
1538 let size = if !size_field.is_empty() && size_field != "-" {
1540 size_field.parse::<u64>().ok()
1541 } else {
1542 None
1543 };
1544
1545 file_references.push(FileReference {
1546 path,
1547 size,
1548 sha1: None,
1549 md5: None,
1550 sha256,
1551 sha512: None,
1552 extra_data: None,
1553 });
1554 }
1555 Err(e) => {
1556 warn!("Failed to parse RECORD CSV row: {}", e);
1557 continue;
1558 }
1559 }
1560 }
1561
1562 file_references
1563}
1564
1565pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1568 content
1569 .lines()
1570 .map(|line| line.trim())
1571 .filter(|line| !line.is_empty())
1572 .map(|path| FileReference {
1573 path: path.to_string(),
1574 size: None,
1575 sha1: None,
1576 md5: None,
1577 sha256: None,
1578 sha512: None,
1579 extra_data: None,
1580 })
1581 .collect()
1582}
1583
1584pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1585 content
1586 .lines()
1587 .map(str::trim)
1588 .filter(|line| !line.is_empty())
1589 .map(|path| FileReference {
1590 path: path.to_string(),
1591 size: None,
1592 sha1: None,
1593 md5: None,
1594 sha256: None,
1595 sha512: None,
1596 extra_data: None,
1597 })
1598 .collect()
1599}
1600
1601struct WheelInfo {
1602 name: String,
1603 version: String,
1604 python_tag: String,
1605 abi_tag: String,
1606 platform_tag: String,
1607}
1608
1609fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1610 let stem = path.file_stem()?.to_string_lossy();
1611 let parts: Vec<&str> = stem.split('-').collect();
1612
1613 if parts.len() >= 5 {
1614 Some(WheelInfo {
1615 name: parts[0].replace('_', "-"),
1616 version: parts[1].to_string(),
1617 python_tag: parts[2].to_string(),
1618 abi_tag: parts[3].to_string(),
1619 platform_tag: parts[4..].join("-"),
1620 })
1621 } else {
1622 None
1623 }
1624}
1625
1626struct EggInfo {
1627 name: String,
1628 version: String,
1629 python_version: Option<String>,
1630}
1631
1632fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1633 let stem = path.file_stem()?.to_string_lossy();
1634 let parts: Vec<&str> = stem.split('-').collect();
1635
1636 if parts.len() >= 2 {
1637 Some(EggInfo {
1638 name: parts[0].replace('_', "-"),
1639 version: parts[1].to_string(),
1640 python_version: parts.get(2).map(|s| s.to_string()),
1641 })
1642 } else {
1643 None
1644 }
1645}
1646
1647fn build_wheel_purl(
1648 name: Option<&str>,
1649 version: Option<&str>,
1650 wheel_info: &WheelInfo,
1651) -> Option<String> {
1652 let name = name?;
1653 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1654
1655 if let Some(ver) = version {
1656 package_url.with_version(ver).ok()?;
1657 }
1658
1659 let extension = format!(
1660 "{}-{}-{}",
1661 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1662 );
1663 package_url.add_qualifier("extension", extension).ok()?;
1664
1665 Some(package_url.to_string())
1666}
1667
1668fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1669 let name = name?;
1670 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1671
1672 if let Some(ver) = version {
1673 package_url.with_version(ver).ok()?;
1674 }
1675
1676 package_url.add_qualifier("type", "egg").ok()?;
1677
1678 Some(package_url.to_string())
1679}
1680
1681fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1682 let metadata = super::rfc822::parse_rfc822_content(content);
1683 build_package_data_from_rfc822(&metadata, datasource_id)
1684}
1685
1686fn build_package_data_from_rfc822(
1691 metadata: &super::rfc822::Rfc822Metadata,
1692 datasource_id: DatasourceId,
1693) -> PackageData {
1694 use super::rfc822::{get_header_all, get_header_first};
1695
1696 let name = get_header_first(&metadata.headers, "name");
1697 let version = get_header_first(&metadata.headers, "version");
1698 let summary = get_header_first(&metadata.headers, "summary");
1699 let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1700 let author = get_header_first(&metadata.headers, "author");
1701 let author_email = get_header_first(&metadata.headers, "author-email");
1702 let license = get_header_first(&metadata.headers, "license");
1703 let license_expression = get_header_first(&metadata.headers, "license-expression");
1704 let download_url = get_header_first(&metadata.headers, "download-url");
1705 let platform = get_header_first(&metadata.headers, "platform");
1706 let requires_python = get_header_first(&metadata.headers, "requires-python");
1707 let classifiers = get_header_all(&metadata.headers, "classifier");
1708 let license_files = get_header_all(&metadata.headers, "license-file");
1709
1710 let description_body = if metadata.body.is_empty() {
1711 get_header_first(&metadata.headers, "description").unwrap_or_default()
1712 } else {
1713 metadata.body.clone()
1714 };
1715
1716 let description = build_description(summary.as_deref(), &description_body);
1717
1718 let mut parties = Vec::new();
1719 if author.is_some() || author_email.is_some() {
1720 parties.push(Party {
1721 r#type: Some("person".to_string()),
1722 role: Some("author".to_string()),
1723 name: author,
1724 email: author_email,
1725 url: None,
1726 organization: None,
1727 organization_url: None,
1728 timezone: None,
1729 });
1730 }
1731
1732 let (keywords, license_classifiers) = split_classifiers(&classifiers);
1733 let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1734 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1735 license_expression
1736 .as_deref()
1737 .and_then(normalize_spdx_expression)
1738 .map(|normalized| {
1739 build_declared_license_data(
1740 normalized,
1741 DeclaredLicenseMatchMetadata::single_line(
1742 license_expression.as_deref().unwrap_or_default(),
1743 )
1744 .with_referenced_filenames(&referenced_license_files),
1745 )
1746 })
1747 .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1748
1749 let extracted_license_statement = license_expression
1750 .clone()
1751 .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1752
1753 let mut extra_data = HashMap::new();
1754 if let Some(platform_value) = platform
1755 && !platform_value.eq_ignore_ascii_case("unknown")
1756 && !platform_value.is_empty()
1757 {
1758 extra_data.insert(
1759 "platform".to_string(),
1760 serde_json::Value::String(platform_value),
1761 );
1762 }
1763
1764 if let Some(requires_python_value) = requires_python
1765 && !requires_python_value.is_empty()
1766 {
1767 extra_data.insert(
1768 "requires_python".to_string(),
1769 serde_json::Value::String(requires_python_value),
1770 );
1771 }
1772
1773 if !license_files.is_empty() {
1774 extra_data.insert(
1775 "license_files".to_string(),
1776 serde_json::Value::Array(
1777 license_files
1778 .iter()
1779 .cloned()
1780 .map(serde_json::Value::String)
1781 .collect(),
1782 ),
1783 );
1784 }
1785
1786 let file_references = license_files
1787 .iter()
1788 .map(|path| FileReference {
1789 path: path.clone(),
1790 size: None,
1791 sha1: None,
1792 md5: None,
1793 sha256: None,
1794 sha512: None,
1795 extra_data: None,
1796 })
1797 .collect();
1798
1799 let project_urls = get_header_all(&metadata.headers, "project-url");
1800 let dependencies = extract_rfc822_dependencies(&metadata.headers);
1801 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1802
1803 if !project_urls.is_empty() {
1804 let parsed_urls = parse_project_urls(&project_urls);
1805
1806 for (label, url) in &parsed_urls {
1807 let label_lower = label.to_lowercase();
1808
1809 if bug_tracking_url.is_none()
1810 && matches!(
1811 label_lower.as_str(),
1812 "tracker"
1813 | "bug reports"
1814 | "bug tracker"
1815 | "issues"
1816 | "issue tracker"
1817 | "github: issues"
1818 )
1819 {
1820 bug_tracking_url = Some(url.clone());
1821 } else if code_view_url.is_none()
1822 && matches!(label_lower.as_str(), "source" | "source code" | "code")
1823 {
1824 code_view_url = Some(url.clone());
1825 } else if vcs_url.is_none()
1826 && matches!(
1827 label_lower.as_str(),
1828 "github" | "gitlab" | "github: repo" | "repository"
1829 )
1830 {
1831 vcs_url = Some(url.clone());
1832 } else if homepage_url.is_none()
1833 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1834 {
1835 homepage_url = Some(url.clone());
1836 } else if label_lower == "changelog" {
1837 extra_data.insert(
1838 "changelog_url".to_string(),
1839 serde_json::Value::String(url.clone()),
1840 );
1841 }
1842 }
1843
1844 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1845 .iter()
1846 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1847 .collect();
1848
1849 if !project_urls_json.is_empty() {
1850 extra_data.insert(
1851 "project_urls".to_string(),
1852 serde_json::Value::Object(project_urls_json),
1853 );
1854 }
1855 }
1856
1857 let extra_data = if extra_data.is_empty() {
1858 None
1859 } else {
1860 Some(extra_data)
1861 };
1862
1863 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1864 build_pypi_urls(name.as_deref(), version.as_deref());
1865
1866 PackageData {
1867 package_type: Some(PythonParser::PACKAGE_TYPE),
1868 namespace: None,
1869 name,
1870 version,
1871 qualifiers: None,
1872 subpath: None,
1873 primary_language: Some("Python".to_string()),
1874 description,
1875 release_date: None,
1876 parties,
1877 keywords,
1878 homepage_url,
1879 download_url,
1880 size: None,
1881 sha1: None,
1882 md5: None,
1883 sha256: None,
1884 sha512: None,
1885 bug_tracking_url,
1886 code_view_url,
1887 vcs_url,
1888 copyright: None,
1889 holder: None,
1890 declared_license_expression,
1891 declared_license_expression_spdx,
1892 license_detections,
1893 other_license_expression: None,
1894 other_license_expression_spdx: None,
1895 other_license_detections: Vec::new(),
1896 extracted_license_statement,
1897 notice_text: None,
1898 source_packages: Vec::new(),
1899 file_references,
1900 is_private: false,
1901 is_virtual: false,
1902 extra_data,
1903 dependencies,
1904 repository_homepage_url,
1905 repository_download_url,
1906 api_data_url,
1907 datasource_id: Some(datasource_id),
1908 purl,
1909 }
1910}
1911
1912fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
1913 project_urls
1914 .iter()
1915 .filter_map(|url_entry| {
1916 if let Some((label, url)) = url_entry.split_once(", ") {
1917 let label_trimmed = label.trim();
1918 let url_trimmed = url.trim();
1919 if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
1920 return Some((label_trimmed.to_string(), url_trimmed.to_string()));
1921 }
1922 }
1923 None
1924 })
1925 .collect()
1926}
1927
1928fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
1929 let mut parts = Vec::new();
1930 if let Some(summary_value) = summary
1931 && !summary_value.trim().is_empty()
1932 {
1933 parts.push(summary_value.trim().to_string());
1934 }
1935
1936 if !body.trim().is_empty() {
1937 parts.push(body.trim().to_string());
1938 }
1939
1940 if parts.is_empty() {
1941 None
1942 } else {
1943 Some(parts.join("\n"))
1944 }
1945}
1946
1947fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
1948 let mut keywords = Vec::new();
1949 let mut license_classifiers = Vec::new();
1950
1951 for classifier in classifiers {
1952 if classifier.starts_with("License ::") {
1953 license_classifiers.push(classifier.to_string());
1954 } else {
1955 keywords.push(classifier.to_string());
1956 }
1957 }
1958
1959 (keywords, license_classifiers)
1960}
1961
1962fn build_extracted_license_statement(
1963 license: Option<&str>,
1964 license_classifiers: &[String],
1965) -> Option<String> {
1966 let mut lines = Vec::new();
1967
1968 if let Some(value) = license
1969 && !value.trim().is_empty()
1970 {
1971 lines.push(format!("license: {}", value.trim()));
1972 }
1973
1974 if !license_classifiers.is_empty() {
1975 lines.push("classifiers:".to_string());
1976 for classifier in license_classifiers {
1977 lines.push(format!(" - '{}'", classifier));
1978 }
1979 }
1980
1981 if lines.is_empty() {
1982 None
1983 } else {
1984 Some(format!("{}\n", lines.join("\n")))
1985 }
1986}
1987
1988pub(crate) fn build_pypi_urls(
1989 name: Option<&str>,
1990 version: Option<&str>,
1991) -> (
1992 Option<String>,
1993 Option<String>,
1994 Option<String>,
1995 Option<String>,
1996) {
1997 let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
1998
1999 let repository_download_url = name.and_then(|value| {
2000 version.map(|ver| {
2001 format!(
2002 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2003 &value[..1.min(value.len())],
2004 value,
2005 value,
2006 ver
2007 )
2008 })
2009 });
2010
2011 let api_data_url = name.map(|value| {
2012 if let Some(ver) = version {
2013 format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2014 } else {
2015 format!("https://pypi.org/pypi/{}/json", value)
2016 }
2017 });
2018
2019 let purl = name.and_then(|value| {
2020 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2021 if let Some(ver) = version {
2022 package_url.with_version(ver).ok()?;
2023 }
2024 Some(package_url.to_string())
2025 });
2026
2027 (
2028 repository_homepage_url,
2029 repository_download_url,
2030 api_data_url,
2031 purl,
2032 )
2033}
2034
2035fn build_pypi_purl_with_extension(
2036 name: &str,
2037 version: Option<&str>,
2038 extension: &str,
2039) -> Option<String> {
2040 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2041 if let Some(ver) = version {
2042 package_url.with_version(ver).ok()?;
2043 }
2044 package_url.add_qualifier("extension", extension).ok()?;
2045 Some(package_url.to_string())
2046}
2047
2048fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2049 let toml_content = match read_toml_file(path) {
2050 Ok(content) => content,
2051 Err(e) => {
2052 warn!(
2053 "Failed to read or parse pyproject.toml at {:?}: {}",
2054 path, e
2055 );
2056 return default_package_data(path);
2057 }
2058 };
2059
2060 let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2061 let is_poetry_pyproject = tool_table
2062 .and_then(|tool| tool.get("poetry"))
2063 .and_then(|value| value.as_table())
2064 .is_some();
2065
2066 let project_table =
2068 if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2069 project.clone()
2071 } else if let Some(tool) = tool_table {
2072 if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2073 poetry.clone()
2075 } else {
2076 return default_package_data(path);
2077 }
2078 } else if toml_content.get(FIELD_NAME).is_some() {
2079 match toml_content.as_table() {
2081 Some(table) => table.clone(),
2082 None => {
2083 warn!("Failed to convert TOML content to table in {:?}", path);
2084 return default_package_data(path);
2085 }
2086 }
2087 } else {
2088 return default_package_data(path);
2089 };
2090
2091 let name = project_table
2092 .get(FIELD_NAME)
2093 .and_then(|v| v.as_str())
2094 .map(String::from);
2095
2096 let version = project_table
2097 .get(FIELD_VERSION)
2098 .and_then(|v| v.as_str())
2099 .map(String::from);
2100 let classifiers = project_table
2101 .get("classifiers")
2102 .and_then(|value| value.as_array())
2103 .map(|values| {
2104 values
2105 .iter()
2106 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2107 .collect::<Vec<_>>()
2108 })
2109 .unwrap_or_default();
2110
2111 let extracted_license_statement = extract_raw_license_string(&project_table);
2112 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2113 normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2114
2115 let (homepage_url, repository_url) = extract_urls(&project_table);
2117
2118 let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2119 let extra_data = extract_pyproject_extra_data(&toml_content);
2120
2121 let purl = name.as_ref().and_then(|n| {
2123 let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2124 Ok(p) => p,
2125 Err(e) => {
2126 warn!(
2127 "Failed to create PackageUrl for Python package '{}': {}",
2128 n, e
2129 );
2130 return None;
2131 }
2132 };
2133
2134 if let Some(v) = &version
2135 && let Err(e) = package_url.with_version(v)
2136 {
2137 warn!(
2138 "Failed to set version '{}' for Python package '{}': {}",
2139 v, n, e
2140 );
2141 return None;
2142 }
2143
2144 Some(package_url.to_string())
2145 });
2146
2147 let api_data_url = name.as_ref().map(|n| {
2148 if let Some(v) = &version {
2149 format!("https://pypi.org/pypi/{}/{}/json", n, v)
2150 } else {
2151 format!("https://pypi.org/pypi/{}/json", n)
2152 }
2153 });
2154
2155 let pypi_homepage_url = name
2156 .as_ref()
2157 .map(|n| format!("https://pypi.org/project/{}", n));
2158
2159 let pypi_download_url = name.as_ref().and_then(|n| {
2160 version.as_ref().map(|v| {
2161 format!(
2162 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2163 &n[..1.min(n.len())],
2164 n,
2165 n,
2166 v
2167 )
2168 })
2169 });
2170
2171 PackageData {
2172 package_type: Some(PythonParser::PACKAGE_TYPE),
2173 namespace: None,
2174 name,
2175 version,
2176 qualifiers: None,
2177 subpath: None,
2178 primary_language: None,
2179 description: None,
2180 release_date: None,
2181 parties: extract_parties(&project_table),
2182 keywords: Vec::new(),
2183 homepage_url: homepage_url.or(pypi_homepage_url),
2184 download_url: repository_url.clone().or(pypi_download_url),
2185 size: None,
2186 sha1: None,
2187 md5: None,
2188 sha256: None,
2189 sha512: None,
2190 bug_tracking_url: None,
2191 code_view_url: None,
2192 vcs_url: repository_url,
2193 copyright: None,
2194 holder: None,
2195 declared_license_expression,
2196 declared_license_expression_spdx,
2197 license_detections,
2198 other_license_expression: None,
2199 other_license_expression_spdx: None,
2200 other_license_detections: Vec::new(),
2201 extracted_license_statement,
2202 notice_text: None,
2203 source_packages: Vec::new(),
2204 file_references: Vec::new(),
2205 is_private: has_private_classifier(&classifiers),
2206 is_virtual: false,
2207 extra_data,
2208 dependencies: [dependencies, optional_dependencies].concat(),
2209 repository_homepage_url: None,
2210 repository_download_url: None,
2211 api_data_url,
2212 datasource_id: Some(if is_poetry_pyproject {
2213 DatasourceId::PypiPoetryPyprojectToml
2214 } else {
2215 DatasourceId::PypiPyprojectToml
2216 }),
2217 purl,
2218 }
2219}
2220
2221fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2222 let path_str = path.to_string_lossy().replace('\\', "/");
2223 if path_str.contains("/EGG-INFO/PKG-INFO") {
2224 DatasourceId::PypiEggPkginfo
2225 } else if path_str.ends_with(".egg-info/PKG-INFO") {
2226 DatasourceId::PypiEditableEggPkginfo
2227 } else {
2228 DatasourceId::PypiSdistPkginfo
2229 }
2230}
2231
2232fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2233 project
2234 .get(FIELD_LICENSE)
2235 .and_then(|license_value| match license_value {
2236 TomlValue::String(license_str) => Some(license_str.clone()),
2237 TomlValue::Table(license_table) => license_table
2238 .get("text")
2239 .and_then(|v| v.as_str())
2240 .map(|s| s.to_string())
2241 .or_else(|| {
2242 license_table
2243 .get("expression")
2244 .and_then(|v| v.as_str())
2245 .map(|expr| expr.to_string())
2246 }),
2247 _ => None,
2248 })
2249}
2250
2251fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2252 match project.get(FIELD_LICENSE) {
2253 Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2254 Some(TomlValue::Table(license_table)) => license_table
2255 .get("expression")
2256 .and_then(|value| value.as_str()),
2257 _ => None,
2258 }
2259}
2260
2261fn extract_urls(project: &TomlMap<String, TomlValue>) -> (Option<String>, Option<String>) {
2262 let mut homepage_url = None;
2263 let mut repository_url = None;
2264
2265 if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2267 homepage_url = urls
2268 .get(FIELD_HOMEPAGE)
2269 .and_then(|v| v.as_str())
2270 .map(String::from);
2271 repository_url = urls
2272 .get(FIELD_REPOSITORY)
2273 .and_then(|v| v.as_str())
2274 .map(String::from);
2275 }
2276
2277 if homepage_url.is_none() {
2279 homepage_url = project
2280 .get(FIELD_HOMEPAGE)
2281 .and_then(|v| v.as_str())
2282 .map(String::from);
2283 }
2284
2285 if repository_url.is_none() {
2286 repository_url = project
2287 .get(FIELD_REPOSITORY)
2288 .and_then(|v| v.as_str())
2289 .map(String::from);
2290 }
2291
2292 (homepage_url, repository_url)
2293}
2294
2295fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2296 let mut parties = Vec::new();
2297
2298 if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2299 for author in authors {
2300 if let Some(author_str) = author.as_str() {
2301 let (name, email) = split_name_email(author_str);
2302 parties.push(Party {
2303 r#type: None,
2304 role: Some("author".to_string()),
2305 name,
2306 email,
2307 url: None,
2308 organization: None,
2309 organization_url: None,
2310 timezone: None,
2311 });
2312 }
2313 }
2314 }
2315
2316 if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2317 for maintainer in maintainers {
2318 if let Some(maintainer_str) = maintainer.as_str() {
2319 let (name, email) = split_name_email(maintainer_str);
2320 parties.push(Party {
2321 r#type: None,
2322 role: Some("maintainer".to_string()),
2323 name,
2324 email,
2325 url: None,
2326 organization: None,
2327 organization_url: None,
2328 timezone: None,
2329 });
2330 }
2331 }
2332 }
2333
2334 parties
2335}
2336
2337fn extract_dependencies(
2338 project: &TomlMap<String, TomlValue>,
2339 toml_content: &TomlValue,
2340) -> (Vec<Dependency>, Vec<Dependency>) {
2341 let mut dependencies = Vec::new();
2342 let mut optional_dependencies = Vec::new();
2343
2344 if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2346 match deps_value {
2347 TomlValue::Array(arr) => {
2348 dependencies = parse_dependency_array(arr, false, None);
2349 }
2350 TomlValue::Table(table) => {
2351 dependencies = parse_dependency_table(table, false, None);
2352 }
2353 _ => {}
2354 }
2355 }
2356
2357 if let Some(opt_deps_table) = project
2359 .get(FIELD_OPTIONAL_DEPENDENCIES)
2360 .and_then(|v| v.as_table())
2361 {
2362 for (extra_name, deps) in opt_deps_table {
2363 match deps {
2364 TomlValue::Array(arr) => {
2365 optional_dependencies.extend(parse_dependency_array(
2366 arr,
2367 true,
2368 Some(extra_name),
2369 ));
2370 }
2371 TomlValue::Table(table) => {
2372 optional_dependencies.extend(parse_dependency_table(
2373 table,
2374 true,
2375 Some(extra_name),
2376 ));
2377 }
2378 _ => {}
2379 }
2380 }
2381 }
2382
2383 if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2385 match dev_deps_value {
2386 TomlValue::Array(arr) => {
2387 optional_dependencies.extend(parse_dependency_array(
2388 arr,
2389 true,
2390 Some(FIELD_DEV_DEPENDENCIES),
2391 ));
2392 }
2393 TomlValue::Table(table) => {
2394 optional_dependencies.extend(parse_dependency_table(
2395 table,
2396 true,
2397 Some(FIELD_DEV_DEPENDENCIES),
2398 ));
2399 }
2400 _ => {}
2401 }
2402 }
2403
2404 if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
2406 for (group_name, group_data) in groups_table {
2407 if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2408 match group_deps {
2409 TomlValue::Array(arr) => {
2410 optional_dependencies.extend(parse_dependency_array(
2411 arr,
2412 true,
2413 Some(group_name),
2414 ));
2415 }
2416 TomlValue::Table(table) => {
2417 optional_dependencies.extend(parse_dependency_table(
2418 table,
2419 true,
2420 Some(group_name),
2421 ));
2422 }
2423 _ => {}
2424 }
2425 }
2426 }
2427 }
2428
2429 if let Some(groups_table) = toml_content
2430 .get(FIELD_DEPENDENCY_GROUPS)
2431 .and_then(|value| value.as_table())
2432 {
2433 for (group_name, deps) in groups_table {
2434 match deps {
2435 TomlValue::Array(arr) => {
2436 optional_dependencies.extend(parse_dependency_array(
2437 arr,
2438 true,
2439 Some(group_name),
2440 ));
2441 }
2442 TomlValue::Table(table) => {
2443 optional_dependencies.extend(parse_dependency_table(
2444 table,
2445 true,
2446 Some(group_name),
2447 ));
2448 }
2449 _ => {}
2450 }
2451 }
2452 }
2453
2454 if let Some(dev_deps_value) = toml_content
2455 .get("tool")
2456 .and_then(|value| value.as_table())
2457 .and_then(|tool| tool.get("uv"))
2458 .and_then(|value| value.as_table())
2459 .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2460 {
2461 match dev_deps_value {
2462 TomlValue::Array(arr) => {
2463 optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2464 }
2465 TomlValue::Table(table) => {
2466 optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2467 }
2468 _ => {}
2469 }
2470 }
2471
2472 (dependencies, optional_dependencies)
2473}
2474
2475fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2476 let mut extra_data = HashMap::new();
2477
2478 if let Some(tool_uv) = toml_content
2479 .get("tool")
2480 .and_then(|value| value.as_table())
2481 .and_then(|tool| tool.get("uv"))
2482 {
2483 extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2484 }
2485
2486 if extra_data.is_empty() {
2487 None
2488 } else {
2489 Some(extra_data)
2490 }
2491}
2492
2493fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2494 match value {
2495 TomlValue::String(value) => JsonValue::String(value.clone()),
2496 TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2497 TomlValue::Float(value) => JsonValue::String(value.to_string()),
2498 TomlValue::Boolean(value) => JsonValue::Bool(*value),
2499 TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2500 TomlValue::Array(values) => {
2501 JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2502 }
2503 TomlValue::Table(values) => JsonValue::Object(
2504 values
2505 .iter()
2506 .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2507 .collect::<JsonMap<String, JsonValue>>(),
2508 ),
2509 }
2510}
2511
2512fn parse_dependency_table(
2513 table: &TomlMap<String, TomlValue>,
2514 is_optional: bool,
2515 scope: Option<&str>,
2516) -> Vec<Dependency> {
2517 table
2518 .iter()
2519 .filter_map(|(name, version)| {
2520 let version_str = version.as_str().map(|s| s.to_string());
2521 let mut package_url =
2522 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2523
2524 if let Some(v) = &version_str {
2525 package_url.with_version(v).ok()?;
2526 }
2527
2528 Some(Dependency {
2529 purl: Some(package_url.to_string()),
2530 extracted_requirement: None,
2531 scope: scope.map(|s| s.to_string()),
2532 is_runtime: Some(!is_optional),
2533 is_optional: Some(is_optional),
2534 is_pinned: None,
2535 is_direct: Some(true),
2536 resolved_package: None,
2537 extra_data: None,
2538 })
2539 })
2540 .collect()
2541}
2542
2543fn parse_dependency_array(
2544 array: &[TomlValue],
2545 is_optional: bool,
2546 scope: Option<&str>,
2547) -> Vec<Dependency> {
2548 array
2549 .iter()
2550 .filter_map(|dep| {
2551 let dep_str = dep.as_str()?;
2552
2553 let mut parts = dep_str.split(['>', '=', '<', '~']);
2554 let name = parts.next()?.trim().to_string();
2555
2556 let version = parts.next().map(|v| v.trim().to_string());
2557
2558 let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name)
2559 {
2560 Ok(purl) => purl,
2561 Err(_) => return None,
2562 };
2563
2564 if let Some(ref v) = version {
2565 package_url.with_version(v).ok()?;
2566 }
2567
2568 Some(Dependency {
2569 purl: Some(package_url.to_string()),
2570 extracted_requirement: None,
2571 scope: scope.map(|s| s.to_string()),
2572 is_runtime: Some(!is_optional),
2573 is_optional: Some(is_optional),
2574 is_pinned: None,
2575 is_direct: Some(true),
2576 resolved_package: None,
2577 extra_data: None,
2578 })
2579 })
2580 .collect()
2581}
2582
2583#[derive(Debug, Clone)]
2584enum Value {
2585 String(String),
2586 Number(f64),
2587 Bool(bool),
2588 None,
2589 List(Vec<Value>),
2590 Tuple(Vec<Value>),
2591 Dict(HashMap<String, Value>),
2592}
2593
2594struct LiteralEvaluator {
2595 constants: HashMap<String, Value>,
2596 max_depth: usize,
2597 max_nodes: usize,
2598 nodes_visited: usize,
2599}
2600
2601impl LiteralEvaluator {
2602 fn new(constants: HashMap<String, Value>) -> Self {
2603 Self {
2604 constants,
2605 max_depth: MAX_SETUP_PY_AST_DEPTH,
2606 max_nodes: MAX_SETUP_PY_AST_NODES,
2607 nodes_visited: 0,
2608 }
2609 }
2610
2611 fn insert_constant(&mut self, name: String, value: Value) {
2612 self.constants.insert(name, value);
2613 }
2614
2615 fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2616 if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2617 return None;
2618 }
2619 self.nodes_visited += 1;
2620
2621 match expr {
2622 ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
2623 Some(Value::String(value.to_str().to_string()))
2624 }
2625 ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
2626 Some(Value::Bool(*value))
2627 }
2628 ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
2629 self.evaluate_number(value)
2630 }
2631 ast::Expr::NoneLiteral(_) => Some(Value::None),
2632 ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2633 ast::Expr::List(ast::ExprList { elts, .. }) => {
2634 let mut values = Vec::new();
2635 for elt in elts {
2636 values.push(self.evaluate_expr(elt, depth + 1)?);
2637 }
2638 Some(Value::List(values))
2639 }
2640 ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
2641 let mut values = Vec::new();
2642 for elt in elts {
2643 values.push(self.evaluate_expr(elt, depth + 1)?);
2644 }
2645 Some(Value::Tuple(values))
2646 }
2647 ast::Expr::Dict(ast::ExprDict { items, .. }) => {
2648 let mut dict = HashMap::new();
2649 for item in items {
2650 let key_expr = item.key.as_ref()?;
2651 let key_value = self.evaluate_expr(key_expr, depth + 1)?;
2652 let key = value_to_string(&key_value)?;
2653 let value = self.evaluate_expr(&item.value, depth + 1)?;
2654 dict.insert(key, value);
2655 }
2656 Some(Value::Dict(dict))
2657 }
2658 ast::Expr::Call(ast::ExprCall {
2659 func, arguments, ..
2660 }) => {
2661 let args = arguments.args.as_ref();
2662 let keywords = arguments.keywords.as_ref();
2663 if keywords.is_empty()
2664 && let Some(name) = dotted_name(func.as_ref(), depth + 1)
2665 && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
2666 {
2667 return self.evaluate_ordered_dict(args, depth + 1);
2668 }
2669
2670 if !args.is_empty() {
2671 return None;
2672 }
2673
2674 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
2675 && id == "dict"
2676 {
2677 let mut dict = HashMap::new();
2678 for keyword in keywords {
2679 let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
2680 let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2681 dict.insert(key.to_string(), value);
2682 }
2683 return Some(Value::Dict(dict));
2684 }
2685
2686 None
2687 }
2688 _ => None,
2689 }
2690 }
2691
2692 fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
2693 match number {
2694 ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2695 ast::Number::Float(value) => Some(Value::Number(*value)),
2696 ast::Number::Complex { .. } => None,
2697 }
2698 }
2699
2700 fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2701 if args.len() != 1 {
2702 return None;
2703 }
2704
2705 let items = match self.evaluate_expr(&args[0], depth)? {
2706 Value::List(items) | Value::Tuple(items) => items,
2707 _ => return None,
2708 };
2709
2710 let mut dict = HashMap::new();
2711 for item in items {
2712 let Value::Tuple(values) = item else {
2713 return None;
2714 };
2715 if values.len() != 2 {
2716 return None;
2717 }
2718 let key = value_to_string(&values[0])?;
2719 dict.insert(key, values[1].clone());
2720 }
2721
2722 Some(Value::Dict(dict))
2723 }
2724}
2725
2726#[derive(Default)]
2727struct SetupAliases {
2728 setup_names: HashSet<String>,
2729 module_aliases: HashMap<String, String>,
2730}
2731
2732fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
2733 extract_from_setup_py(path).into_iter().collect()
2734}
2735
2736fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
2737 let content = match read_file_to_string(path) {
2738 Ok(content) => content,
2739 Err(e) => {
2740 warn!("Failed to read setup.py at {:?}: {}", path, e);
2741 return Some(default_package_data(path));
2742 }
2743 };
2744
2745 if content.len() > MAX_SETUP_PY_BYTES {
2746 warn!("setup.py too large at {:?}: {} bytes", path, content.len());
2747 let package_data = extract_from_setup_py_regex(&content);
2748 return should_emit_setup_py_package(&package_data).then_some(package_data);
2749 }
2750
2751 let mut package_data = match extract_from_setup_py_ast(&content) {
2752 Ok(Some(data)) => data,
2753 Ok(None) => return Some(default_package_data(path)),
2754 Err(e) => {
2755 warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
2756 extract_from_setup_py_regex(&content)
2757 }
2758 };
2759
2760 if package_data.name.is_none() {
2761 package_data.name = extract_setup_value(&content, "name");
2762 }
2763
2764 if package_data.version.is_none() {
2765 package_data.version = extract_setup_value(&content, "version");
2766 }
2767
2768 fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
2769
2770 if package_data.purl.is_none() {
2771 package_data.purl = build_setup_py_purl(
2772 package_data.name.as_deref(),
2773 package_data.version.as_deref(),
2774 );
2775 }
2776
2777 if should_emit_setup_py_package(&package_data) {
2778 Some(package_data)
2779 } else {
2780 Some(default_package_data(path))
2781 }
2782}
2783
2784fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
2785 package_data.name.is_some()
2786 || package_data.version.is_some()
2787 || package_data.purl.is_some()
2788 || !package_data.dependencies.is_empty()
2789 || package_data.extracted_license_statement.is_some()
2790 || !package_data.license_detections.is_empty()
2791 || !package_data.parties.is_empty()
2792 || package_data.description.is_some()
2793 || package_data.homepage_url.is_some()
2794 || package_data.bug_tracking_url.is_some()
2795 || package_data.code_view_url.is_some()
2796 || package_data.vcs_url.is_some()
2797}
2798
2799fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
2800 if package_data.version.is_some()
2801 && package_data.extracted_license_statement.is_some()
2802 && package_data
2803 .parties
2804 .iter()
2805 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
2806 {
2807 return;
2808 }
2809
2810 let Some(root) = path.parent() else {
2811 return;
2812 };
2813
2814 let dunder_metadata = collect_sibling_dunder_metadata(root, content);
2815
2816 if package_data.version.is_none() {
2817 package_data.version = dunder_metadata.version;
2818 }
2819
2820 if package_data.extracted_license_statement.is_none() {
2821 package_data.extracted_license_statement = dunder_metadata.license;
2822 }
2823
2824 let has_author = package_data
2825 .parties
2826 .iter()
2827 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
2828
2829 if !has_author && let Some(author) = dunder_metadata.author {
2830 package_data.parties.push(Party {
2831 r#type: Some("person".to_string()),
2832 role: Some("author".to_string()),
2833 name: Some(author),
2834 email: None,
2835 url: None,
2836 organization: None,
2837 organization_url: None,
2838 timezone: None,
2839 });
2840 }
2841}
2842
2843#[derive(Default)]
2844struct DunderMetadata {
2845 version: Option<String>,
2846 author: Option<String>,
2847 license: Option<String>,
2848}
2849
2850fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
2851 let statements = match parse_module(content) {
2852 Ok(parsed) => parsed.into_suite(),
2853 Err(_) => return DunderMetadata::default(),
2854 };
2855
2856 let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2857 let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2858 let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2859 let mut metadata = DunderMetadata::default();
2860
2861 for module in imported_dunder_modules(&statements) {
2862 let Some(path) = resolve_imported_module_path(root, &module) else {
2863 continue;
2864 };
2865 let Ok(module_content) = read_file_to_string(&path) else {
2866 continue;
2867 };
2868
2869 if metadata.version.is_none() {
2870 metadata.version = version_re
2871 .as_ref()
2872 .and_then(|regex| regex.captures(&module_content))
2873 .and_then(|captures| captures.get(1))
2874 .map(|match_| match_.as_str().to_string());
2875 }
2876
2877 if metadata.author.is_none() {
2878 metadata.author = author_re
2879 .as_ref()
2880 .and_then(|regex| regex.captures(&module_content))
2881 .and_then(|captures| captures.get(1))
2882 .map(|match_| match_.as_str().to_string());
2883 }
2884
2885 if metadata.license.is_none() {
2886 metadata.license = license_re
2887 .as_ref()
2888 .and_then(|regex| regex.captures(&module_content))
2889 .and_then(|captures| captures.get(1))
2890 .map(|match_| match_.as_str().to_string());
2891 }
2892
2893 if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
2894 return metadata;
2895 }
2896 }
2897
2898 metadata
2899}
2900
2901fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
2902 let mut modules = Vec::new();
2903
2904 for statement in statements {
2905 let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
2906 continue;
2907 };
2908 let Some(module) = module.as_ref().map(|name| name.as_str()) else {
2909 continue;
2910 };
2911 let imports_dunder = names.iter().any(|alias| {
2912 matches!(
2913 alias.name.as_str(),
2914 "__version__" | "__author__" | "__license__"
2915 )
2916 });
2917 if imports_dunder {
2918 modules.push(module.to_string());
2919 }
2920 }
2921
2922 modules
2923}
2924
2925fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
2926 let relative = PathBuf::from_iter(module.split('.'));
2927 let candidates = [
2928 root.join(relative.with_extension("py")),
2929 root.join(&relative).join("__init__.py"),
2930 root.join("src").join(relative.with_extension("py")),
2931 root.join("src").join(relative).join("__init__.py"),
2932 ];
2933
2934 candidates.into_iter().find(|candidate| candidate.exists())
2935}
2936
2937fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
2953 let statements = parse_module(content)
2954 .map(|parsed| parsed.into_suite())
2955 .map_err(|e| e.to_string())?;
2956 let aliases = collect_setup_aliases(&statements);
2957 let mut evaluator = LiteralEvaluator::new(HashMap::new());
2958 build_setup_py_constants(&statements, &mut evaluator);
2959
2960 let setup_call = find_setup_call(&statements, &aliases);
2961 let Some(call_expr) = setup_call else {
2962 return Ok(None);
2963 };
2964
2965 let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
2966 Ok(Some(build_setup_py_package_data(&setup_values)))
2967}
2968
2969fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
2970 for stmt in statements {
2971 if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
2972 if targets.len() != 1 {
2973 continue;
2974 }
2975
2976 let Some(name) = extract_assign_name(&targets[0]) else {
2977 continue;
2978 };
2979
2980 if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
2981 evaluator.insert_constant(name, value);
2982 }
2983 }
2984 }
2985}
2986
2987fn extract_assign_name(target: &ast::Expr) -> Option<String> {
2988 match target {
2989 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
2990 _ => None,
2991 }
2992}
2993
2994fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
2995 let mut aliases = SetupAliases::default();
2996 aliases.setup_names.insert("setup".to_string());
2997
2998 for stmt in statements {
2999 match stmt {
3000 ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3001 for alias in names {
3002 let module_name = alias.name.as_str();
3003 if !is_setup_module(module_name) {
3004 continue;
3005 }
3006 let alias_name = alias
3007 .asname
3008 .as_ref()
3009 .map(|name| name.as_str())
3010 .unwrap_or(module_name);
3011 aliases
3012 .module_aliases
3013 .insert(alias_name.to_string(), module_name.to_string());
3014 }
3015 }
3016 ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3017 let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3018 continue;
3019 };
3020 if !is_setup_module(module_name) {
3021 continue;
3022 }
3023 for alias in names {
3024 if alias.name.as_str() != "setup" {
3025 continue;
3026 }
3027 let alias_name = alias
3028 .asname
3029 .as_ref()
3030 .map(|name| name.as_str())
3031 .unwrap_or("setup");
3032 aliases.setup_names.insert(alias_name.to_string());
3033 }
3034 }
3035 _ => {}
3036 }
3037 }
3038
3039 aliases
3040}
3041
3042fn is_setup_module(module_name: &str) -> bool {
3043 matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3044}
3045
3046fn find_setup_call<'a>(
3047 statements: &'a [ast::Stmt],
3048 aliases: &'a SetupAliases,
3049) -> Option<&'a ast::Expr> {
3050 let mut finder = SetupCallFinder {
3051 aliases,
3052 called_function_names: collect_top_level_called_function_names(statements),
3053 nodes_visited: 0,
3054 };
3055 finder.find_in_statements(statements)
3056}
3057
3058fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3059 let mut called = HashSet::new();
3060 collect_called_function_names_in_statements(statements, &mut called);
3061 called
3062}
3063
3064fn collect_called_function_names_in_statements(
3065 statements: &[ast::Stmt],
3066 called: &mut HashSet<String>,
3067) {
3068 for stmt in statements {
3069 match stmt {
3070 ast::Stmt::Expr(ast::StmtExpr { value, .. })
3071 | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3072 collect_called_function_names_in_expr(value.as_ref(), called);
3073 }
3074 ast::Stmt::If(ast::StmtIf {
3075 body,
3076 elif_else_clauses,
3077 ..
3078 }) => {
3079 collect_called_function_names_in_statements(body, called);
3080 for clause in elif_else_clauses {
3081 collect_called_function_names_in_statements(&clause.body, called);
3082 }
3083 }
3084 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3085 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3086 collect_called_function_names_in_statements(body, called);
3087 collect_called_function_names_in_statements(orelse, called);
3088 }
3089 ast::Stmt::With(ast::StmtWith { body, .. }) => {
3090 collect_called_function_names_in_statements(body, called);
3091 }
3092 ast::Stmt::Try(ast::StmtTry {
3093 body,
3094 orelse,
3095 finalbody,
3096 handlers,
3097 ..
3098 }) => {
3099 collect_called_function_names_in_statements(body, called);
3100 collect_called_function_names_in_statements(orelse, called);
3101 collect_called_function_names_in_statements(finalbody, called);
3102 for handler in handlers {
3103 let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3104 body,
3105 ..
3106 }) = handler;
3107 collect_called_function_names_in_statements(body, called);
3108 }
3109 }
3110 _ => {}
3111 }
3112 }
3113}
3114
3115fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3116 if let ast::Expr::Call(ast::ExprCall {
3117 func, arguments, ..
3118 }) = expr
3119 {
3120 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3121 called.insert(id.as_str().to_string());
3122 }
3123
3124 for arg in arguments.args.iter() {
3125 collect_called_function_names_in_expr(arg, called);
3126 }
3127 for keyword in arguments.keywords.iter() {
3128 collect_called_function_names_in_expr(&keyword.value, called);
3129 }
3130 }
3131}
3132
3133struct SetupCallFinder<'a> {
3134 aliases: &'a SetupAliases,
3135 called_function_names: HashSet<String>,
3136 nodes_visited: usize,
3137}
3138
3139impl<'a> SetupCallFinder<'a> {
3140 fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3141 for stmt in statements {
3142 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3143 return None;
3144 }
3145 self.nodes_visited += 1;
3146
3147 let found = match stmt {
3148 ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3149 ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3150 ast::Stmt::If(ast::StmtIf {
3151 body,
3152 elif_else_clauses,
3153 ..
3154 }) => self.find_in_statements(body).or_else(|| {
3155 for clause in elif_else_clauses {
3156 if let Some(found) = self.find_in_statements(&clause.body) {
3157 return Some(found);
3158 }
3159 }
3160 None
3161 }),
3162 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3163 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3164 .find_in_statements(body)
3165 .or_else(|| self.find_in_statements(orelse)),
3166 ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3167 .called_function_names
3168 .contains(name.as_str())
3169 .then(|| self.find_in_statements(body))
3170 .flatten(),
3171 ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3172 ast::Stmt::Try(ast::StmtTry {
3173 body,
3174 orelse,
3175 finalbody,
3176 handlers,
3177 ..
3178 }) => self
3179 .find_in_statements(body)
3180 .or_else(|| self.find_in_statements(orelse))
3181 .or_else(|| self.find_in_statements(finalbody))
3182 .or_else(|| {
3183 for handler in handlers {
3184 let ast::ExceptHandler::ExceptHandler(
3185 ast::ExceptHandlerExceptHandler { body, .. },
3186 ) = handler;
3187 if let Some(found) = self.find_in_statements(body) {
3188 return Some(found);
3189 }
3190 }
3191 None
3192 }),
3193 _ => None,
3194 };
3195
3196 if found.is_some() {
3197 return found;
3198 }
3199 }
3200
3201 None
3202 }
3203
3204 fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3205 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3206 return None;
3207 }
3208 self.nodes_visited += 1;
3209
3210 match expr {
3211 ast::Expr::Call(ast::ExprCall { func, .. })
3212 if is_setup_call(func.as_ref(), self.aliases) =>
3213 {
3214 Some(expr)
3215 }
3216 _ => None,
3217 }
3218 }
3219}
3220
3221fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3222 let Some(dotted) = dotted_name(func, 0) else {
3223 return false;
3224 };
3225
3226 if aliases.setup_names.contains(&dotted) {
3227 return true;
3228 }
3229
3230 let Some(module) = dotted.strip_suffix(".setup") else {
3231 return false;
3232 };
3233
3234 let resolved = resolve_module_alias(module, aliases);
3235 is_setup_module(&resolved)
3236}
3237
3238fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3239 if depth >= MAX_SETUP_PY_AST_DEPTH {
3240 return None;
3241 }
3242
3243 match expr {
3244 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3245 ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3246 let base = dotted_name(value.as_ref(), depth + 1)?;
3247 Some(format!("{}.{}", base, attr.as_str()))
3248 }
3249 _ => None,
3250 }
3251}
3252
3253fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3254 if let Some(mapped) = aliases.module_aliases.get(module) {
3255 return mapped.clone();
3256 }
3257
3258 let Some((base, rest)) = module.split_once('.') else {
3259 return module.to_string();
3260 };
3261
3262 if let Some(mapped) = aliases.module_aliases.get(base) {
3263 return format!("{}.{}", mapped, rest);
3264 }
3265
3266 module.to_string()
3267}
3268
3269fn extract_setup_keywords(
3270 call_expr: &ast::Expr,
3271 evaluator: &mut LiteralEvaluator,
3272) -> HashMap<String, Value> {
3273 let mut values = HashMap::new();
3274 let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3275 return values;
3276 };
3277
3278 for keyword in arguments.keywords.iter() {
3279 if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3280 if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3281 values.insert(arg.to_string(), value);
3282 }
3283 } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3284 for (key, value) in dict {
3285 values.insert(key, value);
3286 }
3287 }
3288 }
3289
3290 values
3291}
3292
3293fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3294 let name = get_value_string(values, "name");
3295 let version = get_value_string(values, "version");
3296 let description =
3297 get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3298 let homepage_url =
3299 get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3300 let author = get_value_string(values, "author");
3301 let author_email = get_value_string(values, "author_email");
3302 let maintainer = get_value_string(values, "maintainer");
3303 let maintainer_email = get_value_string(values, "maintainer_email");
3304 let license = get_value_string(values, "license");
3305 let classifiers = values
3306 .get("classifiers")
3307 .and_then(value_to_string_list)
3308 .unwrap_or_default();
3309
3310 let mut parties = Vec::new();
3311 if author.is_some() || author_email.is_some() {
3312 parties.push(Party {
3313 r#type: Some("person".to_string()),
3314 role: Some("author".to_string()),
3315 name: author,
3316 email: author_email,
3317 url: None,
3318 organization: None,
3319 organization_url: None,
3320 timezone: None,
3321 });
3322 }
3323
3324 if maintainer.is_some() || maintainer_email.is_some() {
3325 parties.push(Party {
3326 r#type: Some("person".to_string()),
3327 role: Some("maintainer".to_string()),
3328 name: maintainer,
3329 email: maintainer_email,
3330 url: None,
3331 organization: None,
3332 organization_url: None,
3333 timezone: None,
3334 });
3335 }
3336
3337 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3338 normalize_spdx_declared_license(license.as_deref());
3339 let extracted_license_statement = license.clone();
3340
3341 let dependencies = build_setup_py_dependencies(values);
3342 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3343 let mut homepage_from_project_urls = None;
3344 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3345 let mut extra_data = HashMap::new();
3346
3347 if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3348 apply_project_url_mappings(
3349 &parsed_project_urls,
3350 &mut homepage_from_project_urls,
3351 &mut bug_tracking_url,
3352 &mut code_view_url,
3353 &mut vcs_url,
3354 &mut extra_data,
3355 );
3356 }
3357
3358 let extra_data = if extra_data.is_empty() {
3359 None
3360 } else {
3361 Some(extra_data)
3362 };
3363
3364 PackageData {
3365 package_type: Some(PythonParser::PACKAGE_TYPE),
3366 namespace: None,
3367 name,
3368 version,
3369 qualifiers: None,
3370 subpath: None,
3371 primary_language: Some("Python".to_string()),
3372 description,
3373 release_date: None,
3374 parties,
3375 keywords: Vec::new(),
3376 homepage_url: homepage_url.or(homepage_from_project_urls),
3377 download_url: None,
3378 size: None,
3379 sha1: None,
3380 md5: None,
3381 sha256: None,
3382 sha512: None,
3383 bug_tracking_url,
3384 code_view_url,
3385 vcs_url,
3386 copyright: None,
3387 holder: None,
3388 declared_license_expression,
3389 declared_license_expression_spdx,
3390 license_detections,
3391 other_license_expression: None,
3392 other_license_expression_spdx: None,
3393 other_license_detections: Vec::new(),
3394 extracted_license_statement,
3395 notice_text: None,
3396 source_packages: Vec::new(),
3397 file_references: Vec::new(),
3398 is_private: has_private_classifier(&classifiers),
3399 is_virtual: false,
3400 extra_data,
3401 dependencies,
3402 repository_homepage_url: None,
3403 repository_download_url: None,
3404 api_data_url: None,
3405 datasource_id: Some(DatasourceId::PypiSetupPy),
3406 purl,
3407 }
3408}
3409
3410fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3411 let mut dependencies = Vec::new();
3412
3413 if let Some(reqs) = values
3414 .get("install_requires")
3415 .and_then(value_to_string_list)
3416 {
3417 dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3418 }
3419
3420 if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3421 dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3422 }
3423
3424 if let Some(Value::Dict(extras)) = values.get("extras_require") {
3425 let mut extra_items: Vec<_> = extras.iter().collect();
3426 extra_items.sort_by_key(|(name, _)| *name);
3427 for (extra_name, extra_value) in extra_items {
3428 if let Some(reqs) = value_to_string_list(extra_value) {
3429 dependencies.extend(build_setup_py_dependency_list(
3430 reqs.as_slice(),
3431 extra_name,
3432 true,
3433 ));
3434 }
3435 }
3436 }
3437
3438 dependencies
3439}
3440
3441fn build_setup_py_dependency_list(
3442 reqs: &[String],
3443 scope: &str,
3444 is_optional: bool,
3445) -> Vec<Dependency> {
3446 reqs.iter()
3447 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3448 .collect()
3449}
3450
3451fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3452 values.get(key).and_then(value_to_string)
3453}
3454
3455fn value_to_string(value: &Value) -> Option<String> {
3456 match value {
3457 Value::String(value) => Some(value.clone()),
3458 Value::Number(value) => Some(value.to_string()),
3459 Value::Bool(value) => Some(value.to_string()),
3460 _ => None,
3461 }
3462}
3463
3464fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3465 match value {
3466 Value::String(value) => Some(vec![value.clone()]),
3467 Value::List(values) | Value::Tuple(values) => {
3468 let mut items = Vec::new();
3469 for item in values {
3470 items.push(value_to_string(item)?);
3471 }
3472 Some(items)
3473 }
3474 _ => None,
3475 }
3476}
3477
3478fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3479 let Value::Dict(dict) = value else {
3480 return None;
3481 };
3482
3483 let mut pairs: Vec<(String, String)> = dict
3484 .iter()
3485 .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3486 .collect::<Option<Vec<_>>>()?;
3487 pairs.sort_by(|left, right| left.0.cmp(&right.0));
3488 Some(pairs)
3489}
3490
3491fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3492 let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3493 extract_requires_dist_dependencies(&requires_dist)
3494}
3495
3496pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3497 requires_dist
3498 .iter()
3499 .filter_map(|entry| build_rfc822_dependency(entry))
3500 .collect()
3501}
3502
3503fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3504 build_python_dependency(entry, "install", false, None)
3505}
3506
3507fn build_python_dependency(
3508 entry: &str,
3509 default_scope: &str,
3510 default_optional: bool,
3511 marker_override: Option<&str>,
3512) -> Option<Dependency> {
3513 let (requirement_part, marker_part) = entry
3514 .split_once(';')
3515 .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3516 .unwrap_or((entry.trim(), None));
3517
3518 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3519 let requirement = normalize_rfc822_requirement(requirement_part);
3520 let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3521 marker_part.or(marker_override),
3522 default_scope,
3523 default_optional,
3524 );
3525 let mut purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
3526
3527 let is_pinned = requirement
3528 .as_deref()
3529 .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3530 if is_pinned
3531 && let Some(version) = requirement
3532 .as_deref()
3533 .map(|req| req.trim_start_matches('='))
3534 {
3535 purl.with_version(version).ok()?;
3536 }
3537
3538 let mut extra_data = HashMap::new();
3539 extra_data.extend(marker_data);
3540 if let Some(marker) = marker {
3541 extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3542 }
3543
3544 Some(Dependency {
3545 purl: Some(purl.to_string()),
3546 extracted_requirement: requirement,
3547 scope: Some(scope),
3548 is_runtime: Some(true),
3549 is_optional: Some(is_optional),
3550 is_pinned: Some(is_pinned),
3551 is_direct: Some(true),
3552 resolved_package: None,
3553 extra_data: if extra_data.is_empty() {
3554 None
3555 } else {
3556 Some(extra_data)
3557 },
3558 })
3559}
3560
3561fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3562 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3563 let trimmed = requirement_part.trim();
3564 let mut remainder = trimmed[name.len()..].trim();
3565
3566 if let Some(stripped) = remainder.strip_prefix('[')
3567 && let Some(end_idx) = stripped.find(']')
3568 {
3569 remainder = stripped[end_idx + 1..].trim();
3570 }
3571
3572 let remainder = remainder
3573 .strip_prefix('(')
3574 .and_then(|value| value.strip_suffix(')'))
3575 .unwrap_or(remainder)
3576 .trim();
3577
3578 if remainder.is_empty() {
3579 return None;
3580 }
3581
3582 let mut specifiers: Vec<String> = remainder
3583 .split(',')
3584 .map(|specifier| specifier.trim().replace(' ', ""))
3585 .filter(|specifier| !specifier.is_empty())
3586 .collect();
3587 specifiers.sort();
3588 Some(specifiers.join(","))
3589}
3590
3591fn parse_rfc822_marker(
3592 marker_part: Option<&str>,
3593 default_scope: &str,
3594 default_optional: bool,
3595) -> (
3596 String,
3597 bool,
3598 Option<String>,
3599 HashMap<String, serde_json::Value>,
3600) {
3601 let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3602 return (
3603 default_scope.to_string(),
3604 default_optional,
3605 None,
3606 HashMap::new(),
3607 );
3608 };
3609
3610 let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3611 .expect("extra marker regex should compile");
3612 let mut extra_data = HashMap::new();
3613
3614 if let Some(python_version) = extract_marker_field(marker, "python_version") {
3615 extra_data.insert(
3616 "python_version".to_string(),
3617 serde_json::Value::String(python_version),
3618 );
3619 }
3620 if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
3621 extra_data.insert(
3622 "sys_platform".to_string(),
3623 serde_json::Value::String(sys_platform),
3624 );
3625 }
3626
3627 if let Some(captures) = extra_re.captures(marker)
3628 && let Some(scope) = captures.get(1)
3629 {
3630 return (
3631 scope.as_str().to_string(),
3632 true,
3633 Some(marker.trim().to_string()),
3634 extra_data,
3635 );
3636 }
3637
3638 (
3639 default_scope.to_string(),
3640 default_optional,
3641 Some(marker.trim().to_string()),
3642 extra_data,
3643 )
3644}
3645
3646fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
3647 let re = Regex::new(&format!(
3648 r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
3649 field
3650 ))
3651 .ok()?;
3652 let captures = re.captures(marker)?;
3653 let operator = captures.get(1)?.as_str();
3654 let value = captures.get(2)?.as_str();
3655 Some(format!("{} {}", operator, value))
3656}
3657
3658fn parse_requires_txt(content: &str) -> Vec<Dependency> {
3659 let mut dependencies = Vec::new();
3660 let mut current_scope = "install".to_string();
3661 let mut current_optional = false;
3662 let mut current_marker: Option<String> = None;
3663
3664 for line in content.lines() {
3665 let trimmed = line.trim();
3666 if trimmed.is_empty() || trimmed.starts_with('#') {
3667 continue;
3668 }
3669
3670 if trimmed.starts_with('[') && trimmed.ends_with(']') {
3671 let inner = &trimmed[1..trimmed.len() - 1];
3672 if let Some(rest) = inner.strip_prefix(':') {
3673 current_scope = "install".to_string();
3674 current_optional = false;
3675 current_marker = Some(rest.trim().to_string());
3676 } else if let Some((scope, marker)) = inner.split_once(':') {
3677 current_scope = scope.trim().to_string();
3678 current_optional = true;
3679 current_marker = Some(marker.trim().to_string());
3680 } else {
3681 current_scope = inner.trim().to_string();
3682 current_optional = true;
3683 current_marker = None;
3684 }
3685 continue;
3686 }
3687
3688 if let Some(dependency) = build_python_dependency(
3689 trimmed,
3690 ¤t_scope,
3691 current_optional,
3692 current_marker.as_deref(),
3693 ) {
3694 dependencies.push(dependency);
3695 }
3696 }
3697
3698 dependencies
3699}
3700
3701fn has_private_classifier(classifiers: &[String]) -> bool {
3702 classifiers
3703 .iter()
3704 .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
3705}
3706
3707fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
3708 let name = name?;
3709 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
3710 if let Some(version) = version {
3711 package_url.with_version(version).ok()?;
3712 }
3713 Some(package_url.to_string())
3714}
3715
3716fn extract_from_setup_py_regex(content: &str) -> PackageData {
3717 let name = extract_setup_value(content, "name");
3718 let version = extract_setup_value(content, "version");
3719 let license_expression = extract_setup_value(content, "license");
3720
3721 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3722 normalize_spdx_declared_license(license_expression.as_deref());
3723 let extracted_license_statement = license_expression.clone();
3724
3725 let dependencies = extract_setup_py_dependencies(content);
3726 let homepage_url = extract_setup_value(content, "url");
3727 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3728
3729 PackageData {
3730 package_type: Some(PythonParser::PACKAGE_TYPE),
3731 namespace: None,
3732 name,
3733 version,
3734 qualifiers: None,
3735 subpath: None,
3736 primary_language: Some("Python".to_string()),
3737 description: None,
3738 release_date: None,
3739 parties: Vec::new(),
3740 keywords: Vec::new(),
3741 homepage_url,
3742 download_url: None,
3743 size: None,
3744 sha1: None,
3745 md5: None,
3746 sha256: None,
3747 sha512: None,
3748 bug_tracking_url: None,
3749 code_view_url: None,
3750 vcs_url: None,
3751 copyright: None,
3752 holder: None,
3753 declared_license_expression,
3754 declared_license_expression_spdx,
3755 license_detections,
3756 other_license_expression: None,
3757 other_license_expression_spdx: None,
3758 other_license_detections: Vec::new(),
3759 extracted_license_statement,
3760 notice_text: None,
3761 source_packages: Vec::new(),
3762 file_references: Vec::new(),
3763 is_private: false,
3764 is_virtual: false,
3765 extra_data: None,
3766 dependencies,
3767 repository_homepage_url: None,
3768 repository_download_url: None,
3769 api_data_url: None,
3770 datasource_id: Some(DatasourceId::PypiSetupPy),
3771 purl,
3772 }
3773}
3774
3775fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
3776 crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
3777}
3778
3779fn extract_from_pypi_json(path: &Path) -> PackageData {
3780 let default = PackageData {
3781 package_type: Some(PythonParser::PACKAGE_TYPE),
3782 datasource_id: Some(DatasourceId::PypiJson),
3783 ..Default::default()
3784 };
3785
3786 let content = match read_file_to_string(path) {
3787 Ok(content) => content,
3788 Err(error) => {
3789 warn!("Failed to read pypi.json at {:?}: {}", path, error);
3790 return default;
3791 }
3792 };
3793
3794 let root: serde_json::Value = match serde_json::from_str(&content) {
3795 Ok(value) => value,
3796 Err(error) => {
3797 warn!("Failed to parse pypi.json at {:?}: {}", path, error);
3798 return default;
3799 }
3800 };
3801
3802 let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
3803 warn!("No info object found in pypi.json at {:?}", path);
3804 return default;
3805 };
3806
3807 let name = info
3808 .get("name")
3809 .and_then(|value| value.as_str())
3810 .map(ToOwned::to_owned);
3811 let version = info
3812 .get("version")
3813 .and_then(|value| value.as_str())
3814 .map(ToOwned::to_owned);
3815 let summary = info
3816 .get("summary")
3817 .and_then(|value| value.as_str())
3818 .map(ToOwned::to_owned);
3819 let description = info
3820 .get("description")
3821 .and_then(|value| value.as_str())
3822 .filter(|value| !value.trim().is_empty())
3823 .map(ToOwned::to_owned)
3824 .or(summary);
3825 let mut homepage_url = info
3826 .get("home_page")
3827 .and_then(|value| value.as_str())
3828 .map(ToOwned::to_owned);
3829 let author = info
3830 .get("author")
3831 .and_then(|value| value.as_str())
3832 .filter(|value| !value.trim().is_empty())
3833 .map(ToOwned::to_owned);
3834 let author_email = info
3835 .get("author_email")
3836 .and_then(|value| value.as_str())
3837 .filter(|value| !value.trim().is_empty())
3838 .map(ToOwned::to_owned);
3839 let license = info
3840 .get("license")
3841 .and_then(|value| value.as_str())
3842 .filter(|value| !value.trim().is_empty())
3843 .map(ToOwned::to_owned);
3844 let keywords = parse_setup_cfg_keywords(
3845 info.get("keywords")
3846 .and_then(|value| value.as_str())
3847 .map(ToOwned::to_owned),
3848 );
3849 let classifiers = info
3850 .get("classifiers")
3851 .and_then(|value| value.as_array())
3852 .map(|values| {
3853 values
3854 .iter()
3855 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
3856 .collect::<Vec<_>>()
3857 })
3858 .unwrap_or_default();
3859
3860 let mut parties = Vec::new();
3861 if author.is_some() || author_email.is_some() {
3862 parties.push(Party {
3863 r#type: Some("person".to_string()),
3864 role: Some("author".to_string()),
3865 name: author,
3866 email: author_email,
3867 url: None,
3868 organization: None,
3869 organization_url: None,
3870 timezone: None,
3871 });
3872 }
3873
3874 let mut bug_tracking_url = None;
3875 let mut code_view_url = None;
3876 let mut vcs_url = None;
3877 let mut extra_data = HashMap::new();
3878
3879 let parsed_project_urls = info
3880 .get("project_urls")
3881 .and_then(|value| value.as_object())
3882 .map(|map| {
3883 let mut pairs: Vec<(String, String)> = map
3884 .iter()
3885 .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
3886 .collect();
3887 pairs.sort_by(|left, right| left.0.cmp(&right.0));
3888 pairs
3889 })
3890 .unwrap_or_default();
3891
3892 apply_project_url_mappings(
3893 &parsed_project_urls,
3894 &mut homepage_url,
3895 &mut bug_tracking_url,
3896 &mut code_view_url,
3897 &mut vcs_url,
3898 &mut extra_data,
3899 );
3900
3901 let (download_url, size, sha256) = root
3902 .get("urls")
3903 .and_then(|value| value.as_array())
3904 .map(|urls| select_pypi_json_artifact(urls))
3905 .unwrap_or((None, None, None));
3906
3907 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3908 normalize_spdx_declared_license(license.as_deref());
3909 let dependencies = info
3910 .get("requires_dist")
3911 .and_then(|value| value.as_array())
3912 .map(|entries| {
3913 entries
3914 .iter()
3915 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
3916 .collect::<Vec<_>>()
3917 })
3918 .map(|entries| extract_requires_dist_dependencies(&entries))
3919 .unwrap_or_default();
3920
3921 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
3922 build_pypi_urls(name.as_deref(), version.as_deref());
3923
3924 PackageData {
3925 package_type: Some(PythonParser::PACKAGE_TYPE),
3926 namespace: None,
3927 name,
3928 version,
3929 qualifiers: None,
3930 subpath: None,
3931 primary_language: None,
3932 description,
3933 release_date: None,
3934 parties,
3935 keywords,
3936 homepage_url: homepage_url.or(repository_homepage_url.clone()),
3937 download_url,
3938 size,
3939 sha1: None,
3940 md5: None,
3941 sha256,
3942 sha512: None,
3943 bug_tracking_url,
3944 code_view_url,
3945 vcs_url,
3946 copyright: None,
3947 holder: None,
3948 declared_license_expression,
3949 declared_license_expression_spdx,
3950 license_detections,
3951 other_license_expression: None,
3952 other_license_expression_spdx: None,
3953 other_license_detections: Vec::new(),
3954 extracted_license_statement: license,
3955 notice_text: None,
3956 source_packages: Vec::new(),
3957 file_references: Vec::new(),
3958 is_private: has_private_classifier(&classifiers),
3959 is_virtual: false,
3960 extra_data: if extra_data.is_empty() {
3961 None
3962 } else {
3963 Some(extra_data)
3964 },
3965 dependencies,
3966 repository_homepage_url,
3967 repository_download_url,
3968 api_data_url,
3969 datasource_id: Some(DatasourceId::PypiJson),
3970 purl,
3971 }
3972}
3973
3974fn select_pypi_json_artifact(
3975 urls: &[serde_json::Value],
3976) -> (Option<String>, Option<u64>, Option<String>) {
3977 let selected = urls
3978 .iter()
3979 .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
3980 .or_else(|| urls.first());
3981
3982 let Some(entry) = selected else {
3983 return (None, None, None);
3984 };
3985
3986 let download_url = entry
3987 .get("url")
3988 .and_then(|value| value.as_str())
3989 .map(ToOwned::to_owned);
3990 let size = entry.get("size").and_then(|value| value.as_u64());
3991 let sha256 = entry
3992 .get("digests")
3993 .and_then(|value| value.as_object())
3994 .and_then(|digests| digests.get("sha256"))
3995 .and_then(|value| value.as_str())
3996 .map(ToOwned::to_owned);
3997
3998 (download_url, size, sha256)
3999}
4000
4001fn extract_from_pip_inspect(path: &Path) -> PackageData {
4002 let content = match read_file_to_string(path) {
4003 Ok(content) => content,
4004 Err(e) => {
4005 warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4006 return default_package_data(path);
4007 }
4008 };
4009
4010 let root: serde_json::Value = match serde_json::from_str(&content) {
4011 Ok(value) => value,
4012 Err(e) => {
4013 warn!(
4014 "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4015 path, e
4016 );
4017 return default_package_data(path);
4018 }
4019 };
4020
4021 let installed = match root.get("installed").and_then(|v| v.as_array()) {
4022 Some(arr) => arr,
4023 None => {
4024 warn!(
4025 "No 'installed' array found in pip-inspect.deplock at {:?}",
4026 path
4027 );
4028 return default_package_data(path);
4029 }
4030 };
4031
4032 let pip_version = root
4033 .get("pip_version")
4034 .and_then(|v| v.as_str())
4035 .map(String::from);
4036 let inspect_version = root
4037 .get("version")
4038 .and_then(|v| v.as_str())
4039 .map(String::from);
4040
4041 let mut main_package: Option<PackageData> = None;
4042 let mut dependencies: Vec<Dependency> = Vec::new();
4043
4044 for package_entry in installed {
4045 let metadata = match package_entry.get("metadata") {
4046 Some(m) => m,
4047 None => continue,
4048 };
4049
4050 let is_requested = package_entry
4051 .get("requested")
4052 .and_then(|v| v.as_bool())
4053 .unwrap_or(false);
4054 let has_direct_url = package_entry.get("direct_url").is_some();
4055
4056 let name = metadata
4057 .get("name")
4058 .and_then(|v| v.as_str())
4059 .map(String::from);
4060 let version = metadata
4061 .get("version")
4062 .and_then(|v| v.as_str())
4063 .map(String::from);
4064 let summary = metadata
4065 .get("summary")
4066 .and_then(|v| v.as_str())
4067 .map(String::from);
4068 let home_page = metadata
4069 .get("home_page")
4070 .and_then(|v| v.as_str())
4071 .map(String::from);
4072 let author = metadata
4073 .get("author")
4074 .and_then(|v| v.as_str())
4075 .map(String::from);
4076 let author_email = metadata
4077 .get("author_email")
4078 .and_then(|v| v.as_str())
4079 .map(String::from);
4080 let license = metadata
4081 .get("license")
4082 .and_then(|v| v.as_str())
4083 .map(String::from);
4084 let description = metadata
4085 .get("description")
4086 .and_then(|v| v.as_str())
4087 .map(String::from);
4088 let keywords = metadata
4089 .get("keywords")
4090 .and_then(|v| v.as_array())
4091 .map(|arr| {
4092 arr.iter()
4093 .filter_map(|k| k.as_str().map(String::from))
4094 .collect::<Vec<_>>()
4095 })
4096 .unwrap_or_default();
4097
4098 let mut parties = Vec::new();
4099 if author.is_some() || author_email.is_some() {
4100 parties.push(Party {
4101 r#type: Some("person".to_string()),
4102 role: Some("author".to_string()),
4103 name: author,
4104 email: author_email,
4105 url: None,
4106 organization: None,
4107 organization_url: None,
4108 timezone: None,
4109 });
4110 }
4111
4112 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4113 normalize_spdx_declared_license(license.as_deref());
4114 let extracted_license_statement = license.clone();
4115 let requires_dist = metadata
4116 .get("requires_dist")
4117 .and_then(|v| v.as_array())
4118 .map(|entries| {
4119 entries
4120 .iter()
4121 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4122 .collect::<Vec<_>>()
4123 })
4124 .unwrap_or_default();
4125 let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4126
4127 let purl = name.as_ref().and_then(|n| {
4128 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4129 if let Some(v) = &version {
4130 package_url.with_version(v).ok()?;
4131 }
4132 Some(package_url.to_string())
4133 });
4134
4135 if is_requested && has_direct_url {
4136 let mut extra_data = HashMap::new();
4137 if let Some(pv) = &pip_version {
4138 extra_data.insert(
4139 "pip_version".to_string(),
4140 serde_json::Value::String(pv.clone()),
4141 );
4142 }
4143 if let Some(iv) = &inspect_version {
4144 extra_data.insert(
4145 "inspect_version".to_string(),
4146 serde_json::Value::String(iv.clone()),
4147 );
4148 }
4149
4150 main_package = Some(PackageData {
4151 package_type: Some(PythonParser::PACKAGE_TYPE),
4152 namespace: None,
4153 name,
4154 version,
4155 qualifiers: None,
4156 subpath: None,
4157 primary_language: Some("Python".to_string()),
4158 description: description.or(summary),
4159 release_date: None,
4160 parties,
4161 keywords,
4162 homepage_url: home_page,
4163 download_url: None,
4164 size: None,
4165 sha1: None,
4166 md5: None,
4167 sha256: None,
4168 sha512: None,
4169 bug_tracking_url: None,
4170 code_view_url: None,
4171 vcs_url: None,
4172 copyright: None,
4173 holder: None,
4174 declared_license_expression,
4175 declared_license_expression_spdx,
4176 license_detections,
4177 other_license_expression: None,
4178 other_license_expression_spdx: None,
4179 other_license_detections: Vec::new(),
4180 extracted_license_statement,
4181 notice_text: None,
4182 source_packages: Vec::new(),
4183 file_references: Vec::new(),
4184 is_private: false,
4185 is_virtual: true,
4186 extra_data: if extra_data.is_empty() {
4187 None
4188 } else {
4189 Some(extra_data)
4190 },
4191 dependencies: parsed_dependencies,
4192 repository_homepage_url: None,
4193 repository_download_url: None,
4194 api_data_url: None,
4195 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4196 purl,
4197 });
4198 } else {
4199 let resolved_package = PackageData {
4200 package_type: Some(PythonParser::PACKAGE_TYPE),
4201 namespace: None,
4202 name: name.clone(),
4203 version: version.clone(),
4204 qualifiers: None,
4205 subpath: None,
4206 primary_language: Some("Python".to_string()),
4207 description: description.or(summary),
4208 release_date: None,
4209 parties,
4210 keywords,
4211 homepage_url: home_page,
4212 download_url: None,
4213 size: None,
4214 sha1: None,
4215 md5: None,
4216 sha256: None,
4217 sha512: None,
4218 bug_tracking_url: None,
4219 code_view_url: None,
4220 vcs_url: None,
4221 copyright: None,
4222 holder: None,
4223 declared_license_expression,
4224 declared_license_expression_spdx,
4225 license_detections,
4226 other_license_expression: None,
4227 other_license_expression_spdx: None,
4228 other_license_detections: Vec::new(),
4229 extracted_license_statement,
4230 notice_text: None,
4231 source_packages: Vec::new(),
4232 file_references: Vec::new(),
4233 is_private: false,
4234 is_virtual: true,
4235 extra_data: None,
4236 dependencies: parsed_dependencies,
4237 repository_homepage_url: None,
4238 repository_download_url: None,
4239 api_data_url: None,
4240 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4241 purl: purl.clone(),
4242 };
4243
4244 let resolved = package_data_to_resolved(&resolved_package);
4245 dependencies.push(Dependency {
4246 purl,
4247 extracted_requirement: None,
4248 scope: None,
4249 is_runtime: Some(true),
4250 is_optional: Some(false),
4251 is_pinned: Some(true),
4252 is_direct: Some(is_requested),
4253 resolved_package: Some(Box::new(resolved)),
4254 extra_data: None,
4255 });
4256 }
4257 }
4258
4259 if let Some(mut main_pkg) = main_package {
4260 let direct_requirement_purls: HashSet<String> = main_pkg
4261 .dependencies
4262 .iter()
4263 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4264 .collect();
4265
4266 let resolved_requirement_purls: HashSet<String> = dependencies
4267 .iter()
4268 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4269 .collect();
4270
4271 let unresolved_dependencies = main_pkg
4272 .dependencies
4273 .iter()
4274 .filter(|dep| {
4275 dep.purl.as_ref().is_some_and(|purl| {
4276 !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4277 })
4278 })
4279 .cloned()
4280 .collect::<Vec<_>>();
4281
4282 for dependency in &mut dependencies {
4283 if dependency
4284 .purl
4285 .as_ref()
4286 .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4287 {
4288 dependency.is_direct = Some(true);
4289 }
4290 }
4291
4292 main_pkg.dependencies = dependencies;
4293 main_pkg.dependencies.extend(unresolved_dependencies);
4294 main_pkg
4295 } else {
4296 default_package_data(path)
4297 }
4298}
4299
4300fn base_dependency_purl(purl: &str) -> String {
4301 purl.split_once('@')
4302 .map(|(base, _)| base.to_string())
4303 .unwrap_or_else(|| purl.to_string())
4304}
4305
4306type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4307
4308fn extract_from_setup_cfg(path: &Path) -> PackageData {
4309 let content = match read_file_to_string(path) {
4310 Ok(content) => content,
4311 Err(e) => {
4312 warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4313 return default_package_data(path);
4314 }
4315 };
4316
4317 let sections = parse_setup_cfg(&content);
4318 let name = get_ini_value(§ions, "metadata", "name");
4319 let version = get_ini_value(§ions, "metadata", "version");
4320 let description = get_ini_value(§ions, "metadata", "description");
4321 let author = get_ini_value(§ions, "metadata", "author");
4322 let author_email = get_ini_value(§ions, "metadata", "author_email");
4323 let maintainer = get_ini_value(§ions, "metadata", "maintainer");
4324 let maintainer_email = get_ini_value(§ions, "metadata", "maintainer_email");
4325 let license = get_ini_value(§ions, "metadata", "license");
4326 let mut homepage_url = get_ini_value(§ions, "metadata", "url");
4327 let classifiers = get_ini_values(§ions, "metadata", "classifiers");
4328 let keywords = parse_setup_cfg_keywords(get_ini_value(§ions, "metadata", "keywords"));
4329 let python_requires = get_ini_value(§ions, "options", "python_requires");
4330 let parsed_project_urls =
4331 parse_setup_cfg_project_urls(&get_ini_values(§ions, "metadata", "project_urls"));
4332 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4333 let mut extra_data = HashMap::new();
4334
4335 let mut parties = Vec::new();
4336 if author.is_some() || author_email.is_some() {
4337 parties.push(Party {
4338 r#type: Some("person".to_string()),
4339 role: Some("author".to_string()),
4340 name: author,
4341 email: author_email,
4342 url: None,
4343 organization: None,
4344 organization_url: None,
4345 timezone: None,
4346 });
4347 }
4348
4349 if maintainer.is_some() || maintainer_email.is_some() {
4350 parties.push(Party {
4351 r#type: Some("person".to_string()),
4352 role: Some("maintainer".to_string()),
4353 name: maintainer,
4354 email: maintainer_email,
4355 url: None,
4356 organization: None,
4357 organization_url: None,
4358 timezone: None,
4359 });
4360 }
4361
4362 let declared_license_expression = None;
4363 let declared_license_expression_spdx = None;
4364 let license_detections = Vec::new();
4365 let extracted_license_statement = license.clone();
4366
4367 let dependencies = extract_setup_cfg_dependencies(§ions);
4368
4369 if let Some(value) = python_requires {
4370 extra_data.insert(
4371 "python_requires".to_string(),
4372 serde_json::Value::String(value),
4373 );
4374 }
4375
4376 apply_project_url_mappings(
4377 &parsed_project_urls,
4378 &mut homepage_url,
4379 &mut bug_tracking_url,
4380 &mut code_view_url,
4381 &mut vcs_url,
4382 &mut extra_data,
4383 );
4384
4385 let extra_data = if extra_data.is_empty() {
4386 None
4387 } else {
4388 Some(extra_data)
4389 };
4390
4391 let purl = name.as_ref().and_then(|n| {
4392 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4393 if let Some(v) = &version {
4394 package_url.with_version(v).ok()?;
4395 }
4396 Some(package_url.to_string())
4397 });
4398
4399 PackageData {
4400 package_type: Some(PythonParser::PACKAGE_TYPE),
4401 namespace: None,
4402 name,
4403 version,
4404 qualifiers: None,
4405 subpath: None,
4406 primary_language: Some("Python".to_string()),
4407 description,
4408 release_date: None,
4409 parties,
4410 keywords,
4411 homepage_url,
4412 download_url: None,
4413 size: None,
4414 sha1: None,
4415 md5: None,
4416 sha256: None,
4417 sha512: None,
4418 bug_tracking_url,
4419 code_view_url,
4420 vcs_url,
4421 copyright: None,
4422 holder: None,
4423 declared_license_expression,
4424 declared_license_expression_spdx,
4425 license_detections,
4426 other_license_expression: None,
4427 other_license_expression_spdx: None,
4428 other_license_detections: Vec::new(),
4429 extracted_license_statement,
4430 notice_text: None,
4431 source_packages: Vec::new(),
4432 file_references: Vec::new(),
4433 is_private: has_private_classifier(&classifiers),
4434 is_virtual: false,
4435 extra_data,
4436 dependencies,
4437 repository_homepage_url: None,
4438 repository_download_url: None,
4439 api_data_url: None,
4440 datasource_id: Some(DatasourceId::PypiSetupCfg),
4441 purl,
4442 }
4443}
4444
4445fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4446 let Some(keywords) = value else {
4447 return Vec::new();
4448 };
4449
4450 keywords
4451 .split(',')
4452 .map(str::trim)
4453 .filter(|keyword| !keyword.is_empty())
4454 .map(ToOwned::to_owned)
4455 .collect()
4456}
4457
4458fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4459 entries
4460 .iter()
4461 .filter_map(|entry| {
4462 let (label, url) = entry.split_once('=')?;
4463 let label = label.trim();
4464 let url = url.trim();
4465 if label.is_empty() || url.is_empty() {
4466 None
4467 } else {
4468 Some((label.to_string(), url.to_string()))
4469 }
4470 })
4471 .collect()
4472}
4473
4474fn apply_project_url_mappings(
4475 parsed_urls: &[(String, String)],
4476 homepage_url: &mut Option<String>,
4477 bug_tracking_url: &mut Option<String>,
4478 code_view_url: &mut Option<String>,
4479 vcs_url: &mut Option<String>,
4480 extra_data: &mut HashMap<String, serde_json::Value>,
4481) {
4482 for (label, url) in parsed_urls {
4483 let label_lower = label.to_lowercase();
4484
4485 if bug_tracking_url.is_none()
4486 && matches!(
4487 label_lower.as_str(),
4488 "tracker"
4489 | "bug reports"
4490 | "bug tracker"
4491 | "issues"
4492 | "issue tracker"
4493 | "github: issues"
4494 )
4495 {
4496 *bug_tracking_url = Some(url.clone());
4497 } else if code_view_url.is_none()
4498 && matches!(label_lower.as_str(), "source" | "source code" | "code")
4499 {
4500 *code_view_url = Some(url.clone());
4501 } else if vcs_url.is_none()
4502 && matches!(
4503 label_lower.as_str(),
4504 "github" | "gitlab" | "github: repo" | "repository"
4505 )
4506 {
4507 *vcs_url = Some(url.clone());
4508 } else if homepage_url.is_none()
4509 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4510 {
4511 *homepage_url = Some(url.clone());
4512 } else if label_lower == "changelog" {
4513 extra_data.insert(
4514 "changelog_url".to_string(),
4515 serde_json::Value::String(url.clone()),
4516 );
4517 }
4518 }
4519
4520 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4521 .iter()
4522 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4523 .collect();
4524
4525 if !project_urls_json.is_empty() {
4526 extra_data.insert(
4527 "project_urls".to_string(),
4528 serde_json::Value::Object(project_urls_json),
4529 );
4530 }
4531}
4532
4533fn parse_setup_cfg(content: &str) -> IniSections {
4534 let mut sections: IniSections = HashMap::new();
4535 let mut current_section: Option<String> = None;
4536 let mut current_key: Option<String> = None;
4537
4538 for raw_line in content.lines() {
4539 let line = raw_line.trim_end_matches('\r');
4540 let trimmed = line.trim();
4541 if trimmed.is_empty() {
4542 continue;
4543 }
4544
4545 let stripped = line.trim_start();
4546 if stripped.starts_with('#') || stripped.starts_with(';') {
4547 continue;
4548 }
4549
4550 if stripped.starts_with('[') && stripped.ends_with(']') {
4551 let section_name = stripped
4552 .trim_start_matches('[')
4553 .trim_end_matches(']')
4554 .trim()
4555 .to_ascii_lowercase();
4556 current_section = if section_name.is_empty() {
4557 None
4558 } else {
4559 Some(section_name)
4560 };
4561 current_key = None;
4562 continue;
4563 }
4564
4565 if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4566 if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4567 let value = stripped.trim();
4568 if !value.is_empty() {
4569 sections
4570 .entry(section.clone())
4571 .or_default()
4572 .entry(key.clone())
4573 .or_default()
4574 .push(value.to_string());
4575 }
4576 }
4577 continue;
4578 }
4579
4580 if let Some((key, value)) = stripped.split_once('=')
4581 && let Some(section) = current_section.as_ref()
4582 {
4583 let key_name = key.trim().to_ascii_lowercase();
4584 let value_trimmed = value.trim();
4585 let entry = sections
4586 .entry(section.clone())
4587 .or_default()
4588 .entry(key_name.clone())
4589 .or_default();
4590 if !value_trimmed.is_empty() {
4591 entry.push(value_trimmed.to_string());
4592 }
4593 current_key = Some(key_name);
4594 }
4595 }
4596
4597 sections
4598}
4599
4600fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4601 sections
4602 .get(§ion.to_ascii_lowercase())
4603 .and_then(|values| values.get(&key.to_ascii_lowercase()))
4604 .and_then(|entries| entries.first())
4605 .map(|value| value.trim().to_string())
4606}
4607
4608fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4609 sections
4610 .get(§ion.to_ascii_lowercase())
4611 .and_then(|values| values.get(&key.to_ascii_lowercase()))
4612 .cloned()
4613 .unwrap_or_default()
4614}
4615
4616fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
4617 let mut dependencies = Vec::new();
4618
4619 for (sub_section, scope) in [
4620 ("install_requires", "install"),
4621 ("tests_require", "test"),
4622 ("setup_requires", "setup"),
4623 ] {
4624 let reqs = get_ini_values(sections, "options", sub_section);
4625 dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
4626 }
4627
4628 if let Some(extras) = sections.get("options.extras_require") {
4629 let mut extra_items: Vec<_> = extras.iter().collect();
4630 extra_items.sort_by_key(|(name, _)| *name);
4631 for (extra_name, reqs) in extra_items {
4632 dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
4633 }
4634 }
4635
4636 dependencies
4637}
4638
4639fn parse_setup_cfg_requirements(
4640 reqs: &[String],
4641 scope: &str,
4642 is_optional: bool,
4643) -> Vec<Dependency> {
4644 reqs.iter()
4645 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
4646 .collect()
4647}
4648
4649fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
4650 let trimmed = req.trim();
4651 if trimmed.is_empty() || trimmed.starts_with('#') {
4652 return None;
4653 }
4654
4655 let name = extract_setup_cfg_dependency_name(trimmed)?;
4656 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4657
4658 Some(Dependency {
4659 purl: Some(purl.to_string()),
4660 extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
4661 scope: Some(scope.to_string()),
4662 is_runtime: Some(true),
4663 is_optional: Some(is_optional),
4664 is_pinned: Some(false),
4665 is_direct: Some(true),
4666 resolved_package: None,
4667 extra_data: None,
4668 })
4669}
4670
4671fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
4672 let trimmed = req.trim();
4673 if trimmed.is_empty() {
4674 return None;
4675 }
4676
4677 let end = trimmed
4678 .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
4679 .unwrap_or(trimmed.len());
4680 let name = trimmed[..end].trim();
4681 if name.is_empty() {
4682 None
4683 } else {
4684 Some(name.to_string())
4685 }
4686}
4687
4688fn normalize_setup_cfg_requirement(req: &str) -> String {
4689 req.chars().filter(|c| !c.is_whitespace()).collect()
4690}
4691
4692fn extract_setup_value(content: &str, key: &str) -> Option<String> {
4693 let patterns = vec![
4694 format!("{}=\"", key), format!("{} =\"", key), format!("{}= \"", key), format!("{} = \"", key), format!("{}='", key), format!("{} ='", key), format!("{}= '", key), format!("{} = '", key), ];
4703
4704 for pattern in patterns {
4705 if let Some(start_idx) = content.find(&pattern) {
4706 let value_start = start_idx + pattern.len();
4707 let remaining = &content[value_start..];
4708
4709 if let Some(end_idx) = remaining.find(['"', '\'']) {
4710 return Some(remaining[..end_idx].to_string());
4711 }
4712 }
4713 }
4714
4715 None
4716}
4717
4718fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
4719 let mut dependencies = Vec::new();
4720
4721 if let Some(tests_deps) = extract_tests_require(content) {
4722 dependencies.extend(tests_deps);
4723 }
4724
4725 if let Some(extras_deps) = extract_extras_require(content) {
4726 dependencies.extend(extras_deps);
4727 }
4728
4729 dependencies
4730}
4731
4732fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
4733 let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
4734 let re = Regex::new(pattern).ok()?;
4735 let captures = re.captures(content)?;
4736 let deps_str = captures.get(1)?.as_str();
4737
4738 let deps = parse_setup_py_dep_list(deps_str, "test", true);
4739 if deps.is_empty() { None } else { Some(deps) }
4740}
4741
4742fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
4743 let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
4744 let re = Regex::new(pattern).ok()?;
4745 let captures = re.captures(content)?;
4746 let dict_content = captures.get(1)?.as_str();
4747
4748 let mut all_deps = Vec::new();
4749
4750 let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
4751 let entry_re = Regex::new(entry_pattern).ok()?;
4752
4753 for entry_cap in entry_re.captures_iter(dict_content) {
4754 if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
4755 let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
4756 all_deps.extend(deps);
4757 }
4758 }
4759
4760 if all_deps.is_empty() {
4761 None
4762 } else {
4763 Some(all_deps)
4764 }
4765}
4766
4767fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
4768 let dep_pattern = r#"['"]([^'"]+)['"]"#;
4769 let re = match Regex::new(dep_pattern) {
4770 Ok(r) => r,
4771 Err(_) => return Vec::new(),
4772 };
4773
4774 re.captures_iter(deps_str)
4775 .filter_map(|cap| {
4776 let dep_str = cap.get(1)?.as_str().trim();
4777 if dep_str.is_empty() {
4778 return None;
4779 }
4780
4781 let name = extract_setup_cfg_dependency_name(dep_str)?;
4782 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4783
4784 Some(Dependency {
4785 purl: Some(purl.to_string()),
4786 extracted_requirement: Some(dep_str.to_string()),
4787 scope: Some(scope.to_string()),
4788 is_runtime: Some(true),
4789 is_optional: Some(is_optional),
4790 is_pinned: Some(false),
4791 is_direct: Some(true),
4792 resolved_package: None,
4793 extra_data: None,
4794 })
4795 })
4796 .collect()
4797}
4798
4799pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
4801 let content = read_file_to_string(path).map_err(|e| e.to_string())?;
4802 toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
4803}
4804
4805fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<String>) {
4816 let mut file = match File::open(path) {
4817 Ok(f) => f,
4818 Err(_) => return (None, None),
4819 };
4820
4821 let metadata = match file.metadata() {
4822 Ok(m) => m,
4823 Err(_) => return (None, None),
4824 };
4825 let size = metadata.len();
4826
4827 let mut hasher = Sha256::new();
4828 let mut buffer = vec![0; 8192];
4829
4830 loop {
4831 match file.read(&mut buffer) {
4832 Ok(0) => break,
4833 Ok(n) => hasher.update(&buffer[..n]),
4834 Err(_) => return (Some(size), None),
4835 }
4836 }
4837
4838 let hash = hex::encode(hasher.finalize());
4839 (Some(size), Some(hash))
4840}
4841
4842fn default_package_data(path: &Path) -> PackageData {
4843 PackageData {
4844 package_type: Some(PythonParser::PACKAGE_TYPE),
4845 primary_language: Some("Python".to_string()),
4846 datasource_id: infer_python_datasource_id(path),
4847 ..Default::default()
4848 }
4849}
4850
4851fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
4852 let file_name = path.file_name().and_then(|name| name.to_str());
4853
4854 match file_name {
4855 Some("pyproject.toml") => {
4856 if read_toml_file(path)
4857 .ok()
4858 .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
4859 .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
4860 .is_some()
4861 {
4862 Some(DatasourceId::PypiPoetryPyprojectToml)
4863 } else {
4864 Some(DatasourceId::PypiPyprojectToml)
4865 }
4866 }
4867 Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
4868 Some(DatasourceId::PypiSetupPy)
4869 }
4870 Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
4871 Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
4872 Some("METADATA") if is_installed_wheel_metadata_path(path) => {
4873 Some(DatasourceId::PypiWheelMetadata)
4874 }
4875 Some("pypi.json") => Some(DatasourceId::PypiJson),
4876 Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
4877 Some("origin.json") if is_pip_cache_origin_json(path) => {
4878 Some(DatasourceId::PypiPipOriginJson)
4879 }
4880 _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
4881 Some(DatasourceId::PypiSdist)
4882 }
4883 _ if path
4884 .extension()
4885 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
4886 {
4887 Some(DatasourceId::PypiWheel)
4888 }
4889 _ if path
4890 .extension()
4891 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
4892 {
4893 Some(DatasourceId::PypiEgg)
4894 }
4895 _ => None,
4896 }
4897}
4898
4899crate::register_parser!(
4900 "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
4901 &[
4902 "**/pyproject.toml",
4903 "**/setup.py",
4904 "**/*_setup.py",
4905 "**/setup.cfg",
4906 "**/pypi.json",
4907 "**/PKG-INFO",
4908 "**/*.dist-info/METADATA",
4909 "**/origin.json",
4910 "**/*.tar.gz",
4911 "**/*.tgz",
4912 "**/*.tar.bz2",
4913 "**/*.tar.xz",
4914 "**/*.zip",
4915 "**/*.whl",
4916 "**/*.egg"
4917 ],
4918 "pypi",
4919 "Python",
4920 Some("https://packaging.python.org/"),
4921);