1use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
35use crate::parser_warn as warn;
36use crate::parsers::utils::{read_file_to_string, split_name_email};
37use base64::Engine;
38use base64::engine::general_purpose::URL_SAFE_NO_PAD;
39use bzip2::read::BzDecoder;
40use csv::ReaderBuilder;
41use flate2::read::GzDecoder;
42use liblzma::read::XzDecoder;
43use packageurl::PackageUrl;
44use regex::Regex;
45use ruff_python_ast as ast;
46use ruff_python_parser::parse_module;
47use serde_json::{Map as JsonMap, Value as JsonValue};
48use sha2::{Digest, Sha256};
49use std::collections::{HashMap, HashSet};
50use std::fs::File;
51use std::io::Read;
52use std::path::{Component, Path, PathBuf};
53use tar::Archive;
54use toml::Value as TomlValue;
55use toml::map::Map as TomlMap;
56use zip::ZipArchive;
57
58use super::PackageParser;
59use super::license_normalization::{
60 DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
61 normalize_spdx_expression,
62};
63
64const FIELD_PROJECT: &str = "project";
66const FIELD_NAME: &str = "name";
67const FIELD_VERSION: &str = "version";
68const FIELD_LICENSE: &str = "license";
69const FIELD_AUTHORS: &str = "authors";
70const FIELD_MAINTAINERS: &str = "maintainers";
71const FIELD_URLS: &str = "urls";
72const FIELD_HOMEPAGE: &str = "homepage";
73const FIELD_REPOSITORY: &str = "repository";
74const FIELD_DEPENDENCIES: &str = "dependencies";
75const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
76const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
77const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
78const MAX_SETUP_PY_BYTES: usize = 1_048_576;
79const MAX_SETUP_PY_AST_NODES: usize = 10_000;
80const MAX_SETUP_PY_AST_DEPTH: usize = 50;
81const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; const MAX_COMPRESSION_RATIO: f64 = 100.0; pub struct PythonParser;
95
96#[derive(Clone, Copy, Debug)]
97enum PythonSdistArchiveFormat {
98 TarGz,
99 Tgz,
100 TarBz2,
101 TarXz,
102 Zip,
103}
104
105#[derive(Clone, Debug)]
106struct ValidatedZipEntry {
107 index: usize,
108 name: String,
109}
110
111impl PackageParser for PythonParser {
112 const PACKAGE_TYPE: PackageType = PackageType::Pypi;
113
114 fn extract_packages(path: &Path) -> Vec<PackageData> {
115 vec![
116 if path.file_name().unwrap_or_default() == "pyproject.toml" {
117 extract_from_pyproject_toml(path)
118 } else if path.file_name().unwrap_or_default() == "setup.cfg" {
119 extract_from_setup_cfg(path)
120 } else if path.file_name().unwrap_or_default() == "setup.py" {
121 extract_from_setup_py(path)
122 } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
123 extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
124 } else if path.file_name().unwrap_or_default() == "METADATA" {
125 extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
126 } else if is_pip_cache_origin_json(path) {
127 extract_from_pip_origin_json(path)
128 } else if path.file_name().unwrap_or_default() == "pypi.json" {
129 extract_from_pypi_json(path)
130 } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
131 extract_from_pip_inspect(path)
132 } else if is_python_sdist_archive_path(path) {
133 extract_from_sdist_archive(path)
134 } else if path
135 .extension()
136 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
137 {
138 extract_from_wheel_archive(path)
139 } else if path
140 .extension()
141 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
142 {
143 extract_from_egg_archive(path)
144 } else {
145 default_package_data(path)
146 },
147 ]
148 }
149
150 fn is_match(path: &Path) -> bool {
151 if let Some(filename) = path.file_name()
152 && (filename == "pyproject.toml"
153 || filename == "setup.cfg"
154 || filename == "setup.py"
155 || filename == "PKG-INFO"
156 || filename == "METADATA"
157 || filename == "pypi.json"
158 || filename == "pip-inspect.deplock"
159 || is_pip_cache_origin_json(path))
160 {
161 return true;
162 }
163
164 if let Some(extension) = path.extension() {
165 let ext = extension.to_string_lossy().to_lowercase();
166 if ext == "whl" || ext == "egg" || is_python_sdist_archive_path(path) {
167 return true;
168 }
169 }
170
171 false
172 }
173}
174
175#[derive(Debug, Clone)]
176struct InstalledWheelMetadata {
177 wheel_tags: Vec<String>,
178 wheel_version: Option<String>,
179 wheel_generator: Option<String>,
180 root_is_purelib: Option<bool>,
181 compressed_tag: Option<String>,
182}
183
184fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
185 let Some(parent) = path.parent() else {
186 return;
187 };
188
189 if !parent
190 .file_name()
191 .and_then(|name| name.to_str())
192 .is_some_and(|name| name.ends_with(".dist-info"))
193 {
194 return;
195 }
196
197 let wheel_path = parent.join("WHEEL");
198 if !wheel_path.exists() {
199 return;
200 }
201
202 let Ok(content) = read_file_to_string(&wheel_path) else {
203 warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
204 return;
205 };
206
207 let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
208 return;
209 };
210
211 apply_installed_wheel_metadata(package_data, &wheel_metadata);
212}
213
214fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
215 use super::rfc822::{get_header_all, get_header_first};
216
217 let metadata = super::rfc822::parse_rfc822_content(content);
218 let wheel_tags = get_header_all(&metadata.headers, "tag");
219 if wheel_tags.is_empty() {
220 return None;
221 }
222
223 let wheel_version = get_header_first(&metadata.headers, "wheel-version");
224 let wheel_generator = get_header_first(&metadata.headers, "generator");
225 let root_is_purelib =
226 get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
227 match value.to_ascii_lowercase().as_str() {
228 "true" => Some(true),
229 "false" => Some(false),
230 _ => None,
231 }
232 });
233
234 let compressed_tag = compress_wheel_tags(&wheel_tags);
235
236 Some(InstalledWheelMetadata {
237 wheel_tags,
238 wheel_version,
239 wheel_generator,
240 root_is_purelib,
241 compressed_tag,
242 })
243}
244
245fn compress_wheel_tags(tags: &[String]) -> Option<String> {
246 if tags.is_empty() {
247 return None;
248 }
249
250 if tags.len() == 1 {
251 return Some(tags[0].clone());
252 }
253
254 let mut python_tags = Vec::new();
255 let mut abi_tag: Option<&str> = None;
256 let mut platform_tag: Option<&str> = None;
257
258 for tag in tags {
259 let mut parts = tag.splitn(3, '-');
260 let python = parts.next()?;
261 let abi = parts.next()?;
262 let platform = parts.next()?;
263
264 if abi_tag.is_some_and(|existing| existing != abi)
265 || platform_tag.is_some_and(|existing| existing != platform)
266 {
267 return None;
268 }
269
270 abi_tag = Some(abi);
271 platform_tag = Some(platform);
272 python_tags.push(python.to_string());
273 }
274
275 Some(format!(
276 "{}-{}-{}",
277 python_tags.join("."),
278 abi_tag?,
279 platform_tag?
280 ))
281}
282
283fn apply_installed_wheel_metadata(
284 package_data: &mut PackageData,
285 wheel_metadata: &InstalledWheelMetadata,
286) {
287 let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
288 extra_data.insert(
289 "wheel_tags".to_string(),
290 JsonValue::Array(
291 wheel_metadata
292 .wheel_tags
293 .iter()
294 .cloned()
295 .map(JsonValue::String)
296 .collect(),
297 ),
298 );
299
300 if let Some(wheel_version) = &wheel_metadata.wheel_version {
301 extra_data.insert(
302 "wheel_version".to_string(),
303 JsonValue::String(wheel_version.clone()),
304 );
305 }
306
307 if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
308 extra_data.insert(
309 "wheel_generator".to_string(),
310 JsonValue::String(wheel_generator.clone()),
311 );
312 }
313
314 if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
315 extra_data.insert(
316 "root_is_purelib".to_string(),
317 JsonValue::Bool(root_is_purelib),
318 );
319 }
320
321 if let (Some(name), Some(version), Some(extension)) = (
322 package_data.name.as_deref(),
323 package_data.version.as_deref(),
324 wheel_metadata.compressed_tag.as_deref(),
325 ) {
326 package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
327 }
328}
329
330fn is_pip_cache_origin_json(path: &Path) -> bool {
331 path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
332 && path.ancestors().skip(1).any(|ancestor| {
333 ancestor
334 .file_name()
335 .and_then(|name| name.to_str())
336 .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
337 })
338}
339
340fn extract_from_pip_origin_json(path: &Path) -> PackageData {
341 let content = match read_file_to_string(path) {
342 Ok(content) => content,
343 Err(e) => {
344 warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
345 return default_package_data(path);
346 }
347 };
348
349 let root: JsonValue = match serde_json::from_str(&content) {
350 Ok(root) => root,
351 Err(e) => {
352 warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
353 return default_package_data(path);
354 }
355 };
356
357 let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
358 warn!("No url found in pip cache origin.json at {:?}", path);
359 return default_package_data(path);
360 };
361
362 let sibling_wheel = find_sibling_cached_wheel(path);
363 let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
364 sibling_wheel
365 .as_ref()
366 .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
367 });
368
369 let Some((name, version)) = name_version else {
370 warn!(
371 "Failed to infer package name/version from pip cache origin.json at {:?}",
372 path
373 );
374 return default_package_data(path);
375 };
376
377 let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
378 build_pypi_urls(Some(&name), Some(&version));
379 let purl = sibling_wheel
380 .as_ref()
381 .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
382 .or(plain_purl);
383
384 PackageData {
385 package_type: Some(PythonParser::PACKAGE_TYPE),
386 primary_language: Some("Python".to_string()),
387 name: Some(name),
388 version: Some(version),
389 datasource_id: Some(DatasourceId::PypiPipOriginJson),
390 download_url: Some(download_url.to_string()),
391 sha256: extract_sha256_from_origin_json(&root),
392 repository_homepage_url,
393 repository_download_url,
394 api_data_url,
395 purl,
396 ..Default::default()
397 }
398}
399
400fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
401 let parent = path.parent()?;
402 let entries = parent.read_dir().ok()?;
403
404 for entry in entries.flatten() {
405 let sibling_path = entry.path();
406 if sibling_path
407 .extension()
408 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
409 && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
410 {
411 return Some(wheel_info);
412 }
413 }
414
415 None
416}
417
418fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
419 let file_name = url.rsplit('/').next()?;
420
421 if file_name.ends_with(".whl") {
422 return parse_wheel_filename(Path::new(file_name))
423 .map(|wheel_info| (wheel_info.name, wheel_info.version));
424 }
425
426 let stem = strip_python_archive_extension(file_name)?;
427 let (name, version) = stem.rsplit_once('-')?;
428 if name.is_empty() || version.is_empty() {
429 return None;
430 }
431
432 Some((name.replace('_', "-"), version.to_string()))
433}
434
435fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
436 [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
437 .iter()
438 .find_map(|suffix| file_name.strip_suffix(suffix))
439}
440
441fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
442 root.pointer("/archive_info/hashes/sha256")
443 .and_then(|value| value.as_str())
444 .map(ToOwned::to_owned)
445 .or_else(|| {
446 root.pointer("/archive_info/hash")
447 .and_then(|value| value.as_str())
448 .and_then(normalize_origin_hash)
449 })
450}
451
452fn normalize_origin_hash(hash: &str) -> Option<String> {
453 if let Some(value) = hash.strip_prefix("sha256=") {
454 return Some(value.to_string());
455 }
456 if let Some(value) = hash.strip_prefix("sha256:") {
457 return Some(value.to_string());
458 }
459 if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
460 return Some(hash.to_string());
461 }
462 None
463}
464
465fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
466 let content = match read_file_to_string(path) {
467 Ok(content) => content,
468 Err(e) => {
469 warn!("Failed to read metadata at {:?}: {}", path, e);
470 return default_package_data(path);
471 }
472 };
473
474 let metadata = super::rfc822::parse_rfc822_content(&content);
475 let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
476 merge_sibling_metadata_dependencies(path, &mut package_data);
477 merge_sibling_metadata_file_references(path, &mut package_data);
478 if datasource_id == DatasourceId::PypiWheelMetadata {
479 merge_sibling_wheel_metadata(path, &mut package_data);
480 }
481 package_data
482}
483
484fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
485 let mut extra_dependencies = Vec::new();
486
487 if let Some(parent) = path.parent() {
488 let direct_requires = parent.join("requires.txt");
489 if direct_requires.exists()
490 && let Ok(content) = read_file_to_string(&direct_requires)
491 {
492 extra_dependencies.extend(parse_requires_txt(&content));
493 }
494
495 let sibling_egg_info_requires = parent
496 .read_dir()
497 .ok()
498 .into_iter()
499 .flatten()
500 .flatten()
501 .find_map(|entry| {
502 let child_path = entry.path();
503 if child_path.is_dir()
504 && child_path
505 .file_name()
506 .and_then(|name| name.to_str())
507 .is_some_and(|name| name.ends_with(".egg-info"))
508 {
509 let requires = child_path.join("requires.txt");
510 requires.exists().then_some(requires)
511 } else {
512 None
513 }
514 });
515
516 if let Some(requires_path) = sibling_egg_info_requires
517 && let Ok(content) = read_file_to_string(&requires_path)
518 {
519 extra_dependencies.extend(parse_requires_txt(&content));
520 }
521 }
522
523 for dependency in extra_dependencies {
524 if !package_data.dependencies.iter().any(|existing| {
525 existing.purl == dependency.purl
526 && existing.scope == dependency.scope
527 && existing.extracted_requirement == dependency.extracted_requirement
528 && existing.extra_data == dependency.extra_data
529 }) {
530 package_data.dependencies.push(dependency);
531 }
532 }
533}
534
535fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
536 let mut extra_refs = Vec::new();
537
538 if let Some(parent) = path.parent() {
539 let record_path = parent.join("RECORD");
540 if record_path.exists()
541 && let Ok(content) = read_file_to_string(&record_path)
542 {
543 extra_refs.extend(parse_record_csv(&content));
544 }
545
546 let installed_files_path = parent.join("installed-files.txt");
547 if installed_files_path.exists()
548 && let Ok(content) = read_file_to_string(&installed_files_path)
549 {
550 extra_refs.extend(parse_installed_files_txt(&content));
551 }
552
553 let sources_path = parent.join("SOURCES.txt");
554 if sources_path.exists()
555 && let Ok(content) = read_file_to_string(&sources_path)
556 {
557 extra_refs.extend(parse_sources_txt(&content));
558 }
559 }
560
561 for file_ref in extra_refs {
562 if !package_data
563 .file_references
564 .iter()
565 .any(|existing| existing.path == file_ref.path)
566 {
567 package_data.file_references.push(file_ref);
568 }
569 }
570}
571
572fn collect_validated_zip_entries<R: Read + std::io::Seek>(
573 archive: &mut ZipArchive<R>,
574 path: &Path,
575 archive_type: &str,
576) -> Result<Vec<ValidatedZipEntry>, String> {
577 let mut total_extracted = 0u64;
578 let mut entries = Vec::new();
579
580 for i in 0..archive.len() {
581 if let Ok(file) = archive.by_index_raw(i) {
582 let compressed_size = file.compressed_size();
583 let uncompressed_size = file.size();
584 let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
585 warn!(
586 "Skipping unsafe path in {} {:?}: {}",
587 archive_type,
588 path,
589 file.name()
590 );
591 continue;
592 };
593
594 if compressed_size > 0 {
595 let ratio = uncompressed_size as f64 / compressed_size as f64;
596 if ratio > MAX_COMPRESSION_RATIO {
597 warn!(
598 "Suspicious compression ratio in {} {:?}: {:.2}:1",
599 archive_type, path, ratio
600 );
601 continue;
602 }
603 }
604
605 if uncompressed_size > MAX_FILE_SIZE {
606 warn!(
607 "File too large in {} {:?}: {} bytes (limit: {} bytes)",
608 archive_type, path, uncompressed_size, MAX_FILE_SIZE
609 );
610 continue;
611 }
612
613 total_extracted += uncompressed_size;
614 if total_extracted > MAX_ARCHIVE_SIZE {
615 let msg = format!(
616 "Total extracted size exceeds limit for {} {:?}",
617 archive_type, path
618 );
619 warn!("{}", msg);
620 return Err(msg);
621 }
622
623 entries.push(ValidatedZipEntry {
624 index: i,
625 name: entry_name,
626 });
627 }
628 }
629
630 Ok(entries)
631}
632
633fn is_python_sdist_archive_path(path: &Path) -> bool {
634 detect_python_sdist_archive_format(path).is_some()
635}
636
637fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
638 let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
639
640 if !is_likely_python_sdist_filename(&file_name) {
641 return None;
642 }
643
644 if file_name.ends_with(".tar.gz") {
645 Some(PythonSdistArchiveFormat::TarGz)
646 } else if file_name.ends_with(".tgz") {
647 Some(PythonSdistArchiveFormat::Tgz)
648 } else if file_name.ends_with(".tar.bz2") {
649 Some(PythonSdistArchiveFormat::TarBz2)
650 } else if file_name.ends_with(".tar.xz") {
651 Some(PythonSdistArchiveFormat::TarXz)
652 } else if file_name.ends_with(".zip") {
653 Some(PythonSdistArchiveFormat::Zip)
654 } else {
655 None
656 }
657}
658
659fn is_likely_python_sdist_filename(file_name: &str) -> bool {
660 let Some(stem) = strip_python_archive_extension(file_name) else {
661 return false;
662 };
663
664 let Some((name, version)) = stem.rsplit_once('-') else {
665 return false;
666 };
667
668 !name.is_empty()
669 && !version.is_empty()
670 && version.chars().any(|ch| ch.is_ascii_digit())
671 && name
672 .chars()
673 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
674}
675
676fn extract_from_sdist_archive(path: &Path) -> PackageData {
677 let metadata = match std::fs::metadata(path) {
678 Ok(m) => m,
679 Err(e) => {
680 warn!(
681 "Failed to read metadata for sdist archive {:?}: {}",
682 path, e
683 );
684 return default_package_data(path);
685 }
686 };
687
688 if metadata.len() > MAX_ARCHIVE_SIZE {
689 warn!(
690 "sdist archive too large: {} bytes (limit: {} bytes)",
691 metadata.len(),
692 MAX_ARCHIVE_SIZE
693 );
694 return default_package_data(path);
695 }
696
697 let Some(format) = detect_python_sdist_archive_format(path) else {
698 return default_package_data(path);
699 };
700
701 let mut package_data = match format {
702 PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
703 let file = match File::open(path) {
704 Ok(file) => file,
705 Err(e) => {
706 warn!("Failed to open sdist archive {:?}: {}", path, e);
707 return default_package_data(path);
708 }
709 };
710 let decoder = GzDecoder::new(file);
711 extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
712 }
713 PythonSdistArchiveFormat::TarBz2 => {
714 let file = match File::open(path) {
715 Ok(file) => file,
716 Err(e) => {
717 warn!("Failed to open sdist archive {:?}: {}", path, e);
718 return default_package_data(path);
719 }
720 };
721 let decoder = BzDecoder::new(file);
722 extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
723 }
724 PythonSdistArchiveFormat::TarXz => {
725 let file = match File::open(path) {
726 Ok(file) => file,
727 Err(e) => {
728 warn!("Failed to open sdist archive {:?}: {}", path, e);
729 return default_package_data(path);
730 }
731 };
732 let decoder = XzDecoder::new(file);
733 extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
734 }
735 PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
736 };
737
738 if package_data.package_type.is_some() {
739 let (size, sha256) = calculate_file_checksums(path);
740 package_data.size = size;
741 package_data.sha256 = sha256;
742 }
743
744 package_data
745}
746
747fn extract_from_tar_sdist_archive<R: Read>(
748 path: &Path,
749 reader: R,
750 archive_type: &str,
751 compressed_size: u64,
752) -> PackageData {
753 let mut archive = Archive::new(reader);
754 let archive_entries = match archive.entries() {
755 Ok(entries) => entries,
756 Err(e) => {
757 warn!(
758 "Failed to read {} sdist archive {:?}: {}",
759 archive_type, path, e
760 );
761 return default_package_data(path);
762 }
763 };
764
765 let mut total_extracted = 0u64;
766 let mut entries = Vec::new();
767
768 for entry_result in archive_entries {
769 let mut entry = match entry_result {
770 Ok(entry) => entry,
771 Err(e) => {
772 warn!(
773 "Failed to read {} sdist entry from {:?}: {}",
774 archive_type, path, e
775 );
776 continue;
777 }
778 };
779
780 let entry_size = entry.size();
781 if entry_size > MAX_FILE_SIZE {
782 warn!(
783 "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
784 archive_type, path, entry_size, MAX_FILE_SIZE
785 );
786 continue;
787 }
788
789 total_extracted += entry_size;
790 if total_extracted > MAX_ARCHIVE_SIZE {
791 warn!(
792 "Total extracted size exceeds limit for {} sdist {:?}",
793 archive_type, path
794 );
795 return default_package_data(path);
796 }
797
798 if compressed_size > 0 {
799 let ratio = total_extracted as f64 / compressed_size as f64;
800 if ratio > MAX_COMPRESSION_RATIO {
801 warn!(
802 "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
803 archive_type, path, ratio
804 );
805 return default_package_data(path);
806 }
807 }
808
809 let entry_path = match entry.path() {
810 Ok(path) => path.to_string_lossy().replace('\\', "/"),
811 Err(e) => {
812 warn!(
813 "Failed to get {} sdist entry path from {:?}: {}",
814 archive_type, path, e
815 );
816 continue;
817 }
818 };
819
820 let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
821 warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
822 continue;
823 };
824
825 if !is_relevant_sdist_text_entry(&entry_path) {
826 continue;
827 }
828
829 if let Ok(content) = read_limited_utf8(
830 &mut entry,
831 MAX_FILE_SIZE,
832 &format!("{} entry {}", archive_type, entry_path),
833 ) {
834 entries.push((entry_path, content));
835 }
836 }
837
838 build_sdist_package_data(path, entries)
839}
840
841fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
842 let file = match File::open(path) {
843 Ok(file) => file,
844 Err(e) => {
845 warn!("Failed to open zip sdist archive {:?}: {}", path, e);
846 return default_package_data(path);
847 }
848 };
849
850 let mut archive = match ZipArchive::new(file) {
851 Ok(archive) => archive,
852 Err(e) => {
853 warn!("Failed to read zip sdist archive {:?}: {}", path, e);
854 return default_package_data(path);
855 }
856 };
857
858 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
859 Ok(entries) => entries,
860 Err(_) => return default_package_data(path),
861 };
862
863 let mut entries = Vec::new();
864 for entry in validated_entries.iter() {
865 if !is_relevant_sdist_text_entry(&entry.name) {
866 continue;
867 }
868
869 if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
870 entries.push((entry.name.clone(), content));
871 }
872 }
873
874 build_sdist_package_data(path, entries)
875}
876
877fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
878 entry_path.ends_with("/PKG-INFO")
879 || entry_path.ends_with("/requires.txt")
880 || entry_path.ends_with("/SOURCES.txt")
881}
882
883fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
884 let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
885 warn!("No PKG-INFO file found in sdist archive {:?}", path);
886 return default_package_data(path);
887 };
888
889 let mut package_data =
890 python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
891 merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
892 merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
893 apply_sdist_name_version_fallback(path, &mut package_data);
894 package_data.datasource_id = Some(DatasourceId::PypiSdist);
895 package_data
896}
897
898fn select_sdist_pkginfo_entry(
899 archive_path: &Path,
900 entries: &[(String, String)],
901) -> Option<(String, String)> {
902 let expected_name = archive_path
903 .file_name()
904 .and_then(|name| name.to_str())
905 .and_then(strip_python_archive_extension)
906 .and_then(|stem| {
907 stem.rsplit_once('-')
908 .map(|(name, _)| normalize_python_package_name(name))
909 });
910
911 entries
912 .iter()
913 .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
914 .min_by_key(|(entry_path, content)| {
915 let components: Vec<_> = entry_path
916 .split('/')
917 .filter(|part| !part.is_empty())
918 .collect();
919 let metadata = super::rfc822::parse_rfc822_content(content);
920 let candidate_name = super::rfc822::get_header_first(&metadata.headers, "name")
921 .map(|name| normalize_python_package_name(&name));
922 let name_rank = if candidate_name == expected_name {
923 0
924 } else {
925 1
926 };
927 let kind_rank = if components.len() == 3
928 && components[1].ends_with(".egg-info")
929 && components[2] == "PKG-INFO"
930 {
931 0
932 } else if components.len() == 2 && components[1] == "PKG-INFO" {
933 1
934 } else if entry_path.ends_with(".egg-info/PKG-INFO") {
935 2
936 } else {
937 3
938 };
939
940 (name_rank, kind_rank, components.len(), entry_path.clone())
941 })
942 .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
943}
944
945fn merge_sdist_archive_dependencies(
946 entries: &[(String, String)],
947 metadata_path: &str,
948 package_data: &mut PackageData,
949) {
950 let metadata_dir = metadata_path
951 .rsplit_once('/')
952 .map(|(dir, _)| dir)
953 .unwrap_or("");
954 let archive_root = metadata_path.split('/').next().unwrap_or("");
955 let matched_egg_info_dir =
956 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
957 let mut extra_dependencies = Vec::new();
958
959 for (entry_path, content) in entries {
960 let is_direct_requires =
961 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
962 let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
963 entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
964 });
965
966 if is_direct_requires || is_egg_info_requires {
967 extra_dependencies.extend(parse_requires_txt(content));
968 }
969 }
970
971 for dependency in extra_dependencies {
972 if !package_data.dependencies.iter().any(|existing| {
973 existing.purl == dependency.purl
974 && existing.scope == dependency.scope
975 && existing.extracted_requirement == dependency.extracted_requirement
976 && existing.extra_data == dependency.extra_data
977 }) {
978 package_data.dependencies.push(dependency);
979 }
980 }
981}
982
983fn merge_sdist_archive_file_references(
984 entries: &[(String, String)],
985 metadata_path: &str,
986 package_data: &mut PackageData,
987) {
988 let metadata_dir = metadata_path
989 .rsplit_once('/')
990 .map(|(dir, _)| dir)
991 .unwrap_or("");
992 let archive_root = metadata_path.split('/').next().unwrap_or("");
993 let matched_egg_info_dir =
994 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
995 let mut extra_refs = Vec::new();
996
997 for (entry_path, content) in entries {
998 let is_direct_sources =
999 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1000 let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1001 entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1002 });
1003
1004 if is_direct_sources || is_egg_info_sources {
1005 extra_refs.extend(parse_sources_txt(content));
1006 }
1007 }
1008
1009 for file_ref in extra_refs {
1010 if !package_data
1011 .file_references
1012 .iter()
1013 .any(|existing| existing.path == file_ref.path)
1014 {
1015 package_data.file_references.push(file_ref);
1016 }
1017 }
1018}
1019
1020fn select_matching_sdist_egg_info_dir(
1021 entries: &[(String, String)],
1022 archive_root: &str,
1023 package_name: Option<&str>,
1024) -> Option<String> {
1025 let normalized_package_name = package_name.map(normalize_python_package_name);
1026
1027 entries
1028 .iter()
1029 .filter_map(|(entry_path, _)| {
1030 let components: Vec<_> = entry_path
1031 .split('/')
1032 .filter(|part| !part.is_empty())
1033 .collect();
1034 if components.len() == 3
1035 && components[0] == archive_root
1036 && components[1].ends_with(".egg-info")
1037 {
1038 Some(components[1].to_string())
1039 } else {
1040 None
1041 }
1042 })
1043 .min_by_key(|egg_info_dir| {
1044 let normalized_dir_name =
1045 normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1046 let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1047 0
1048 } else {
1049 1
1050 };
1051
1052 (name_rank, egg_info_dir.clone())
1053 })
1054}
1055
1056fn normalize_python_package_name(name: &str) -> String {
1057 name.to_ascii_lowercase().replace('_', "-")
1058}
1059
1060fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1061 let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1062 return;
1063 };
1064
1065 let Some(stem) = strip_python_archive_extension(file_name) else {
1066 return;
1067 };
1068
1069 let Some((name, version)) = stem.rsplit_once('-') else {
1070 return;
1071 };
1072
1073 if package_data.name.is_none() {
1074 package_data.name = Some(name.replace('_', "-"));
1075 }
1076 if package_data.version.is_none() {
1077 package_data.version = Some(version.to_string());
1078 }
1079
1080 if package_data.purl.is_none()
1081 || package_data.repository_homepage_url.is_none()
1082 || package_data.repository_download_url.is_none()
1083 || package_data.api_data_url.is_none()
1084 {
1085 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1086 build_pypi_urls(
1087 package_data.name.as_deref(),
1088 package_data.version.as_deref(),
1089 );
1090
1091 if package_data.repository_homepage_url.is_none() {
1092 package_data.repository_homepage_url = repository_homepage_url;
1093 }
1094 if package_data.repository_download_url.is_none() {
1095 package_data.repository_download_url = repository_download_url;
1096 }
1097 if package_data.api_data_url.is_none() {
1098 package_data.api_data_url = api_data_url;
1099 }
1100 if package_data.purl.is_none() {
1101 package_data.purl = purl;
1102 }
1103 }
1104}
1105
1106fn extract_from_wheel_archive(path: &Path) -> PackageData {
1107 let metadata = match std::fs::metadata(path) {
1108 Ok(m) => m,
1109 Err(e) => {
1110 warn!(
1111 "Failed to read metadata for wheel archive {:?}: {}",
1112 path, e
1113 );
1114 return default_package_data(path);
1115 }
1116 };
1117
1118 if metadata.len() > MAX_ARCHIVE_SIZE {
1119 warn!(
1120 "Wheel archive too large: {} bytes (limit: {} bytes)",
1121 metadata.len(),
1122 MAX_ARCHIVE_SIZE
1123 );
1124 return default_package_data(path);
1125 }
1126
1127 let file = match File::open(path) {
1128 Ok(f) => f,
1129 Err(e) => {
1130 warn!("Failed to open wheel archive {:?}: {}", path, e);
1131 return default_package_data(path);
1132 }
1133 };
1134
1135 let mut archive = match ZipArchive::new(file) {
1136 Ok(a) => a,
1137 Err(e) => {
1138 warn!("Failed to read wheel archive {:?}: {}", path, e);
1139 return default_package_data(path);
1140 }
1141 };
1142
1143 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1144 Ok(entries) => entries,
1145 Err(_) => return default_package_data(path),
1146 };
1147
1148 let metadata_entry =
1149 match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1150 Some(entry) => entry,
1151 None => {
1152 warn!("No METADATA file found in wheel archive {:?}", path);
1153 return default_package_data(path);
1154 }
1155 };
1156
1157 let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1158 Ok(c) => c,
1159 Err(e) => {
1160 warn!("Failed to read METADATA from {:?}: {}", path, e);
1161 return default_package_data(path);
1162 }
1163 };
1164
1165 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1166
1167 let (size, sha256) = calculate_file_checksums(path);
1168 package_data.size = size;
1169 package_data.sha256 = sha256;
1170
1171 if let Some(record_entry) =
1172 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1173 && let Ok(record_content) =
1174 read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1175 {
1176 package_data.file_references = parse_record_csv(&record_content);
1177 }
1178
1179 if let Some(wheel_info) = parse_wheel_filename(path) {
1180 if package_data.name.is_none() {
1181 package_data.name = Some(wheel_info.name.clone());
1182 }
1183 if package_data.version.is_none() {
1184 package_data.version = Some(wheel_info.version.clone());
1185 }
1186
1187 package_data.qualifiers = Some(std::collections::HashMap::from([(
1188 "extension".to_string(),
1189 format!(
1190 "{}-{}-{}",
1191 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1192 ),
1193 )]));
1194
1195 package_data.purl = build_wheel_purl(
1196 package_data.name.as_deref(),
1197 package_data.version.as_deref(),
1198 &wheel_info,
1199 );
1200
1201 let mut extra_data = package_data.extra_data.unwrap_or_default();
1202 extra_data.insert(
1203 "python_requires".to_string(),
1204 serde_json::Value::String(wheel_info.python_tag.clone()),
1205 );
1206 extra_data.insert(
1207 "abi_tag".to_string(),
1208 serde_json::Value::String(wheel_info.abi_tag.clone()),
1209 );
1210 extra_data.insert(
1211 "platform_tag".to_string(),
1212 serde_json::Value::String(wheel_info.platform_tag.clone()),
1213 );
1214 package_data.extra_data = Some(extra_data);
1215 }
1216
1217 package_data
1218}
1219
1220fn extract_from_egg_archive(path: &Path) -> PackageData {
1221 let metadata = match std::fs::metadata(path) {
1222 Ok(m) => m,
1223 Err(e) => {
1224 warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1225 return default_package_data(path);
1226 }
1227 };
1228
1229 if metadata.len() > MAX_ARCHIVE_SIZE {
1230 warn!(
1231 "Egg archive too large: {} bytes (limit: {} bytes)",
1232 metadata.len(),
1233 MAX_ARCHIVE_SIZE
1234 );
1235 return default_package_data(path);
1236 }
1237
1238 let file = match File::open(path) {
1239 Ok(f) => f,
1240 Err(e) => {
1241 warn!("Failed to open egg archive {:?}: {}", path, e);
1242 return default_package_data(path);
1243 }
1244 };
1245
1246 let mut archive = match ZipArchive::new(file) {
1247 Ok(a) => a,
1248 Err(e) => {
1249 warn!("Failed to read egg archive {:?}: {}", path, e);
1250 return default_package_data(path);
1251 }
1252 };
1253
1254 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1255 Ok(entries) => entries,
1256 Err(_) => return default_package_data(path),
1257 };
1258
1259 let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1260 &validated_entries,
1261 &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1262 ) {
1263 Some(entry) => entry,
1264 None => {
1265 warn!("No PKG-INFO file found in egg archive {:?}", path);
1266 return default_package_data(path);
1267 }
1268 };
1269
1270 let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1271 Ok(c) => c,
1272 Err(e) => {
1273 warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1274 return default_package_data(path);
1275 }
1276 };
1277
1278 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1279
1280 let (size, sha256) = calculate_file_checksums(path);
1281 package_data.size = size;
1282 package_data.sha256 = sha256;
1283
1284 if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1285 &validated_entries,
1286 &[
1287 "EGG-INFO/installed-files.txt",
1288 ".egg-info/installed-files.txt",
1289 ],
1290 ) && let Ok(installed_files_content) =
1291 read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1292 {
1293 package_data.file_references = parse_installed_files_txt(&installed_files_content);
1294 }
1295
1296 if let Some(egg_info) = parse_egg_filename(path) {
1297 if package_data.name.is_none() {
1298 package_data.name = Some(egg_info.name.clone());
1299 }
1300 if package_data.version.is_none() {
1301 package_data.version = Some(egg_info.version.clone());
1302 }
1303
1304 if let Some(python_version) = &egg_info.python_version {
1305 let mut extra_data = package_data.extra_data.unwrap_or_default();
1306 extra_data.insert(
1307 "python_version".to_string(),
1308 serde_json::Value::String(python_version.clone()),
1309 );
1310 package_data.extra_data = Some(extra_data);
1311 }
1312 }
1313
1314 package_data.purl = build_egg_purl(
1315 package_data.name.as_deref(),
1316 package_data.version.as_deref(),
1317 );
1318
1319 package_data
1320}
1321
1322fn find_validated_zip_entry_by_suffix<'a>(
1323 entries: &'a [ValidatedZipEntry],
1324 suffix: &str,
1325) -> Option<&'a ValidatedZipEntry> {
1326 entries.iter().find(|entry| entry.name.ends_with(suffix))
1327}
1328
1329fn find_validated_zip_entry_by_any_suffix<'a>(
1330 entries: &'a [ValidatedZipEntry],
1331 suffixes: &[&str],
1332) -> Option<&'a ValidatedZipEntry> {
1333 entries
1334 .iter()
1335 .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1336}
1337
1338fn read_validated_zip_entry<R: Read + std::io::Seek>(
1339 archive: &mut ZipArchive<R>,
1340 entry: &ValidatedZipEntry,
1341 path: &Path,
1342 archive_type: &str,
1343) -> Result<String, String> {
1344 let mut file = archive
1345 .by_index(entry.index)
1346 .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1347
1348 let compressed_size = file.compressed_size();
1349 let uncompressed_size = file.size();
1350
1351 if compressed_size > 0 {
1352 let ratio = uncompressed_size as f64 / compressed_size as f64;
1353 if ratio > MAX_COMPRESSION_RATIO {
1354 return Err(format!(
1355 "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1356 archive_type, path, ratio
1357 ));
1358 }
1359 }
1360
1361 if uncompressed_size > MAX_FILE_SIZE {
1362 return Err(format!(
1363 "Rejected oversized entry in {} {:?}: {} bytes",
1364 archive_type, path, uncompressed_size
1365 ));
1366 }
1367
1368 read_limited_utf8(
1369 &mut file,
1370 MAX_FILE_SIZE,
1371 &format!("{} entry {}", archive_type, entry.name),
1372 )
1373}
1374
1375fn read_limited_utf8<R: Read>(
1376 reader: &mut R,
1377 max_bytes: u64,
1378 context: &str,
1379) -> Result<String, String> {
1380 let mut limited = reader.take(max_bytes + 1);
1381 let mut bytes = Vec::new();
1382 limited
1383 .read_to_end(&mut bytes)
1384 .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1385
1386 if bytes.len() as u64 > max_bytes {
1387 return Err(format!(
1388 "{} exceeded {} byte limit while reading",
1389 context, max_bytes
1390 ));
1391 }
1392
1393 String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1394}
1395
1396fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1397 let normalized = entry_path.replace('\\', "/");
1398 if normalized.len() >= 3 {
1399 let bytes = normalized.as_bytes();
1400 if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1401 return None;
1402 }
1403 }
1404 let path = Path::new(&normalized);
1405 let mut components = Vec::new();
1406
1407 for component in path.components() {
1408 match component {
1409 Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1410 Component::CurDir => {}
1411 Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1412 }
1413 }
1414
1415 (!components.is_empty()).then_some(components.join("/"))
1416}
1417
1418pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1423 let mut reader = ReaderBuilder::new()
1424 .has_headers(false)
1425 .from_reader(content.as_bytes());
1426
1427 let mut file_references = Vec::new();
1428
1429 for result in reader.records() {
1430 match result {
1431 Ok(record) => {
1432 if record.len() < 3 {
1433 continue;
1434 }
1435
1436 let path = record.get(0).unwrap_or("").trim().to_string();
1437 if path.is_empty() {
1438 continue;
1439 }
1440
1441 let hash_field = record.get(1).unwrap_or("").trim();
1442 let size_field = record.get(2).unwrap_or("").trim();
1443
1444 let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1446 let parts: Vec<&str> = hash_field.split('=').collect();
1447 if parts.len() == 2 && parts[0] == "sha256" {
1448 match URL_SAFE_NO_PAD.decode(parts[1]) {
1450 Ok(decoded) => {
1451 let hex = decoded
1452 .iter()
1453 .map(|b| format!("{:02x}", b))
1454 .collect::<String>();
1455 Some(hex)
1456 }
1457 Err(_) => None,
1458 }
1459 } else {
1460 None
1461 }
1462 } else {
1463 None
1464 };
1465
1466 let size = if !size_field.is_empty() && size_field != "-" {
1468 size_field.parse::<u64>().ok()
1469 } else {
1470 None
1471 };
1472
1473 file_references.push(FileReference {
1474 path,
1475 size,
1476 sha1: None,
1477 md5: None,
1478 sha256,
1479 sha512: None,
1480 extra_data: None,
1481 });
1482 }
1483 Err(e) => {
1484 warn!("Failed to parse RECORD CSV row: {}", e);
1485 continue;
1486 }
1487 }
1488 }
1489
1490 file_references
1491}
1492
1493pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1496 content
1497 .lines()
1498 .map(|line| line.trim())
1499 .filter(|line| !line.is_empty())
1500 .map(|path| FileReference {
1501 path: path.to_string(),
1502 size: None,
1503 sha1: None,
1504 md5: None,
1505 sha256: None,
1506 sha512: None,
1507 extra_data: None,
1508 })
1509 .collect()
1510}
1511
1512pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1513 content
1514 .lines()
1515 .map(str::trim)
1516 .filter(|line| !line.is_empty())
1517 .map(|path| FileReference {
1518 path: path.to_string(),
1519 size: None,
1520 sha1: None,
1521 md5: None,
1522 sha256: None,
1523 sha512: None,
1524 extra_data: None,
1525 })
1526 .collect()
1527}
1528
1529struct WheelInfo {
1530 name: String,
1531 version: String,
1532 python_tag: String,
1533 abi_tag: String,
1534 platform_tag: String,
1535}
1536
1537fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1538 let stem = path.file_stem()?.to_string_lossy();
1539 let parts: Vec<&str> = stem.split('-').collect();
1540
1541 if parts.len() >= 5 {
1542 Some(WheelInfo {
1543 name: parts[0].replace('_', "-"),
1544 version: parts[1].to_string(),
1545 python_tag: parts[2].to_string(),
1546 abi_tag: parts[3].to_string(),
1547 platform_tag: parts[4..].join("-"),
1548 })
1549 } else {
1550 None
1551 }
1552}
1553
1554struct EggInfo {
1555 name: String,
1556 version: String,
1557 python_version: Option<String>,
1558}
1559
1560fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1561 let stem = path.file_stem()?.to_string_lossy();
1562 let parts: Vec<&str> = stem.split('-').collect();
1563
1564 if parts.len() >= 2 {
1565 Some(EggInfo {
1566 name: parts[0].replace('_', "-"),
1567 version: parts[1].to_string(),
1568 python_version: parts.get(2).map(|s| s.to_string()),
1569 })
1570 } else {
1571 None
1572 }
1573}
1574
1575fn build_wheel_purl(
1576 name: Option<&str>,
1577 version: Option<&str>,
1578 wheel_info: &WheelInfo,
1579) -> Option<String> {
1580 let name = name?;
1581 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1582
1583 if let Some(ver) = version {
1584 package_url.with_version(ver).ok()?;
1585 }
1586
1587 let extension = format!(
1588 "{}-{}-{}",
1589 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1590 );
1591 package_url.add_qualifier("extension", extension).ok()?;
1592
1593 Some(package_url.to_string())
1594}
1595
1596fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1597 let name = name?;
1598 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1599
1600 if let Some(ver) = version {
1601 package_url.with_version(ver).ok()?;
1602 }
1603
1604 package_url.add_qualifier("type", "egg").ok()?;
1605
1606 Some(package_url.to_string())
1607}
1608
1609fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1610 let metadata = super::rfc822::parse_rfc822_content(content);
1611 build_package_data_from_rfc822(&metadata, datasource_id)
1612}
1613
1614fn build_package_data_from_rfc822(
1619 metadata: &super::rfc822::Rfc822Metadata,
1620 datasource_id: DatasourceId,
1621) -> PackageData {
1622 use super::rfc822::{get_header_all, get_header_first};
1623
1624 let name = get_header_first(&metadata.headers, "name");
1625 let version = get_header_first(&metadata.headers, "version");
1626 let summary = get_header_first(&metadata.headers, "summary");
1627 let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1628 let author = get_header_first(&metadata.headers, "author");
1629 let author_email = get_header_first(&metadata.headers, "author-email");
1630 let license = get_header_first(&metadata.headers, "license");
1631 let license_expression = get_header_first(&metadata.headers, "license-expression");
1632 let download_url = get_header_first(&metadata.headers, "download-url");
1633 let platform = get_header_first(&metadata.headers, "platform");
1634 let requires_python = get_header_first(&metadata.headers, "requires-python");
1635 let classifiers = get_header_all(&metadata.headers, "classifier");
1636 let license_files = get_header_all(&metadata.headers, "license-file");
1637
1638 let description_body = if metadata.body.is_empty() {
1639 get_header_first(&metadata.headers, "description").unwrap_or_default()
1640 } else {
1641 metadata.body.clone()
1642 };
1643
1644 let description = build_description(summary.as_deref(), &description_body);
1645
1646 let mut parties = Vec::new();
1647 if author.is_some() || author_email.is_some() {
1648 parties.push(Party {
1649 r#type: Some("person".to_string()),
1650 role: Some("author".to_string()),
1651 name: author,
1652 email: author_email,
1653 url: None,
1654 organization: None,
1655 organization_url: None,
1656 timezone: None,
1657 });
1658 }
1659
1660 let (keywords, license_classifiers) = split_classifiers(&classifiers);
1661 let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1662 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1663 license_expression
1664 .as_deref()
1665 .and_then(normalize_spdx_expression)
1666 .map(|normalized| {
1667 build_declared_license_data(
1668 normalized,
1669 DeclaredLicenseMatchMetadata::single_line(
1670 license_expression.as_deref().unwrap_or_default(),
1671 )
1672 .with_referenced_filenames(&referenced_license_files),
1673 )
1674 })
1675 .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1676
1677 let extracted_license_statement = license_expression
1678 .clone()
1679 .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1680
1681 let mut extra_data = HashMap::new();
1682 if let Some(platform_value) = platform
1683 && !platform_value.eq_ignore_ascii_case("unknown")
1684 && !platform_value.is_empty()
1685 {
1686 extra_data.insert(
1687 "platform".to_string(),
1688 serde_json::Value::String(platform_value),
1689 );
1690 }
1691
1692 if let Some(requires_python_value) = requires_python
1693 && !requires_python_value.is_empty()
1694 {
1695 extra_data.insert(
1696 "requires_python".to_string(),
1697 serde_json::Value::String(requires_python_value),
1698 );
1699 }
1700
1701 if !license_files.is_empty() {
1702 extra_data.insert(
1703 "license_files".to_string(),
1704 serde_json::Value::Array(
1705 license_files
1706 .iter()
1707 .cloned()
1708 .map(serde_json::Value::String)
1709 .collect(),
1710 ),
1711 );
1712 }
1713
1714 let file_references = license_files
1715 .iter()
1716 .map(|path| FileReference {
1717 path: path.clone(),
1718 size: None,
1719 sha1: None,
1720 md5: None,
1721 sha256: None,
1722 sha512: None,
1723 extra_data: None,
1724 })
1725 .collect();
1726
1727 let project_urls = get_header_all(&metadata.headers, "project-url");
1728 let dependencies = extract_rfc822_dependencies(&metadata.headers);
1729 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1730
1731 if !project_urls.is_empty() {
1732 let parsed_urls = parse_project_urls(&project_urls);
1733
1734 for (label, url) in &parsed_urls {
1735 let label_lower = label.to_lowercase();
1736
1737 if bug_tracking_url.is_none()
1738 && matches!(
1739 label_lower.as_str(),
1740 "tracker"
1741 | "bug reports"
1742 | "bug tracker"
1743 | "issues"
1744 | "issue tracker"
1745 | "github: issues"
1746 )
1747 {
1748 bug_tracking_url = Some(url.clone());
1749 } else if code_view_url.is_none()
1750 && matches!(label_lower.as_str(), "source" | "source code" | "code")
1751 {
1752 code_view_url = Some(url.clone());
1753 } else if vcs_url.is_none()
1754 && matches!(
1755 label_lower.as_str(),
1756 "github" | "gitlab" | "github: repo" | "repository"
1757 )
1758 {
1759 vcs_url = Some(url.clone());
1760 } else if homepage_url.is_none()
1761 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1762 {
1763 homepage_url = Some(url.clone());
1764 } else if label_lower == "changelog" {
1765 extra_data.insert(
1766 "changelog_url".to_string(),
1767 serde_json::Value::String(url.clone()),
1768 );
1769 }
1770 }
1771
1772 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1773 .iter()
1774 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1775 .collect();
1776
1777 if !project_urls_json.is_empty() {
1778 extra_data.insert(
1779 "project_urls".to_string(),
1780 serde_json::Value::Object(project_urls_json),
1781 );
1782 }
1783 }
1784
1785 let extra_data = if extra_data.is_empty() {
1786 None
1787 } else {
1788 Some(extra_data)
1789 };
1790
1791 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1792 build_pypi_urls(name.as_deref(), version.as_deref());
1793
1794 PackageData {
1795 package_type: Some(PythonParser::PACKAGE_TYPE),
1796 namespace: None,
1797 name,
1798 version,
1799 qualifiers: None,
1800 subpath: None,
1801 primary_language: Some("Python".to_string()),
1802 description,
1803 release_date: None,
1804 parties,
1805 keywords,
1806 homepage_url,
1807 download_url,
1808 size: None,
1809 sha1: None,
1810 md5: None,
1811 sha256: None,
1812 sha512: None,
1813 bug_tracking_url,
1814 code_view_url,
1815 vcs_url,
1816 copyright: None,
1817 holder: None,
1818 declared_license_expression,
1819 declared_license_expression_spdx,
1820 license_detections,
1821 other_license_expression: None,
1822 other_license_expression_spdx: None,
1823 other_license_detections: Vec::new(),
1824 extracted_license_statement,
1825 notice_text: None,
1826 source_packages: Vec::new(),
1827 file_references,
1828 is_private: false,
1829 is_virtual: false,
1830 extra_data,
1831 dependencies,
1832 repository_homepage_url,
1833 repository_download_url,
1834 api_data_url,
1835 datasource_id: Some(datasource_id),
1836 purl,
1837 }
1838}
1839
1840fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
1841 project_urls
1842 .iter()
1843 .filter_map(|url_entry| {
1844 if let Some((label, url)) = url_entry.split_once(", ") {
1845 let label_trimmed = label.trim();
1846 let url_trimmed = url.trim();
1847 if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
1848 return Some((label_trimmed.to_string(), url_trimmed.to_string()));
1849 }
1850 }
1851 None
1852 })
1853 .collect()
1854}
1855
1856fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
1857 let mut parts = Vec::new();
1858 if let Some(summary_value) = summary
1859 && !summary_value.trim().is_empty()
1860 {
1861 parts.push(summary_value.trim().to_string());
1862 }
1863
1864 if !body.trim().is_empty() {
1865 parts.push(body.trim().to_string());
1866 }
1867
1868 if parts.is_empty() {
1869 None
1870 } else {
1871 Some(parts.join("\n"))
1872 }
1873}
1874
1875fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
1876 let mut keywords = Vec::new();
1877 let mut license_classifiers = Vec::new();
1878
1879 for classifier in classifiers {
1880 if classifier.starts_with("License ::") {
1881 license_classifiers.push(classifier.to_string());
1882 } else {
1883 keywords.push(classifier.to_string());
1884 }
1885 }
1886
1887 (keywords, license_classifiers)
1888}
1889
1890fn build_extracted_license_statement(
1891 license: Option<&str>,
1892 license_classifiers: &[String],
1893) -> Option<String> {
1894 let mut lines = Vec::new();
1895
1896 if let Some(value) = license
1897 && !value.trim().is_empty()
1898 {
1899 lines.push(format!("license: {}", value.trim()));
1900 }
1901
1902 if !license_classifiers.is_empty() {
1903 lines.push("classifiers:".to_string());
1904 for classifier in license_classifiers {
1905 lines.push(format!(" - '{}'", classifier));
1906 }
1907 }
1908
1909 if lines.is_empty() {
1910 None
1911 } else {
1912 Some(format!("{}\n", lines.join("\n")))
1913 }
1914}
1915
1916pub(crate) fn build_pypi_urls(
1917 name: Option<&str>,
1918 version: Option<&str>,
1919) -> (
1920 Option<String>,
1921 Option<String>,
1922 Option<String>,
1923 Option<String>,
1924) {
1925 let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
1926
1927 let repository_download_url = name.and_then(|value| {
1928 version.map(|ver| {
1929 format!(
1930 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
1931 &value[..1.min(value.len())],
1932 value,
1933 value,
1934 ver
1935 )
1936 })
1937 });
1938
1939 let api_data_url = name.map(|value| {
1940 if let Some(ver) = version {
1941 format!("https://pypi.org/pypi/{}/{}/json", value, ver)
1942 } else {
1943 format!("https://pypi.org/pypi/{}/json", value)
1944 }
1945 });
1946
1947 let purl = name.and_then(|value| {
1948 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
1949 if let Some(ver) = version {
1950 package_url.with_version(ver).ok()?;
1951 }
1952 Some(package_url.to_string())
1953 });
1954
1955 (
1956 repository_homepage_url,
1957 repository_download_url,
1958 api_data_url,
1959 purl,
1960 )
1961}
1962
1963fn build_pypi_purl_with_extension(
1964 name: &str,
1965 version: Option<&str>,
1966 extension: &str,
1967) -> Option<String> {
1968 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1969 if let Some(ver) = version {
1970 package_url.with_version(ver).ok()?;
1971 }
1972 package_url.add_qualifier("extension", extension).ok()?;
1973 Some(package_url.to_string())
1974}
1975
1976fn extract_from_pyproject_toml(path: &Path) -> PackageData {
1977 let toml_content = match read_toml_file(path) {
1978 Ok(content) => content,
1979 Err(e) => {
1980 warn!(
1981 "Failed to read or parse pyproject.toml at {:?}: {}",
1982 path, e
1983 );
1984 return default_package_data(path);
1985 }
1986 };
1987
1988 let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
1989 let is_poetry_pyproject = tool_table
1990 .and_then(|tool| tool.get("poetry"))
1991 .and_then(|value| value.as_table())
1992 .is_some();
1993
1994 let project_table =
1996 if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
1997 project.clone()
1999 } else if let Some(tool) = tool_table {
2000 if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2001 poetry.clone()
2003 } else {
2004 warn!(
2005 "No project or tool.poetry data found in pyproject.toml at {:?}",
2006 path
2007 );
2008 return default_package_data(path);
2009 }
2010 } else if toml_content.get(FIELD_NAME).is_some() {
2011 match toml_content.as_table() {
2013 Some(table) => table.clone(),
2014 None => {
2015 warn!("Failed to convert TOML content to table in {:?}", path);
2016 return default_package_data(path);
2017 }
2018 }
2019 } else {
2020 warn!("No project data found in pyproject.toml at {:?}", path);
2021 return default_package_data(path);
2022 };
2023
2024 let name = project_table
2025 .get(FIELD_NAME)
2026 .and_then(|v| v.as_str())
2027 .map(String::from);
2028
2029 let version = project_table
2030 .get(FIELD_VERSION)
2031 .and_then(|v| v.as_str())
2032 .map(String::from);
2033 let classifiers = project_table
2034 .get("classifiers")
2035 .and_then(|value| value.as_array())
2036 .map(|values| {
2037 values
2038 .iter()
2039 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2040 .collect::<Vec<_>>()
2041 })
2042 .unwrap_or_default();
2043
2044 let extracted_license_statement = extract_raw_license_string(&project_table);
2045 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2046 normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2047
2048 let (homepage_url, repository_url) = extract_urls(&project_table);
2050
2051 let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2052 let extra_data = extract_pyproject_extra_data(&toml_content);
2053
2054 let purl = name.as_ref().and_then(|n| {
2056 let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2057 Ok(p) => p,
2058 Err(e) => {
2059 warn!(
2060 "Failed to create PackageUrl for Python package '{}': {}",
2061 n, e
2062 );
2063 return None;
2064 }
2065 };
2066
2067 if let Some(v) = &version
2068 && let Err(e) = package_url.with_version(v)
2069 {
2070 warn!(
2071 "Failed to set version '{}' for Python package '{}': {}",
2072 v, n, e
2073 );
2074 return None;
2075 }
2076
2077 Some(package_url.to_string())
2078 });
2079
2080 let api_data_url = name.as_ref().map(|n| {
2081 if let Some(v) = &version {
2082 format!("https://pypi.org/pypi/{}/{}/json", n, v)
2083 } else {
2084 format!("https://pypi.org/pypi/{}/json", n)
2085 }
2086 });
2087
2088 let pypi_homepage_url = name
2089 .as_ref()
2090 .map(|n| format!("https://pypi.org/project/{}", n));
2091
2092 let pypi_download_url = name.as_ref().and_then(|n| {
2093 version.as_ref().map(|v| {
2094 format!(
2095 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2096 &n[..1.min(n.len())],
2097 n,
2098 n,
2099 v
2100 )
2101 })
2102 });
2103
2104 PackageData {
2105 package_type: Some(PythonParser::PACKAGE_TYPE),
2106 namespace: None,
2107 name,
2108 version,
2109 qualifiers: None,
2110 subpath: None,
2111 primary_language: None,
2112 description: None,
2113 release_date: None,
2114 parties: extract_parties(&project_table),
2115 keywords: Vec::new(),
2116 homepage_url: homepage_url.or(pypi_homepage_url),
2117 download_url: repository_url.clone().or(pypi_download_url),
2118 size: None,
2119 sha1: None,
2120 md5: None,
2121 sha256: None,
2122 sha512: None,
2123 bug_tracking_url: None,
2124 code_view_url: None,
2125 vcs_url: repository_url,
2126 copyright: None,
2127 holder: None,
2128 declared_license_expression,
2129 declared_license_expression_spdx,
2130 license_detections,
2131 other_license_expression: None,
2132 other_license_expression_spdx: None,
2133 other_license_detections: Vec::new(),
2134 extracted_license_statement,
2135 notice_text: None,
2136 source_packages: Vec::new(),
2137 file_references: Vec::new(),
2138 is_private: has_private_classifier(&classifiers),
2139 is_virtual: false,
2140 extra_data,
2141 dependencies: [dependencies, optional_dependencies].concat(),
2142 repository_homepage_url: None,
2143 repository_download_url: None,
2144 api_data_url,
2145 datasource_id: Some(if is_poetry_pyproject {
2146 DatasourceId::PypiPoetryPyprojectToml
2147 } else {
2148 DatasourceId::PypiPyprojectToml
2149 }),
2150 purl,
2151 }
2152}
2153
2154fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2155 let path_str = path.to_string_lossy().replace('\\', "/");
2156 if path_str.contains("/EGG-INFO/PKG-INFO") {
2157 DatasourceId::PypiEggPkginfo
2158 } else if path_str.ends_with(".egg-info/PKG-INFO") {
2159 DatasourceId::PypiEditableEggPkginfo
2160 } else {
2161 DatasourceId::PypiSdistPkginfo
2162 }
2163}
2164
2165fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2166 project
2167 .get(FIELD_LICENSE)
2168 .and_then(|license_value| match license_value {
2169 TomlValue::String(license_str) => Some(license_str.clone()),
2170 TomlValue::Table(license_table) => license_table
2171 .get("text")
2172 .and_then(|v| v.as_str())
2173 .map(|s| s.to_string())
2174 .or_else(|| {
2175 license_table
2176 .get("expression")
2177 .and_then(|v| v.as_str())
2178 .map(|expr| expr.to_string())
2179 }),
2180 _ => None,
2181 })
2182}
2183
2184fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2185 match project.get(FIELD_LICENSE) {
2186 Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2187 Some(TomlValue::Table(license_table)) => license_table
2188 .get("expression")
2189 .and_then(|value| value.as_str()),
2190 _ => None,
2191 }
2192}
2193
2194fn extract_urls(project: &TomlMap<String, TomlValue>) -> (Option<String>, Option<String>) {
2195 let mut homepage_url = None;
2196 let mut repository_url = None;
2197
2198 if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2200 homepage_url = urls
2201 .get(FIELD_HOMEPAGE)
2202 .and_then(|v| v.as_str())
2203 .map(String::from);
2204 repository_url = urls
2205 .get(FIELD_REPOSITORY)
2206 .and_then(|v| v.as_str())
2207 .map(String::from);
2208 }
2209
2210 if homepage_url.is_none() {
2212 homepage_url = project
2213 .get(FIELD_HOMEPAGE)
2214 .and_then(|v| v.as_str())
2215 .map(String::from);
2216 }
2217
2218 if repository_url.is_none() {
2219 repository_url = project
2220 .get(FIELD_REPOSITORY)
2221 .and_then(|v| v.as_str())
2222 .map(String::from);
2223 }
2224
2225 (homepage_url, repository_url)
2226}
2227
2228fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2229 let mut parties = Vec::new();
2230
2231 if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2232 for author in authors {
2233 if let Some(author_str) = author.as_str() {
2234 let (name, email) = split_name_email(author_str);
2235 parties.push(Party {
2236 r#type: None,
2237 role: Some("author".to_string()),
2238 name,
2239 email,
2240 url: None,
2241 organization: None,
2242 organization_url: None,
2243 timezone: None,
2244 });
2245 }
2246 }
2247 }
2248
2249 if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2250 for maintainer in maintainers {
2251 if let Some(maintainer_str) = maintainer.as_str() {
2252 let (name, email) = split_name_email(maintainer_str);
2253 parties.push(Party {
2254 r#type: None,
2255 role: Some("maintainer".to_string()),
2256 name,
2257 email,
2258 url: None,
2259 organization: None,
2260 organization_url: None,
2261 timezone: None,
2262 });
2263 }
2264 }
2265 }
2266
2267 parties
2268}
2269
2270fn extract_dependencies(
2271 project: &TomlMap<String, TomlValue>,
2272 toml_content: &TomlValue,
2273) -> (Vec<Dependency>, Vec<Dependency>) {
2274 let mut dependencies = Vec::new();
2275 let mut optional_dependencies = Vec::new();
2276
2277 if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2279 match deps_value {
2280 TomlValue::Array(arr) => {
2281 dependencies = parse_dependency_array(arr, false, None);
2282 }
2283 TomlValue::Table(table) => {
2284 dependencies = parse_dependency_table(table, false, None);
2285 }
2286 _ => {}
2287 }
2288 }
2289
2290 if let Some(opt_deps_table) = project
2292 .get(FIELD_OPTIONAL_DEPENDENCIES)
2293 .and_then(|v| v.as_table())
2294 {
2295 for (extra_name, deps) in opt_deps_table {
2296 match deps {
2297 TomlValue::Array(arr) => {
2298 optional_dependencies.extend(parse_dependency_array(
2299 arr,
2300 true,
2301 Some(extra_name),
2302 ));
2303 }
2304 TomlValue::Table(table) => {
2305 optional_dependencies.extend(parse_dependency_table(
2306 table,
2307 true,
2308 Some(extra_name),
2309 ));
2310 }
2311 _ => {}
2312 }
2313 }
2314 }
2315
2316 if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2318 match dev_deps_value {
2319 TomlValue::Array(arr) => {
2320 optional_dependencies.extend(parse_dependency_array(
2321 arr,
2322 true,
2323 Some(FIELD_DEV_DEPENDENCIES),
2324 ));
2325 }
2326 TomlValue::Table(table) => {
2327 optional_dependencies.extend(parse_dependency_table(
2328 table,
2329 true,
2330 Some(FIELD_DEV_DEPENDENCIES),
2331 ));
2332 }
2333 _ => {}
2334 }
2335 }
2336
2337 if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
2339 for (group_name, group_data) in groups_table {
2340 if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2341 match group_deps {
2342 TomlValue::Array(arr) => {
2343 optional_dependencies.extend(parse_dependency_array(
2344 arr,
2345 true,
2346 Some(group_name),
2347 ));
2348 }
2349 TomlValue::Table(table) => {
2350 optional_dependencies.extend(parse_dependency_table(
2351 table,
2352 true,
2353 Some(group_name),
2354 ));
2355 }
2356 _ => {}
2357 }
2358 }
2359 }
2360 }
2361
2362 if let Some(groups_table) = toml_content
2363 .get(FIELD_DEPENDENCY_GROUPS)
2364 .and_then(|value| value.as_table())
2365 {
2366 for (group_name, deps) in groups_table {
2367 match deps {
2368 TomlValue::Array(arr) => {
2369 optional_dependencies.extend(parse_dependency_array(
2370 arr,
2371 true,
2372 Some(group_name),
2373 ));
2374 }
2375 TomlValue::Table(table) => {
2376 optional_dependencies.extend(parse_dependency_table(
2377 table,
2378 true,
2379 Some(group_name),
2380 ));
2381 }
2382 _ => {}
2383 }
2384 }
2385 }
2386
2387 if let Some(dev_deps_value) = toml_content
2388 .get("tool")
2389 .and_then(|value| value.as_table())
2390 .and_then(|tool| tool.get("uv"))
2391 .and_then(|value| value.as_table())
2392 .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2393 {
2394 match dev_deps_value {
2395 TomlValue::Array(arr) => {
2396 optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2397 }
2398 TomlValue::Table(table) => {
2399 optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2400 }
2401 _ => {}
2402 }
2403 }
2404
2405 (dependencies, optional_dependencies)
2406}
2407
2408fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2409 let mut extra_data = HashMap::new();
2410
2411 if let Some(tool_uv) = toml_content
2412 .get("tool")
2413 .and_then(|value| value.as_table())
2414 .and_then(|tool| tool.get("uv"))
2415 {
2416 extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2417 }
2418
2419 if extra_data.is_empty() {
2420 None
2421 } else {
2422 Some(extra_data)
2423 }
2424}
2425
2426fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2427 match value {
2428 TomlValue::String(value) => JsonValue::String(value.clone()),
2429 TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2430 TomlValue::Float(value) => JsonValue::String(value.to_string()),
2431 TomlValue::Boolean(value) => JsonValue::Bool(*value),
2432 TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2433 TomlValue::Array(values) => {
2434 JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2435 }
2436 TomlValue::Table(values) => JsonValue::Object(
2437 values
2438 .iter()
2439 .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2440 .collect::<JsonMap<String, JsonValue>>(),
2441 ),
2442 }
2443}
2444
2445fn parse_dependency_table(
2446 table: &TomlMap<String, TomlValue>,
2447 is_optional: bool,
2448 scope: Option<&str>,
2449) -> Vec<Dependency> {
2450 table
2451 .iter()
2452 .filter_map(|(name, version)| {
2453 let version_str = version.as_str().map(|s| s.to_string());
2454 let mut package_url =
2455 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2456
2457 if let Some(v) = &version_str {
2458 package_url.with_version(v).ok()?;
2459 }
2460
2461 Some(Dependency {
2462 purl: Some(package_url.to_string()),
2463 extracted_requirement: None,
2464 scope: scope.map(|s| s.to_string()),
2465 is_runtime: Some(!is_optional),
2466 is_optional: Some(is_optional),
2467 is_pinned: None,
2468 is_direct: Some(true),
2469 resolved_package: None,
2470 extra_data: None,
2471 })
2472 })
2473 .collect()
2474}
2475
2476fn parse_dependency_array(
2477 array: &[TomlValue],
2478 is_optional: bool,
2479 scope: Option<&str>,
2480) -> Vec<Dependency> {
2481 array
2482 .iter()
2483 .filter_map(|dep| {
2484 let dep_str = dep.as_str()?;
2485
2486 let mut parts = dep_str.split(['>', '=', '<', '~']);
2487 let name = parts.next()?.trim().to_string();
2488
2489 let version = parts.next().map(|v| v.trim().to_string());
2490
2491 let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name)
2492 {
2493 Ok(purl) => purl,
2494 Err(_) => return None,
2495 };
2496
2497 if let Some(ref v) = version {
2498 package_url.with_version(v).ok()?;
2499 }
2500
2501 Some(Dependency {
2502 purl: Some(package_url.to_string()),
2503 extracted_requirement: None,
2504 scope: scope.map(|s| s.to_string()),
2505 is_runtime: Some(!is_optional),
2506 is_optional: Some(is_optional),
2507 is_pinned: None,
2508 is_direct: Some(true),
2509 resolved_package: None,
2510 extra_data: None,
2511 })
2512 })
2513 .collect()
2514}
2515
2516#[derive(Debug, Clone)]
2517enum Value {
2518 String(String),
2519 Number(f64),
2520 Bool(bool),
2521 None,
2522 List(Vec<Value>),
2523 Tuple(Vec<Value>),
2524 Dict(HashMap<String, Value>),
2525}
2526
2527struct LiteralEvaluator {
2528 constants: HashMap<String, Value>,
2529 max_depth: usize,
2530 max_nodes: usize,
2531 nodes_visited: usize,
2532}
2533
2534impl LiteralEvaluator {
2535 fn new(constants: HashMap<String, Value>) -> Self {
2536 Self {
2537 constants,
2538 max_depth: MAX_SETUP_PY_AST_DEPTH,
2539 max_nodes: MAX_SETUP_PY_AST_NODES,
2540 nodes_visited: 0,
2541 }
2542 }
2543
2544 fn insert_constant(&mut self, name: String, value: Value) {
2545 self.constants.insert(name, value);
2546 }
2547
2548 fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2549 if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2550 return None;
2551 }
2552 self.nodes_visited += 1;
2553
2554 match expr {
2555 ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
2556 Some(Value::String(value.to_str().to_string()))
2557 }
2558 ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
2559 Some(Value::Bool(*value))
2560 }
2561 ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
2562 self.evaluate_number(value)
2563 }
2564 ast::Expr::NoneLiteral(_) => Some(Value::None),
2565 ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2566 ast::Expr::List(ast::ExprList { elts, .. }) => {
2567 let mut values = Vec::new();
2568 for elt in elts {
2569 values.push(self.evaluate_expr(elt, depth + 1)?);
2570 }
2571 Some(Value::List(values))
2572 }
2573 ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
2574 let mut values = Vec::new();
2575 for elt in elts {
2576 values.push(self.evaluate_expr(elt, depth + 1)?);
2577 }
2578 Some(Value::Tuple(values))
2579 }
2580 ast::Expr::Dict(ast::ExprDict { items, .. }) => {
2581 let mut dict = HashMap::new();
2582 for item in items {
2583 let key_expr = item.key.as_ref()?;
2584 let key_value = self.evaluate_expr(key_expr, depth + 1)?;
2585 let key = value_to_string(&key_value)?;
2586 let value = self.evaluate_expr(&item.value, depth + 1)?;
2587 dict.insert(key, value);
2588 }
2589 Some(Value::Dict(dict))
2590 }
2591 ast::Expr::Call(ast::ExprCall {
2592 func, arguments, ..
2593 }) => {
2594 let args = arguments.args.as_ref();
2595 let keywords = arguments.keywords.as_ref();
2596 if keywords.is_empty()
2597 && let Some(name) = dotted_name(func.as_ref(), depth + 1)
2598 && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
2599 {
2600 return self.evaluate_ordered_dict(args, depth + 1);
2601 }
2602
2603 if !args.is_empty() {
2604 return None;
2605 }
2606
2607 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
2608 && id == "dict"
2609 {
2610 let mut dict = HashMap::new();
2611 for keyword in keywords {
2612 let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
2613 let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2614 dict.insert(key.to_string(), value);
2615 }
2616 return Some(Value::Dict(dict));
2617 }
2618
2619 None
2620 }
2621 _ => None,
2622 }
2623 }
2624
2625 fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
2626 match number {
2627 ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2628 ast::Number::Float(value) => Some(Value::Number(*value)),
2629 ast::Number::Complex { .. } => None,
2630 }
2631 }
2632
2633 fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2634 if args.len() != 1 {
2635 return None;
2636 }
2637
2638 let items = match self.evaluate_expr(&args[0], depth)? {
2639 Value::List(items) | Value::Tuple(items) => items,
2640 _ => return None,
2641 };
2642
2643 let mut dict = HashMap::new();
2644 for item in items {
2645 let Value::Tuple(values) = item else {
2646 return None;
2647 };
2648 if values.len() != 2 {
2649 return None;
2650 }
2651 let key = value_to_string(&values[0])?;
2652 dict.insert(key, values[1].clone());
2653 }
2654
2655 Some(Value::Dict(dict))
2656 }
2657}
2658
2659#[derive(Default)]
2660struct SetupAliases {
2661 setup_names: HashSet<String>,
2662 module_aliases: HashMap<String, String>,
2663}
2664
2665fn extract_from_setup_py(path: &Path) -> PackageData {
2666 let content = match read_file_to_string(path) {
2667 Ok(content) => content,
2668 Err(e) => {
2669 warn!("Failed to read setup.py at {:?}: {}", path, e);
2670 return default_package_data(path);
2671 }
2672 };
2673
2674 if content.len() > MAX_SETUP_PY_BYTES {
2675 warn!("setup.py too large at {:?}: {} bytes", path, content.len());
2676 return extract_from_setup_py_regex(&content);
2677 }
2678
2679 let mut package_data = match extract_from_setup_py_ast(&content) {
2680 Ok(Some(data)) => data,
2681 Ok(None) => extract_from_setup_py_regex(&content),
2682 Err(e) => {
2683 warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
2684 extract_from_setup_py_regex(&content)
2685 }
2686 };
2687
2688 if package_data.name.is_none() {
2689 package_data.name = extract_setup_value(&content, "name");
2690 }
2691
2692 if package_data.version.is_none() {
2693 package_data.version = extract_setup_value(&content, "version");
2694 }
2695
2696 fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
2697
2698 if package_data.purl.is_none() {
2699 package_data.purl = build_setup_py_purl(
2700 package_data.name.as_deref(),
2701 package_data.version.as_deref(),
2702 );
2703 }
2704
2705 package_data
2706}
2707
2708fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
2709 if package_data.version.is_some()
2710 && package_data.extracted_license_statement.is_some()
2711 && package_data
2712 .parties
2713 .iter()
2714 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
2715 {
2716 return;
2717 }
2718
2719 let Some(root) = path.parent() else {
2720 return;
2721 };
2722
2723 let dunder_metadata = collect_sibling_dunder_metadata(root, content);
2724
2725 if package_data.version.is_none() {
2726 package_data.version = dunder_metadata.version;
2727 }
2728
2729 if package_data.extracted_license_statement.is_none() {
2730 package_data.extracted_license_statement = dunder_metadata.license;
2731 }
2732
2733 let has_author = package_data
2734 .parties
2735 .iter()
2736 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
2737
2738 if !has_author && let Some(author) = dunder_metadata.author {
2739 package_data.parties.push(Party {
2740 r#type: Some("person".to_string()),
2741 role: Some("author".to_string()),
2742 name: Some(author),
2743 email: None,
2744 url: None,
2745 organization: None,
2746 organization_url: None,
2747 timezone: None,
2748 });
2749 }
2750}
2751
2752#[derive(Default)]
2753struct DunderMetadata {
2754 version: Option<String>,
2755 author: Option<String>,
2756 license: Option<String>,
2757}
2758
2759fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
2760 let statements = match parse_module(content) {
2761 Ok(parsed) => parsed.into_suite(),
2762 Err(_) => return DunderMetadata::default(),
2763 };
2764
2765 let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2766 let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2767 let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2768 let mut metadata = DunderMetadata::default();
2769
2770 for module in imported_dunder_modules(&statements) {
2771 let Some(path) = resolve_imported_module_path(root, &module) else {
2772 continue;
2773 };
2774 let Ok(module_content) = read_file_to_string(&path) else {
2775 continue;
2776 };
2777
2778 if metadata.version.is_none() {
2779 metadata.version = version_re
2780 .as_ref()
2781 .and_then(|regex| regex.captures(&module_content))
2782 .and_then(|captures| captures.get(1))
2783 .map(|match_| match_.as_str().to_string());
2784 }
2785
2786 if metadata.author.is_none() {
2787 metadata.author = author_re
2788 .as_ref()
2789 .and_then(|regex| regex.captures(&module_content))
2790 .and_then(|captures| captures.get(1))
2791 .map(|match_| match_.as_str().to_string());
2792 }
2793
2794 if metadata.license.is_none() {
2795 metadata.license = license_re
2796 .as_ref()
2797 .and_then(|regex| regex.captures(&module_content))
2798 .and_then(|captures| captures.get(1))
2799 .map(|match_| match_.as_str().to_string());
2800 }
2801
2802 if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
2803 return metadata;
2804 }
2805 }
2806
2807 metadata
2808}
2809
2810fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
2811 let mut modules = Vec::new();
2812
2813 for statement in statements {
2814 let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
2815 continue;
2816 };
2817 let Some(module) = module.as_ref().map(|name| name.as_str()) else {
2818 continue;
2819 };
2820 let imports_dunder = names.iter().any(|alias| {
2821 matches!(
2822 alias.name.as_str(),
2823 "__version__" | "__author__" | "__license__"
2824 )
2825 });
2826 if imports_dunder {
2827 modules.push(module.to_string());
2828 }
2829 }
2830
2831 modules
2832}
2833
2834fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
2835 let relative = PathBuf::from_iter(module.split('.'));
2836 let candidates = [
2837 root.join(relative.with_extension("py")),
2838 root.join(&relative).join("__init__.py"),
2839 root.join("src").join(relative.with_extension("py")),
2840 root.join("src").join(relative).join("__init__.py"),
2841 ];
2842
2843 candidates.into_iter().find(|candidate| candidate.exists())
2844}
2845
2846fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
2862 let statements = parse_module(content)
2863 .map(|parsed| parsed.into_suite())
2864 .map_err(|e| e.to_string())?;
2865 let aliases = collect_setup_aliases(&statements);
2866 let mut evaluator = LiteralEvaluator::new(HashMap::new());
2867 build_setup_py_constants(&statements, &mut evaluator);
2868
2869 let setup_call = find_setup_call(&statements, &aliases);
2870 let Some(call_expr) = setup_call else {
2871 return Ok(None);
2872 };
2873
2874 let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
2875 Ok(Some(build_setup_py_package_data(&setup_values)))
2876}
2877
2878fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
2879 for stmt in statements {
2880 if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
2881 if targets.len() != 1 {
2882 continue;
2883 }
2884
2885 let Some(name) = extract_assign_name(&targets[0]) else {
2886 continue;
2887 };
2888
2889 if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
2890 evaluator.insert_constant(name, value);
2891 }
2892 }
2893 }
2894}
2895
2896fn extract_assign_name(target: &ast::Expr) -> Option<String> {
2897 match target {
2898 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
2899 _ => None,
2900 }
2901}
2902
2903fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
2904 let mut aliases = SetupAliases::default();
2905 aliases.setup_names.insert("setup".to_string());
2906
2907 for stmt in statements {
2908 match stmt {
2909 ast::Stmt::Import(ast::StmtImport { names, .. }) => {
2910 for alias in names {
2911 let module_name = alias.name.as_str();
2912 if !is_setup_module(module_name) {
2913 continue;
2914 }
2915 let alias_name = alias
2916 .asname
2917 .as_ref()
2918 .map(|name| name.as_str())
2919 .unwrap_or(module_name);
2920 aliases
2921 .module_aliases
2922 .insert(alias_name.to_string(), module_name.to_string());
2923 }
2924 }
2925 ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
2926 let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
2927 continue;
2928 };
2929 if !is_setup_module(module_name) {
2930 continue;
2931 }
2932 for alias in names {
2933 if alias.name.as_str() != "setup" {
2934 continue;
2935 }
2936 let alias_name = alias
2937 .asname
2938 .as_ref()
2939 .map(|name| name.as_str())
2940 .unwrap_or("setup");
2941 aliases.setup_names.insert(alias_name.to_string());
2942 }
2943 }
2944 _ => {}
2945 }
2946 }
2947
2948 aliases
2949}
2950
2951fn is_setup_module(module_name: &str) -> bool {
2952 matches!(module_name, "setuptools" | "distutils" | "distutils.core")
2953}
2954
2955fn find_setup_call<'a>(
2956 statements: &'a [ast::Stmt],
2957 aliases: &'a SetupAliases,
2958) -> Option<&'a ast::Expr> {
2959 let mut finder = SetupCallFinder {
2960 aliases,
2961 nodes_visited: 0,
2962 };
2963 finder.find_in_statements(statements)
2964}
2965
2966struct SetupCallFinder<'a> {
2967 aliases: &'a SetupAliases,
2968 nodes_visited: usize,
2969}
2970
2971impl<'a> SetupCallFinder<'a> {
2972 fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
2973 for stmt in statements {
2974 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
2975 return None;
2976 }
2977 self.nodes_visited += 1;
2978
2979 let found = match stmt {
2980 ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
2981 ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
2982 ast::Stmt::If(ast::StmtIf {
2983 body,
2984 elif_else_clauses,
2985 ..
2986 }) => self.find_in_statements(body).or_else(|| {
2987 for clause in elif_else_clauses {
2988 if let Some(found) = self.find_in_statements(&clause.body) {
2989 return Some(found);
2990 }
2991 }
2992 None
2993 }),
2994 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
2995 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
2996 .find_in_statements(body)
2997 .or_else(|| self.find_in_statements(orelse)),
2998 ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
2999 ast::Stmt::Try(ast::StmtTry {
3000 body,
3001 orelse,
3002 finalbody,
3003 handlers,
3004 ..
3005 }) => self
3006 .find_in_statements(body)
3007 .or_else(|| self.find_in_statements(orelse))
3008 .or_else(|| self.find_in_statements(finalbody))
3009 .or_else(|| {
3010 for handler in handlers {
3011 let ast::ExceptHandler::ExceptHandler(
3012 ast::ExceptHandlerExceptHandler { body, .. },
3013 ) = handler;
3014 if let Some(found) = self.find_in_statements(body) {
3015 return Some(found);
3016 }
3017 }
3018 None
3019 }),
3020 _ => None,
3021 };
3022
3023 if found.is_some() {
3024 return found;
3025 }
3026 }
3027
3028 None
3029 }
3030
3031 fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3032 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3033 return None;
3034 }
3035 self.nodes_visited += 1;
3036
3037 match expr {
3038 ast::Expr::Call(ast::ExprCall { func, .. })
3039 if is_setup_call(func.as_ref(), self.aliases) =>
3040 {
3041 Some(expr)
3042 }
3043 _ => None,
3044 }
3045 }
3046}
3047
3048fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3049 let Some(dotted) = dotted_name(func, 0) else {
3050 return false;
3051 };
3052
3053 if aliases.setup_names.contains(&dotted) {
3054 return true;
3055 }
3056
3057 let Some(module) = dotted.strip_suffix(".setup") else {
3058 return false;
3059 };
3060
3061 let resolved = resolve_module_alias(module, aliases);
3062 is_setup_module(&resolved)
3063}
3064
3065fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3066 if depth >= MAX_SETUP_PY_AST_DEPTH {
3067 return None;
3068 }
3069
3070 match expr {
3071 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3072 ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3073 let base = dotted_name(value.as_ref(), depth + 1)?;
3074 Some(format!("{}.{}", base, attr.as_str()))
3075 }
3076 _ => None,
3077 }
3078}
3079
3080fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3081 if let Some(mapped) = aliases.module_aliases.get(module) {
3082 return mapped.clone();
3083 }
3084
3085 let Some((base, rest)) = module.split_once('.') else {
3086 return module.to_string();
3087 };
3088
3089 if let Some(mapped) = aliases.module_aliases.get(base) {
3090 return format!("{}.{}", mapped, rest);
3091 }
3092
3093 module.to_string()
3094}
3095
3096fn extract_setup_keywords(
3097 call_expr: &ast::Expr,
3098 evaluator: &mut LiteralEvaluator,
3099) -> HashMap<String, Value> {
3100 let mut values = HashMap::new();
3101 let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3102 return values;
3103 };
3104
3105 for keyword in arguments.keywords.iter() {
3106 if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3107 if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3108 values.insert(arg.to_string(), value);
3109 }
3110 } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3111 for (key, value) in dict {
3112 values.insert(key, value);
3113 }
3114 }
3115 }
3116
3117 values
3118}
3119
3120fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3121 let name = get_value_string(values, "name");
3122 let version = get_value_string(values, "version");
3123 let description =
3124 get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3125 let homepage_url =
3126 get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3127 let author = get_value_string(values, "author");
3128 let author_email = get_value_string(values, "author_email");
3129 let maintainer = get_value_string(values, "maintainer");
3130 let maintainer_email = get_value_string(values, "maintainer_email");
3131 let license = get_value_string(values, "license");
3132 let classifiers = values
3133 .get("classifiers")
3134 .and_then(value_to_string_list)
3135 .unwrap_or_default();
3136
3137 let mut parties = Vec::new();
3138 if author.is_some() || author_email.is_some() {
3139 parties.push(Party {
3140 r#type: Some("person".to_string()),
3141 role: Some("author".to_string()),
3142 name: author,
3143 email: author_email,
3144 url: None,
3145 organization: None,
3146 organization_url: None,
3147 timezone: None,
3148 });
3149 }
3150
3151 if maintainer.is_some() || maintainer_email.is_some() {
3152 parties.push(Party {
3153 r#type: Some("person".to_string()),
3154 role: Some("maintainer".to_string()),
3155 name: maintainer,
3156 email: maintainer_email,
3157 url: None,
3158 organization: None,
3159 organization_url: None,
3160 timezone: None,
3161 });
3162 }
3163
3164 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3165 normalize_spdx_declared_license(license.as_deref());
3166 let extracted_license_statement = license.clone();
3167
3168 let dependencies = build_setup_py_dependencies(values);
3169 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3170 let mut homepage_from_project_urls = None;
3171 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3172 let mut extra_data = HashMap::new();
3173
3174 if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3175 apply_project_url_mappings(
3176 &parsed_project_urls,
3177 &mut homepage_from_project_urls,
3178 &mut bug_tracking_url,
3179 &mut code_view_url,
3180 &mut vcs_url,
3181 &mut extra_data,
3182 );
3183 }
3184
3185 let extra_data = if extra_data.is_empty() {
3186 None
3187 } else {
3188 Some(extra_data)
3189 };
3190
3191 PackageData {
3192 package_type: Some(PythonParser::PACKAGE_TYPE),
3193 namespace: None,
3194 name,
3195 version,
3196 qualifiers: None,
3197 subpath: None,
3198 primary_language: Some("Python".to_string()),
3199 description,
3200 release_date: None,
3201 parties,
3202 keywords: Vec::new(),
3203 homepage_url: homepage_url.or(homepage_from_project_urls),
3204 download_url: None,
3205 size: None,
3206 sha1: None,
3207 md5: None,
3208 sha256: None,
3209 sha512: None,
3210 bug_tracking_url,
3211 code_view_url,
3212 vcs_url,
3213 copyright: None,
3214 holder: None,
3215 declared_license_expression,
3216 declared_license_expression_spdx,
3217 license_detections,
3218 other_license_expression: None,
3219 other_license_expression_spdx: None,
3220 other_license_detections: Vec::new(),
3221 extracted_license_statement,
3222 notice_text: None,
3223 source_packages: Vec::new(),
3224 file_references: Vec::new(),
3225 is_private: has_private_classifier(&classifiers),
3226 is_virtual: false,
3227 extra_data,
3228 dependencies,
3229 repository_homepage_url: None,
3230 repository_download_url: None,
3231 api_data_url: None,
3232 datasource_id: Some(DatasourceId::PypiSetupPy),
3233 purl,
3234 }
3235}
3236
3237fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3238 let mut dependencies = Vec::new();
3239
3240 if let Some(reqs) = values
3241 .get("install_requires")
3242 .and_then(value_to_string_list)
3243 {
3244 dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3245 }
3246
3247 if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3248 dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3249 }
3250
3251 if let Some(Value::Dict(extras)) = values.get("extras_require") {
3252 let mut extra_items: Vec<_> = extras.iter().collect();
3253 extra_items.sort_by_key(|(name, _)| *name);
3254 for (extra_name, extra_value) in extra_items {
3255 if let Some(reqs) = value_to_string_list(extra_value) {
3256 dependencies.extend(build_setup_py_dependency_list(
3257 reqs.as_slice(),
3258 extra_name,
3259 true,
3260 ));
3261 }
3262 }
3263 }
3264
3265 dependencies
3266}
3267
3268fn build_setup_py_dependency_list(
3269 reqs: &[String],
3270 scope: &str,
3271 is_optional: bool,
3272) -> Vec<Dependency> {
3273 reqs.iter()
3274 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3275 .collect()
3276}
3277
3278fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3279 values.get(key).and_then(value_to_string)
3280}
3281
3282fn value_to_string(value: &Value) -> Option<String> {
3283 match value {
3284 Value::String(value) => Some(value.clone()),
3285 Value::Number(value) => Some(value.to_string()),
3286 Value::Bool(value) => Some(value.to_string()),
3287 _ => None,
3288 }
3289}
3290
3291fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3292 match value {
3293 Value::String(value) => Some(vec![value.clone()]),
3294 Value::List(values) | Value::Tuple(values) => {
3295 let mut items = Vec::new();
3296 for item in values {
3297 items.push(value_to_string(item)?);
3298 }
3299 Some(items)
3300 }
3301 _ => None,
3302 }
3303}
3304
3305fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3306 let Value::Dict(dict) = value else {
3307 return None;
3308 };
3309
3310 let mut pairs: Vec<(String, String)> = dict
3311 .iter()
3312 .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3313 .collect::<Option<Vec<_>>>()?;
3314 pairs.sort_by(|left, right| left.0.cmp(&right.0));
3315 Some(pairs)
3316}
3317
3318fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3319 let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3320 extract_requires_dist_dependencies(&requires_dist)
3321}
3322
3323pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3324 requires_dist
3325 .iter()
3326 .filter_map(|entry| build_rfc822_dependency(entry))
3327 .collect()
3328}
3329
3330fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3331 build_python_dependency(entry, "install", false, None)
3332}
3333
3334fn build_python_dependency(
3335 entry: &str,
3336 default_scope: &str,
3337 default_optional: bool,
3338 marker_override: Option<&str>,
3339) -> Option<Dependency> {
3340 let (requirement_part, marker_part) = entry
3341 .split_once(';')
3342 .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3343 .unwrap_or((entry.trim(), None));
3344
3345 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3346 let requirement = normalize_rfc822_requirement(requirement_part);
3347 let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3348 marker_part.or(marker_override),
3349 default_scope,
3350 default_optional,
3351 );
3352 let mut purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
3353
3354 let is_pinned = requirement
3355 .as_deref()
3356 .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3357 if is_pinned
3358 && let Some(version) = requirement
3359 .as_deref()
3360 .map(|req| req.trim_start_matches('='))
3361 {
3362 purl.with_version(version).ok()?;
3363 }
3364
3365 let mut extra_data = HashMap::new();
3366 extra_data.extend(marker_data);
3367 if let Some(marker) = marker {
3368 extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3369 }
3370
3371 Some(Dependency {
3372 purl: Some(purl.to_string()),
3373 extracted_requirement: requirement,
3374 scope: Some(scope),
3375 is_runtime: Some(true),
3376 is_optional: Some(is_optional),
3377 is_pinned: Some(is_pinned),
3378 is_direct: Some(true),
3379 resolved_package: None,
3380 extra_data: if extra_data.is_empty() {
3381 None
3382 } else {
3383 Some(extra_data)
3384 },
3385 })
3386}
3387
3388fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3389 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3390 let trimmed = requirement_part.trim();
3391 let mut remainder = trimmed[name.len()..].trim();
3392
3393 if let Some(stripped) = remainder.strip_prefix('[')
3394 && let Some(end_idx) = stripped.find(']')
3395 {
3396 remainder = stripped[end_idx + 1..].trim();
3397 }
3398
3399 let remainder = remainder
3400 .strip_prefix('(')
3401 .and_then(|value| value.strip_suffix(')'))
3402 .unwrap_or(remainder)
3403 .trim();
3404
3405 if remainder.is_empty() {
3406 return None;
3407 }
3408
3409 let mut specifiers: Vec<String> = remainder
3410 .split(',')
3411 .map(|specifier| specifier.trim().replace(' ', ""))
3412 .filter(|specifier| !specifier.is_empty())
3413 .collect();
3414 specifiers.sort();
3415 Some(specifiers.join(","))
3416}
3417
3418fn parse_rfc822_marker(
3419 marker_part: Option<&str>,
3420 default_scope: &str,
3421 default_optional: bool,
3422) -> (
3423 String,
3424 bool,
3425 Option<String>,
3426 HashMap<String, serde_json::Value>,
3427) {
3428 let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3429 return (
3430 default_scope.to_string(),
3431 default_optional,
3432 None,
3433 HashMap::new(),
3434 );
3435 };
3436
3437 let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3438 .expect("extra marker regex should compile");
3439 let mut extra_data = HashMap::new();
3440
3441 if let Some(python_version) = extract_marker_field(marker, "python_version") {
3442 extra_data.insert(
3443 "python_version".to_string(),
3444 serde_json::Value::String(python_version),
3445 );
3446 }
3447 if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
3448 extra_data.insert(
3449 "sys_platform".to_string(),
3450 serde_json::Value::String(sys_platform),
3451 );
3452 }
3453
3454 if let Some(captures) = extra_re.captures(marker)
3455 && let Some(scope) = captures.get(1)
3456 {
3457 return (
3458 scope.as_str().to_string(),
3459 true,
3460 Some(marker.trim().to_string()),
3461 extra_data,
3462 );
3463 }
3464
3465 (
3466 default_scope.to_string(),
3467 default_optional,
3468 Some(marker.trim().to_string()),
3469 extra_data,
3470 )
3471}
3472
3473fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
3474 let re = Regex::new(&format!(
3475 r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
3476 field
3477 ))
3478 .ok()?;
3479 let captures = re.captures(marker)?;
3480 let operator = captures.get(1)?.as_str();
3481 let value = captures.get(2)?.as_str();
3482 Some(format!("{} {}", operator, value))
3483}
3484
3485fn parse_requires_txt(content: &str) -> Vec<Dependency> {
3486 let mut dependencies = Vec::new();
3487 let mut current_scope = "install".to_string();
3488 let mut current_optional = false;
3489 let mut current_marker: Option<String> = None;
3490
3491 for line in content.lines() {
3492 let trimmed = line.trim();
3493 if trimmed.is_empty() || trimmed.starts_with('#') {
3494 continue;
3495 }
3496
3497 if trimmed.starts_with('[') && trimmed.ends_with(']') {
3498 let inner = &trimmed[1..trimmed.len() - 1];
3499 if let Some(rest) = inner.strip_prefix(':') {
3500 current_scope = "install".to_string();
3501 current_optional = false;
3502 current_marker = Some(rest.trim().to_string());
3503 } else if let Some((scope, marker)) = inner.split_once(':') {
3504 current_scope = scope.trim().to_string();
3505 current_optional = true;
3506 current_marker = Some(marker.trim().to_string());
3507 } else {
3508 current_scope = inner.trim().to_string();
3509 current_optional = true;
3510 current_marker = None;
3511 }
3512 continue;
3513 }
3514
3515 if let Some(dependency) = build_python_dependency(
3516 trimmed,
3517 ¤t_scope,
3518 current_optional,
3519 current_marker.as_deref(),
3520 ) {
3521 dependencies.push(dependency);
3522 }
3523 }
3524
3525 dependencies
3526}
3527
3528fn has_private_classifier(classifiers: &[String]) -> bool {
3529 classifiers
3530 .iter()
3531 .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
3532}
3533
3534fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
3535 let name = name?;
3536 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
3537 if let Some(version) = version {
3538 package_url.with_version(version).ok()?;
3539 }
3540 Some(package_url.to_string())
3541}
3542
3543fn extract_from_setup_py_regex(content: &str) -> PackageData {
3544 let name = extract_setup_value(content, "name");
3545 let version = extract_setup_value(content, "version");
3546 let license_expression = extract_setup_value(content, "license");
3547
3548 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3549 normalize_spdx_declared_license(license_expression.as_deref());
3550 let extracted_license_statement = license_expression.clone();
3551
3552 let dependencies = extract_setup_py_dependencies(content);
3553 let homepage_url = extract_setup_value(content, "url");
3554 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3555
3556 PackageData {
3557 package_type: Some(PythonParser::PACKAGE_TYPE),
3558 namespace: None,
3559 name,
3560 version,
3561 qualifiers: None,
3562 subpath: None,
3563 primary_language: Some("Python".to_string()),
3564 description: None,
3565 release_date: None,
3566 parties: Vec::new(),
3567 keywords: Vec::new(),
3568 homepage_url,
3569 download_url: None,
3570 size: None,
3571 sha1: None,
3572 md5: None,
3573 sha256: None,
3574 sha512: None,
3575 bug_tracking_url: None,
3576 code_view_url: None,
3577 vcs_url: None,
3578 copyright: None,
3579 holder: None,
3580 declared_license_expression,
3581 declared_license_expression_spdx,
3582 license_detections,
3583 other_license_expression: None,
3584 other_license_expression_spdx: None,
3585 other_license_detections: Vec::new(),
3586 extracted_license_statement,
3587 notice_text: None,
3588 source_packages: Vec::new(),
3589 file_references: Vec::new(),
3590 is_private: false,
3591 is_virtual: false,
3592 extra_data: None,
3593 dependencies,
3594 repository_homepage_url: None,
3595 repository_download_url: None,
3596 api_data_url: None,
3597 datasource_id: Some(DatasourceId::PypiSetupPy),
3598 purl,
3599 }
3600}
3601
3602fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
3603 crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
3604}
3605
3606fn extract_from_pypi_json(path: &Path) -> PackageData {
3607 let default = PackageData {
3608 package_type: Some(PythonParser::PACKAGE_TYPE),
3609 datasource_id: Some(DatasourceId::PypiJson),
3610 ..Default::default()
3611 };
3612
3613 let content = match read_file_to_string(path) {
3614 Ok(content) => content,
3615 Err(error) => {
3616 warn!("Failed to read pypi.json at {:?}: {}", path, error);
3617 return default;
3618 }
3619 };
3620
3621 let root: serde_json::Value = match serde_json::from_str(&content) {
3622 Ok(value) => value,
3623 Err(error) => {
3624 warn!("Failed to parse pypi.json at {:?}: {}", path, error);
3625 return default;
3626 }
3627 };
3628
3629 let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
3630 warn!("No info object found in pypi.json at {:?}", path);
3631 return default;
3632 };
3633
3634 let name = info
3635 .get("name")
3636 .and_then(|value| value.as_str())
3637 .map(ToOwned::to_owned);
3638 let version = info
3639 .get("version")
3640 .and_then(|value| value.as_str())
3641 .map(ToOwned::to_owned);
3642 let summary = info
3643 .get("summary")
3644 .and_then(|value| value.as_str())
3645 .map(ToOwned::to_owned);
3646 let description = info
3647 .get("description")
3648 .and_then(|value| value.as_str())
3649 .filter(|value| !value.trim().is_empty())
3650 .map(ToOwned::to_owned)
3651 .or(summary);
3652 let mut homepage_url = info
3653 .get("home_page")
3654 .and_then(|value| value.as_str())
3655 .map(ToOwned::to_owned);
3656 let author = info
3657 .get("author")
3658 .and_then(|value| value.as_str())
3659 .filter(|value| !value.trim().is_empty())
3660 .map(ToOwned::to_owned);
3661 let author_email = info
3662 .get("author_email")
3663 .and_then(|value| value.as_str())
3664 .filter(|value| !value.trim().is_empty())
3665 .map(ToOwned::to_owned);
3666 let license = info
3667 .get("license")
3668 .and_then(|value| value.as_str())
3669 .filter(|value| !value.trim().is_empty())
3670 .map(ToOwned::to_owned);
3671 let keywords = parse_setup_cfg_keywords(
3672 info.get("keywords")
3673 .and_then(|value| value.as_str())
3674 .map(ToOwned::to_owned),
3675 );
3676 let classifiers = info
3677 .get("classifiers")
3678 .and_then(|value| value.as_array())
3679 .map(|values| {
3680 values
3681 .iter()
3682 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
3683 .collect::<Vec<_>>()
3684 })
3685 .unwrap_or_default();
3686
3687 let mut parties = Vec::new();
3688 if author.is_some() || author_email.is_some() {
3689 parties.push(Party {
3690 r#type: Some("person".to_string()),
3691 role: Some("author".to_string()),
3692 name: author,
3693 email: author_email,
3694 url: None,
3695 organization: None,
3696 organization_url: None,
3697 timezone: None,
3698 });
3699 }
3700
3701 let mut bug_tracking_url = None;
3702 let mut code_view_url = None;
3703 let mut vcs_url = None;
3704 let mut extra_data = HashMap::new();
3705
3706 let parsed_project_urls = info
3707 .get("project_urls")
3708 .and_then(|value| value.as_object())
3709 .map(|map| {
3710 let mut pairs: Vec<(String, String)> = map
3711 .iter()
3712 .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
3713 .collect();
3714 pairs.sort_by(|left, right| left.0.cmp(&right.0));
3715 pairs
3716 })
3717 .unwrap_or_default();
3718
3719 apply_project_url_mappings(
3720 &parsed_project_urls,
3721 &mut homepage_url,
3722 &mut bug_tracking_url,
3723 &mut code_view_url,
3724 &mut vcs_url,
3725 &mut extra_data,
3726 );
3727
3728 let (download_url, size, sha256) = root
3729 .get("urls")
3730 .and_then(|value| value.as_array())
3731 .map(|urls| select_pypi_json_artifact(urls))
3732 .unwrap_or((None, None, None));
3733
3734 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3735 normalize_spdx_declared_license(license.as_deref());
3736 let dependencies = info
3737 .get("requires_dist")
3738 .and_then(|value| value.as_array())
3739 .map(|entries| {
3740 entries
3741 .iter()
3742 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
3743 .collect::<Vec<_>>()
3744 })
3745 .map(|entries| extract_requires_dist_dependencies(&entries))
3746 .unwrap_or_default();
3747
3748 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
3749 build_pypi_urls(name.as_deref(), version.as_deref());
3750
3751 PackageData {
3752 package_type: Some(PythonParser::PACKAGE_TYPE),
3753 namespace: None,
3754 name,
3755 version,
3756 qualifiers: None,
3757 subpath: None,
3758 primary_language: None,
3759 description,
3760 release_date: None,
3761 parties,
3762 keywords,
3763 homepage_url: homepage_url.or(repository_homepage_url.clone()),
3764 download_url,
3765 size,
3766 sha1: None,
3767 md5: None,
3768 sha256,
3769 sha512: None,
3770 bug_tracking_url,
3771 code_view_url,
3772 vcs_url,
3773 copyright: None,
3774 holder: None,
3775 declared_license_expression,
3776 declared_license_expression_spdx,
3777 license_detections,
3778 other_license_expression: None,
3779 other_license_expression_spdx: None,
3780 other_license_detections: Vec::new(),
3781 extracted_license_statement: license,
3782 notice_text: None,
3783 source_packages: Vec::new(),
3784 file_references: Vec::new(),
3785 is_private: has_private_classifier(&classifiers),
3786 is_virtual: false,
3787 extra_data: if extra_data.is_empty() {
3788 None
3789 } else {
3790 Some(extra_data)
3791 },
3792 dependencies,
3793 repository_homepage_url,
3794 repository_download_url,
3795 api_data_url,
3796 datasource_id: Some(DatasourceId::PypiJson),
3797 purl,
3798 }
3799}
3800
3801fn select_pypi_json_artifact(
3802 urls: &[serde_json::Value],
3803) -> (Option<String>, Option<u64>, Option<String>) {
3804 let selected = urls
3805 .iter()
3806 .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
3807 .or_else(|| urls.first());
3808
3809 let Some(entry) = selected else {
3810 return (None, None, None);
3811 };
3812
3813 let download_url = entry
3814 .get("url")
3815 .and_then(|value| value.as_str())
3816 .map(ToOwned::to_owned);
3817 let size = entry.get("size").and_then(|value| value.as_u64());
3818 let sha256 = entry
3819 .get("digests")
3820 .and_then(|value| value.as_object())
3821 .and_then(|digests| digests.get("sha256"))
3822 .and_then(|value| value.as_str())
3823 .map(ToOwned::to_owned);
3824
3825 (download_url, size, sha256)
3826}
3827
3828fn extract_from_pip_inspect(path: &Path) -> PackageData {
3829 let content = match read_file_to_string(path) {
3830 Ok(content) => content,
3831 Err(e) => {
3832 warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
3833 return default_package_data(path);
3834 }
3835 };
3836
3837 let root: serde_json::Value = match serde_json::from_str(&content) {
3838 Ok(value) => value,
3839 Err(e) => {
3840 warn!(
3841 "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
3842 path, e
3843 );
3844 return default_package_data(path);
3845 }
3846 };
3847
3848 let installed = match root.get("installed").and_then(|v| v.as_array()) {
3849 Some(arr) => arr,
3850 None => {
3851 warn!(
3852 "No 'installed' array found in pip-inspect.deplock at {:?}",
3853 path
3854 );
3855 return default_package_data(path);
3856 }
3857 };
3858
3859 let pip_version = root
3860 .get("pip_version")
3861 .and_then(|v| v.as_str())
3862 .map(String::from);
3863 let inspect_version = root
3864 .get("version")
3865 .and_then(|v| v.as_str())
3866 .map(String::from);
3867
3868 let mut main_package: Option<PackageData> = None;
3869 let mut dependencies: Vec<Dependency> = Vec::new();
3870
3871 for package_entry in installed {
3872 let metadata = match package_entry.get("metadata") {
3873 Some(m) => m,
3874 None => continue,
3875 };
3876
3877 let is_requested = package_entry
3878 .get("requested")
3879 .and_then(|v| v.as_bool())
3880 .unwrap_or(false);
3881 let has_direct_url = package_entry.get("direct_url").is_some();
3882
3883 let name = metadata
3884 .get("name")
3885 .and_then(|v| v.as_str())
3886 .map(String::from);
3887 let version = metadata
3888 .get("version")
3889 .and_then(|v| v.as_str())
3890 .map(String::from);
3891 let summary = metadata
3892 .get("summary")
3893 .and_then(|v| v.as_str())
3894 .map(String::from);
3895 let home_page = metadata
3896 .get("home_page")
3897 .and_then(|v| v.as_str())
3898 .map(String::from);
3899 let author = metadata
3900 .get("author")
3901 .and_then(|v| v.as_str())
3902 .map(String::from);
3903 let author_email = metadata
3904 .get("author_email")
3905 .and_then(|v| v.as_str())
3906 .map(String::from);
3907 let license = metadata
3908 .get("license")
3909 .and_then(|v| v.as_str())
3910 .map(String::from);
3911 let description = metadata
3912 .get("description")
3913 .and_then(|v| v.as_str())
3914 .map(String::from);
3915 let keywords = metadata
3916 .get("keywords")
3917 .and_then(|v| v.as_array())
3918 .map(|arr| {
3919 arr.iter()
3920 .filter_map(|k| k.as_str().map(String::from))
3921 .collect::<Vec<_>>()
3922 })
3923 .unwrap_or_default();
3924
3925 let mut parties = Vec::new();
3926 if author.is_some() || author_email.is_some() {
3927 parties.push(Party {
3928 r#type: Some("person".to_string()),
3929 role: Some("author".to_string()),
3930 name: author,
3931 email: author_email,
3932 url: None,
3933 organization: None,
3934 organization_url: None,
3935 timezone: None,
3936 });
3937 }
3938
3939 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3940 normalize_spdx_declared_license(license.as_deref());
3941 let extracted_license_statement = license.clone();
3942 let requires_dist = metadata
3943 .get("requires_dist")
3944 .and_then(|v| v.as_array())
3945 .map(|entries| {
3946 entries
3947 .iter()
3948 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
3949 .collect::<Vec<_>>()
3950 })
3951 .unwrap_or_default();
3952 let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
3953
3954 let purl = name.as_ref().and_then(|n| {
3955 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
3956 if let Some(v) = &version {
3957 package_url.with_version(v).ok()?;
3958 }
3959 Some(package_url.to_string())
3960 });
3961
3962 if is_requested && has_direct_url {
3963 let mut extra_data = HashMap::new();
3964 if let Some(pv) = &pip_version {
3965 extra_data.insert(
3966 "pip_version".to_string(),
3967 serde_json::Value::String(pv.clone()),
3968 );
3969 }
3970 if let Some(iv) = &inspect_version {
3971 extra_data.insert(
3972 "inspect_version".to_string(),
3973 serde_json::Value::String(iv.clone()),
3974 );
3975 }
3976
3977 main_package = Some(PackageData {
3978 package_type: Some(PythonParser::PACKAGE_TYPE),
3979 namespace: None,
3980 name,
3981 version,
3982 qualifiers: None,
3983 subpath: None,
3984 primary_language: Some("Python".to_string()),
3985 description: description.or(summary),
3986 release_date: None,
3987 parties,
3988 keywords,
3989 homepage_url: home_page,
3990 download_url: None,
3991 size: None,
3992 sha1: None,
3993 md5: None,
3994 sha256: None,
3995 sha512: None,
3996 bug_tracking_url: None,
3997 code_view_url: None,
3998 vcs_url: None,
3999 copyright: None,
4000 holder: None,
4001 declared_license_expression,
4002 declared_license_expression_spdx,
4003 license_detections,
4004 other_license_expression: None,
4005 other_license_expression_spdx: None,
4006 other_license_detections: Vec::new(),
4007 extracted_license_statement,
4008 notice_text: None,
4009 source_packages: Vec::new(),
4010 file_references: Vec::new(),
4011 is_private: false,
4012 is_virtual: true,
4013 extra_data: if extra_data.is_empty() {
4014 None
4015 } else {
4016 Some(extra_data)
4017 },
4018 dependencies: parsed_dependencies,
4019 repository_homepage_url: None,
4020 repository_download_url: None,
4021 api_data_url: None,
4022 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4023 purl,
4024 });
4025 } else {
4026 let resolved_package = PackageData {
4027 package_type: Some(PythonParser::PACKAGE_TYPE),
4028 namespace: None,
4029 name: name.clone(),
4030 version: version.clone(),
4031 qualifiers: None,
4032 subpath: None,
4033 primary_language: Some("Python".to_string()),
4034 description: description.or(summary),
4035 release_date: None,
4036 parties,
4037 keywords,
4038 homepage_url: home_page,
4039 download_url: None,
4040 size: None,
4041 sha1: None,
4042 md5: None,
4043 sha256: None,
4044 sha512: None,
4045 bug_tracking_url: None,
4046 code_view_url: None,
4047 vcs_url: None,
4048 copyright: None,
4049 holder: None,
4050 declared_license_expression,
4051 declared_license_expression_spdx,
4052 license_detections,
4053 other_license_expression: None,
4054 other_license_expression_spdx: None,
4055 other_license_detections: Vec::new(),
4056 extracted_license_statement,
4057 notice_text: None,
4058 source_packages: Vec::new(),
4059 file_references: Vec::new(),
4060 is_private: false,
4061 is_virtual: true,
4062 extra_data: None,
4063 dependencies: parsed_dependencies,
4064 repository_homepage_url: None,
4065 repository_download_url: None,
4066 api_data_url: None,
4067 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4068 purl: purl.clone(),
4069 };
4070
4071 let resolved = package_data_to_resolved(&resolved_package);
4072 dependencies.push(Dependency {
4073 purl,
4074 extracted_requirement: None,
4075 scope: None,
4076 is_runtime: Some(true),
4077 is_optional: Some(false),
4078 is_pinned: Some(true),
4079 is_direct: Some(is_requested),
4080 resolved_package: Some(Box::new(resolved)),
4081 extra_data: None,
4082 });
4083 }
4084 }
4085
4086 if let Some(mut main_pkg) = main_package {
4087 let direct_requirement_purls: HashSet<String> = main_pkg
4088 .dependencies
4089 .iter()
4090 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4091 .collect();
4092
4093 let resolved_requirement_purls: HashSet<String> = dependencies
4094 .iter()
4095 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4096 .collect();
4097
4098 let unresolved_dependencies = main_pkg
4099 .dependencies
4100 .iter()
4101 .filter(|dep| {
4102 dep.purl.as_ref().is_some_and(|purl| {
4103 !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4104 })
4105 })
4106 .cloned()
4107 .collect::<Vec<_>>();
4108
4109 for dependency in &mut dependencies {
4110 if dependency
4111 .purl
4112 .as_ref()
4113 .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4114 {
4115 dependency.is_direct = Some(true);
4116 }
4117 }
4118
4119 main_pkg.dependencies = dependencies;
4120 main_pkg.dependencies.extend(unresolved_dependencies);
4121 main_pkg
4122 } else {
4123 default_package_data(path)
4124 }
4125}
4126
4127fn base_dependency_purl(purl: &str) -> String {
4128 purl.split_once('@')
4129 .map(|(base, _)| base.to_string())
4130 .unwrap_or_else(|| purl.to_string())
4131}
4132
4133type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4134
4135fn extract_from_setup_cfg(path: &Path) -> PackageData {
4136 let content = match read_file_to_string(path) {
4137 Ok(content) => content,
4138 Err(e) => {
4139 warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4140 return default_package_data(path);
4141 }
4142 };
4143
4144 let sections = parse_setup_cfg(&content);
4145 let name = get_ini_value(§ions, "metadata", "name");
4146 let version = get_ini_value(§ions, "metadata", "version");
4147 let description = get_ini_value(§ions, "metadata", "description");
4148 let author = get_ini_value(§ions, "metadata", "author");
4149 let author_email = get_ini_value(§ions, "metadata", "author_email");
4150 let maintainer = get_ini_value(§ions, "metadata", "maintainer");
4151 let maintainer_email = get_ini_value(§ions, "metadata", "maintainer_email");
4152 let license = get_ini_value(§ions, "metadata", "license");
4153 let mut homepage_url = get_ini_value(§ions, "metadata", "url");
4154 let classifiers = get_ini_values(§ions, "metadata", "classifiers");
4155 let keywords = parse_setup_cfg_keywords(get_ini_value(§ions, "metadata", "keywords"));
4156 let python_requires = get_ini_value(§ions, "options", "python_requires");
4157 let parsed_project_urls =
4158 parse_setup_cfg_project_urls(&get_ini_values(§ions, "metadata", "project_urls"));
4159 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4160 let mut extra_data = HashMap::new();
4161
4162 let mut parties = Vec::new();
4163 if author.is_some() || author_email.is_some() {
4164 parties.push(Party {
4165 r#type: Some("person".to_string()),
4166 role: Some("author".to_string()),
4167 name: author,
4168 email: author_email,
4169 url: None,
4170 organization: None,
4171 organization_url: None,
4172 timezone: None,
4173 });
4174 }
4175
4176 if maintainer.is_some() || maintainer_email.is_some() {
4177 parties.push(Party {
4178 r#type: Some("person".to_string()),
4179 role: Some("maintainer".to_string()),
4180 name: maintainer,
4181 email: maintainer_email,
4182 url: None,
4183 organization: None,
4184 organization_url: None,
4185 timezone: None,
4186 });
4187 }
4188
4189 let declared_license_expression = None;
4190 let declared_license_expression_spdx = None;
4191 let license_detections = Vec::new();
4192 let extracted_license_statement = license.clone();
4193
4194 let dependencies = extract_setup_cfg_dependencies(§ions);
4195
4196 if let Some(value) = python_requires {
4197 extra_data.insert(
4198 "python_requires".to_string(),
4199 serde_json::Value::String(value),
4200 );
4201 }
4202
4203 apply_project_url_mappings(
4204 &parsed_project_urls,
4205 &mut homepage_url,
4206 &mut bug_tracking_url,
4207 &mut code_view_url,
4208 &mut vcs_url,
4209 &mut extra_data,
4210 );
4211
4212 let extra_data = if extra_data.is_empty() {
4213 None
4214 } else {
4215 Some(extra_data)
4216 };
4217
4218 let purl = name.as_ref().and_then(|n| {
4219 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4220 if let Some(v) = &version {
4221 package_url.with_version(v).ok()?;
4222 }
4223 Some(package_url.to_string())
4224 });
4225
4226 PackageData {
4227 package_type: Some(PythonParser::PACKAGE_TYPE),
4228 namespace: None,
4229 name,
4230 version,
4231 qualifiers: None,
4232 subpath: None,
4233 primary_language: Some("Python".to_string()),
4234 description,
4235 release_date: None,
4236 parties,
4237 keywords,
4238 homepage_url,
4239 download_url: None,
4240 size: None,
4241 sha1: None,
4242 md5: None,
4243 sha256: None,
4244 sha512: None,
4245 bug_tracking_url,
4246 code_view_url,
4247 vcs_url,
4248 copyright: None,
4249 holder: None,
4250 declared_license_expression,
4251 declared_license_expression_spdx,
4252 license_detections,
4253 other_license_expression: None,
4254 other_license_expression_spdx: None,
4255 other_license_detections: Vec::new(),
4256 extracted_license_statement,
4257 notice_text: None,
4258 source_packages: Vec::new(),
4259 file_references: Vec::new(),
4260 is_private: has_private_classifier(&classifiers),
4261 is_virtual: false,
4262 extra_data,
4263 dependencies,
4264 repository_homepage_url: None,
4265 repository_download_url: None,
4266 api_data_url: None,
4267 datasource_id: Some(DatasourceId::PypiSetupCfg),
4268 purl,
4269 }
4270}
4271
4272fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4273 let Some(keywords) = value else {
4274 return Vec::new();
4275 };
4276
4277 keywords
4278 .split(',')
4279 .map(str::trim)
4280 .filter(|keyword| !keyword.is_empty())
4281 .map(ToOwned::to_owned)
4282 .collect()
4283}
4284
4285fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4286 entries
4287 .iter()
4288 .filter_map(|entry| {
4289 let (label, url) = entry.split_once('=')?;
4290 let label = label.trim();
4291 let url = url.trim();
4292 if label.is_empty() || url.is_empty() {
4293 None
4294 } else {
4295 Some((label.to_string(), url.to_string()))
4296 }
4297 })
4298 .collect()
4299}
4300
4301fn apply_project_url_mappings(
4302 parsed_urls: &[(String, String)],
4303 homepage_url: &mut Option<String>,
4304 bug_tracking_url: &mut Option<String>,
4305 code_view_url: &mut Option<String>,
4306 vcs_url: &mut Option<String>,
4307 extra_data: &mut HashMap<String, serde_json::Value>,
4308) {
4309 for (label, url) in parsed_urls {
4310 let label_lower = label.to_lowercase();
4311
4312 if bug_tracking_url.is_none()
4313 && matches!(
4314 label_lower.as_str(),
4315 "tracker"
4316 | "bug reports"
4317 | "bug tracker"
4318 | "issues"
4319 | "issue tracker"
4320 | "github: issues"
4321 )
4322 {
4323 *bug_tracking_url = Some(url.clone());
4324 } else if code_view_url.is_none()
4325 && matches!(label_lower.as_str(), "source" | "source code" | "code")
4326 {
4327 *code_view_url = Some(url.clone());
4328 } else if vcs_url.is_none()
4329 && matches!(
4330 label_lower.as_str(),
4331 "github" | "gitlab" | "github: repo" | "repository"
4332 )
4333 {
4334 *vcs_url = Some(url.clone());
4335 } else if homepage_url.is_none()
4336 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4337 {
4338 *homepage_url = Some(url.clone());
4339 } else if label_lower == "changelog" {
4340 extra_data.insert(
4341 "changelog_url".to_string(),
4342 serde_json::Value::String(url.clone()),
4343 );
4344 }
4345 }
4346
4347 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4348 .iter()
4349 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4350 .collect();
4351
4352 if !project_urls_json.is_empty() {
4353 extra_data.insert(
4354 "project_urls".to_string(),
4355 serde_json::Value::Object(project_urls_json),
4356 );
4357 }
4358}
4359
4360fn parse_setup_cfg(content: &str) -> IniSections {
4361 let mut sections: IniSections = HashMap::new();
4362 let mut current_section: Option<String> = None;
4363 let mut current_key: Option<String> = None;
4364
4365 for raw_line in content.lines() {
4366 let line = raw_line.trim_end_matches('\r');
4367 let trimmed = line.trim();
4368 if trimmed.is_empty() {
4369 continue;
4370 }
4371
4372 let stripped = line.trim_start();
4373 if stripped.starts_with('#') || stripped.starts_with(';') {
4374 continue;
4375 }
4376
4377 if stripped.starts_with('[') && stripped.ends_with(']') {
4378 let section_name = stripped
4379 .trim_start_matches('[')
4380 .trim_end_matches(']')
4381 .trim()
4382 .to_ascii_lowercase();
4383 current_section = if section_name.is_empty() {
4384 None
4385 } else {
4386 Some(section_name)
4387 };
4388 current_key = None;
4389 continue;
4390 }
4391
4392 if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4393 if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4394 let value = stripped.trim();
4395 if !value.is_empty() {
4396 sections
4397 .entry(section.clone())
4398 .or_default()
4399 .entry(key.clone())
4400 .or_default()
4401 .push(value.to_string());
4402 }
4403 }
4404 continue;
4405 }
4406
4407 if let Some((key, value)) = stripped.split_once('=')
4408 && let Some(section) = current_section.as_ref()
4409 {
4410 let key_name = key.trim().to_ascii_lowercase();
4411 let value_trimmed = value.trim();
4412 let entry = sections
4413 .entry(section.clone())
4414 .or_default()
4415 .entry(key_name.clone())
4416 .or_default();
4417 if !value_trimmed.is_empty() {
4418 entry.push(value_trimmed.to_string());
4419 }
4420 current_key = Some(key_name);
4421 }
4422 }
4423
4424 sections
4425}
4426
4427fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4428 sections
4429 .get(§ion.to_ascii_lowercase())
4430 .and_then(|values| values.get(&key.to_ascii_lowercase()))
4431 .and_then(|entries| entries.first())
4432 .map(|value| value.trim().to_string())
4433}
4434
4435fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4436 sections
4437 .get(§ion.to_ascii_lowercase())
4438 .and_then(|values| values.get(&key.to_ascii_lowercase()))
4439 .cloned()
4440 .unwrap_or_default()
4441}
4442
4443fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
4444 let mut dependencies = Vec::new();
4445
4446 for (sub_section, scope) in [
4447 ("install_requires", "install"),
4448 ("tests_require", "test"),
4449 ("setup_requires", "setup"),
4450 ] {
4451 let reqs = get_ini_values(sections, "options", sub_section);
4452 dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
4453 }
4454
4455 if let Some(extras) = sections.get("options.extras_require") {
4456 let mut extra_items: Vec<_> = extras.iter().collect();
4457 extra_items.sort_by_key(|(name, _)| *name);
4458 for (extra_name, reqs) in extra_items {
4459 dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
4460 }
4461 }
4462
4463 dependencies
4464}
4465
4466fn parse_setup_cfg_requirements(
4467 reqs: &[String],
4468 scope: &str,
4469 is_optional: bool,
4470) -> Vec<Dependency> {
4471 reqs.iter()
4472 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
4473 .collect()
4474}
4475
4476fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
4477 let trimmed = req.trim();
4478 if trimmed.is_empty() || trimmed.starts_with('#') {
4479 return None;
4480 }
4481
4482 let name = extract_setup_cfg_dependency_name(trimmed)?;
4483 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4484
4485 Some(Dependency {
4486 purl: Some(purl.to_string()),
4487 extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
4488 scope: Some(scope.to_string()),
4489 is_runtime: Some(true),
4490 is_optional: Some(is_optional),
4491 is_pinned: Some(false),
4492 is_direct: Some(true),
4493 resolved_package: None,
4494 extra_data: None,
4495 })
4496}
4497
4498fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
4499 let trimmed = req.trim();
4500 if trimmed.is_empty() {
4501 return None;
4502 }
4503
4504 let end = trimmed
4505 .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
4506 .unwrap_or(trimmed.len());
4507 let name = trimmed[..end].trim();
4508 if name.is_empty() {
4509 None
4510 } else {
4511 Some(name.to_string())
4512 }
4513}
4514
4515fn normalize_setup_cfg_requirement(req: &str) -> String {
4516 req.chars().filter(|c| !c.is_whitespace()).collect()
4517}
4518
4519fn extract_setup_value(content: &str, key: &str) -> Option<String> {
4520 let patterns = vec![
4521 format!("{}=\"", key), format!("{} =\"", key), format!("{}= \"", key), format!("{} = \"", key), format!("{}='", key), format!("{} ='", key), format!("{}= '", key), format!("{} = '", key), ];
4530
4531 for pattern in patterns {
4532 if let Some(start_idx) = content.find(&pattern) {
4533 let value_start = start_idx + pattern.len();
4534 let remaining = &content[value_start..];
4535
4536 if let Some(end_idx) = remaining.find(['"', '\'']) {
4537 return Some(remaining[..end_idx].to_string());
4538 }
4539 }
4540 }
4541
4542 None
4543}
4544
4545fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
4546 let mut dependencies = Vec::new();
4547
4548 if let Some(tests_deps) = extract_tests_require(content) {
4549 dependencies.extend(tests_deps);
4550 }
4551
4552 if let Some(extras_deps) = extract_extras_require(content) {
4553 dependencies.extend(extras_deps);
4554 }
4555
4556 dependencies
4557}
4558
4559fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
4560 let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
4561 let re = Regex::new(pattern).ok()?;
4562 let captures = re.captures(content)?;
4563 let deps_str = captures.get(1)?.as_str();
4564
4565 let deps = parse_setup_py_dep_list(deps_str, "test", true);
4566 if deps.is_empty() { None } else { Some(deps) }
4567}
4568
4569fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
4570 let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
4571 let re = Regex::new(pattern).ok()?;
4572 let captures = re.captures(content)?;
4573 let dict_content = captures.get(1)?.as_str();
4574
4575 let mut all_deps = Vec::new();
4576
4577 let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
4578 let entry_re = Regex::new(entry_pattern).ok()?;
4579
4580 for entry_cap in entry_re.captures_iter(dict_content) {
4581 if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
4582 let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
4583 all_deps.extend(deps);
4584 }
4585 }
4586
4587 if all_deps.is_empty() {
4588 None
4589 } else {
4590 Some(all_deps)
4591 }
4592}
4593
4594fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
4595 let dep_pattern = r#"['"]([^'"]+)['"]"#;
4596 let re = match Regex::new(dep_pattern) {
4597 Ok(r) => r,
4598 Err(_) => return Vec::new(),
4599 };
4600
4601 re.captures_iter(deps_str)
4602 .filter_map(|cap| {
4603 let dep_str = cap.get(1)?.as_str().trim();
4604 if dep_str.is_empty() {
4605 return None;
4606 }
4607
4608 let name = extract_setup_cfg_dependency_name(dep_str)?;
4609 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4610
4611 Some(Dependency {
4612 purl: Some(purl.to_string()),
4613 extracted_requirement: Some(dep_str.to_string()),
4614 scope: Some(scope.to_string()),
4615 is_runtime: Some(true),
4616 is_optional: Some(is_optional),
4617 is_pinned: Some(false),
4618 is_direct: Some(true),
4619 resolved_package: None,
4620 extra_data: None,
4621 })
4622 })
4623 .collect()
4624}
4625
4626pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
4628 let content = read_file_to_string(path).map_err(|e| e.to_string())?;
4629 toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
4630}
4631
4632fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<String>) {
4643 let mut file = match File::open(path) {
4644 Ok(f) => f,
4645 Err(_) => return (None, None),
4646 };
4647
4648 let metadata = match file.metadata() {
4649 Ok(m) => m,
4650 Err(_) => return (None, None),
4651 };
4652 let size = metadata.len();
4653
4654 let mut hasher = Sha256::new();
4655 let mut buffer = vec![0; 8192];
4656
4657 loop {
4658 match file.read(&mut buffer) {
4659 Ok(0) => break,
4660 Ok(n) => hasher.update(&buffer[..n]),
4661 Err(_) => return (Some(size), None),
4662 }
4663 }
4664
4665 let hash = hex::encode(hasher.finalize());
4666 (Some(size), Some(hash))
4667}
4668
4669fn default_package_data(path: &Path) -> PackageData {
4670 PackageData {
4671 package_type: Some(PythonParser::PACKAGE_TYPE),
4672 primary_language: Some("Python".to_string()),
4673 datasource_id: infer_python_datasource_id(path),
4674 ..Default::default()
4675 }
4676}
4677
4678fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
4679 let file_name = path.file_name().and_then(|name| name.to_str());
4680
4681 match file_name {
4682 Some("pyproject.toml") => {
4683 if read_toml_file(path)
4684 .ok()
4685 .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
4686 .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
4687 .is_some()
4688 {
4689 Some(DatasourceId::PypiPoetryPyprojectToml)
4690 } else {
4691 Some(DatasourceId::PypiPyprojectToml)
4692 }
4693 }
4694 Some("setup.py") => Some(DatasourceId::PypiSetupPy),
4695 Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
4696 Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
4697 Some("METADATA") => Some(DatasourceId::PypiWheelMetadata),
4698 Some("pypi.json") => Some(DatasourceId::PypiJson),
4699 Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
4700 Some("origin.json") if is_pip_cache_origin_json(path) => {
4701 Some(DatasourceId::PypiPipOriginJson)
4702 }
4703 _ if is_python_sdist_archive_path(path) => Some(DatasourceId::PypiSdist),
4704 _ if path
4705 .extension()
4706 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
4707 {
4708 Some(DatasourceId::PypiWheel)
4709 }
4710 _ if path
4711 .extension()
4712 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
4713 {
4714 Some(DatasourceId::PypiEgg)
4715 }
4716 _ => None,
4717 }
4718}
4719
4720crate::register_parser!(
4721 "Python package manifests (pyproject.toml, setup.py, setup.cfg, pypi.json, PKG-INFO, METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
4722 &[
4723 "**/pyproject.toml",
4724 "**/setup.py",
4725 "**/setup.cfg",
4726 "**/pypi.json",
4727 "**/PKG-INFO",
4728 "**/METADATA",
4729 "**/origin.json",
4730 "**/*.tar.gz",
4731 "**/*.tgz",
4732 "**/*.tar.bz2",
4733 "**/*.tar.xz",
4734 "**/*.zip",
4735 "**/*.whl",
4736 "**/*.egg"
4737 ],
4738 "pypi",
4739 "Python",
4740 Some("https://packaging.python.org/"),
4741);