1use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
35use crate::parser_warn as warn;
36use crate::parsers::utils::{read_file_to_string, split_name_email};
37use base64::Engine;
38use base64::engine::general_purpose::URL_SAFE_NO_PAD;
39use bzip2::read::BzDecoder;
40use csv::ReaderBuilder;
41use flate2::read::GzDecoder;
42use liblzma::read::XzDecoder;
43use packageurl::PackageUrl;
44use regex::Regex;
45use ruff_python_ast as ast;
46use ruff_python_parser::parse_module;
47use serde_json::{Map as JsonMap, Value as JsonValue};
48use sha2::{Digest, Sha256};
49use std::collections::{HashMap, HashSet};
50use std::fs::File;
51use std::io::Read;
52use std::path::{Component, Path, PathBuf};
53use tar::Archive;
54use toml::Value as TomlValue;
55use toml::map::Map as TomlMap;
56use zip::ZipArchive;
57
58use super::PackageParser;
59use super::license_normalization::{
60 DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
61 normalize_spdx_expression,
62};
63
64const FIELD_PROJECT: &str = "project";
66const FIELD_NAME: &str = "name";
67const FIELD_VERSION: &str = "version";
68const FIELD_LICENSE: &str = "license";
69const FIELD_AUTHORS: &str = "authors";
70const FIELD_MAINTAINERS: &str = "maintainers";
71const FIELD_URLS: &str = "urls";
72const FIELD_HOMEPAGE: &str = "homepage";
73const FIELD_REPOSITORY: &str = "repository";
74const FIELD_DEPENDENCIES: &str = "dependencies";
75const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
76const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
77const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
78const MAX_SETUP_PY_BYTES: usize = 1_048_576;
79const MAX_SETUP_PY_AST_NODES: usize = 10_000;
80const MAX_SETUP_PY_AST_DEPTH: usize = 50;
81const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; const MAX_COMPRESSION_RATIO: f64 = 100.0; pub struct PythonParser;
95
96#[derive(Clone, Copy, Debug)]
97enum PythonSdistArchiveFormat {
98 TarGz,
99 Tgz,
100 TarBz2,
101 TarXz,
102 Zip,
103}
104
105#[derive(Clone, Debug)]
106struct ValidatedZipEntry {
107 index: usize,
108 name: String,
109}
110
111impl PackageParser for PythonParser {
112 const PACKAGE_TYPE: PackageType = PackageType::Pypi;
113
114 fn extract_packages(path: &Path) -> Vec<PackageData> {
115 vec![
116 if path.file_name().unwrap_or_default() == "pyproject.toml" {
117 extract_from_pyproject_toml(path)
118 } else if path.file_name().unwrap_or_default() == "setup.cfg" {
119 extract_from_setup_cfg(path)
120 } else if is_setup_py_like_path(path) {
121 return extract_setup_py_packages(path);
122 } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
123 extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
124 } else if is_installed_wheel_metadata_path(path) {
125 extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
126 } else if is_pip_cache_origin_json(path) {
127 extract_from_pip_origin_json(path)
128 } else if path.file_name().unwrap_or_default() == "pypi.json" {
129 extract_from_pypi_json(path)
130 } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
131 extract_from_pip_inspect(path)
132 } else if is_python_sdist_archive_path(path) {
133 extract_from_sdist_archive(path)
134 } else if path
135 .extension()
136 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
137 {
138 extract_from_wheel_archive(path)
139 } else if path
140 .extension()
141 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
142 {
143 extract_from_egg_archive(path)
144 } else {
145 default_package_data(path)
146 },
147 ]
148 }
149
150 fn is_match(path: &Path) -> bool {
151 if let Some(filename) = path.file_name()
152 && (filename == "pyproject.toml"
153 || filename == "setup.cfg"
154 || is_setup_py_like_path(path)
155 || filename == "PKG-INFO"
156 || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
157 || filename == "pypi.json"
158 || filename == "pip-inspect.deplock"
159 || is_pip_cache_origin_json(path))
160 {
161 return true;
162 }
163
164 if let Some(extension) = path.extension() {
165 let ext = extension.to_string_lossy().to_lowercase();
166 if ext == "whl" || ext == "egg" || is_python_sdist_archive_path(path) {
167 return true;
168 }
169 }
170
171 false
172 }
173}
174
175fn is_setup_py_like_path(path: &Path) -> bool {
176 path.file_name()
177 .and_then(|name| name.to_str())
178 .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
179}
180
181fn is_installed_wheel_metadata_path(path: &Path) -> bool {
182 path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
183 && path
184 .parent()
185 .and_then(|parent| parent.file_name())
186 .and_then(|name| name.to_str())
187 .is_some_and(|name| name.ends_with(".dist-info"))
188}
189
190#[derive(Debug, Clone)]
191struct InstalledWheelMetadata {
192 wheel_tags: Vec<String>,
193 wheel_version: Option<String>,
194 wheel_generator: Option<String>,
195 root_is_purelib: Option<bool>,
196 compressed_tag: Option<String>,
197}
198
199fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
200 let Some(parent) = path.parent() else {
201 return;
202 };
203
204 if !parent
205 .file_name()
206 .and_then(|name| name.to_str())
207 .is_some_and(|name| name.ends_with(".dist-info"))
208 {
209 return;
210 }
211
212 let wheel_path = parent.join("WHEEL");
213 if !wheel_path.exists() {
214 return;
215 }
216
217 let Ok(content) = read_file_to_string(&wheel_path) else {
218 warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
219 return;
220 };
221
222 let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
223 return;
224 };
225
226 apply_installed_wheel_metadata(package_data, &wheel_metadata);
227}
228
229fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
230 use super::rfc822::{get_header_all, get_header_first};
231
232 let metadata = super::rfc822::parse_rfc822_content(content);
233 let wheel_tags = get_header_all(&metadata.headers, "tag");
234 if wheel_tags.is_empty() {
235 return None;
236 }
237
238 let wheel_version = get_header_first(&metadata.headers, "wheel-version");
239 let wheel_generator = get_header_first(&metadata.headers, "generator");
240 let root_is_purelib =
241 get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
242 match value.to_ascii_lowercase().as_str() {
243 "true" => Some(true),
244 "false" => Some(false),
245 _ => None,
246 }
247 });
248
249 let compressed_tag = compress_wheel_tags(&wheel_tags);
250
251 Some(InstalledWheelMetadata {
252 wheel_tags,
253 wheel_version,
254 wheel_generator,
255 root_is_purelib,
256 compressed_tag,
257 })
258}
259
260fn compress_wheel_tags(tags: &[String]) -> Option<String> {
261 if tags.is_empty() {
262 return None;
263 }
264
265 if tags.len() == 1 {
266 return Some(tags[0].clone());
267 }
268
269 let mut python_tags = Vec::new();
270 let mut abi_tag: Option<&str> = None;
271 let mut platform_tag: Option<&str> = None;
272
273 for tag in tags {
274 let mut parts = tag.splitn(3, '-');
275 let python = parts.next()?;
276 let abi = parts.next()?;
277 let platform = parts.next()?;
278
279 if abi_tag.is_some_and(|existing| existing != abi)
280 || platform_tag.is_some_and(|existing| existing != platform)
281 {
282 return None;
283 }
284
285 abi_tag = Some(abi);
286 platform_tag = Some(platform);
287 python_tags.push(python.to_string());
288 }
289
290 Some(format!(
291 "{}-{}-{}",
292 python_tags.join("."),
293 abi_tag?,
294 platform_tag?
295 ))
296}
297
298fn apply_installed_wheel_metadata(
299 package_data: &mut PackageData,
300 wheel_metadata: &InstalledWheelMetadata,
301) {
302 let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
303 extra_data.insert(
304 "wheel_tags".to_string(),
305 JsonValue::Array(
306 wheel_metadata
307 .wheel_tags
308 .iter()
309 .cloned()
310 .map(JsonValue::String)
311 .collect(),
312 ),
313 );
314
315 if let Some(wheel_version) = &wheel_metadata.wheel_version {
316 extra_data.insert(
317 "wheel_version".to_string(),
318 JsonValue::String(wheel_version.clone()),
319 );
320 }
321
322 if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
323 extra_data.insert(
324 "wheel_generator".to_string(),
325 JsonValue::String(wheel_generator.clone()),
326 );
327 }
328
329 if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
330 extra_data.insert(
331 "root_is_purelib".to_string(),
332 JsonValue::Bool(root_is_purelib),
333 );
334 }
335
336 if let (Some(name), Some(version), Some(extension)) = (
337 package_data.name.as_deref(),
338 package_data.version.as_deref(),
339 wheel_metadata.compressed_tag.as_deref(),
340 ) {
341 package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
342 }
343}
344
345fn is_pip_cache_origin_json(path: &Path) -> bool {
346 path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
347 && path.ancestors().skip(1).any(|ancestor| {
348 ancestor
349 .file_name()
350 .and_then(|name| name.to_str())
351 .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
352 })
353}
354
355fn extract_from_pip_origin_json(path: &Path) -> PackageData {
356 let content = match read_file_to_string(path) {
357 Ok(content) => content,
358 Err(e) => {
359 warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
360 return default_package_data(path);
361 }
362 };
363
364 let root: JsonValue = match serde_json::from_str(&content) {
365 Ok(root) => root,
366 Err(e) => {
367 warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
368 return default_package_data(path);
369 }
370 };
371
372 let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
373 warn!("No url found in pip cache origin.json at {:?}", path);
374 return default_package_data(path);
375 };
376
377 let sibling_wheel = find_sibling_cached_wheel(path);
378 let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
379 sibling_wheel
380 .as_ref()
381 .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
382 });
383
384 let Some((name, version)) = name_version else {
385 warn!(
386 "Failed to infer package name/version from pip cache origin.json at {:?}",
387 path
388 );
389 return default_package_data(path);
390 };
391
392 let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
393 build_pypi_urls(Some(&name), Some(&version));
394 let purl = sibling_wheel
395 .as_ref()
396 .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
397 .or(plain_purl);
398
399 PackageData {
400 package_type: Some(PythonParser::PACKAGE_TYPE),
401 primary_language: Some("Python".to_string()),
402 name: Some(name),
403 version: Some(version),
404 datasource_id: Some(DatasourceId::PypiPipOriginJson),
405 download_url: Some(download_url.to_string()),
406 sha256: extract_sha256_from_origin_json(&root),
407 repository_homepage_url,
408 repository_download_url,
409 api_data_url,
410 purl,
411 ..Default::default()
412 }
413}
414
415fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
416 let parent = path.parent()?;
417 let entries = parent.read_dir().ok()?;
418
419 for entry in entries.flatten() {
420 let sibling_path = entry.path();
421 if sibling_path
422 .extension()
423 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
424 && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
425 {
426 return Some(wheel_info);
427 }
428 }
429
430 None
431}
432
433fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
434 let file_name = url.rsplit('/').next()?;
435
436 if file_name.ends_with(".whl") {
437 return parse_wheel_filename(Path::new(file_name))
438 .map(|wheel_info| (wheel_info.name, wheel_info.version));
439 }
440
441 let stem = strip_python_archive_extension(file_name)?;
442 let (name, version) = stem.rsplit_once('-')?;
443 if name.is_empty() || version.is_empty() {
444 return None;
445 }
446
447 Some((name.replace('_', "-"), version.to_string()))
448}
449
450fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
451 [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
452 .iter()
453 .find_map(|suffix| file_name.strip_suffix(suffix))
454}
455
456fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
457 root.pointer("/archive_info/hashes/sha256")
458 .and_then(|value| value.as_str())
459 .map(ToOwned::to_owned)
460 .or_else(|| {
461 root.pointer("/archive_info/hash")
462 .and_then(|value| value.as_str())
463 .and_then(normalize_origin_hash)
464 })
465}
466
467fn normalize_origin_hash(hash: &str) -> Option<String> {
468 if let Some(value) = hash.strip_prefix("sha256=") {
469 return Some(value.to_string());
470 }
471 if let Some(value) = hash.strip_prefix("sha256:") {
472 return Some(value.to_string());
473 }
474 if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
475 return Some(hash.to_string());
476 }
477 None
478}
479
480fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
481 let content = match read_file_to_string(path) {
482 Ok(content) => content,
483 Err(e) => {
484 warn!("Failed to read metadata at {:?}: {}", path, e);
485 return default_package_data(path);
486 }
487 };
488
489 let metadata = super::rfc822::parse_rfc822_content(&content);
490 let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
491 merge_sibling_metadata_dependencies(path, &mut package_data);
492 merge_sibling_metadata_file_references(path, &mut package_data);
493 if datasource_id == DatasourceId::PypiWheelMetadata {
494 merge_sibling_wheel_metadata(path, &mut package_data);
495 }
496 package_data
497}
498
499fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
500 let mut extra_dependencies = Vec::new();
501
502 if let Some(parent) = path.parent() {
503 let direct_requires = parent.join("requires.txt");
504 if direct_requires.exists()
505 && let Ok(content) = read_file_to_string(&direct_requires)
506 {
507 extra_dependencies.extend(parse_requires_txt(&content));
508 }
509
510 let sibling_egg_info_requires = parent
511 .read_dir()
512 .ok()
513 .into_iter()
514 .flatten()
515 .flatten()
516 .find_map(|entry| {
517 let child_path = entry.path();
518 if child_path.is_dir()
519 && child_path
520 .file_name()
521 .and_then(|name| name.to_str())
522 .is_some_and(|name| name.ends_with(".egg-info"))
523 {
524 let requires = child_path.join("requires.txt");
525 requires.exists().then_some(requires)
526 } else {
527 None
528 }
529 });
530
531 if let Some(requires_path) = sibling_egg_info_requires
532 && let Ok(content) = read_file_to_string(&requires_path)
533 {
534 extra_dependencies.extend(parse_requires_txt(&content));
535 }
536 }
537
538 for dependency in extra_dependencies {
539 if !package_data.dependencies.iter().any(|existing| {
540 existing.purl == dependency.purl
541 && existing.scope == dependency.scope
542 && existing.extracted_requirement == dependency.extracted_requirement
543 && existing.extra_data == dependency.extra_data
544 }) {
545 package_data.dependencies.push(dependency);
546 }
547 }
548}
549
550fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
551 let mut extra_refs = Vec::new();
552
553 if let Some(parent) = path.parent() {
554 let record_path = parent.join("RECORD");
555 if record_path.exists()
556 && let Ok(content) = read_file_to_string(&record_path)
557 {
558 extra_refs.extend(parse_record_csv(&content));
559 }
560
561 let installed_files_path = parent.join("installed-files.txt");
562 if installed_files_path.exists()
563 && let Ok(content) = read_file_to_string(&installed_files_path)
564 {
565 extra_refs.extend(parse_installed_files_txt(&content));
566 }
567
568 let sources_path = parent.join("SOURCES.txt");
569 if sources_path.exists()
570 && let Ok(content) = read_file_to_string(&sources_path)
571 {
572 extra_refs.extend(parse_sources_txt(&content));
573 }
574 }
575
576 for file_ref in extra_refs {
577 if !package_data
578 .file_references
579 .iter()
580 .any(|existing| existing.path == file_ref.path)
581 {
582 package_data.file_references.push(file_ref);
583 }
584 }
585}
586
587fn collect_validated_zip_entries<R: Read + std::io::Seek>(
588 archive: &mut ZipArchive<R>,
589 path: &Path,
590 archive_type: &str,
591) -> Result<Vec<ValidatedZipEntry>, String> {
592 let mut total_extracted = 0u64;
593 let mut entries = Vec::new();
594
595 for i in 0..archive.len() {
596 if let Ok(file) = archive.by_index_raw(i) {
597 let compressed_size = file.compressed_size();
598 let uncompressed_size = file.size();
599 let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
600 warn!(
601 "Skipping unsafe path in {} {:?}: {}",
602 archive_type,
603 path,
604 file.name()
605 );
606 continue;
607 };
608
609 if compressed_size > 0 {
610 let ratio = uncompressed_size as f64 / compressed_size as f64;
611 if ratio > MAX_COMPRESSION_RATIO {
612 warn!(
613 "Suspicious compression ratio in {} {:?}: {:.2}:1",
614 archive_type, path, ratio
615 );
616 continue;
617 }
618 }
619
620 if uncompressed_size > MAX_FILE_SIZE {
621 warn!(
622 "File too large in {} {:?}: {} bytes (limit: {} bytes)",
623 archive_type, path, uncompressed_size, MAX_FILE_SIZE
624 );
625 continue;
626 }
627
628 total_extracted += uncompressed_size;
629 if total_extracted > MAX_ARCHIVE_SIZE {
630 let msg = format!(
631 "Total extracted size exceeds limit for {} {:?}",
632 archive_type, path
633 );
634 warn!("{}", msg);
635 return Err(msg);
636 }
637
638 entries.push(ValidatedZipEntry {
639 index: i,
640 name: entry_name,
641 });
642 }
643 }
644
645 Ok(entries)
646}
647
648fn is_python_sdist_archive_path(path: &Path) -> bool {
649 detect_python_sdist_archive_format(path).is_some()
650}
651
652fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
653 let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
654
655 if !is_likely_python_sdist_filename(&file_name) {
656 return None;
657 }
658
659 if file_name.ends_with(".tar.gz") {
660 Some(PythonSdistArchiveFormat::TarGz)
661 } else if file_name.ends_with(".tgz") {
662 tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
663 } else if file_name.ends_with(".tar.bz2") {
664 Some(PythonSdistArchiveFormat::TarBz2)
665 } else if file_name.ends_with(".tar.xz") {
666 Some(PythonSdistArchiveFormat::TarXz)
667 } else if file_name.ends_with(".zip") {
668 zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
669 } else {
670 None
671 }
672}
673
674fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
675 if !path.is_file() {
676 return true;
677 }
678
679 let compressed_size = match std::fs::metadata(path) {
680 Ok(metadata) => metadata.len(),
681 Err(_) => return false,
682 };
683 let file = match File::open(path) {
684 Ok(file) => file,
685 Err(_) => return false,
686 };
687 let decoder = GzDecoder::new(file);
688 let Some(entries) = collect_tar_sdist_entries(path, decoder, "tgz", compressed_size) else {
689 return false;
690 };
691
692 select_sdist_pkginfo_entry(path, &entries).is_some()
693}
694
695fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
696 if !path.is_file() {
697 return true;
698 }
699
700 let file = match File::open(path) {
701 Ok(file) => file,
702 Err(_) => return false,
703 };
704 let mut archive = match ZipArchive::new(file) {
705 Ok(archive) => archive,
706 Err(_) => return false,
707 };
708
709 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
710 Ok(entries) => entries,
711 Err(_) => return false,
712 };
713 let metadata_entries: Vec<_> = validated_entries
714 .iter()
715 .filter(|entry| entry.name.ends_with("/PKG-INFO"))
716 .filter_map(|entry| {
717 read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
718 .ok()
719 .map(|content| (entry.name.clone(), content))
720 })
721 .collect();
722
723 has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
724}
725
726fn is_likely_python_sdist_filename(file_name: &str) -> bool {
727 let Some(stem) = strip_python_archive_extension(file_name) else {
728 return false;
729 };
730
731 let Some((name, version)) = stem.rsplit_once('-') else {
732 return false;
733 };
734
735 !name.is_empty()
736 && !version.is_empty()
737 && version.chars().any(|ch| ch.is_ascii_digit())
738 && name
739 .chars()
740 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
741}
742
743fn extract_from_sdist_archive(path: &Path) -> PackageData {
744 let metadata = match std::fs::metadata(path) {
745 Ok(m) => m,
746 Err(e) => {
747 warn!(
748 "Failed to read metadata for sdist archive {:?}: {}",
749 path, e
750 );
751 return default_package_data(path);
752 }
753 };
754
755 if metadata.len() > MAX_ARCHIVE_SIZE {
756 warn!(
757 "sdist archive too large: {} bytes (limit: {} bytes)",
758 metadata.len(),
759 MAX_ARCHIVE_SIZE
760 );
761 return default_package_data(path);
762 }
763
764 let Some(format) = detect_python_sdist_archive_format(path) else {
765 return default_package_data(path);
766 };
767
768 let mut package_data = match format {
769 PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
770 let file = match File::open(path) {
771 Ok(file) => file,
772 Err(e) => {
773 warn!("Failed to open sdist archive {:?}: {}", path, e);
774 return default_package_data(path);
775 }
776 };
777 let decoder = GzDecoder::new(file);
778 extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
779 }
780 PythonSdistArchiveFormat::TarBz2 => {
781 let file = match File::open(path) {
782 Ok(file) => file,
783 Err(e) => {
784 warn!("Failed to open sdist archive {:?}: {}", path, e);
785 return default_package_data(path);
786 }
787 };
788 let decoder = BzDecoder::new(file);
789 extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
790 }
791 PythonSdistArchiveFormat::TarXz => {
792 let file = match File::open(path) {
793 Ok(file) => file,
794 Err(e) => {
795 warn!("Failed to open sdist archive {:?}: {}", path, e);
796 return default_package_data(path);
797 }
798 };
799 let decoder = XzDecoder::new(file);
800 extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
801 }
802 PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
803 };
804
805 if package_data.package_type.is_some() {
806 let (size, sha256) = calculate_file_checksums(path);
807 package_data.size = size;
808 package_data.sha256 = sha256;
809 }
810
811 package_data
812}
813
814fn extract_from_tar_sdist_archive<R: Read>(
815 path: &Path,
816 reader: R,
817 archive_type: &str,
818 compressed_size: u64,
819) -> PackageData {
820 let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
821 else {
822 return default_package_data(path);
823 };
824
825 build_sdist_package_data(path, entries)
826}
827
828fn collect_tar_sdist_entries<R: Read>(
829 path: &Path,
830 reader: R,
831 archive_type: &str,
832 compressed_size: u64,
833) -> Option<Vec<(String, String)>> {
834 let mut archive = Archive::new(reader);
835 let archive_entries = match archive.entries() {
836 Ok(entries) => entries,
837 Err(e) => {
838 warn!(
839 "Failed to read {} sdist archive {:?}: {}",
840 archive_type, path, e
841 );
842 return None;
843 }
844 };
845
846 let mut total_extracted = 0u64;
847 let mut entries = Vec::new();
848
849 for entry_result in archive_entries {
850 let mut entry = match entry_result {
851 Ok(entry) => entry,
852 Err(e) => {
853 warn!(
854 "Failed to read {} sdist entry from {:?}: {}",
855 archive_type, path, e
856 );
857 continue;
858 }
859 };
860
861 let entry_size = entry.size();
862 if entry_size > MAX_FILE_SIZE {
863 warn!(
864 "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
865 archive_type, path, entry_size, MAX_FILE_SIZE
866 );
867 continue;
868 }
869
870 total_extracted += entry_size;
871 if total_extracted > MAX_ARCHIVE_SIZE {
872 warn!(
873 "Total extracted size exceeds limit for {} sdist {:?}",
874 archive_type, path
875 );
876 return None;
877 }
878
879 if compressed_size > 0 {
880 let ratio = total_extracted as f64 / compressed_size as f64;
881 if ratio > MAX_COMPRESSION_RATIO {
882 warn!(
883 "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
884 archive_type, path, ratio
885 );
886 return None;
887 }
888 }
889
890 let entry_path = match entry.path() {
891 Ok(path) => path.to_string_lossy().replace('\\', "/"),
892 Err(e) => {
893 warn!(
894 "Failed to get {} sdist entry path from {:?}: {}",
895 archive_type, path, e
896 );
897 continue;
898 }
899 };
900
901 let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
902 warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
903 continue;
904 };
905
906 if !is_relevant_sdist_text_entry(&entry_path) {
907 continue;
908 }
909
910 if let Ok(content) = read_limited_utf8(
911 &mut entry,
912 MAX_FILE_SIZE,
913 &format!("{} entry {}", archive_type, entry_path),
914 ) {
915 entries.push((entry_path, content));
916 }
917 }
918
919 Some(entries)
920}
921
922fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
923 let file = match File::open(path) {
924 Ok(file) => file,
925 Err(e) => {
926 warn!("Failed to open zip sdist archive {:?}: {}", path, e);
927 return default_package_data(path);
928 }
929 };
930
931 let mut archive = match ZipArchive::new(file) {
932 Ok(archive) => archive,
933 Err(e) => {
934 warn!("Failed to read zip sdist archive {:?}: {}", path, e);
935 return default_package_data(path);
936 }
937 };
938
939 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
940 Ok(entries) => entries,
941 Err(_) => return default_package_data(path),
942 };
943
944 let mut entries = Vec::new();
945 for entry in validated_entries.iter() {
946 if !is_relevant_sdist_text_entry(&entry.name) {
947 continue;
948 }
949
950 if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
951 entries.push((entry.name.clone(), content));
952 }
953 }
954
955 build_sdist_package_data(path, entries)
956}
957
958fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
959 entry_path.ends_with("/PKG-INFO")
960 || entry_path.ends_with("/requires.txt")
961 || entry_path.ends_with("/SOURCES.txt")
962}
963
964fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
965 let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
966 warn!("No PKG-INFO file found in sdist archive {:?}", path);
967 return default_package_data(path);
968 };
969
970 let mut package_data =
971 python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
972 merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
973 merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
974 apply_sdist_name_version_fallback(path, &mut package_data);
975 package_data.datasource_id = Some(DatasourceId::PypiSdist);
976 package_data
977}
978
979fn select_sdist_pkginfo_entry(
980 archive_path: &Path,
981 entries: &[(String, String)],
982) -> Option<(String, String)> {
983 let expected_name = sdist_archive_expected_name(archive_path);
984
985 entries
986 .iter()
987 .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
988 .min_by_key(|(entry_path, content)| {
989 let components: Vec<_> = entry_path
990 .split('/')
991 .filter(|part| !part.is_empty())
992 .collect();
993 let candidate_name = sdist_pkginfo_candidate_name(content);
994 let name_rank = if candidate_name == expected_name {
995 0
996 } else {
997 1
998 };
999 let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1000
1001 (name_rank, kind_rank, components.len(), entry_path.clone())
1002 })
1003 .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1004}
1005
1006fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1007 let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1008 return false;
1009 };
1010
1011 entries.iter().any(|(entry_path, content)| {
1012 sdist_pkginfo_kind_rank(entry_path) < 3
1013 && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1014 })
1015}
1016
1017fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1018 archive_path
1019 .file_name()
1020 .and_then(|name| name.to_str())
1021 .and_then(strip_python_archive_extension)
1022 .and_then(|stem| {
1023 stem.rsplit_once('-')
1024 .map(|(name, _)| normalize_python_package_name(name))
1025 })
1026}
1027
1028fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1029 let metadata = super::rfc822::parse_rfc822_content(content);
1030 super::rfc822::get_header_first(&metadata.headers, "name")
1031 .map(|name| normalize_python_package_name(&name))
1032}
1033
1034fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1035 let components: Vec<_> = entry_path
1036 .split('/')
1037 .filter(|part| !part.is_empty())
1038 .collect();
1039
1040 if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1041 {
1042 0
1043 } else if components.len() == 2 && components[1] == "PKG-INFO" {
1044 1
1045 } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1046 2
1047 } else {
1048 3
1049 }
1050}
1051
1052fn merge_sdist_archive_dependencies(
1053 entries: &[(String, String)],
1054 metadata_path: &str,
1055 package_data: &mut PackageData,
1056) {
1057 let metadata_dir = metadata_path
1058 .rsplit_once('/')
1059 .map(|(dir, _)| dir)
1060 .unwrap_or("");
1061 let archive_root = metadata_path.split('/').next().unwrap_or("");
1062 let matched_egg_info_dir =
1063 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1064 let mut extra_dependencies = Vec::new();
1065
1066 for (entry_path, content) in entries {
1067 let is_direct_requires =
1068 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1069 let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1070 entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1071 });
1072
1073 if is_direct_requires || is_egg_info_requires {
1074 extra_dependencies.extend(parse_requires_txt(content));
1075 }
1076 }
1077
1078 for dependency in extra_dependencies {
1079 if !package_data.dependencies.iter().any(|existing| {
1080 existing.purl == dependency.purl
1081 && existing.scope == dependency.scope
1082 && existing.extracted_requirement == dependency.extracted_requirement
1083 && existing.extra_data == dependency.extra_data
1084 }) {
1085 package_data.dependencies.push(dependency);
1086 }
1087 }
1088}
1089
1090fn merge_sdist_archive_file_references(
1091 entries: &[(String, String)],
1092 metadata_path: &str,
1093 package_data: &mut PackageData,
1094) {
1095 let metadata_dir = metadata_path
1096 .rsplit_once('/')
1097 .map(|(dir, _)| dir)
1098 .unwrap_or("");
1099 let archive_root = metadata_path.split('/').next().unwrap_or("");
1100 let matched_egg_info_dir =
1101 select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1102 let mut extra_refs = Vec::new();
1103
1104 for (entry_path, content) in entries {
1105 let is_direct_sources =
1106 !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1107 let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1108 entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1109 });
1110
1111 if is_direct_sources || is_egg_info_sources {
1112 extra_refs.extend(parse_sources_txt(content));
1113 }
1114 }
1115
1116 for file_ref in extra_refs {
1117 if !package_data
1118 .file_references
1119 .iter()
1120 .any(|existing| existing.path == file_ref.path)
1121 {
1122 package_data.file_references.push(file_ref);
1123 }
1124 }
1125}
1126
1127fn select_matching_sdist_egg_info_dir(
1128 entries: &[(String, String)],
1129 archive_root: &str,
1130 package_name: Option<&str>,
1131) -> Option<String> {
1132 let normalized_package_name = package_name.map(normalize_python_package_name);
1133
1134 entries
1135 .iter()
1136 .filter_map(|(entry_path, _)| {
1137 let components: Vec<_> = entry_path
1138 .split('/')
1139 .filter(|part| !part.is_empty())
1140 .collect();
1141 if components.len() == 3
1142 && components[0] == archive_root
1143 && components[1].ends_with(".egg-info")
1144 {
1145 Some(components[1].to_string())
1146 } else {
1147 None
1148 }
1149 })
1150 .min_by_key(|egg_info_dir| {
1151 let normalized_dir_name =
1152 normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1153 let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1154 0
1155 } else {
1156 1
1157 };
1158
1159 (name_rank, egg_info_dir.clone())
1160 })
1161}
1162
1163fn normalize_python_package_name(name: &str) -> String {
1164 name.to_ascii_lowercase().replace('_', "-")
1165}
1166
1167fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1168 let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1169 return;
1170 };
1171
1172 let Some(stem) = strip_python_archive_extension(file_name) else {
1173 return;
1174 };
1175
1176 let Some((name, version)) = stem.rsplit_once('-') else {
1177 return;
1178 };
1179
1180 if package_data.name.is_none() {
1181 package_data.name = Some(name.replace('_', "-"));
1182 }
1183 if package_data.version.is_none() {
1184 package_data.version = Some(version.to_string());
1185 }
1186
1187 if package_data.purl.is_none()
1188 || package_data.repository_homepage_url.is_none()
1189 || package_data.repository_download_url.is_none()
1190 || package_data.api_data_url.is_none()
1191 {
1192 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1193 build_pypi_urls(
1194 package_data.name.as_deref(),
1195 package_data.version.as_deref(),
1196 );
1197
1198 if package_data.repository_homepage_url.is_none() {
1199 package_data.repository_homepage_url = repository_homepage_url;
1200 }
1201 if package_data.repository_download_url.is_none() {
1202 package_data.repository_download_url = repository_download_url;
1203 }
1204 if package_data.api_data_url.is_none() {
1205 package_data.api_data_url = api_data_url;
1206 }
1207 if package_data.purl.is_none() {
1208 package_data.purl = purl;
1209 }
1210 }
1211}
1212
1213fn extract_from_wheel_archive(path: &Path) -> PackageData {
1214 let metadata = match std::fs::metadata(path) {
1215 Ok(m) => m,
1216 Err(e) => {
1217 warn!(
1218 "Failed to read metadata for wheel archive {:?}: {}",
1219 path, e
1220 );
1221 return default_package_data(path);
1222 }
1223 };
1224
1225 if metadata.len() > MAX_ARCHIVE_SIZE {
1226 warn!(
1227 "Wheel archive too large: {} bytes (limit: {} bytes)",
1228 metadata.len(),
1229 MAX_ARCHIVE_SIZE
1230 );
1231 return default_package_data(path);
1232 }
1233
1234 let file = match File::open(path) {
1235 Ok(f) => f,
1236 Err(e) => {
1237 warn!("Failed to open wheel archive {:?}: {}", path, e);
1238 return default_package_data(path);
1239 }
1240 };
1241
1242 let mut archive = match ZipArchive::new(file) {
1243 Ok(a) => a,
1244 Err(e) => {
1245 warn!("Failed to read wheel archive {:?}: {}", path, e);
1246 return default_package_data(path);
1247 }
1248 };
1249
1250 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1251 Ok(entries) => entries,
1252 Err(_) => return default_package_data(path),
1253 };
1254
1255 let metadata_entry =
1256 match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1257 Some(entry) => entry,
1258 None => {
1259 warn!("No METADATA file found in wheel archive {:?}", path);
1260 return default_package_data(path);
1261 }
1262 };
1263
1264 let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1265 Ok(c) => c,
1266 Err(e) => {
1267 warn!("Failed to read METADATA from {:?}: {}", path, e);
1268 return default_package_data(path);
1269 }
1270 };
1271
1272 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1273
1274 let (size, sha256) = calculate_file_checksums(path);
1275 package_data.size = size;
1276 package_data.sha256 = sha256;
1277
1278 if let Some(record_entry) =
1279 find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1280 && let Ok(record_content) =
1281 read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1282 {
1283 package_data.file_references = parse_record_csv(&record_content);
1284 }
1285
1286 if let Some(wheel_info) = parse_wheel_filename(path) {
1287 if package_data.name.is_none() {
1288 package_data.name = Some(wheel_info.name.clone());
1289 }
1290 if package_data.version.is_none() {
1291 package_data.version = Some(wheel_info.version.clone());
1292 }
1293
1294 package_data.qualifiers = Some(std::collections::HashMap::from([(
1295 "extension".to_string(),
1296 format!(
1297 "{}-{}-{}",
1298 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1299 ),
1300 )]));
1301
1302 package_data.purl = build_wheel_purl(
1303 package_data.name.as_deref(),
1304 package_data.version.as_deref(),
1305 &wheel_info,
1306 );
1307
1308 let mut extra_data = package_data.extra_data.unwrap_or_default();
1309 extra_data.insert(
1310 "python_requires".to_string(),
1311 serde_json::Value::String(wheel_info.python_tag.clone()),
1312 );
1313 extra_data.insert(
1314 "abi_tag".to_string(),
1315 serde_json::Value::String(wheel_info.abi_tag.clone()),
1316 );
1317 extra_data.insert(
1318 "platform_tag".to_string(),
1319 serde_json::Value::String(wheel_info.platform_tag.clone()),
1320 );
1321 package_data.extra_data = Some(extra_data);
1322 }
1323
1324 package_data
1325}
1326
1327fn extract_from_egg_archive(path: &Path) -> PackageData {
1328 let metadata = match std::fs::metadata(path) {
1329 Ok(m) => m,
1330 Err(e) => {
1331 warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1332 return default_package_data(path);
1333 }
1334 };
1335
1336 if metadata.len() > MAX_ARCHIVE_SIZE {
1337 warn!(
1338 "Egg archive too large: {} bytes (limit: {} bytes)",
1339 metadata.len(),
1340 MAX_ARCHIVE_SIZE
1341 );
1342 return default_package_data(path);
1343 }
1344
1345 let file = match File::open(path) {
1346 Ok(f) => f,
1347 Err(e) => {
1348 warn!("Failed to open egg archive {:?}: {}", path, e);
1349 return default_package_data(path);
1350 }
1351 };
1352
1353 let mut archive = match ZipArchive::new(file) {
1354 Ok(a) => a,
1355 Err(e) => {
1356 warn!("Failed to read egg archive {:?}: {}", path, e);
1357 return default_package_data(path);
1358 }
1359 };
1360
1361 let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1362 Ok(entries) => entries,
1363 Err(_) => return default_package_data(path),
1364 };
1365
1366 let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1367 &validated_entries,
1368 &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1369 ) {
1370 Some(entry) => entry,
1371 None => {
1372 warn!("No PKG-INFO file found in egg archive {:?}", path);
1373 return default_package_data(path);
1374 }
1375 };
1376
1377 let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1378 Ok(c) => c,
1379 Err(e) => {
1380 warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1381 return default_package_data(path);
1382 }
1383 };
1384
1385 let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1386
1387 let (size, sha256) = calculate_file_checksums(path);
1388 package_data.size = size;
1389 package_data.sha256 = sha256;
1390
1391 if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1392 &validated_entries,
1393 &[
1394 "EGG-INFO/installed-files.txt",
1395 ".egg-info/installed-files.txt",
1396 ],
1397 ) && let Ok(installed_files_content) =
1398 read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1399 {
1400 package_data.file_references = parse_installed_files_txt(&installed_files_content);
1401 }
1402
1403 if let Some(egg_info) = parse_egg_filename(path) {
1404 if package_data.name.is_none() {
1405 package_data.name = Some(egg_info.name.clone());
1406 }
1407 if package_data.version.is_none() {
1408 package_data.version = Some(egg_info.version.clone());
1409 }
1410
1411 if let Some(python_version) = &egg_info.python_version {
1412 let mut extra_data = package_data.extra_data.unwrap_or_default();
1413 extra_data.insert(
1414 "python_version".to_string(),
1415 serde_json::Value::String(python_version.clone()),
1416 );
1417 package_data.extra_data = Some(extra_data);
1418 }
1419 }
1420
1421 package_data.purl = build_egg_purl(
1422 package_data.name.as_deref(),
1423 package_data.version.as_deref(),
1424 );
1425
1426 package_data
1427}
1428
1429fn find_validated_zip_entry_by_suffix<'a>(
1430 entries: &'a [ValidatedZipEntry],
1431 suffix: &str,
1432) -> Option<&'a ValidatedZipEntry> {
1433 entries.iter().find(|entry| entry.name.ends_with(suffix))
1434}
1435
1436fn find_validated_zip_entry_by_any_suffix<'a>(
1437 entries: &'a [ValidatedZipEntry],
1438 suffixes: &[&str],
1439) -> Option<&'a ValidatedZipEntry> {
1440 entries
1441 .iter()
1442 .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1443}
1444
1445fn read_validated_zip_entry<R: Read + std::io::Seek>(
1446 archive: &mut ZipArchive<R>,
1447 entry: &ValidatedZipEntry,
1448 path: &Path,
1449 archive_type: &str,
1450) -> Result<String, String> {
1451 let mut file = archive
1452 .by_index(entry.index)
1453 .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1454
1455 let compressed_size = file.compressed_size();
1456 let uncompressed_size = file.size();
1457
1458 if compressed_size > 0 {
1459 let ratio = uncompressed_size as f64 / compressed_size as f64;
1460 if ratio > MAX_COMPRESSION_RATIO {
1461 return Err(format!(
1462 "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1463 archive_type, path, ratio
1464 ));
1465 }
1466 }
1467
1468 if uncompressed_size > MAX_FILE_SIZE {
1469 return Err(format!(
1470 "Rejected oversized entry in {} {:?}: {} bytes",
1471 archive_type, path, uncompressed_size
1472 ));
1473 }
1474
1475 read_limited_utf8(
1476 &mut file,
1477 MAX_FILE_SIZE,
1478 &format!("{} entry {}", archive_type, entry.name),
1479 )
1480}
1481
1482fn read_limited_utf8<R: Read>(
1483 reader: &mut R,
1484 max_bytes: u64,
1485 context: &str,
1486) -> Result<String, String> {
1487 let mut limited = reader.take(max_bytes + 1);
1488 let mut bytes = Vec::new();
1489 limited
1490 .read_to_end(&mut bytes)
1491 .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1492
1493 if bytes.len() as u64 > max_bytes {
1494 return Err(format!(
1495 "{} exceeded {} byte limit while reading",
1496 context, max_bytes
1497 ));
1498 }
1499
1500 String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1501}
1502
1503fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1504 let normalized = entry_path.replace('\\', "/");
1505 if normalized.len() >= 3 {
1506 let bytes = normalized.as_bytes();
1507 if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1508 return None;
1509 }
1510 }
1511 let path = Path::new(&normalized);
1512 let mut components = Vec::new();
1513
1514 for component in path.components() {
1515 match component {
1516 Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1517 Component::CurDir => {}
1518 Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1519 }
1520 }
1521
1522 (!components.is_empty()).then_some(components.join("/"))
1523}
1524
1525pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1530 let mut reader = ReaderBuilder::new()
1531 .has_headers(false)
1532 .from_reader(content.as_bytes());
1533
1534 let mut file_references = Vec::new();
1535
1536 for result in reader.records() {
1537 match result {
1538 Ok(record) => {
1539 if record.len() < 3 {
1540 continue;
1541 }
1542
1543 let path = record.get(0).unwrap_or("").trim().to_string();
1544 if path.is_empty() {
1545 continue;
1546 }
1547
1548 let hash_field = record.get(1).unwrap_or("").trim();
1549 let size_field = record.get(2).unwrap_or("").trim();
1550
1551 let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1553 let parts: Vec<&str> = hash_field.split('=').collect();
1554 if parts.len() == 2 && parts[0] == "sha256" {
1555 match URL_SAFE_NO_PAD.decode(parts[1]) {
1557 Ok(decoded) => {
1558 let hex = decoded
1559 .iter()
1560 .map(|b| format!("{:02x}", b))
1561 .collect::<String>();
1562 Some(hex)
1563 }
1564 Err(_) => None,
1565 }
1566 } else {
1567 None
1568 }
1569 } else {
1570 None
1571 };
1572
1573 let size = if !size_field.is_empty() && size_field != "-" {
1575 size_field.parse::<u64>().ok()
1576 } else {
1577 None
1578 };
1579
1580 file_references.push(FileReference {
1581 path,
1582 size,
1583 sha1: None,
1584 md5: None,
1585 sha256,
1586 sha512: None,
1587 extra_data: None,
1588 });
1589 }
1590 Err(e) => {
1591 warn!("Failed to parse RECORD CSV row: {}", e);
1592 continue;
1593 }
1594 }
1595 }
1596
1597 file_references
1598}
1599
1600pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1603 content
1604 .lines()
1605 .map(|line| line.trim())
1606 .filter(|line| !line.is_empty())
1607 .map(|path| FileReference {
1608 path: path.to_string(),
1609 size: None,
1610 sha1: None,
1611 md5: None,
1612 sha256: None,
1613 sha512: None,
1614 extra_data: None,
1615 })
1616 .collect()
1617}
1618
1619pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1620 content
1621 .lines()
1622 .map(str::trim)
1623 .filter(|line| !line.is_empty())
1624 .map(|path| FileReference {
1625 path: path.to_string(),
1626 size: None,
1627 sha1: None,
1628 md5: None,
1629 sha256: None,
1630 sha512: None,
1631 extra_data: None,
1632 })
1633 .collect()
1634}
1635
1636struct WheelInfo {
1637 name: String,
1638 version: String,
1639 python_tag: String,
1640 abi_tag: String,
1641 platform_tag: String,
1642}
1643
1644fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1645 let stem = path.file_stem()?.to_string_lossy();
1646 let parts: Vec<&str> = stem.split('-').collect();
1647
1648 if parts.len() >= 5 {
1649 Some(WheelInfo {
1650 name: parts[0].replace('_', "-"),
1651 version: parts[1].to_string(),
1652 python_tag: parts[2].to_string(),
1653 abi_tag: parts[3].to_string(),
1654 platform_tag: parts[4..].join("-"),
1655 })
1656 } else {
1657 None
1658 }
1659}
1660
1661struct EggInfo {
1662 name: String,
1663 version: String,
1664 python_version: Option<String>,
1665}
1666
1667fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1668 let stem = path.file_stem()?.to_string_lossy();
1669 let parts: Vec<&str> = stem.split('-').collect();
1670
1671 if parts.len() >= 2 {
1672 Some(EggInfo {
1673 name: parts[0].replace('_', "-"),
1674 version: parts[1].to_string(),
1675 python_version: parts.get(2).map(|s| s.to_string()),
1676 })
1677 } else {
1678 None
1679 }
1680}
1681
1682fn build_wheel_purl(
1683 name: Option<&str>,
1684 version: Option<&str>,
1685 wheel_info: &WheelInfo,
1686) -> Option<String> {
1687 let name = name?;
1688 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1689
1690 if let Some(ver) = version {
1691 package_url.with_version(ver).ok()?;
1692 }
1693
1694 let extension = format!(
1695 "{}-{}-{}",
1696 wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1697 );
1698 package_url.add_qualifier("extension", extension).ok()?;
1699
1700 Some(package_url.to_string())
1701}
1702
1703fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1704 let name = name?;
1705 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1706
1707 if let Some(ver) = version {
1708 package_url.with_version(ver).ok()?;
1709 }
1710
1711 package_url.add_qualifier("type", "egg").ok()?;
1712
1713 Some(package_url.to_string())
1714}
1715
1716fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1717 let metadata = super::rfc822::parse_rfc822_content(content);
1718 build_package_data_from_rfc822(&metadata, datasource_id)
1719}
1720
1721fn build_package_data_from_rfc822(
1726 metadata: &super::rfc822::Rfc822Metadata,
1727 datasource_id: DatasourceId,
1728) -> PackageData {
1729 use super::rfc822::{get_header_all, get_header_first};
1730
1731 let name = get_header_first(&metadata.headers, "name");
1732 let version = get_header_first(&metadata.headers, "version");
1733 let summary = get_header_first(&metadata.headers, "summary");
1734 let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1735 let author = get_header_first(&metadata.headers, "author");
1736 let author_email = get_header_first(&metadata.headers, "author-email");
1737 let license = get_header_first(&metadata.headers, "license");
1738 let license_expression = get_header_first(&metadata.headers, "license-expression");
1739 let download_url = get_header_first(&metadata.headers, "download-url");
1740 let platform = get_header_first(&metadata.headers, "platform");
1741 let requires_python = get_header_first(&metadata.headers, "requires-python");
1742 let classifiers = get_header_all(&metadata.headers, "classifier");
1743 let license_files = get_header_all(&metadata.headers, "license-file");
1744
1745 let description_body = if metadata.body.is_empty() {
1746 get_header_first(&metadata.headers, "description").unwrap_or_default()
1747 } else {
1748 metadata.body.clone()
1749 };
1750
1751 let description = build_description(summary.as_deref(), &description_body);
1752
1753 let mut parties = Vec::new();
1754 if author.is_some() || author_email.is_some() {
1755 parties.push(Party {
1756 r#type: Some("person".to_string()),
1757 role: Some("author".to_string()),
1758 name: author,
1759 email: author_email,
1760 url: None,
1761 organization: None,
1762 organization_url: None,
1763 timezone: None,
1764 });
1765 }
1766
1767 let (keywords, license_classifiers) = split_classifiers(&classifiers);
1768 let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1769 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1770 license_expression
1771 .as_deref()
1772 .and_then(normalize_spdx_expression)
1773 .map(|normalized| {
1774 build_declared_license_data(
1775 normalized,
1776 DeclaredLicenseMatchMetadata::single_line(
1777 license_expression.as_deref().unwrap_or_default(),
1778 )
1779 .with_referenced_filenames(&referenced_license_files),
1780 )
1781 })
1782 .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1783
1784 let extracted_license_statement = license_expression
1785 .clone()
1786 .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1787
1788 let mut extra_data = HashMap::new();
1789 if let Some(platform_value) = platform
1790 && !platform_value.eq_ignore_ascii_case("unknown")
1791 && !platform_value.is_empty()
1792 {
1793 extra_data.insert(
1794 "platform".to_string(),
1795 serde_json::Value::String(platform_value),
1796 );
1797 }
1798
1799 if let Some(requires_python_value) = requires_python
1800 && !requires_python_value.is_empty()
1801 {
1802 extra_data.insert(
1803 "requires_python".to_string(),
1804 serde_json::Value::String(requires_python_value),
1805 );
1806 }
1807
1808 if !license_files.is_empty() {
1809 extra_data.insert(
1810 "license_files".to_string(),
1811 serde_json::Value::Array(
1812 license_files
1813 .iter()
1814 .cloned()
1815 .map(serde_json::Value::String)
1816 .collect(),
1817 ),
1818 );
1819 }
1820
1821 let file_references = license_files
1822 .iter()
1823 .map(|path| FileReference {
1824 path: path.clone(),
1825 size: None,
1826 sha1: None,
1827 md5: None,
1828 sha256: None,
1829 sha512: None,
1830 extra_data: None,
1831 })
1832 .collect();
1833
1834 let project_urls = get_header_all(&metadata.headers, "project-url");
1835 let dependencies = extract_rfc822_dependencies(&metadata.headers);
1836 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1837
1838 if !project_urls.is_empty() {
1839 let parsed_urls = parse_project_urls(&project_urls);
1840
1841 for (label, url) in &parsed_urls {
1842 let label_lower = label.to_lowercase();
1843
1844 if bug_tracking_url.is_none()
1845 && matches!(
1846 label_lower.as_str(),
1847 "tracker"
1848 | "bug reports"
1849 | "bug tracker"
1850 | "issues"
1851 | "issue tracker"
1852 | "github: issues"
1853 )
1854 {
1855 bug_tracking_url = Some(url.clone());
1856 } else if code_view_url.is_none()
1857 && matches!(label_lower.as_str(), "source" | "source code" | "code")
1858 {
1859 code_view_url = Some(url.clone());
1860 } else if vcs_url.is_none()
1861 && matches!(
1862 label_lower.as_str(),
1863 "github" | "gitlab" | "github: repo" | "repository"
1864 )
1865 {
1866 vcs_url = Some(url.clone());
1867 } else if homepage_url.is_none()
1868 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1869 {
1870 homepage_url = Some(url.clone());
1871 } else if label_lower == "changelog" {
1872 extra_data.insert(
1873 "changelog_url".to_string(),
1874 serde_json::Value::String(url.clone()),
1875 );
1876 }
1877 }
1878
1879 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1880 .iter()
1881 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1882 .collect();
1883
1884 if !project_urls_json.is_empty() {
1885 extra_data.insert(
1886 "project_urls".to_string(),
1887 serde_json::Value::Object(project_urls_json),
1888 );
1889 }
1890 }
1891
1892 let extra_data = if extra_data.is_empty() {
1893 None
1894 } else {
1895 Some(extra_data)
1896 };
1897
1898 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1899 build_pypi_urls(name.as_deref(), version.as_deref());
1900
1901 PackageData {
1902 package_type: Some(PythonParser::PACKAGE_TYPE),
1903 namespace: None,
1904 name,
1905 version,
1906 qualifiers: None,
1907 subpath: None,
1908 primary_language: Some("Python".to_string()),
1909 description,
1910 release_date: None,
1911 parties,
1912 keywords,
1913 homepage_url,
1914 download_url,
1915 size: None,
1916 sha1: None,
1917 md5: None,
1918 sha256: None,
1919 sha512: None,
1920 bug_tracking_url,
1921 code_view_url,
1922 vcs_url,
1923 copyright: None,
1924 holder: None,
1925 declared_license_expression,
1926 declared_license_expression_spdx,
1927 license_detections,
1928 other_license_expression: None,
1929 other_license_expression_spdx: None,
1930 other_license_detections: Vec::new(),
1931 extracted_license_statement,
1932 notice_text: None,
1933 source_packages: Vec::new(),
1934 file_references,
1935 is_private: false,
1936 is_virtual: false,
1937 extra_data,
1938 dependencies,
1939 repository_homepage_url,
1940 repository_download_url,
1941 api_data_url,
1942 datasource_id: Some(datasource_id),
1943 purl,
1944 }
1945}
1946
1947fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
1948 project_urls
1949 .iter()
1950 .filter_map(|url_entry| {
1951 if let Some((label, url)) = url_entry.split_once(", ") {
1952 let label_trimmed = label.trim();
1953 let url_trimmed = url.trim();
1954 if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
1955 return Some((label_trimmed.to_string(), url_trimmed.to_string()));
1956 }
1957 }
1958 None
1959 })
1960 .collect()
1961}
1962
1963fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
1964 let mut parts = Vec::new();
1965 if let Some(summary_value) = summary
1966 && !summary_value.trim().is_empty()
1967 {
1968 parts.push(summary_value.trim().to_string());
1969 }
1970
1971 if !body.trim().is_empty() {
1972 parts.push(body.trim().to_string());
1973 }
1974
1975 if parts.is_empty() {
1976 None
1977 } else {
1978 Some(parts.join("\n"))
1979 }
1980}
1981
1982fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
1983 let mut keywords = Vec::new();
1984 let mut license_classifiers = Vec::new();
1985
1986 for classifier in classifiers {
1987 if classifier.starts_with("License ::") {
1988 license_classifiers.push(classifier.to_string());
1989 } else {
1990 keywords.push(classifier.to_string());
1991 }
1992 }
1993
1994 (keywords, license_classifiers)
1995}
1996
1997fn build_extracted_license_statement(
1998 license: Option<&str>,
1999 license_classifiers: &[String],
2000) -> Option<String> {
2001 let mut lines = Vec::new();
2002
2003 if let Some(value) = license
2004 && !value.trim().is_empty()
2005 {
2006 lines.push(format!("license: {}", value.trim()));
2007 }
2008
2009 if !license_classifiers.is_empty() {
2010 lines.push("classifiers:".to_string());
2011 for classifier in license_classifiers {
2012 lines.push(format!(" - '{}'", classifier));
2013 }
2014 }
2015
2016 if lines.is_empty() {
2017 None
2018 } else {
2019 Some(format!("{}\n", lines.join("\n")))
2020 }
2021}
2022
2023pub(crate) fn build_pypi_urls(
2024 name: Option<&str>,
2025 version: Option<&str>,
2026) -> (
2027 Option<String>,
2028 Option<String>,
2029 Option<String>,
2030 Option<String>,
2031) {
2032 let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2033
2034 let repository_download_url = name.and_then(|value| {
2035 version.map(|ver| {
2036 format!(
2037 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2038 &value[..1.min(value.len())],
2039 value,
2040 value,
2041 ver
2042 )
2043 })
2044 });
2045
2046 let api_data_url = name.map(|value| {
2047 if let Some(ver) = version {
2048 format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2049 } else {
2050 format!("https://pypi.org/pypi/{}/json", value)
2051 }
2052 });
2053
2054 let purl = name.and_then(|value| {
2055 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2056 if let Some(ver) = version {
2057 package_url.with_version(ver).ok()?;
2058 }
2059 Some(package_url.to_string())
2060 });
2061
2062 (
2063 repository_homepage_url,
2064 repository_download_url,
2065 api_data_url,
2066 purl,
2067 )
2068}
2069
2070fn build_pypi_purl_with_extension(
2071 name: &str,
2072 version: Option<&str>,
2073 extension: &str,
2074) -> Option<String> {
2075 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2076 if let Some(ver) = version {
2077 package_url.with_version(ver).ok()?;
2078 }
2079 package_url.add_qualifier("extension", extension).ok()?;
2080 Some(package_url.to_string())
2081}
2082
2083fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2084 let toml_content = match read_toml_file(path) {
2085 Ok(content) => content,
2086 Err(e) => {
2087 warn!(
2088 "Failed to read or parse pyproject.toml at {:?}: {}",
2089 path, e
2090 );
2091 return default_package_data(path);
2092 }
2093 };
2094
2095 let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2096 let is_poetry_pyproject = tool_table
2097 .and_then(|tool| tool.get("poetry"))
2098 .and_then(|value| value.as_table())
2099 .is_some();
2100
2101 let project_table =
2103 if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2104 project.clone()
2106 } else if let Some(tool) = tool_table {
2107 if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2108 poetry.clone()
2110 } else {
2111 return default_package_data(path);
2112 }
2113 } else if toml_content.get(FIELD_NAME).is_some() {
2114 match toml_content.as_table() {
2116 Some(table) => table.clone(),
2117 None => {
2118 warn!("Failed to convert TOML content to table in {:?}", path);
2119 return default_package_data(path);
2120 }
2121 }
2122 } else {
2123 return default_package_data(path);
2124 };
2125
2126 let name = project_table
2127 .get(FIELD_NAME)
2128 .and_then(|v| v.as_str())
2129 .map(String::from);
2130
2131 let version = project_table
2132 .get(FIELD_VERSION)
2133 .and_then(|v| v.as_str())
2134 .map(String::from);
2135 let classifiers = project_table
2136 .get("classifiers")
2137 .and_then(|value| value.as_array())
2138 .map(|values| {
2139 values
2140 .iter()
2141 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2142 .collect::<Vec<_>>()
2143 })
2144 .unwrap_or_default();
2145
2146 let extracted_license_statement = extract_raw_license_string(&project_table);
2147 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2148 normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2149
2150 let (homepage_url, repository_url) = extract_urls(&project_table);
2152
2153 let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2154 let extra_data = extract_pyproject_extra_data(&toml_content);
2155
2156 let purl = name.as_ref().and_then(|n| {
2158 let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2159 Ok(p) => p,
2160 Err(e) => {
2161 warn!(
2162 "Failed to create PackageUrl for Python package '{}': {}",
2163 n, e
2164 );
2165 return None;
2166 }
2167 };
2168
2169 if let Some(v) = &version
2170 && let Err(e) = package_url.with_version(v)
2171 {
2172 warn!(
2173 "Failed to set version '{}' for Python package '{}': {}",
2174 v, n, e
2175 );
2176 return None;
2177 }
2178
2179 Some(package_url.to_string())
2180 });
2181
2182 let api_data_url = name.as_ref().map(|n| {
2183 if let Some(v) = &version {
2184 format!("https://pypi.org/pypi/{}/{}/json", n, v)
2185 } else {
2186 format!("https://pypi.org/pypi/{}/json", n)
2187 }
2188 });
2189
2190 let pypi_homepage_url = name
2191 .as_ref()
2192 .map(|n| format!("https://pypi.org/project/{}", n));
2193
2194 let pypi_download_url = name.as_ref().and_then(|n| {
2195 version.as_ref().map(|v| {
2196 format!(
2197 "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2198 &n[..1.min(n.len())],
2199 n,
2200 n,
2201 v
2202 )
2203 })
2204 });
2205
2206 PackageData {
2207 package_type: Some(PythonParser::PACKAGE_TYPE),
2208 namespace: None,
2209 name,
2210 version,
2211 qualifiers: None,
2212 subpath: None,
2213 primary_language: None,
2214 description: None,
2215 release_date: None,
2216 parties: extract_parties(&project_table),
2217 keywords: Vec::new(),
2218 homepage_url: homepage_url.or(pypi_homepage_url),
2219 download_url: repository_url.clone().or(pypi_download_url),
2220 size: None,
2221 sha1: None,
2222 md5: None,
2223 sha256: None,
2224 sha512: None,
2225 bug_tracking_url: None,
2226 code_view_url: None,
2227 vcs_url: repository_url,
2228 copyright: None,
2229 holder: None,
2230 declared_license_expression,
2231 declared_license_expression_spdx,
2232 license_detections,
2233 other_license_expression: None,
2234 other_license_expression_spdx: None,
2235 other_license_detections: Vec::new(),
2236 extracted_license_statement,
2237 notice_text: None,
2238 source_packages: Vec::new(),
2239 file_references: Vec::new(),
2240 is_private: has_private_classifier(&classifiers),
2241 is_virtual: false,
2242 extra_data,
2243 dependencies: [dependencies, optional_dependencies].concat(),
2244 repository_homepage_url: None,
2245 repository_download_url: None,
2246 api_data_url,
2247 datasource_id: Some(if is_poetry_pyproject {
2248 DatasourceId::PypiPoetryPyprojectToml
2249 } else {
2250 DatasourceId::PypiPyprojectToml
2251 }),
2252 purl,
2253 }
2254}
2255
2256fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2257 let path_str = path.to_string_lossy().replace('\\', "/");
2258 if path_str.contains("/EGG-INFO/PKG-INFO") {
2259 DatasourceId::PypiEggPkginfo
2260 } else if path_str.ends_with(".egg-info/PKG-INFO") {
2261 DatasourceId::PypiEditableEggPkginfo
2262 } else {
2263 DatasourceId::PypiSdistPkginfo
2264 }
2265}
2266
2267fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2268 project
2269 .get(FIELD_LICENSE)
2270 .and_then(|license_value| match license_value {
2271 TomlValue::String(license_str) => Some(license_str.clone()),
2272 TomlValue::Table(license_table) => license_table
2273 .get("text")
2274 .and_then(|v| v.as_str())
2275 .map(|s| s.to_string())
2276 .or_else(|| {
2277 license_table
2278 .get("expression")
2279 .and_then(|v| v.as_str())
2280 .map(|expr| expr.to_string())
2281 }),
2282 _ => None,
2283 })
2284}
2285
2286fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2287 match project.get(FIELD_LICENSE) {
2288 Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2289 Some(TomlValue::Table(license_table)) => license_table
2290 .get("expression")
2291 .and_then(|value| value.as_str()),
2292 _ => None,
2293 }
2294}
2295
2296fn extract_urls(project: &TomlMap<String, TomlValue>) -> (Option<String>, Option<String>) {
2297 let mut homepage_url = None;
2298 let mut repository_url = None;
2299
2300 if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2302 homepage_url = urls
2303 .get(FIELD_HOMEPAGE)
2304 .and_then(|v| v.as_str())
2305 .map(String::from);
2306 repository_url = urls
2307 .get(FIELD_REPOSITORY)
2308 .and_then(|v| v.as_str())
2309 .map(String::from);
2310 }
2311
2312 if homepage_url.is_none() {
2314 homepage_url = project
2315 .get(FIELD_HOMEPAGE)
2316 .and_then(|v| v.as_str())
2317 .map(String::from);
2318 }
2319
2320 if repository_url.is_none() {
2321 repository_url = project
2322 .get(FIELD_REPOSITORY)
2323 .and_then(|v| v.as_str())
2324 .map(String::from);
2325 }
2326
2327 (homepage_url, repository_url)
2328}
2329
2330fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2331 let mut parties = Vec::new();
2332
2333 if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2334 for author in authors {
2335 if let Some(author_str) = author.as_str() {
2336 let (name, email) = split_name_email(author_str);
2337 parties.push(Party {
2338 r#type: None,
2339 role: Some("author".to_string()),
2340 name,
2341 email,
2342 url: None,
2343 organization: None,
2344 organization_url: None,
2345 timezone: None,
2346 });
2347 }
2348 }
2349 }
2350
2351 if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2352 for maintainer in maintainers {
2353 if let Some(maintainer_str) = maintainer.as_str() {
2354 let (name, email) = split_name_email(maintainer_str);
2355 parties.push(Party {
2356 r#type: None,
2357 role: Some("maintainer".to_string()),
2358 name,
2359 email,
2360 url: None,
2361 organization: None,
2362 organization_url: None,
2363 timezone: None,
2364 });
2365 }
2366 }
2367 }
2368
2369 parties
2370}
2371
2372fn extract_dependencies(
2373 project: &TomlMap<String, TomlValue>,
2374 toml_content: &TomlValue,
2375) -> (Vec<Dependency>, Vec<Dependency>) {
2376 let mut dependencies = Vec::new();
2377 let mut optional_dependencies = Vec::new();
2378
2379 if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2381 match deps_value {
2382 TomlValue::Array(arr) => {
2383 dependencies = parse_dependency_array(arr, false, None);
2384 }
2385 TomlValue::Table(table) => {
2386 dependencies = parse_dependency_table(table, false, None);
2387 }
2388 _ => {}
2389 }
2390 }
2391
2392 if let Some(opt_deps_table) = project
2394 .get(FIELD_OPTIONAL_DEPENDENCIES)
2395 .and_then(|v| v.as_table())
2396 {
2397 for (extra_name, deps) in opt_deps_table {
2398 match deps {
2399 TomlValue::Array(arr) => {
2400 optional_dependencies.extend(parse_dependency_array(
2401 arr,
2402 true,
2403 Some(extra_name),
2404 ));
2405 }
2406 TomlValue::Table(table) => {
2407 optional_dependencies.extend(parse_dependency_table(
2408 table,
2409 true,
2410 Some(extra_name),
2411 ));
2412 }
2413 _ => {}
2414 }
2415 }
2416 }
2417
2418 if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2420 match dev_deps_value {
2421 TomlValue::Array(arr) => {
2422 optional_dependencies.extend(parse_dependency_array(
2423 arr,
2424 true,
2425 Some(FIELD_DEV_DEPENDENCIES),
2426 ));
2427 }
2428 TomlValue::Table(table) => {
2429 optional_dependencies.extend(parse_dependency_table(
2430 table,
2431 true,
2432 Some(FIELD_DEV_DEPENDENCIES),
2433 ));
2434 }
2435 _ => {}
2436 }
2437 }
2438
2439 if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
2441 for (group_name, group_data) in groups_table {
2442 if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2443 match group_deps {
2444 TomlValue::Array(arr) => {
2445 optional_dependencies.extend(parse_dependency_array(
2446 arr,
2447 true,
2448 Some(group_name),
2449 ));
2450 }
2451 TomlValue::Table(table) => {
2452 optional_dependencies.extend(parse_dependency_table(
2453 table,
2454 true,
2455 Some(group_name),
2456 ));
2457 }
2458 _ => {}
2459 }
2460 }
2461 }
2462 }
2463
2464 if let Some(groups_table) = toml_content
2465 .get(FIELD_DEPENDENCY_GROUPS)
2466 .and_then(|value| value.as_table())
2467 {
2468 for (group_name, deps) in groups_table {
2469 match deps {
2470 TomlValue::Array(arr) => {
2471 optional_dependencies.extend(parse_dependency_array(
2472 arr,
2473 true,
2474 Some(group_name),
2475 ));
2476 }
2477 TomlValue::Table(table) => {
2478 optional_dependencies.extend(parse_dependency_table(
2479 table,
2480 true,
2481 Some(group_name),
2482 ));
2483 }
2484 _ => {}
2485 }
2486 }
2487 }
2488
2489 if let Some(dev_deps_value) = toml_content
2490 .get("tool")
2491 .and_then(|value| value.as_table())
2492 .and_then(|tool| tool.get("uv"))
2493 .and_then(|value| value.as_table())
2494 .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2495 {
2496 match dev_deps_value {
2497 TomlValue::Array(arr) => {
2498 optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2499 }
2500 TomlValue::Table(table) => {
2501 optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2502 }
2503 _ => {}
2504 }
2505 }
2506
2507 (dependencies, optional_dependencies)
2508}
2509
2510fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2511 let mut extra_data = HashMap::new();
2512
2513 if let Some(tool_uv) = toml_content
2514 .get("tool")
2515 .and_then(|value| value.as_table())
2516 .and_then(|tool| tool.get("uv"))
2517 {
2518 extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2519 }
2520
2521 if extra_data.is_empty() {
2522 None
2523 } else {
2524 Some(extra_data)
2525 }
2526}
2527
2528fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2529 match value {
2530 TomlValue::String(value) => JsonValue::String(value.clone()),
2531 TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2532 TomlValue::Float(value) => JsonValue::String(value.to_string()),
2533 TomlValue::Boolean(value) => JsonValue::Bool(*value),
2534 TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2535 TomlValue::Array(values) => {
2536 JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2537 }
2538 TomlValue::Table(values) => JsonValue::Object(
2539 values
2540 .iter()
2541 .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2542 .collect::<JsonMap<String, JsonValue>>(),
2543 ),
2544 }
2545}
2546
2547fn parse_dependency_table(
2548 table: &TomlMap<String, TomlValue>,
2549 is_optional: bool,
2550 scope: Option<&str>,
2551) -> Vec<Dependency> {
2552 table
2553 .iter()
2554 .filter_map(|(name, version)| {
2555 let version_str = version.as_str().map(|s| s.to_string());
2556 let mut package_url =
2557 PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2558
2559 if let Some(v) = &version_str {
2560 package_url.with_version(v).ok()?;
2561 }
2562
2563 Some(Dependency {
2564 purl: Some(package_url.to_string()),
2565 extracted_requirement: None,
2566 scope: scope.map(|s| s.to_string()),
2567 is_runtime: Some(!is_optional),
2568 is_optional: Some(is_optional),
2569 is_pinned: None,
2570 is_direct: Some(true),
2571 resolved_package: None,
2572 extra_data: None,
2573 })
2574 })
2575 .collect()
2576}
2577
2578fn parse_dependency_array(
2579 array: &[TomlValue],
2580 is_optional: bool,
2581 scope: Option<&str>,
2582) -> Vec<Dependency> {
2583 array
2584 .iter()
2585 .filter_map(|dep| {
2586 let dep_str = dep.as_str()?;
2587
2588 let mut parts = dep_str.split(['>', '=', '<', '~']);
2589 let name = parts.next()?.trim().to_string();
2590
2591 let version = parts.next().map(|v| v.trim().to_string());
2592
2593 let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name)
2594 {
2595 Ok(purl) => purl,
2596 Err(_) => return None,
2597 };
2598
2599 if let Some(ref v) = version {
2600 package_url.with_version(v).ok()?;
2601 }
2602
2603 Some(Dependency {
2604 purl: Some(package_url.to_string()),
2605 extracted_requirement: None,
2606 scope: scope.map(|s| s.to_string()),
2607 is_runtime: Some(!is_optional),
2608 is_optional: Some(is_optional),
2609 is_pinned: None,
2610 is_direct: Some(true),
2611 resolved_package: None,
2612 extra_data: None,
2613 })
2614 })
2615 .collect()
2616}
2617
2618#[derive(Debug, Clone)]
2619enum Value {
2620 String(String),
2621 Number(f64),
2622 Bool(bool),
2623 None,
2624 List(Vec<Value>),
2625 Tuple(Vec<Value>),
2626 Dict(HashMap<String, Value>),
2627}
2628
2629struct LiteralEvaluator {
2630 constants: HashMap<String, Value>,
2631 max_depth: usize,
2632 max_nodes: usize,
2633 nodes_visited: usize,
2634}
2635
2636impl LiteralEvaluator {
2637 fn new(constants: HashMap<String, Value>) -> Self {
2638 Self {
2639 constants,
2640 max_depth: MAX_SETUP_PY_AST_DEPTH,
2641 max_nodes: MAX_SETUP_PY_AST_NODES,
2642 nodes_visited: 0,
2643 }
2644 }
2645
2646 fn insert_constant(&mut self, name: String, value: Value) {
2647 self.constants.insert(name, value);
2648 }
2649
2650 fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2651 if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2652 return None;
2653 }
2654 self.nodes_visited += 1;
2655
2656 match expr {
2657 ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
2658 Some(Value::String(value.to_str().to_string()))
2659 }
2660 ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
2661 Some(Value::Bool(*value))
2662 }
2663 ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
2664 self.evaluate_number(value)
2665 }
2666 ast::Expr::NoneLiteral(_) => Some(Value::None),
2667 ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2668 ast::Expr::List(ast::ExprList { elts, .. }) => {
2669 let mut values = Vec::new();
2670 for elt in elts {
2671 values.push(self.evaluate_expr(elt, depth + 1)?);
2672 }
2673 Some(Value::List(values))
2674 }
2675 ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
2676 let mut values = Vec::new();
2677 for elt in elts {
2678 values.push(self.evaluate_expr(elt, depth + 1)?);
2679 }
2680 Some(Value::Tuple(values))
2681 }
2682 ast::Expr::Dict(ast::ExprDict { items, .. }) => {
2683 let mut dict = HashMap::new();
2684 for item in items {
2685 let key_expr = item.key.as_ref()?;
2686 let key_value = self.evaluate_expr(key_expr, depth + 1)?;
2687 let key = value_to_string(&key_value)?;
2688 let value = self.evaluate_expr(&item.value, depth + 1)?;
2689 dict.insert(key, value);
2690 }
2691 Some(Value::Dict(dict))
2692 }
2693 ast::Expr::Call(ast::ExprCall {
2694 func, arguments, ..
2695 }) => {
2696 let args = arguments.args.as_ref();
2697 let keywords = arguments.keywords.as_ref();
2698 if keywords.is_empty()
2699 && let Some(name) = dotted_name(func.as_ref(), depth + 1)
2700 && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
2701 {
2702 return self.evaluate_ordered_dict(args, depth + 1);
2703 }
2704
2705 if !args.is_empty() {
2706 return None;
2707 }
2708
2709 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
2710 && id == "dict"
2711 {
2712 let mut dict = HashMap::new();
2713 for keyword in keywords {
2714 let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
2715 let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2716 dict.insert(key.to_string(), value);
2717 }
2718 return Some(Value::Dict(dict));
2719 }
2720
2721 None
2722 }
2723 _ => None,
2724 }
2725 }
2726
2727 fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
2728 match number {
2729 ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2730 ast::Number::Float(value) => Some(Value::Number(*value)),
2731 ast::Number::Complex { .. } => None,
2732 }
2733 }
2734
2735 fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2736 if args.len() != 1 {
2737 return None;
2738 }
2739
2740 let items = match self.evaluate_expr(&args[0], depth)? {
2741 Value::List(items) | Value::Tuple(items) => items,
2742 _ => return None,
2743 };
2744
2745 let mut dict = HashMap::new();
2746 for item in items {
2747 let Value::Tuple(values) = item else {
2748 return None;
2749 };
2750 if values.len() != 2 {
2751 return None;
2752 }
2753 let key = value_to_string(&values[0])?;
2754 dict.insert(key, values[1].clone());
2755 }
2756
2757 Some(Value::Dict(dict))
2758 }
2759}
2760
2761#[derive(Default)]
2762struct SetupAliases {
2763 setup_names: HashSet<String>,
2764 module_aliases: HashMap<String, String>,
2765}
2766
2767fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
2768 extract_from_setup_py(path).into_iter().collect()
2769}
2770
2771fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
2772 let content = match read_file_to_string(path) {
2773 Ok(content) => content,
2774 Err(e) => {
2775 warn!("Failed to read setup.py at {:?}: {}", path, e);
2776 return Some(default_package_data(path));
2777 }
2778 };
2779
2780 if content.len() > MAX_SETUP_PY_BYTES {
2781 warn!("setup.py too large at {:?}: {} bytes", path, content.len());
2782 let package_data = extract_from_setup_py_regex(&content);
2783 return should_emit_setup_py_package(&package_data).then_some(package_data);
2784 }
2785
2786 let mut package_data = match extract_from_setup_py_ast(&content) {
2787 Ok(Some(data)) => data,
2788 Ok(None) => return Some(default_package_data(path)),
2789 Err(e) => {
2790 warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
2791 extract_from_setup_py_regex(&content)
2792 }
2793 };
2794
2795 if package_data.name.is_none() {
2796 package_data.name = extract_setup_value(&content, "name");
2797 }
2798
2799 if package_data.version.is_none() {
2800 package_data.version = extract_setup_value(&content, "version");
2801 }
2802
2803 fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
2804
2805 if package_data.purl.is_none() {
2806 package_data.purl = build_setup_py_purl(
2807 package_data.name.as_deref(),
2808 package_data.version.as_deref(),
2809 );
2810 }
2811
2812 if should_emit_setup_py_package(&package_data) {
2813 Some(package_data)
2814 } else {
2815 Some(default_package_data(path))
2816 }
2817}
2818
2819fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
2820 package_data.name.is_some()
2821 || package_data.version.is_some()
2822 || package_data.purl.is_some()
2823 || !package_data.dependencies.is_empty()
2824 || package_data.extracted_license_statement.is_some()
2825 || !package_data.license_detections.is_empty()
2826 || !package_data.parties.is_empty()
2827 || package_data.description.is_some()
2828 || package_data.homepage_url.is_some()
2829 || package_data.bug_tracking_url.is_some()
2830 || package_data.code_view_url.is_some()
2831 || package_data.vcs_url.is_some()
2832}
2833
2834fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
2835 if package_data.version.is_some()
2836 && package_data.extracted_license_statement.is_some()
2837 && package_data
2838 .parties
2839 .iter()
2840 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
2841 {
2842 return;
2843 }
2844
2845 let Some(root) = path.parent() else {
2846 return;
2847 };
2848
2849 let dunder_metadata = collect_sibling_dunder_metadata(root, content);
2850
2851 if package_data.version.is_none() {
2852 package_data.version = dunder_metadata.version;
2853 }
2854
2855 if package_data.extracted_license_statement.is_none() {
2856 package_data.extracted_license_statement = dunder_metadata.license;
2857 }
2858
2859 let has_author = package_data
2860 .parties
2861 .iter()
2862 .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
2863
2864 if !has_author && let Some(author) = dunder_metadata.author {
2865 package_data.parties.push(Party {
2866 r#type: Some("person".to_string()),
2867 role: Some("author".to_string()),
2868 name: Some(author),
2869 email: None,
2870 url: None,
2871 organization: None,
2872 organization_url: None,
2873 timezone: None,
2874 });
2875 }
2876}
2877
2878#[derive(Default)]
2879struct DunderMetadata {
2880 version: Option<String>,
2881 author: Option<String>,
2882 license: Option<String>,
2883}
2884
2885fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
2886 let statements = match parse_module(content) {
2887 Ok(parsed) => parsed.into_suite(),
2888 Err(_) => return DunderMetadata::default(),
2889 };
2890
2891 let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2892 let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2893 let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2894 let mut metadata = DunderMetadata::default();
2895
2896 for module in imported_dunder_modules(&statements) {
2897 let Some(path) = resolve_imported_module_path(root, &module) else {
2898 continue;
2899 };
2900 let Ok(module_content) = read_file_to_string(&path) else {
2901 continue;
2902 };
2903
2904 if metadata.version.is_none() {
2905 metadata.version = version_re
2906 .as_ref()
2907 .and_then(|regex| regex.captures(&module_content))
2908 .and_then(|captures| captures.get(1))
2909 .map(|match_| match_.as_str().to_string());
2910 }
2911
2912 if metadata.author.is_none() {
2913 metadata.author = author_re
2914 .as_ref()
2915 .and_then(|regex| regex.captures(&module_content))
2916 .and_then(|captures| captures.get(1))
2917 .map(|match_| match_.as_str().to_string());
2918 }
2919
2920 if metadata.license.is_none() {
2921 metadata.license = license_re
2922 .as_ref()
2923 .and_then(|regex| regex.captures(&module_content))
2924 .and_then(|captures| captures.get(1))
2925 .map(|match_| match_.as_str().to_string());
2926 }
2927
2928 if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
2929 return metadata;
2930 }
2931 }
2932
2933 metadata
2934}
2935
2936fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
2937 let mut modules = Vec::new();
2938
2939 for statement in statements {
2940 let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
2941 continue;
2942 };
2943 let Some(module) = module.as_ref().map(|name| name.as_str()) else {
2944 continue;
2945 };
2946 let imports_dunder = names.iter().any(|alias| {
2947 matches!(
2948 alias.name.as_str(),
2949 "__version__" | "__author__" | "__license__"
2950 )
2951 });
2952 if imports_dunder {
2953 modules.push(module.to_string());
2954 }
2955 }
2956
2957 modules
2958}
2959
2960fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
2961 let relative = PathBuf::from_iter(module.split('.'));
2962 let candidates = [
2963 root.join(relative.with_extension("py")),
2964 root.join(&relative).join("__init__.py"),
2965 root.join("src").join(relative.with_extension("py")),
2966 root.join("src").join(relative).join("__init__.py"),
2967 ];
2968
2969 candidates.into_iter().find(|candidate| candidate.exists())
2970}
2971
2972fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
2988 let statements = parse_module(content)
2989 .map(|parsed| parsed.into_suite())
2990 .map_err(|e| e.to_string())?;
2991 let aliases = collect_setup_aliases(&statements);
2992 let mut evaluator = LiteralEvaluator::new(HashMap::new());
2993 build_setup_py_constants(&statements, &mut evaluator);
2994
2995 let setup_call = find_setup_call(&statements, &aliases);
2996 let Some(call_expr) = setup_call else {
2997 return Ok(None);
2998 };
2999
3000 let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3001 Ok(Some(build_setup_py_package_data(&setup_values)))
3002}
3003
3004fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3005 for stmt in statements {
3006 if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3007 if targets.len() != 1 {
3008 continue;
3009 }
3010
3011 let Some(name) = extract_assign_name(&targets[0]) else {
3012 continue;
3013 };
3014
3015 if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3016 evaluator.insert_constant(name, value);
3017 }
3018 }
3019 }
3020}
3021
3022fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3023 match target {
3024 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3025 _ => None,
3026 }
3027}
3028
3029fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3030 let mut aliases = SetupAliases::default();
3031 aliases.setup_names.insert("setup".to_string());
3032
3033 for stmt in statements {
3034 match stmt {
3035 ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3036 for alias in names {
3037 let module_name = alias.name.as_str();
3038 if !is_setup_module(module_name) {
3039 continue;
3040 }
3041 let alias_name = alias
3042 .asname
3043 .as_ref()
3044 .map(|name| name.as_str())
3045 .unwrap_or(module_name);
3046 aliases
3047 .module_aliases
3048 .insert(alias_name.to_string(), module_name.to_string());
3049 }
3050 }
3051 ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3052 let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3053 continue;
3054 };
3055 if !is_setup_module(module_name) {
3056 continue;
3057 }
3058 for alias in names {
3059 if alias.name.as_str() != "setup" {
3060 continue;
3061 }
3062 let alias_name = alias
3063 .asname
3064 .as_ref()
3065 .map(|name| name.as_str())
3066 .unwrap_or("setup");
3067 aliases.setup_names.insert(alias_name.to_string());
3068 }
3069 }
3070 _ => {}
3071 }
3072 }
3073
3074 aliases
3075}
3076
3077fn is_setup_module(module_name: &str) -> bool {
3078 matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3079}
3080
3081fn find_setup_call<'a>(
3082 statements: &'a [ast::Stmt],
3083 aliases: &'a SetupAliases,
3084) -> Option<&'a ast::Expr> {
3085 let mut finder = SetupCallFinder {
3086 aliases,
3087 called_function_names: collect_top_level_called_function_names(statements),
3088 nodes_visited: 0,
3089 };
3090 finder.find_in_statements(statements)
3091}
3092
3093fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3094 let mut called = HashSet::new();
3095 collect_called_function_names_in_statements(statements, &mut called);
3096 called
3097}
3098
3099fn collect_called_function_names_in_statements(
3100 statements: &[ast::Stmt],
3101 called: &mut HashSet<String>,
3102) {
3103 for stmt in statements {
3104 match stmt {
3105 ast::Stmt::Expr(ast::StmtExpr { value, .. })
3106 | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3107 collect_called_function_names_in_expr(value.as_ref(), called);
3108 }
3109 ast::Stmt::If(ast::StmtIf {
3110 body,
3111 elif_else_clauses,
3112 ..
3113 }) => {
3114 collect_called_function_names_in_statements(body, called);
3115 for clause in elif_else_clauses {
3116 collect_called_function_names_in_statements(&clause.body, called);
3117 }
3118 }
3119 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3120 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3121 collect_called_function_names_in_statements(body, called);
3122 collect_called_function_names_in_statements(orelse, called);
3123 }
3124 ast::Stmt::With(ast::StmtWith { body, .. }) => {
3125 collect_called_function_names_in_statements(body, called);
3126 }
3127 ast::Stmt::Try(ast::StmtTry {
3128 body,
3129 orelse,
3130 finalbody,
3131 handlers,
3132 ..
3133 }) => {
3134 collect_called_function_names_in_statements(body, called);
3135 collect_called_function_names_in_statements(orelse, called);
3136 collect_called_function_names_in_statements(finalbody, called);
3137 for handler in handlers {
3138 let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3139 body,
3140 ..
3141 }) = handler;
3142 collect_called_function_names_in_statements(body, called);
3143 }
3144 }
3145 _ => {}
3146 }
3147 }
3148}
3149
3150fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3151 if let ast::Expr::Call(ast::ExprCall {
3152 func, arguments, ..
3153 }) = expr
3154 {
3155 if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3156 called.insert(id.as_str().to_string());
3157 }
3158
3159 for arg in arguments.args.iter() {
3160 collect_called_function_names_in_expr(arg, called);
3161 }
3162 for keyword in arguments.keywords.iter() {
3163 collect_called_function_names_in_expr(&keyword.value, called);
3164 }
3165 }
3166}
3167
3168struct SetupCallFinder<'a> {
3169 aliases: &'a SetupAliases,
3170 called_function_names: HashSet<String>,
3171 nodes_visited: usize,
3172}
3173
3174impl<'a> SetupCallFinder<'a> {
3175 fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3176 for stmt in statements {
3177 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3178 return None;
3179 }
3180 self.nodes_visited += 1;
3181
3182 let found = match stmt {
3183 ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3184 ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3185 ast::Stmt::If(ast::StmtIf {
3186 body,
3187 elif_else_clauses,
3188 ..
3189 }) => self.find_in_statements(body).or_else(|| {
3190 for clause in elif_else_clauses {
3191 if let Some(found) = self.find_in_statements(&clause.body) {
3192 return Some(found);
3193 }
3194 }
3195 None
3196 }),
3197 ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3198 | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3199 .find_in_statements(body)
3200 .or_else(|| self.find_in_statements(orelse)),
3201 ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3202 .called_function_names
3203 .contains(name.as_str())
3204 .then(|| self.find_in_statements(body))
3205 .flatten(),
3206 ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3207 ast::Stmt::Try(ast::StmtTry {
3208 body,
3209 orelse,
3210 finalbody,
3211 handlers,
3212 ..
3213 }) => self
3214 .find_in_statements(body)
3215 .or_else(|| self.find_in_statements(orelse))
3216 .or_else(|| self.find_in_statements(finalbody))
3217 .or_else(|| {
3218 for handler in handlers {
3219 let ast::ExceptHandler::ExceptHandler(
3220 ast::ExceptHandlerExceptHandler { body, .. },
3221 ) = handler;
3222 if let Some(found) = self.find_in_statements(body) {
3223 return Some(found);
3224 }
3225 }
3226 None
3227 }),
3228 _ => None,
3229 };
3230
3231 if found.is_some() {
3232 return found;
3233 }
3234 }
3235
3236 None
3237 }
3238
3239 fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3240 if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3241 return None;
3242 }
3243 self.nodes_visited += 1;
3244
3245 match expr {
3246 ast::Expr::Call(ast::ExprCall { func, .. })
3247 if is_setup_call(func.as_ref(), self.aliases) =>
3248 {
3249 Some(expr)
3250 }
3251 _ => None,
3252 }
3253 }
3254}
3255
3256fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3257 let Some(dotted) = dotted_name(func, 0) else {
3258 return false;
3259 };
3260
3261 if aliases.setup_names.contains(&dotted) {
3262 return true;
3263 }
3264
3265 let Some(module) = dotted.strip_suffix(".setup") else {
3266 return false;
3267 };
3268
3269 let resolved = resolve_module_alias(module, aliases);
3270 is_setup_module(&resolved)
3271}
3272
3273fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3274 if depth >= MAX_SETUP_PY_AST_DEPTH {
3275 return None;
3276 }
3277
3278 match expr {
3279 ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3280 ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3281 let base = dotted_name(value.as_ref(), depth + 1)?;
3282 Some(format!("{}.{}", base, attr.as_str()))
3283 }
3284 _ => None,
3285 }
3286}
3287
3288fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3289 if let Some(mapped) = aliases.module_aliases.get(module) {
3290 return mapped.clone();
3291 }
3292
3293 let Some((base, rest)) = module.split_once('.') else {
3294 return module.to_string();
3295 };
3296
3297 if let Some(mapped) = aliases.module_aliases.get(base) {
3298 return format!("{}.{}", mapped, rest);
3299 }
3300
3301 module.to_string()
3302}
3303
3304fn extract_setup_keywords(
3305 call_expr: &ast::Expr,
3306 evaluator: &mut LiteralEvaluator,
3307) -> HashMap<String, Value> {
3308 let mut values = HashMap::new();
3309 let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3310 return values;
3311 };
3312
3313 for keyword in arguments.keywords.iter() {
3314 if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3315 if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3316 values.insert(arg.to_string(), value);
3317 }
3318 } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3319 for (key, value) in dict {
3320 values.insert(key, value);
3321 }
3322 }
3323 }
3324
3325 values
3326}
3327
3328fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3329 let name = get_value_string(values, "name");
3330 let version = get_value_string(values, "version");
3331 let description =
3332 get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3333 let homepage_url =
3334 get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3335 let author = get_value_string(values, "author");
3336 let author_email = get_value_string(values, "author_email");
3337 let maintainer = get_value_string(values, "maintainer");
3338 let maintainer_email = get_value_string(values, "maintainer_email");
3339 let license = get_value_string(values, "license");
3340 let classifiers = values
3341 .get("classifiers")
3342 .and_then(value_to_string_list)
3343 .unwrap_or_default();
3344
3345 let mut parties = Vec::new();
3346 if author.is_some() || author_email.is_some() {
3347 parties.push(Party {
3348 r#type: Some("person".to_string()),
3349 role: Some("author".to_string()),
3350 name: author,
3351 email: author_email,
3352 url: None,
3353 organization: None,
3354 organization_url: None,
3355 timezone: None,
3356 });
3357 }
3358
3359 if maintainer.is_some() || maintainer_email.is_some() {
3360 parties.push(Party {
3361 r#type: Some("person".to_string()),
3362 role: Some("maintainer".to_string()),
3363 name: maintainer,
3364 email: maintainer_email,
3365 url: None,
3366 organization: None,
3367 organization_url: None,
3368 timezone: None,
3369 });
3370 }
3371
3372 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3373 normalize_spdx_declared_license(license.as_deref());
3374 let extracted_license_statement = license.clone();
3375
3376 let dependencies = build_setup_py_dependencies(values);
3377 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3378 let mut homepage_from_project_urls = None;
3379 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3380 let mut extra_data = HashMap::new();
3381
3382 if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3383 apply_project_url_mappings(
3384 &parsed_project_urls,
3385 &mut homepage_from_project_urls,
3386 &mut bug_tracking_url,
3387 &mut code_view_url,
3388 &mut vcs_url,
3389 &mut extra_data,
3390 );
3391 }
3392
3393 let extra_data = if extra_data.is_empty() {
3394 None
3395 } else {
3396 Some(extra_data)
3397 };
3398
3399 PackageData {
3400 package_type: Some(PythonParser::PACKAGE_TYPE),
3401 namespace: None,
3402 name,
3403 version,
3404 qualifiers: None,
3405 subpath: None,
3406 primary_language: Some("Python".to_string()),
3407 description,
3408 release_date: None,
3409 parties,
3410 keywords: Vec::new(),
3411 homepage_url: homepage_url.or(homepage_from_project_urls),
3412 download_url: None,
3413 size: None,
3414 sha1: None,
3415 md5: None,
3416 sha256: None,
3417 sha512: None,
3418 bug_tracking_url,
3419 code_view_url,
3420 vcs_url,
3421 copyright: None,
3422 holder: None,
3423 declared_license_expression,
3424 declared_license_expression_spdx,
3425 license_detections,
3426 other_license_expression: None,
3427 other_license_expression_spdx: None,
3428 other_license_detections: Vec::new(),
3429 extracted_license_statement,
3430 notice_text: None,
3431 source_packages: Vec::new(),
3432 file_references: Vec::new(),
3433 is_private: has_private_classifier(&classifiers),
3434 is_virtual: false,
3435 extra_data,
3436 dependencies,
3437 repository_homepage_url: None,
3438 repository_download_url: None,
3439 api_data_url: None,
3440 datasource_id: Some(DatasourceId::PypiSetupPy),
3441 purl,
3442 }
3443}
3444
3445fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3446 let mut dependencies = Vec::new();
3447
3448 if let Some(reqs) = values
3449 .get("install_requires")
3450 .and_then(value_to_string_list)
3451 {
3452 dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3453 }
3454
3455 if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3456 dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3457 }
3458
3459 if let Some(Value::Dict(extras)) = values.get("extras_require") {
3460 let mut extra_items: Vec<_> = extras.iter().collect();
3461 extra_items.sort_by_key(|(name, _)| *name);
3462 for (extra_name, extra_value) in extra_items {
3463 if let Some(reqs) = value_to_string_list(extra_value) {
3464 dependencies.extend(build_setup_py_dependency_list(
3465 reqs.as_slice(),
3466 extra_name,
3467 true,
3468 ));
3469 }
3470 }
3471 }
3472
3473 dependencies
3474}
3475
3476fn build_setup_py_dependency_list(
3477 reqs: &[String],
3478 scope: &str,
3479 is_optional: bool,
3480) -> Vec<Dependency> {
3481 reqs.iter()
3482 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3483 .collect()
3484}
3485
3486fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3487 values.get(key).and_then(value_to_string)
3488}
3489
3490fn value_to_string(value: &Value) -> Option<String> {
3491 match value {
3492 Value::String(value) => Some(value.clone()),
3493 Value::Number(value) => Some(value.to_string()),
3494 Value::Bool(value) => Some(value.to_string()),
3495 _ => None,
3496 }
3497}
3498
3499fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3500 match value {
3501 Value::String(value) => Some(vec![value.clone()]),
3502 Value::List(values) | Value::Tuple(values) => {
3503 let mut items = Vec::new();
3504 for item in values {
3505 items.push(value_to_string(item)?);
3506 }
3507 Some(items)
3508 }
3509 _ => None,
3510 }
3511}
3512
3513fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3514 let Value::Dict(dict) = value else {
3515 return None;
3516 };
3517
3518 let mut pairs: Vec<(String, String)> = dict
3519 .iter()
3520 .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3521 .collect::<Option<Vec<_>>>()?;
3522 pairs.sort_by(|left, right| left.0.cmp(&right.0));
3523 Some(pairs)
3524}
3525
3526fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3527 let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3528 extract_requires_dist_dependencies(&requires_dist)
3529}
3530
3531pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3532 requires_dist
3533 .iter()
3534 .filter_map(|entry| build_rfc822_dependency(entry))
3535 .collect()
3536}
3537
3538fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3539 build_python_dependency(entry, "install", false, None)
3540}
3541
3542fn build_python_dependency(
3543 entry: &str,
3544 default_scope: &str,
3545 default_optional: bool,
3546 marker_override: Option<&str>,
3547) -> Option<Dependency> {
3548 let (requirement_part, marker_part) = entry
3549 .split_once(';')
3550 .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3551 .unwrap_or((entry.trim(), None));
3552
3553 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3554 let requirement = normalize_rfc822_requirement(requirement_part);
3555 let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3556 marker_part.or(marker_override),
3557 default_scope,
3558 default_optional,
3559 );
3560 let mut purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
3561
3562 let is_pinned = requirement
3563 .as_deref()
3564 .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3565 if is_pinned
3566 && let Some(version) = requirement
3567 .as_deref()
3568 .map(|req| req.trim_start_matches('='))
3569 {
3570 purl.with_version(version).ok()?;
3571 }
3572
3573 let mut extra_data = HashMap::new();
3574 extra_data.extend(marker_data);
3575 if let Some(marker) = marker {
3576 extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3577 }
3578
3579 Some(Dependency {
3580 purl: Some(purl.to_string()),
3581 extracted_requirement: requirement,
3582 scope: Some(scope),
3583 is_runtime: Some(true),
3584 is_optional: Some(is_optional),
3585 is_pinned: Some(is_pinned),
3586 is_direct: Some(true),
3587 resolved_package: None,
3588 extra_data: if extra_data.is_empty() {
3589 None
3590 } else {
3591 Some(extra_data)
3592 },
3593 })
3594}
3595
3596fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3597 let name = extract_setup_cfg_dependency_name(requirement_part)?;
3598 let trimmed = requirement_part.trim();
3599 let mut remainder = trimmed[name.len()..].trim();
3600
3601 if let Some(stripped) = remainder.strip_prefix('[')
3602 && let Some(end_idx) = stripped.find(']')
3603 {
3604 remainder = stripped[end_idx + 1..].trim();
3605 }
3606
3607 let remainder = remainder
3608 .strip_prefix('(')
3609 .and_then(|value| value.strip_suffix(')'))
3610 .unwrap_or(remainder)
3611 .trim();
3612
3613 if remainder.is_empty() {
3614 return None;
3615 }
3616
3617 let mut specifiers: Vec<String> = remainder
3618 .split(',')
3619 .map(|specifier| specifier.trim().replace(' ', ""))
3620 .filter(|specifier| !specifier.is_empty())
3621 .collect();
3622 specifiers.sort();
3623 Some(specifiers.join(","))
3624}
3625
3626fn parse_rfc822_marker(
3627 marker_part: Option<&str>,
3628 default_scope: &str,
3629 default_optional: bool,
3630) -> (
3631 String,
3632 bool,
3633 Option<String>,
3634 HashMap<String, serde_json::Value>,
3635) {
3636 let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3637 return (
3638 default_scope.to_string(),
3639 default_optional,
3640 None,
3641 HashMap::new(),
3642 );
3643 };
3644
3645 let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3646 .expect("extra marker regex should compile");
3647 let mut extra_data = HashMap::new();
3648
3649 if let Some(python_version) = extract_marker_field(marker, "python_version") {
3650 extra_data.insert(
3651 "python_version".to_string(),
3652 serde_json::Value::String(python_version),
3653 );
3654 }
3655 if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
3656 extra_data.insert(
3657 "sys_platform".to_string(),
3658 serde_json::Value::String(sys_platform),
3659 );
3660 }
3661
3662 if let Some(captures) = extra_re.captures(marker)
3663 && let Some(scope) = captures.get(1)
3664 {
3665 return (
3666 scope.as_str().to_string(),
3667 true,
3668 Some(marker.trim().to_string()),
3669 extra_data,
3670 );
3671 }
3672
3673 (
3674 default_scope.to_string(),
3675 default_optional,
3676 Some(marker.trim().to_string()),
3677 extra_data,
3678 )
3679}
3680
3681fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
3682 let re = Regex::new(&format!(
3683 r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
3684 field
3685 ))
3686 .ok()?;
3687 let captures = re.captures(marker)?;
3688 let operator = captures.get(1)?.as_str();
3689 let value = captures.get(2)?.as_str();
3690 Some(format!("{} {}", operator, value))
3691}
3692
3693fn parse_requires_txt(content: &str) -> Vec<Dependency> {
3694 let mut dependencies = Vec::new();
3695 let mut current_scope = "install".to_string();
3696 let mut current_optional = false;
3697 let mut current_marker: Option<String> = None;
3698
3699 for line in content.lines() {
3700 let trimmed = line.trim();
3701 if trimmed.is_empty() || trimmed.starts_with('#') {
3702 continue;
3703 }
3704
3705 if trimmed.starts_with('[') && trimmed.ends_with(']') {
3706 let inner = &trimmed[1..trimmed.len() - 1];
3707 if let Some(rest) = inner.strip_prefix(':') {
3708 current_scope = "install".to_string();
3709 current_optional = false;
3710 current_marker = Some(rest.trim().to_string());
3711 } else if let Some((scope, marker)) = inner.split_once(':') {
3712 current_scope = scope.trim().to_string();
3713 current_optional = true;
3714 current_marker = Some(marker.trim().to_string());
3715 } else {
3716 current_scope = inner.trim().to_string();
3717 current_optional = true;
3718 current_marker = None;
3719 }
3720 continue;
3721 }
3722
3723 if let Some(dependency) = build_python_dependency(
3724 trimmed,
3725 ¤t_scope,
3726 current_optional,
3727 current_marker.as_deref(),
3728 ) {
3729 dependencies.push(dependency);
3730 }
3731 }
3732
3733 dependencies
3734}
3735
3736fn has_private_classifier(classifiers: &[String]) -> bool {
3737 classifiers
3738 .iter()
3739 .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
3740}
3741
3742fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
3743 let name = name?;
3744 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
3745 if let Some(version) = version {
3746 package_url.with_version(version).ok()?;
3747 }
3748 Some(package_url.to_string())
3749}
3750
3751fn extract_from_setup_py_regex(content: &str) -> PackageData {
3752 let name = extract_setup_value(content, "name");
3753 let version = extract_setup_value(content, "version");
3754 let license_expression = extract_setup_value(content, "license");
3755
3756 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3757 normalize_spdx_declared_license(license_expression.as_deref());
3758 let extracted_license_statement = license_expression.clone();
3759
3760 let dependencies = extract_setup_py_dependencies(content);
3761 let homepage_url = extract_setup_value(content, "url");
3762 let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3763
3764 PackageData {
3765 package_type: Some(PythonParser::PACKAGE_TYPE),
3766 namespace: None,
3767 name,
3768 version,
3769 qualifiers: None,
3770 subpath: None,
3771 primary_language: Some("Python".to_string()),
3772 description: None,
3773 release_date: None,
3774 parties: Vec::new(),
3775 keywords: Vec::new(),
3776 homepage_url,
3777 download_url: None,
3778 size: None,
3779 sha1: None,
3780 md5: None,
3781 sha256: None,
3782 sha512: None,
3783 bug_tracking_url: None,
3784 code_view_url: None,
3785 vcs_url: None,
3786 copyright: None,
3787 holder: None,
3788 declared_license_expression,
3789 declared_license_expression_spdx,
3790 license_detections,
3791 other_license_expression: None,
3792 other_license_expression_spdx: None,
3793 other_license_detections: Vec::new(),
3794 extracted_license_statement,
3795 notice_text: None,
3796 source_packages: Vec::new(),
3797 file_references: Vec::new(),
3798 is_private: false,
3799 is_virtual: false,
3800 extra_data: None,
3801 dependencies,
3802 repository_homepage_url: None,
3803 repository_download_url: None,
3804 api_data_url: None,
3805 datasource_id: Some(DatasourceId::PypiSetupPy),
3806 purl,
3807 }
3808}
3809
3810fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
3811 crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
3812}
3813
3814fn extract_from_pypi_json(path: &Path) -> PackageData {
3815 let default = PackageData {
3816 package_type: Some(PythonParser::PACKAGE_TYPE),
3817 datasource_id: Some(DatasourceId::PypiJson),
3818 ..Default::default()
3819 };
3820
3821 let content = match read_file_to_string(path) {
3822 Ok(content) => content,
3823 Err(error) => {
3824 warn!("Failed to read pypi.json at {:?}: {}", path, error);
3825 return default;
3826 }
3827 };
3828
3829 let root: serde_json::Value = match serde_json::from_str(&content) {
3830 Ok(value) => value,
3831 Err(error) => {
3832 warn!("Failed to parse pypi.json at {:?}: {}", path, error);
3833 return default;
3834 }
3835 };
3836
3837 let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
3838 warn!("No info object found in pypi.json at {:?}", path);
3839 return default;
3840 };
3841
3842 let name = info
3843 .get("name")
3844 .and_then(|value| value.as_str())
3845 .map(ToOwned::to_owned);
3846 let version = info
3847 .get("version")
3848 .and_then(|value| value.as_str())
3849 .map(ToOwned::to_owned);
3850 let summary = info
3851 .get("summary")
3852 .and_then(|value| value.as_str())
3853 .map(ToOwned::to_owned);
3854 let description = info
3855 .get("description")
3856 .and_then(|value| value.as_str())
3857 .filter(|value| !value.trim().is_empty())
3858 .map(ToOwned::to_owned)
3859 .or(summary);
3860 let mut homepage_url = info
3861 .get("home_page")
3862 .and_then(|value| value.as_str())
3863 .map(ToOwned::to_owned);
3864 let author = info
3865 .get("author")
3866 .and_then(|value| value.as_str())
3867 .filter(|value| !value.trim().is_empty())
3868 .map(ToOwned::to_owned);
3869 let author_email = info
3870 .get("author_email")
3871 .and_then(|value| value.as_str())
3872 .filter(|value| !value.trim().is_empty())
3873 .map(ToOwned::to_owned);
3874 let license = info
3875 .get("license")
3876 .and_then(|value| value.as_str())
3877 .filter(|value| !value.trim().is_empty())
3878 .map(ToOwned::to_owned);
3879 let keywords = parse_setup_cfg_keywords(
3880 info.get("keywords")
3881 .and_then(|value| value.as_str())
3882 .map(ToOwned::to_owned),
3883 );
3884 let classifiers = info
3885 .get("classifiers")
3886 .and_then(|value| value.as_array())
3887 .map(|values| {
3888 values
3889 .iter()
3890 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
3891 .collect::<Vec<_>>()
3892 })
3893 .unwrap_or_default();
3894
3895 let mut parties = Vec::new();
3896 if author.is_some() || author_email.is_some() {
3897 parties.push(Party {
3898 r#type: Some("person".to_string()),
3899 role: Some("author".to_string()),
3900 name: author,
3901 email: author_email,
3902 url: None,
3903 organization: None,
3904 organization_url: None,
3905 timezone: None,
3906 });
3907 }
3908
3909 let mut bug_tracking_url = None;
3910 let mut code_view_url = None;
3911 let mut vcs_url = None;
3912 let mut extra_data = HashMap::new();
3913
3914 let parsed_project_urls = info
3915 .get("project_urls")
3916 .and_then(|value| value.as_object())
3917 .map(|map| {
3918 let mut pairs: Vec<(String, String)> = map
3919 .iter()
3920 .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
3921 .collect();
3922 pairs.sort_by(|left, right| left.0.cmp(&right.0));
3923 pairs
3924 })
3925 .unwrap_or_default();
3926
3927 apply_project_url_mappings(
3928 &parsed_project_urls,
3929 &mut homepage_url,
3930 &mut bug_tracking_url,
3931 &mut code_view_url,
3932 &mut vcs_url,
3933 &mut extra_data,
3934 );
3935
3936 let (download_url, size, sha256) = root
3937 .get("urls")
3938 .and_then(|value| value.as_array())
3939 .map(|urls| select_pypi_json_artifact(urls))
3940 .unwrap_or((None, None, None));
3941
3942 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3943 normalize_spdx_declared_license(license.as_deref());
3944 let dependencies = info
3945 .get("requires_dist")
3946 .and_then(|value| value.as_array())
3947 .map(|entries| {
3948 entries
3949 .iter()
3950 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
3951 .collect::<Vec<_>>()
3952 })
3953 .map(|entries| extract_requires_dist_dependencies(&entries))
3954 .unwrap_or_default();
3955
3956 let (repository_homepage_url, repository_download_url, api_data_url, purl) =
3957 build_pypi_urls(name.as_deref(), version.as_deref());
3958
3959 PackageData {
3960 package_type: Some(PythonParser::PACKAGE_TYPE),
3961 namespace: None,
3962 name,
3963 version,
3964 qualifiers: None,
3965 subpath: None,
3966 primary_language: None,
3967 description,
3968 release_date: None,
3969 parties,
3970 keywords,
3971 homepage_url: homepage_url.or(repository_homepage_url.clone()),
3972 download_url,
3973 size,
3974 sha1: None,
3975 md5: None,
3976 sha256,
3977 sha512: None,
3978 bug_tracking_url,
3979 code_view_url,
3980 vcs_url,
3981 copyright: None,
3982 holder: None,
3983 declared_license_expression,
3984 declared_license_expression_spdx,
3985 license_detections,
3986 other_license_expression: None,
3987 other_license_expression_spdx: None,
3988 other_license_detections: Vec::new(),
3989 extracted_license_statement: license,
3990 notice_text: None,
3991 source_packages: Vec::new(),
3992 file_references: Vec::new(),
3993 is_private: has_private_classifier(&classifiers),
3994 is_virtual: false,
3995 extra_data: if extra_data.is_empty() {
3996 None
3997 } else {
3998 Some(extra_data)
3999 },
4000 dependencies,
4001 repository_homepage_url,
4002 repository_download_url,
4003 api_data_url,
4004 datasource_id: Some(DatasourceId::PypiJson),
4005 purl,
4006 }
4007}
4008
4009fn select_pypi_json_artifact(
4010 urls: &[serde_json::Value],
4011) -> (Option<String>, Option<u64>, Option<String>) {
4012 let selected = urls
4013 .iter()
4014 .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4015 .or_else(|| urls.first());
4016
4017 let Some(entry) = selected else {
4018 return (None, None, None);
4019 };
4020
4021 let download_url = entry
4022 .get("url")
4023 .and_then(|value| value.as_str())
4024 .map(ToOwned::to_owned);
4025 let size = entry.get("size").and_then(|value| value.as_u64());
4026 let sha256 = entry
4027 .get("digests")
4028 .and_then(|value| value.as_object())
4029 .and_then(|digests| digests.get("sha256"))
4030 .and_then(|value| value.as_str())
4031 .map(ToOwned::to_owned);
4032
4033 (download_url, size, sha256)
4034}
4035
4036fn extract_from_pip_inspect(path: &Path) -> PackageData {
4037 let content = match read_file_to_string(path) {
4038 Ok(content) => content,
4039 Err(e) => {
4040 warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4041 return default_package_data(path);
4042 }
4043 };
4044
4045 let root: serde_json::Value = match serde_json::from_str(&content) {
4046 Ok(value) => value,
4047 Err(e) => {
4048 warn!(
4049 "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4050 path, e
4051 );
4052 return default_package_data(path);
4053 }
4054 };
4055
4056 let installed = match root.get("installed").and_then(|v| v.as_array()) {
4057 Some(arr) => arr,
4058 None => {
4059 warn!(
4060 "No 'installed' array found in pip-inspect.deplock at {:?}",
4061 path
4062 );
4063 return default_package_data(path);
4064 }
4065 };
4066
4067 let pip_version = root
4068 .get("pip_version")
4069 .and_then(|v| v.as_str())
4070 .map(String::from);
4071 let inspect_version = root
4072 .get("version")
4073 .and_then(|v| v.as_str())
4074 .map(String::from);
4075
4076 let mut main_package: Option<PackageData> = None;
4077 let mut dependencies: Vec<Dependency> = Vec::new();
4078
4079 for package_entry in installed {
4080 let metadata = match package_entry.get("metadata") {
4081 Some(m) => m,
4082 None => continue,
4083 };
4084
4085 let is_requested = package_entry
4086 .get("requested")
4087 .and_then(|v| v.as_bool())
4088 .unwrap_or(false);
4089 let has_direct_url = package_entry.get("direct_url").is_some();
4090
4091 let name = metadata
4092 .get("name")
4093 .and_then(|v| v.as_str())
4094 .map(String::from);
4095 let version = metadata
4096 .get("version")
4097 .and_then(|v| v.as_str())
4098 .map(String::from);
4099 let summary = metadata
4100 .get("summary")
4101 .and_then(|v| v.as_str())
4102 .map(String::from);
4103 let home_page = metadata
4104 .get("home_page")
4105 .and_then(|v| v.as_str())
4106 .map(String::from);
4107 let author = metadata
4108 .get("author")
4109 .and_then(|v| v.as_str())
4110 .map(String::from);
4111 let author_email = metadata
4112 .get("author_email")
4113 .and_then(|v| v.as_str())
4114 .map(String::from);
4115 let license = metadata
4116 .get("license")
4117 .and_then(|v| v.as_str())
4118 .map(String::from);
4119 let description = metadata
4120 .get("description")
4121 .and_then(|v| v.as_str())
4122 .map(String::from);
4123 let keywords = metadata
4124 .get("keywords")
4125 .and_then(|v| v.as_array())
4126 .map(|arr| {
4127 arr.iter()
4128 .filter_map(|k| k.as_str().map(String::from))
4129 .collect::<Vec<_>>()
4130 })
4131 .unwrap_or_default();
4132
4133 let mut parties = Vec::new();
4134 if author.is_some() || author_email.is_some() {
4135 parties.push(Party {
4136 r#type: Some("person".to_string()),
4137 role: Some("author".to_string()),
4138 name: author,
4139 email: author_email,
4140 url: None,
4141 organization: None,
4142 organization_url: None,
4143 timezone: None,
4144 });
4145 }
4146
4147 let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4148 normalize_spdx_declared_license(license.as_deref());
4149 let extracted_license_statement = license.clone();
4150 let requires_dist = metadata
4151 .get("requires_dist")
4152 .and_then(|v| v.as_array())
4153 .map(|entries| {
4154 entries
4155 .iter()
4156 .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4157 .collect::<Vec<_>>()
4158 })
4159 .unwrap_or_default();
4160 let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4161
4162 let purl = name.as_ref().and_then(|n| {
4163 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4164 if let Some(v) = &version {
4165 package_url.with_version(v).ok()?;
4166 }
4167 Some(package_url.to_string())
4168 });
4169
4170 if is_requested && has_direct_url {
4171 let mut extra_data = HashMap::new();
4172 if let Some(pv) = &pip_version {
4173 extra_data.insert(
4174 "pip_version".to_string(),
4175 serde_json::Value::String(pv.clone()),
4176 );
4177 }
4178 if let Some(iv) = &inspect_version {
4179 extra_data.insert(
4180 "inspect_version".to_string(),
4181 serde_json::Value::String(iv.clone()),
4182 );
4183 }
4184
4185 main_package = Some(PackageData {
4186 package_type: Some(PythonParser::PACKAGE_TYPE),
4187 namespace: None,
4188 name,
4189 version,
4190 qualifiers: None,
4191 subpath: None,
4192 primary_language: Some("Python".to_string()),
4193 description: description.or(summary),
4194 release_date: None,
4195 parties,
4196 keywords,
4197 homepage_url: home_page,
4198 download_url: None,
4199 size: None,
4200 sha1: None,
4201 md5: None,
4202 sha256: None,
4203 sha512: None,
4204 bug_tracking_url: None,
4205 code_view_url: None,
4206 vcs_url: None,
4207 copyright: None,
4208 holder: None,
4209 declared_license_expression,
4210 declared_license_expression_spdx,
4211 license_detections,
4212 other_license_expression: None,
4213 other_license_expression_spdx: None,
4214 other_license_detections: Vec::new(),
4215 extracted_license_statement,
4216 notice_text: None,
4217 source_packages: Vec::new(),
4218 file_references: Vec::new(),
4219 is_private: false,
4220 is_virtual: true,
4221 extra_data: if extra_data.is_empty() {
4222 None
4223 } else {
4224 Some(extra_data)
4225 },
4226 dependencies: parsed_dependencies,
4227 repository_homepage_url: None,
4228 repository_download_url: None,
4229 api_data_url: None,
4230 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4231 purl,
4232 });
4233 } else {
4234 let resolved_package = PackageData {
4235 package_type: Some(PythonParser::PACKAGE_TYPE),
4236 namespace: None,
4237 name: name.clone(),
4238 version: version.clone(),
4239 qualifiers: None,
4240 subpath: None,
4241 primary_language: Some("Python".to_string()),
4242 description: description.or(summary),
4243 release_date: None,
4244 parties,
4245 keywords,
4246 homepage_url: home_page,
4247 download_url: None,
4248 size: None,
4249 sha1: None,
4250 md5: None,
4251 sha256: None,
4252 sha512: None,
4253 bug_tracking_url: None,
4254 code_view_url: None,
4255 vcs_url: None,
4256 copyright: None,
4257 holder: None,
4258 declared_license_expression,
4259 declared_license_expression_spdx,
4260 license_detections,
4261 other_license_expression: None,
4262 other_license_expression_spdx: None,
4263 other_license_detections: Vec::new(),
4264 extracted_license_statement,
4265 notice_text: None,
4266 source_packages: Vec::new(),
4267 file_references: Vec::new(),
4268 is_private: false,
4269 is_virtual: true,
4270 extra_data: None,
4271 dependencies: parsed_dependencies,
4272 repository_homepage_url: None,
4273 repository_download_url: None,
4274 api_data_url: None,
4275 datasource_id: Some(DatasourceId::PypiInspectDeplock),
4276 purl: purl.clone(),
4277 };
4278
4279 let resolved = package_data_to_resolved(&resolved_package);
4280 dependencies.push(Dependency {
4281 purl,
4282 extracted_requirement: None,
4283 scope: None,
4284 is_runtime: Some(true),
4285 is_optional: Some(false),
4286 is_pinned: Some(true),
4287 is_direct: Some(is_requested),
4288 resolved_package: Some(Box::new(resolved)),
4289 extra_data: None,
4290 });
4291 }
4292 }
4293
4294 if let Some(mut main_pkg) = main_package {
4295 let direct_requirement_purls: HashSet<String> = main_pkg
4296 .dependencies
4297 .iter()
4298 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4299 .collect();
4300
4301 let resolved_requirement_purls: HashSet<String> = dependencies
4302 .iter()
4303 .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4304 .collect();
4305
4306 let unresolved_dependencies = main_pkg
4307 .dependencies
4308 .iter()
4309 .filter(|dep| {
4310 dep.purl.as_ref().is_some_and(|purl| {
4311 !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4312 })
4313 })
4314 .cloned()
4315 .collect::<Vec<_>>();
4316
4317 for dependency in &mut dependencies {
4318 if dependency
4319 .purl
4320 .as_ref()
4321 .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4322 {
4323 dependency.is_direct = Some(true);
4324 }
4325 }
4326
4327 main_pkg.dependencies = dependencies;
4328 main_pkg.dependencies.extend(unresolved_dependencies);
4329 main_pkg
4330 } else {
4331 default_package_data(path)
4332 }
4333}
4334
4335fn base_dependency_purl(purl: &str) -> String {
4336 purl.split_once('@')
4337 .map(|(base, _)| base.to_string())
4338 .unwrap_or_else(|| purl.to_string())
4339}
4340
4341type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4342
4343fn extract_from_setup_cfg(path: &Path) -> PackageData {
4344 let content = match read_file_to_string(path) {
4345 Ok(content) => content,
4346 Err(e) => {
4347 warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4348 return default_package_data(path);
4349 }
4350 };
4351
4352 let sections = parse_setup_cfg(&content);
4353 let name = get_ini_value(§ions, "metadata", "name");
4354 let version = get_ini_value(§ions, "metadata", "version");
4355 let description = get_ini_value(§ions, "metadata", "description");
4356 let author = get_ini_value(§ions, "metadata", "author");
4357 let author_email = get_ini_value(§ions, "metadata", "author_email");
4358 let maintainer = get_ini_value(§ions, "metadata", "maintainer");
4359 let maintainer_email = get_ini_value(§ions, "metadata", "maintainer_email");
4360 let license = get_ini_value(§ions, "metadata", "license");
4361 let mut homepage_url = get_ini_value(§ions, "metadata", "url");
4362 let classifiers = get_ini_values(§ions, "metadata", "classifiers");
4363 let keywords = parse_setup_cfg_keywords(get_ini_value(§ions, "metadata", "keywords"));
4364 let python_requires = get_ini_value(§ions, "options", "python_requires");
4365 let parsed_project_urls =
4366 parse_setup_cfg_project_urls(&get_ini_values(§ions, "metadata", "project_urls"));
4367 let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4368 let mut extra_data = HashMap::new();
4369
4370 let mut parties = Vec::new();
4371 if author.is_some() || author_email.is_some() {
4372 parties.push(Party {
4373 r#type: Some("person".to_string()),
4374 role: Some("author".to_string()),
4375 name: author,
4376 email: author_email,
4377 url: None,
4378 organization: None,
4379 organization_url: None,
4380 timezone: None,
4381 });
4382 }
4383
4384 if maintainer.is_some() || maintainer_email.is_some() {
4385 parties.push(Party {
4386 r#type: Some("person".to_string()),
4387 role: Some("maintainer".to_string()),
4388 name: maintainer,
4389 email: maintainer_email,
4390 url: None,
4391 organization: None,
4392 organization_url: None,
4393 timezone: None,
4394 });
4395 }
4396
4397 let declared_license_expression = None;
4398 let declared_license_expression_spdx = None;
4399 let license_detections = Vec::new();
4400 let extracted_license_statement = license.clone();
4401
4402 let dependencies = extract_setup_cfg_dependencies(§ions);
4403
4404 if let Some(value) = python_requires {
4405 extra_data.insert(
4406 "python_requires".to_string(),
4407 serde_json::Value::String(value),
4408 );
4409 }
4410
4411 apply_project_url_mappings(
4412 &parsed_project_urls,
4413 &mut homepage_url,
4414 &mut bug_tracking_url,
4415 &mut code_view_url,
4416 &mut vcs_url,
4417 &mut extra_data,
4418 );
4419
4420 let extra_data = if extra_data.is_empty() {
4421 None
4422 } else {
4423 Some(extra_data)
4424 };
4425
4426 let purl = name.as_ref().and_then(|n| {
4427 let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4428 if let Some(v) = &version {
4429 package_url.with_version(v).ok()?;
4430 }
4431 Some(package_url.to_string())
4432 });
4433
4434 PackageData {
4435 package_type: Some(PythonParser::PACKAGE_TYPE),
4436 namespace: None,
4437 name,
4438 version,
4439 qualifiers: None,
4440 subpath: None,
4441 primary_language: Some("Python".to_string()),
4442 description,
4443 release_date: None,
4444 parties,
4445 keywords,
4446 homepage_url,
4447 download_url: None,
4448 size: None,
4449 sha1: None,
4450 md5: None,
4451 sha256: None,
4452 sha512: None,
4453 bug_tracking_url,
4454 code_view_url,
4455 vcs_url,
4456 copyright: None,
4457 holder: None,
4458 declared_license_expression,
4459 declared_license_expression_spdx,
4460 license_detections,
4461 other_license_expression: None,
4462 other_license_expression_spdx: None,
4463 other_license_detections: Vec::new(),
4464 extracted_license_statement,
4465 notice_text: None,
4466 source_packages: Vec::new(),
4467 file_references: Vec::new(),
4468 is_private: has_private_classifier(&classifiers),
4469 is_virtual: false,
4470 extra_data,
4471 dependencies,
4472 repository_homepage_url: None,
4473 repository_download_url: None,
4474 api_data_url: None,
4475 datasource_id: Some(DatasourceId::PypiSetupCfg),
4476 purl,
4477 }
4478}
4479
4480fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4481 let Some(keywords) = value else {
4482 return Vec::new();
4483 };
4484
4485 keywords
4486 .split(',')
4487 .map(str::trim)
4488 .filter(|keyword| !keyword.is_empty())
4489 .map(ToOwned::to_owned)
4490 .collect()
4491}
4492
4493fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4494 entries
4495 .iter()
4496 .filter_map(|entry| {
4497 let (label, url) = entry.split_once('=')?;
4498 let label = label.trim();
4499 let url = url.trim();
4500 if label.is_empty() || url.is_empty() {
4501 None
4502 } else {
4503 Some((label.to_string(), url.to_string()))
4504 }
4505 })
4506 .collect()
4507}
4508
4509fn apply_project_url_mappings(
4510 parsed_urls: &[(String, String)],
4511 homepage_url: &mut Option<String>,
4512 bug_tracking_url: &mut Option<String>,
4513 code_view_url: &mut Option<String>,
4514 vcs_url: &mut Option<String>,
4515 extra_data: &mut HashMap<String, serde_json::Value>,
4516) {
4517 for (label, url) in parsed_urls {
4518 let label_lower = label.to_lowercase();
4519
4520 if bug_tracking_url.is_none()
4521 && matches!(
4522 label_lower.as_str(),
4523 "tracker"
4524 | "bug reports"
4525 | "bug tracker"
4526 | "issues"
4527 | "issue tracker"
4528 | "github: issues"
4529 )
4530 {
4531 *bug_tracking_url = Some(url.clone());
4532 } else if code_view_url.is_none()
4533 && matches!(label_lower.as_str(), "source" | "source code" | "code")
4534 {
4535 *code_view_url = Some(url.clone());
4536 } else if vcs_url.is_none()
4537 && matches!(
4538 label_lower.as_str(),
4539 "github" | "gitlab" | "github: repo" | "repository"
4540 )
4541 {
4542 *vcs_url = Some(url.clone());
4543 } else if homepage_url.is_none()
4544 && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4545 {
4546 *homepage_url = Some(url.clone());
4547 } else if label_lower == "changelog" {
4548 extra_data.insert(
4549 "changelog_url".to_string(),
4550 serde_json::Value::String(url.clone()),
4551 );
4552 }
4553 }
4554
4555 let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4556 .iter()
4557 .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4558 .collect();
4559
4560 if !project_urls_json.is_empty() {
4561 extra_data.insert(
4562 "project_urls".to_string(),
4563 serde_json::Value::Object(project_urls_json),
4564 );
4565 }
4566}
4567
4568fn parse_setup_cfg(content: &str) -> IniSections {
4569 let mut sections: IniSections = HashMap::new();
4570 let mut current_section: Option<String> = None;
4571 let mut current_key: Option<String> = None;
4572
4573 for raw_line in content.lines() {
4574 let line = raw_line.trim_end_matches('\r');
4575 let trimmed = line.trim();
4576 if trimmed.is_empty() {
4577 continue;
4578 }
4579
4580 let stripped = line.trim_start();
4581 if stripped.starts_with('#') || stripped.starts_with(';') {
4582 continue;
4583 }
4584
4585 if stripped.starts_with('[') && stripped.ends_with(']') {
4586 let section_name = stripped
4587 .trim_start_matches('[')
4588 .trim_end_matches(']')
4589 .trim()
4590 .to_ascii_lowercase();
4591 current_section = if section_name.is_empty() {
4592 None
4593 } else {
4594 Some(section_name)
4595 };
4596 current_key = None;
4597 continue;
4598 }
4599
4600 if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4601 if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4602 let value = stripped.trim();
4603 if !value.is_empty() {
4604 sections
4605 .entry(section.clone())
4606 .or_default()
4607 .entry(key.clone())
4608 .or_default()
4609 .push(value.to_string());
4610 }
4611 }
4612 continue;
4613 }
4614
4615 if let Some((key, value)) = stripped.split_once('=')
4616 && let Some(section) = current_section.as_ref()
4617 {
4618 let key_name = key.trim().to_ascii_lowercase();
4619 let value_trimmed = value.trim();
4620 let entry = sections
4621 .entry(section.clone())
4622 .or_default()
4623 .entry(key_name.clone())
4624 .or_default();
4625 if !value_trimmed.is_empty() {
4626 entry.push(value_trimmed.to_string());
4627 }
4628 current_key = Some(key_name);
4629 }
4630 }
4631
4632 sections
4633}
4634
4635fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4636 sections
4637 .get(§ion.to_ascii_lowercase())
4638 .and_then(|values| values.get(&key.to_ascii_lowercase()))
4639 .and_then(|entries| entries.first())
4640 .map(|value| value.trim().to_string())
4641}
4642
4643fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4644 sections
4645 .get(§ion.to_ascii_lowercase())
4646 .and_then(|values| values.get(&key.to_ascii_lowercase()))
4647 .cloned()
4648 .unwrap_or_default()
4649}
4650
4651fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
4652 let mut dependencies = Vec::new();
4653
4654 for (sub_section, scope) in [
4655 ("install_requires", "install"),
4656 ("tests_require", "test"),
4657 ("setup_requires", "setup"),
4658 ] {
4659 let reqs = get_ini_values(sections, "options", sub_section);
4660 dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
4661 }
4662
4663 if let Some(extras) = sections.get("options.extras_require") {
4664 let mut extra_items: Vec<_> = extras.iter().collect();
4665 extra_items.sort_by_key(|(name, _)| *name);
4666 for (extra_name, reqs) in extra_items {
4667 dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
4668 }
4669 }
4670
4671 dependencies
4672}
4673
4674fn parse_setup_cfg_requirements(
4675 reqs: &[String],
4676 scope: &str,
4677 is_optional: bool,
4678) -> Vec<Dependency> {
4679 reqs.iter()
4680 .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
4681 .collect()
4682}
4683
4684fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
4685 let trimmed = req.trim();
4686 if trimmed.is_empty() || trimmed.starts_with('#') {
4687 return None;
4688 }
4689
4690 let name = extract_setup_cfg_dependency_name(trimmed)?;
4691 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4692
4693 Some(Dependency {
4694 purl: Some(purl.to_string()),
4695 extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
4696 scope: Some(scope.to_string()),
4697 is_runtime: Some(true),
4698 is_optional: Some(is_optional),
4699 is_pinned: Some(false),
4700 is_direct: Some(true),
4701 resolved_package: None,
4702 extra_data: None,
4703 })
4704}
4705
4706fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
4707 let trimmed = req.trim();
4708 if trimmed.is_empty() {
4709 return None;
4710 }
4711
4712 let end = trimmed
4713 .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
4714 .unwrap_or(trimmed.len());
4715 let name = trimmed[..end].trim();
4716 if name.is_empty() {
4717 None
4718 } else {
4719 Some(name.to_string())
4720 }
4721}
4722
4723fn normalize_setup_cfg_requirement(req: &str) -> String {
4724 req.chars().filter(|c| !c.is_whitespace()).collect()
4725}
4726
4727fn extract_setup_value(content: &str, key: &str) -> Option<String> {
4728 let patterns = vec![
4729 format!("{}=\"", key), format!("{} =\"", key), format!("{}= \"", key), format!("{} = \"", key), format!("{}='", key), format!("{} ='", key), format!("{}= '", key), format!("{} = '", key), ];
4738
4739 for pattern in patterns {
4740 if let Some(start_idx) = content.find(&pattern) {
4741 let value_start = start_idx + pattern.len();
4742 let remaining = &content[value_start..];
4743
4744 if let Some(end_idx) = remaining.find(['"', '\'']) {
4745 return Some(remaining[..end_idx].to_string());
4746 }
4747 }
4748 }
4749
4750 None
4751}
4752
4753fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
4754 let mut dependencies = Vec::new();
4755
4756 if let Some(tests_deps) = extract_tests_require(content) {
4757 dependencies.extend(tests_deps);
4758 }
4759
4760 if let Some(extras_deps) = extract_extras_require(content) {
4761 dependencies.extend(extras_deps);
4762 }
4763
4764 dependencies
4765}
4766
4767fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
4768 let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
4769 let re = Regex::new(pattern).ok()?;
4770 let captures = re.captures(content)?;
4771 let deps_str = captures.get(1)?.as_str();
4772
4773 let deps = parse_setup_py_dep_list(deps_str, "test", true);
4774 if deps.is_empty() { None } else { Some(deps) }
4775}
4776
4777fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
4778 let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
4779 let re = Regex::new(pattern).ok()?;
4780 let captures = re.captures(content)?;
4781 let dict_content = captures.get(1)?.as_str();
4782
4783 let mut all_deps = Vec::new();
4784
4785 let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
4786 let entry_re = Regex::new(entry_pattern).ok()?;
4787
4788 for entry_cap in entry_re.captures_iter(dict_content) {
4789 if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
4790 let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
4791 all_deps.extend(deps);
4792 }
4793 }
4794
4795 if all_deps.is_empty() {
4796 None
4797 } else {
4798 Some(all_deps)
4799 }
4800}
4801
4802fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
4803 let dep_pattern = r#"['"]([^'"]+)['"]"#;
4804 let re = match Regex::new(dep_pattern) {
4805 Ok(r) => r,
4806 Err(_) => return Vec::new(),
4807 };
4808
4809 re.captures_iter(deps_str)
4810 .filter_map(|cap| {
4811 let dep_str = cap.get(1)?.as_str().trim();
4812 if dep_str.is_empty() {
4813 return None;
4814 }
4815
4816 let name = extract_setup_cfg_dependency_name(dep_str)?;
4817 let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4818
4819 Some(Dependency {
4820 purl: Some(purl.to_string()),
4821 extracted_requirement: Some(dep_str.to_string()),
4822 scope: Some(scope.to_string()),
4823 is_runtime: Some(true),
4824 is_optional: Some(is_optional),
4825 is_pinned: Some(false),
4826 is_direct: Some(true),
4827 resolved_package: None,
4828 extra_data: None,
4829 })
4830 })
4831 .collect()
4832}
4833
4834pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
4836 let content = read_file_to_string(path).map_err(|e| e.to_string())?;
4837 toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
4838}
4839
4840fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<String>) {
4851 let mut file = match File::open(path) {
4852 Ok(f) => f,
4853 Err(_) => return (None, None),
4854 };
4855
4856 let metadata = match file.metadata() {
4857 Ok(m) => m,
4858 Err(_) => return (None, None),
4859 };
4860 let size = metadata.len();
4861
4862 let mut hasher = Sha256::new();
4863 let mut buffer = vec![0; 8192];
4864
4865 loop {
4866 match file.read(&mut buffer) {
4867 Ok(0) => break,
4868 Ok(n) => hasher.update(&buffer[..n]),
4869 Err(_) => return (Some(size), None),
4870 }
4871 }
4872
4873 let hash = hex::encode(hasher.finalize());
4874 (Some(size), Some(hash))
4875}
4876
4877fn default_package_data(path: &Path) -> PackageData {
4878 PackageData {
4879 package_type: Some(PythonParser::PACKAGE_TYPE),
4880 primary_language: Some("Python".to_string()),
4881 datasource_id: infer_python_datasource_id(path),
4882 ..Default::default()
4883 }
4884}
4885
4886fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
4887 let file_name = path.file_name().and_then(|name| name.to_str());
4888
4889 match file_name {
4890 Some("pyproject.toml") => {
4891 if read_toml_file(path)
4892 .ok()
4893 .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
4894 .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
4895 .is_some()
4896 {
4897 Some(DatasourceId::PypiPoetryPyprojectToml)
4898 } else {
4899 Some(DatasourceId::PypiPyprojectToml)
4900 }
4901 }
4902 Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
4903 Some(DatasourceId::PypiSetupPy)
4904 }
4905 Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
4906 Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
4907 Some("METADATA") if is_installed_wheel_metadata_path(path) => {
4908 Some(DatasourceId::PypiWheelMetadata)
4909 }
4910 Some("pypi.json") => Some(DatasourceId::PypiJson),
4911 Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
4912 Some("origin.json") if is_pip_cache_origin_json(path) => {
4913 Some(DatasourceId::PypiPipOriginJson)
4914 }
4915 _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
4916 Some(DatasourceId::PypiSdist)
4917 }
4918 _ if path
4919 .extension()
4920 .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
4921 {
4922 Some(DatasourceId::PypiWheel)
4923 }
4924 _ if path
4925 .extension()
4926 .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
4927 {
4928 Some(DatasourceId::PypiEgg)
4929 }
4930 _ => None,
4931 }
4932}
4933
4934crate::register_parser!(
4935 "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
4936 &[
4937 "**/pyproject.toml",
4938 "**/setup.py",
4939 "**/*_setup.py",
4940 "**/setup.cfg",
4941 "**/pypi.json",
4942 "**/PKG-INFO",
4943 "**/*.dist-info/METADATA",
4944 "**/origin.json",
4945 "**/*.tar.gz",
4946 "**/*.tgz",
4947 "**/*.tar.bz2",
4948 "**/*.tar.xz",
4949 "**/*.zip",
4950 "**/*.whl",
4951 "**/*.egg"
4952 ],
4953 "pypi",
4954 "Python",
4955 Some("https://packaging.python.org/"),
4956);