Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{
35    DatasourceId, Dependency, FileReference, PackageData, PackageType, Party, Sha256Digest,
36};
37use crate::parser_warn as warn;
38use crate::parsers::utils::{read_file_to_string, split_name_email};
39use base64::Engine;
40use base64::engine::general_purpose::URL_SAFE_NO_PAD;
41use bzip2::read::BzDecoder;
42use csv::ReaderBuilder;
43use flate2::read::GzDecoder;
44use liblzma::read::XzDecoder;
45use packageurl::PackageUrl;
46use regex::Regex;
47use ruff_python_ast as ast;
48use ruff_python_parser::parse_module;
49use serde_json::{Map as JsonMap, Value as JsonValue};
50use sha2::{Digest, Sha256};
51use std::collections::{HashMap, HashSet};
52use std::fs::File;
53use std::io::Read;
54use std::path::{Component, Path, PathBuf};
55use tar::Archive;
56use toml::Value as TomlValue;
57use toml::map::Map as TomlMap;
58use zip::ZipArchive;
59
60use super::PackageParser;
61use super::license_normalization::{
62    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
63    normalize_spdx_expression,
64};
65use super::pep508::parse_pep508_requirement;
66
67// Field constants for pyproject.toml
68const FIELD_PROJECT: &str = "project";
69const FIELD_NAME: &str = "name";
70const FIELD_VERSION: &str = "version";
71const FIELD_DESCRIPTION: &str = "description";
72const FIELD_KEYWORDS: &str = "keywords";
73const FIELD_LICENSE: &str = "license";
74const FIELD_AUTHORS: &str = "authors";
75const FIELD_MAINTAINERS: &str = "maintainers";
76const FIELD_URLS: &str = "urls";
77const FIELD_HOMEPAGE: &str = "homepage";
78const FIELD_REPOSITORY: &str = "repository";
79const FIELD_DEPENDENCIES: &str = "dependencies";
80const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
81const FIELD_EXTRAS: &str = "extras";
82
83type ProjectUrls = (
84    Option<String>,
85    Option<String>,
86    Option<String>,
87    Option<String>,
88    Option<String>,
89);
90const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
91const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
92const MAX_SETUP_PY_BYTES: usize = 1_048_576;
93const MAX_SETUP_PY_AST_NODES: usize = 10_000;
94const MAX_SETUP_PY_AST_DEPTH: usize = 50;
95const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
96const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
97const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
98
99/// Python package parser supporting 11 manifest formats.
100///
101/// Extracts metadata from Python package files including pyproject.toml, setup.py,
102/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
103///
104/// # Security
105///
106/// setup.py files are parsed using AST analysis rather than code execution to prevent
107/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
108pub struct PythonParser;
109
110#[derive(Clone, Copy, Debug)]
111enum PythonSdistArchiveFormat {
112    TarGz,
113    Tgz,
114    TarBz2,
115    TarXz,
116    Zip,
117}
118
119#[derive(Clone, Debug)]
120struct ValidatedZipEntry {
121    index: usize,
122    name: String,
123}
124
125impl PackageParser for PythonParser {
126    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
127
128    fn extract_packages(path: &Path) -> Vec<PackageData> {
129        vec![
130            if path.file_name().unwrap_or_default() == "pyproject.toml" {
131                extract_from_pyproject_toml(path)
132            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
133                extract_from_setup_cfg(path)
134            } else if is_setup_py_like_path(path) {
135                return extract_setup_py_packages(path);
136            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
137                extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
138            } else if is_installed_wheel_metadata_path(path) {
139                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
140            } else if is_pip_cache_origin_json(path) {
141                extract_from_pip_origin_json(path)
142            } else if path.file_name().unwrap_or_default() == "pypi.json" {
143                extract_from_pypi_json(path)
144            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
145                extract_from_pip_inspect(path)
146            } else if is_python_sdist_archive_path(path) {
147                extract_from_sdist_archive(path)
148            } else if path
149                .extension()
150                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
151            {
152                extract_from_wheel_archive(path)
153            } else if path
154                .extension()
155                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
156            {
157                extract_from_egg_archive(path)
158            } else {
159                default_package_data(path)
160            },
161        ]
162    }
163
164    fn is_match(path: &Path) -> bool {
165        if let Some(filename) = path.file_name()
166            && (filename == "pyproject.toml"
167                || filename == "setup.cfg"
168                || is_setup_py_like_path(path)
169                || filename == "PKG-INFO"
170                || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
171                || filename == "pypi.json"
172                || filename == "pip-inspect.deplock"
173                || is_pip_cache_origin_json(path))
174        {
175            return true;
176        }
177
178        if let Some(extension) = path.extension() {
179            let ext = extension.to_string_lossy().to_lowercase();
180            if (ext == "whl" && is_valid_wheel_archive_path(path))
181                || ext == "egg"
182                || is_python_sdist_archive_path(path)
183            {
184                return true;
185            }
186        }
187
188        false
189    }
190}
191
192fn is_setup_py_like_path(path: &Path) -> bool {
193    path.file_name()
194        .and_then(|name| name.to_str())
195        .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
196}
197
198fn is_installed_wheel_metadata_path(path: &Path) -> bool {
199    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
200        && path
201            .parent()
202            .and_then(|parent| parent.file_name())
203            .and_then(|name| name.to_str())
204            .is_some_and(|name| name.ends_with(".dist-info"))
205}
206
207#[derive(Debug, Clone)]
208struct InstalledWheelMetadata {
209    wheel_tags: Vec<String>,
210    wheel_version: Option<String>,
211    wheel_generator: Option<String>,
212    root_is_purelib: Option<bool>,
213    compressed_tag: Option<String>,
214}
215
216fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
217    let Some(parent) = path.parent() else {
218        return;
219    };
220
221    if !parent
222        .file_name()
223        .and_then(|name| name.to_str())
224        .is_some_and(|name| name.ends_with(".dist-info"))
225    {
226        return;
227    }
228
229    let wheel_path = parent.join("WHEEL");
230    if !wheel_path.exists() {
231        return;
232    }
233
234    let Ok(content) = read_file_to_string(&wheel_path) else {
235        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
236        return;
237    };
238
239    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
240        return;
241    };
242
243    apply_installed_wheel_metadata(package_data, &wheel_metadata);
244}
245
246fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
247    use super::rfc822::{get_header_all, get_header_first};
248
249    let metadata = super::rfc822::parse_rfc822_content(content);
250    let wheel_tags = get_header_all(&metadata.headers, "tag");
251    if wheel_tags.is_empty() {
252        return None;
253    }
254
255    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
256    let wheel_generator = get_header_first(&metadata.headers, "generator");
257    let root_is_purelib =
258        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
259            match value.to_ascii_lowercase().as_str() {
260                "true" => Some(true),
261                "false" => Some(false),
262                _ => None,
263            }
264        });
265
266    let compressed_tag = compress_wheel_tags(&wheel_tags);
267
268    Some(InstalledWheelMetadata {
269        wheel_tags,
270        wheel_version,
271        wheel_generator,
272        root_is_purelib,
273        compressed_tag,
274    })
275}
276
277fn compress_wheel_tags(tags: &[String]) -> Option<String> {
278    if tags.is_empty() {
279        return None;
280    }
281
282    if tags.len() == 1 {
283        return Some(tags[0].clone());
284    }
285
286    let mut python_tags = Vec::new();
287    let mut abi_tag: Option<&str> = None;
288    let mut platform_tag: Option<&str> = None;
289
290    for tag in tags {
291        let mut parts = tag.splitn(3, '-');
292        let python = parts.next()?;
293        let abi = parts.next()?;
294        let platform = parts.next()?;
295
296        if abi_tag.is_some_and(|existing| existing != abi)
297            || platform_tag.is_some_and(|existing| existing != platform)
298        {
299            return None;
300        }
301
302        abi_tag = Some(abi);
303        platform_tag = Some(platform);
304        python_tags.push(python.to_string());
305    }
306
307    Some(format!(
308        "{}-{}-{}",
309        python_tags.join("."),
310        abi_tag?,
311        platform_tag?
312    ))
313}
314
315fn apply_installed_wheel_metadata(
316    package_data: &mut PackageData,
317    wheel_metadata: &InstalledWheelMetadata,
318) {
319    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
320    extra_data.insert(
321        "wheel_tags".to_string(),
322        JsonValue::Array(
323            wheel_metadata
324                .wheel_tags
325                .iter()
326                .cloned()
327                .map(JsonValue::String)
328                .collect(),
329        ),
330    );
331
332    if let Some(wheel_version) = &wheel_metadata.wheel_version {
333        extra_data.insert(
334            "wheel_version".to_string(),
335            JsonValue::String(wheel_version.clone()),
336        );
337    }
338
339    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
340        extra_data.insert(
341            "wheel_generator".to_string(),
342            JsonValue::String(wheel_generator.clone()),
343        );
344    }
345
346    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
347        extra_data.insert(
348            "root_is_purelib".to_string(),
349            JsonValue::Bool(root_is_purelib),
350        );
351    }
352
353    if let (Some(name), Some(version), Some(extension)) = (
354        package_data.name.as_deref(),
355        package_data.version.as_deref(),
356        wheel_metadata.compressed_tag.as_deref(),
357    ) {
358        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
359    }
360}
361
362fn is_pip_cache_origin_json(path: &Path) -> bool {
363    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
364        && path.ancestors().skip(1).any(|ancestor| {
365            ancestor
366                .file_name()
367                .and_then(|name| name.to_str())
368                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
369        })
370}
371
372fn extract_from_pip_origin_json(path: &Path) -> PackageData {
373    let content = match read_file_to_string(path) {
374        Ok(content) => content,
375        Err(e) => {
376            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
377            return default_package_data(path);
378        }
379    };
380
381    let root: JsonValue = match serde_json::from_str(&content) {
382        Ok(root) => root,
383        Err(e) => {
384            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
385            return default_package_data(path);
386        }
387    };
388
389    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
390        warn!("No url found in pip cache origin.json at {:?}", path);
391        return default_package_data(path);
392    };
393
394    let sibling_wheel = find_sibling_cached_wheel(path);
395    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
396        sibling_wheel
397            .as_ref()
398            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
399    });
400
401    let Some((name, version)) = name_version else {
402        warn!(
403            "Failed to infer package name/version from pip cache origin.json at {:?}",
404            path
405        );
406        return default_package_data(path);
407    };
408
409    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
410        build_pypi_urls(Some(&name), Some(&version));
411    let purl = sibling_wheel
412        .as_ref()
413        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
414        .or(plain_purl);
415
416    PackageData {
417        package_type: Some(PythonParser::PACKAGE_TYPE),
418        primary_language: Some("Python".to_string()),
419        name: Some(name),
420        version: Some(version),
421        datasource_id: Some(DatasourceId::PypiPipOriginJson),
422        download_url: Some(download_url.to_string()),
423        sha256: extract_sha256_from_origin_json(&root)
424            .and_then(|h| Sha256Digest::from_hex(&h).ok()),
425        repository_homepage_url,
426        repository_download_url,
427        api_data_url,
428        purl,
429        ..Default::default()
430    }
431}
432
433fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
434    let parent = path.parent()?;
435    let entries = parent.read_dir().ok()?;
436
437    for entry in entries.flatten() {
438        let sibling_path = entry.path();
439        if sibling_path
440            .extension()
441            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
442            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
443        {
444            return Some(wheel_info);
445        }
446    }
447
448    None
449}
450
451fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
452    let file_name = url.rsplit('/').next()?;
453
454    if file_name.ends_with(".whl") {
455        return parse_wheel_filename(Path::new(file_name))
456            .map(|wheel_info| (wheel_info.name, wheel_info.version));
457    }
458
459    let stem = strip_python_archive_extension(file_name)?;
460    let (name, version) = stem.rsplit_once('-')?;
461    if name.is_empty() || version.is_empty() {
462        return None;
463    }
464
465    Some((name.replace('_', "-"), version.to_string()))
466}
467
468fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
469    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
470        .iter()
471        .find_map(|suffix| file_name.strip_suffix(suffix))
472}
473
474fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
475    root.pointer("/archive_info/hashes/sha256")
476        .and_then(|value| value.as_str())
477        .map(ToOwned::to_owned)
478        .or_else(|| {
479            root.pointer("/archive_info/hash")
480                .and_then(|value| value.as_str())
481                .and_then(normalize_origin_hash)
482        })
483}
484
485fn normalize_origin_hash(hash: &str) -> Option<String> {
486    if let Some(value) = hash.strip_prefix("sha256=") {
487        return Some(value.to_string());
488    }
489    if let Some(value) = hash.strip_prefix("sha256:") {
490        return Some(value.to_string());
491    }
492    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
493        return Some(hash.to_string());
494    }
495    None
496}
497
498fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
499    let content = match read_file_to_string(path) {
500        Ok(content) => content,
501        Err(e) => {
502            warn!("Failed to read metadata at {:?}: {}", path, e);
503            return default_package_data(path);
504        }
505    };
506
507    let metadata = super::rfc822::parse_rfc822_content(&content);
508    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
509    merge_sibling_metadata_dependencies(path, &mut package_data);
510    merge_sibling_metadata_file_references(path, &mut package_data);
511    if datasource_id == DatasourceId::PypiWheelMetadata {
512        merge_sibling_wheel_metadata(path, &mut package_data);
513    }
514    package_data
515}
516
517fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
518    let mut extra_dependencies = Vec::new();
519
520    if let Some(parent) = path.parent() {
521        let direct_requires = parent.join("requires.txt");
522        if direct_requires.exists()
523            && let Ok(content) = read_file_to_string(&direct_requires)
524        {
525            extra_dependencies.extend(parse_requires_txt(&content));
526        }
527
528        let sibling_egg_info_requires = parent
529            .read_dir()
530            .ok()
531            .into_iter()
532            .flatten()
533            .flatten()
534            .find_map(|entry| {
535                let child_path = entry.path();
536                if child_path.is_dir()
537                    && child_path
538                        .file_name()
539                        .and_then(|name| name.to_str())
540                        .is_some_and(|name| name.ends_with(".egg-info"))
541                {
542                    let requires = child_path.join("requires.txt");
543                    requires.exists().then_some(requires)
544                } else {
545                    None
546                }
547            });
548
549        if let Some(requires_path) = sibling_egg_info_requires
550            && let Ok(content) = read_file_to_string(&requires_path)
551        {
552            extra_dependencies.extend(parse_requires_txt(&content));
553        }
554    }
555
556    for dependency in extra_dependencies {
557        if !package_data.dependencies.iter().any(|existing| {
558            existing.purl == dependency.purl
559                && existing.scope == dependency.scope
560                && existing.extracted_requirement == dependency.extracted_requirement
561                && existing.extra_data == dependency.extra_data
562        }) {
563            package_data.dependencies.push(dependency);
564        }
565    }
566}
567
568fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
569    let mut extra_refs = Vec::new();
570
571    if let Some(parent) = path.parent() {
572        let record_path = parent.join("RECORD");
573        if record_path.exists()
574            && let Ok(content) = read_file_to_string(&record_path)
575        {
576            extra_refs.extend(parse_record_csv(&content));
577        }
578
579        let installed_files_path = parent.join("installed-files.txt");
580        if installed_files_path.exists()
581            && let Ok(content) = read_file_to_string(&installed_files_path)
582        {
583            extra_refs.extend(parse_installed_files_txt(&content));
584        }
585
586        let sources_path = parent.join("SOURCES.txt");
587        if sources_path.exists()
588            && let Ok(content) = read_file_to_string(&sources_path)
589        {
590            extra_refs.extend(parse_sources_txt(&content));
591        }
592    }
593
594    for file_ref in extra_refs {
595        if !package_data
596            .file_references
597            .iter()
598            .any(|existing| existing.path == file_ref.path)
599        {
600            package_data.file_references.push(file_ref);
601        }
602    }
603}
604
605fn collect_validated_zip_entries<R: Read + std::io::Seek>(
606    archive: &mut ZipArchive<R>,
607    path: &Path,
608    archive_type: &str,
609) -> Result<Vec<ValidatedZipEntry>, String> {
610    let mut total_extracted = 0u64;
611    let mut entries = Vec::new();
612
613    for i in 0..archive.len() {
614        if let Ok(file) = archive.by_index_raw(i) {
615            let compressed_size = file.compressed_size();
616            let uncompressed_size = file.size();
617            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
618                warn!(
619                    "Skipping unsafe path in {} {:?}: {}",
620                    archive_type,
621                    path,
622                    file.name()
623                );
624                continue;
625            };
626
627            if compressed_size > 0 {
628                let ratio = uncompressed_size as f64 / compressed_size as f64;
629                if ratio > MAX_COMPRESSION_RATIO {
630                    warn!(
631                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
632                        archive_type, path, ratio
633                    );
634                    continue;
635                }
636            }
637
638            if uncompressed_size > MAX_FILE_SIZE {
639                warn!(
640                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
641                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
642                );
643                continue;
644            }
645
646            total_extracted += uncompressed_size;
647            if total_extracted > MAX_ARCHIVE_SIZE {
648                let msg = format!(
649                    "Total extracted size exceeds limit for {} {:?}",
650                    archive_type, path
651                );
652                warn!("{}", msg);
653                return Err(msg);
654            }
655
656            entries.push(ValidatedZipEntry {
657                index: i,
658                name: entry_name,
659            });
660        }
661    }
662
663    Ok(entries)
664}
665
666fn is_python_sdist_archive_path(path: &Path) -> bool {
667    detect_python_sdist_archive_format(path).is_some()
668}
669
670fn is_valid_wheel_archive_path(path: &Path) -> bool {
671    if !path.is_file() {
672        return true;
673    }
674
675    let file = match File::open(path) {
676        Ok(file) => file,
677        Err(_) => return false,
678    };
679    let mut archive = match ZipArchive::new(file) {
680        Ok(archive) => archive,
681        Err(_) => return false,
682    };
683
684    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
685        Ok(entries) => entries,
686        Err(_) => return false,
687    };
688
689    find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA").is_some()
690}
691
692fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
693    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
694
695    if !is_likely_python_sdist_filename(&file_name) {
696        return None;
697    }
698
699    if file_name.ends_with(".tar.gz") {
700        tar_gz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarGz)
701    } else if file_name.ends_with(".tgz") {
702        tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
703    } else if file_name.ends_with(".tar.bz2") {
704        tar_bz2_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarBz2)
705    } else if file_name.ends_with(".tar.xz") {
706        tar_xz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarXz)
707    } else if file_name.ends_with(".zip") {
708        zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
709    } else {
710        None
711    }
712}
713
714fn tar_gz_sdist_contains_pkg_info(path: &Path) -> bool {
715    let Some(compressed_size) = compressed_archive_size(path) else {
716        return false;
717    };
718    let file = match File::open(path) {
719        Ok(file) => file,
720        Err(_) => return false,
721    };
722    let decoder = GzDecoder::new(file);
723    tar_sdist_contains_pkg_info(path, decoder, "tar.gz", compressed_size)
724}
725
726fn tar_bz2_sdist_contains_pkg_info(path: &Path) -> bool {
727    let Some(compressed_size) = compressed_archive_size(path) else {
728        return false;
729    };
730    let file = match File::open(path) {
731        Ok(file) => file,
732        Err(_) => return false,
733    };
734    let decoder = BzDecoder::new(file);
735    tar_sdist_contains_pkg_info(path, decoder, "tar.bz2", compressed_size)
736}
737
738fn tar_xz_sdist_contains_pkg_info(path: &Path) -> bool {
739    let Some(compressed_size) = compressed_archive_size(path) else {
740        return false;
741    };
742    let file = match File::open(path) {
743        Ok(file) => file,
744        Err(_) => return false,
745    };
746    let decoder = XzDecoder::new(file);
747    tar_sdist_contains_pkg_info(path, decoder, "tar.xz", compressed_size)
748}
749
750fn compressed_archive_size(path: &Path) -> Option<u64> {
751    std::fs::metadata(path).ok().map(|metadata| metadata.len())
752}
753
754fn tar_sdist_contains_pkg_info<R: Read>(
755    path: &Path,
756    reader: R,
757    archive_type: &str,
758    compressed_size: u64,
759) -> bool {
760    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
761    else {
762        return false;
763    };
764
765    select_sdist_pkginfo_entry(path, &entries).is_some()
766}
767
768fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
769    if !path.is_file() {
770        return true;
771    }
772
773    let Some(compressed_size) = compressed_archive_size(path) else {
774        return false;
775    };
776    let file = match File::open(path) {
777        Ok(file) => file,
778        Err(_) => return false,
779    };
780    let decoder = GzDecoder::new(file);
781    tar_sdist_contains_pkg_info(path, decoder, "tgz", compressed_size)
782}
783
784fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
785    if !path.is_file() {
786        return true;
787    }
788
789    let file = match File::open(path) {
790        Ok(file) => file,
791        Err(_) => return false,
792    };
793    let mut archive = match ZipArchive::new(file) {
794        Ok(archive) => archive,
795        Err(_) => return false,
796    };
797
798    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
799        Ok(entries) => entries,
800        Err(_) => return false,
801    };
802    let metadata_entries: Vec<_> = validated_entries
803        .iter()
804        .filter(|entry| entry.name.ends_with("/PKG-INFO"))
805        .filter_map(|entry| {
806            read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
807                .ok()
808                .map(|content| (entry.name.clone(), content))
809        })
810        .collect();
811
812    has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
813}
814
815fn is_likely_python_sdist_filename(file_name: &str) -> bool {
816    let Some(stem) = strip_python_archive_extension(file_name) else {
817        return false;
818    };
819
820    let Some((name, version)) = stem.rsplit_once('-') else {
821        return false;
822    };
823
824    !name.is_empty()
825        && !version.is_empty()
826        && version.chars().any(|ch| ch.is_ascii_digit())
827        && name
828            .chars()
829            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
830}
831
832fn extract_from_sdist_archive(path: &Path) -> PackageData {
833    let metadata = match std::fs::metadata(path) {
834        Ok(m) => m,
835        Err(e) => {
836            warn!(
837                "Failed to read metadata for sdist archive {:?}: {}",
838                path, e
839            );
840            return default_package_data(path);
841        }
842    };
843
844    if metadata.len() > MAX_ARCHIVE_SIZE {
845        warn!(
846            "sdist archive too large: {} bytes (limit: {} bytes)",
847            metadata.len(),
848            MAX_ARCHIVE_SIZE
849        );
850        return default_package_data(path);
851    }
852
853    let Some(format) = detect_python_sdist_archive_format(path) else {
854        return default_package_data(path);
855    };
856
857    let mut package_data = match format {
858        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
859            let file = match File::open(path) {
860                Ok(file) => file,
861                Err(e) => {
862                    warn!("Failed to open sdist archive {:?}: {}", path, e);
863                    return default_package_data(path);
864                }
865            };
866            let decoder = GzDecoder::new(file);
867            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
868        }
869        PythonSdistArchiveFormat::TarBz2 => {
870            let file = match File::open(path) {
871                Ok(file) => file,
872                Err(e) => {
873                    warn!("Failed to open sdist archive {:?}: {}", path, e);
874                    return default_package_data(path);
875                }
876            };
877            let decoder = BzDecoder::new(file);
878            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
879        }
880        PythonSdistArchiveFormat::TarXz => {
881            let file = match File::open(path) {
882                Ok(file) => file,
883                Err(e) => {
884                    warn!("Failed to open sdist archive {:?}: {}", path, e);
885                    return default_package_data(path);
886                }
887            };
888            let decoder = XzDecoder::new(file);
889            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
890        }
891        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
892    };
893
894    if package_data.package_type.is_some() {
895        let (size, sha256) = calculate_file_checksums(path);
896        package_data.size = size;
897        package_data.sha256 = sha256;
898    }
899
900    package_data
901}
902
903fn extract_from_tar_sdist_archive<R: Read>(
904    path: &Path,
905    reader: R,
906    archive_type: &str,
907    compressed_size: u64,
908) -> PackageData {
909    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
910    else {
911        return default_package_data(path);
912    };
913
914    build_sdist_package_data(path, entries)
915}
916
917fn collect_tar_sdist_entries<R: Read>(
918    path: &Path,
919    reader: R,
920    archive_type: &str,
921    compressed_size: u64,
922) -> Option<Vec<(String, String)>> {
923    let mut archive = Archive::new(reader);
924    let archive_entries = match archive.entries() {
925        Ok(entries) => entries,
926        Err(e) => {
927            warn!(
928                "Failed to read {} sdist archive {:?}: {}",
929                archive_type, path, e
930            );
931            return None;
932        }
933    };
934
935    let mut total_extracted = 0u64;
936    let mut entries = Vec::new();
937
938    for entry_result in archive_entries {
939        let mut entry = match entry_result {
940            Ok(entry) => entry,
941            Err(e) => {
942                warn!(
943                    "Failed to read {} sdist entry from {:?}: {}",
944                    archive_type, path, e
945                );
946                continue;
947            }
948        };
949
950        let entry_size = entry.size();
951        if entry_size > MAX_FILE_SIZE {
952            warn!(
953                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
954                archive_type, path, entry_size, MAX_FILE_SIZE
955            );
956            continue;
957        }
958
959        total_extracted += entry_size;
960        if total_extracted > MAX_ARCHIVE_SIZE {
961            warn!(
962                "Total extracted size exceeds limit for {} sdist {:?}",
963                archive_type, path
964            );
965            return None;
966        }
967
968        if compressed_size > 0 {
969            let ratio = total_extracted as f64 / compressed_size as f64;
970            if ratio > MAX_COMPRESSION_RATIO {
971                warn!(
972                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
973                    archive_type, path, ratio
974                );
975                return None;
976            }
977        }
978
979        let entry_path = match entry.path() {
980            Ok(path) => path.to_string_lossy().replace('\\', "/"),
981            Err(e) => {
982                warn!(
983                    "Failed to get {} sdist entry path from {:?}: {}",
984                    archive_type, path, e
985                );
986                continue;
987            }
988        };
989
990        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
991            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
992            continue;
993        };
994
995        if !is_relevant_sdist_text_entry(&entry_path) {
996            continue;
997        }
998
999        if let Ok(content) = read_limited_utf8(
1000            &mut entry,
1001            MAX_FILE_SIZE,
1002            &format!("{} entry {}", archive_type, entry_path),
1003        ) {
1004            entries.push((entry_path, content));
1005        }
1006    }
1007
1008    Some(entries)
1009}
1010
1011fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
1012    let file = match File::open(path) {
1013        Ok(file) => file,
1014        Err(e) => {
1015            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
1016            return default_package_data(path);
1017        }
1018    };
1019
1020    let mut archive = match ZipArchive::new(file) {
1021        Ok(archive) => archive,
1022        Err(e) => {
1023            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
1024            return default_package_data(path);
1025        }
1026    };
1027
1028    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
1029        Ok(entries) => entries,
1030        Err(_) => return default_package_data(path),
1031    };
1032
1033    let mut entries = Vec::new();
1034    for entry in validated_entries.iter() {
1035        if !is_relevant_sdist_text_entry(&entry.name) {
1036            continue;
1037        }
1038
1039        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
1040            entries.push((entry.name.clone(), content));
1041        }
1042    }
1043
1044    build_sdist_package_data(path, entries)
1045}
1046
1047fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
1048    entry_path.ends_with("/PKG-INFO")
1049        || entry_path.ends_with("/requires.txt")
1050        || entry_path.ends_with("/SOURCES.txt")
1051}
1052
1053fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
1054    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
1055        warn!("No PKG-INFO file found in sdist archive {:?}", path);
1056        return default_package_data(path);
1057    };
1058
1059    let mut package_data =
1060        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
1061    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
1062    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
1063    apply_sdist_name_version_fallback(path, &mut package_data);
1064    package_data.datasource_id = Some(DatasourceId::PypiSdist);
1065    package_data
1066}
1067
1068fn select_sdist_pkginfo_entry(
1069    archive_path: &Path,
1070    entries: &[(String, String)],
1071) -> Option<(String, String)> {
1072    let expected_name = sdist_archive_expected_name(archive_path);
1073
1074    entries
1075        .iter()
1076        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
1077        .min_by_key(|(entry_path, content)| {
1078            let components: Vec<_> = entry_path
1079                .split('/')
1080                .filter(|part| !part.is_empty())
1081                .collect();
1082            let candidate_name = sdist_pkginfo_candidate_name(content);
1083            let name_rank = if candidate_name == expected_name {
1084                0
1085            } else {
1086                1
1087            };
1088            let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1089
1090            (name_rank, kind_rank, components.len(), entry_path.clone())
1091        })
1092        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1093}
1094
1095fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1096    let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1097        return false;
1098    };
1099
1100    entries.iter().any(|(entry_path, content)| {
1101        sdist_pkginfo_kind_rank(entry_path) < 3
1102            && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1103    })
1104}
1105
1106fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1107    archive_path
1108        .file_name()
1109        .and_then(|name| name.to_str())
1110        .and_then(strip_python_archive_extension)
1111        .and_then(|stem| {
1112            stem.rsplit_once('-')
1113                .map(|(name, _)| normalize_python_package_name(name))
1114        })
1115}
1116
1117fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1118    let metadata = super::rfc822::parse_rfc822_content(content);
1119    super::rfc822::get_header_first(&metadata.headers, "name")
1120        .map(|name| normalize_python_package_name(&name))
1121}
1122
1123fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1124    let components: Vec<_> = entry_path
1125        .split('/')
1126        .filter(|part| !part.is_empty())
1127        .collect();
1128
1129    if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1130    {
1131        0
1132    } else if components.len() == 2 && components[1] == "PKG-INFO" {
1133        1
1134    } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1135        2
1136    } else {
1137        3
1138    }
1139}
1140
1141fn merge_sdist_archive_dependencies(
1142    entries: &[(String, String)],
1143    metadata_path: &str,
1144    package_data: &mut PackageData,
1145) {
1146    let metadata_dir = metadata_path
1147        .rsplit_once('/')
1148        .map(|(dir, _)| dir)
1149        .unwrap_or("");
1150    let archive_root = metadata_path.split('/').next().unwrap_or("");
1151    let matched_egg_info_dir =
1152        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1153    let mut extra_dependencies = Vec::new();
1154
1155    for (entry_path, content) in entries {
1156        let is_direct_requires =
1157            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1158        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1159            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1160        });
1161
1162        if is_direct_requires || is_egg_info_requires {
1163            extra_dependencies.extend(parse_requires_txt(content));
1164        }
1165    }
1166
1167    for dependency in extra_dependencies {
1168        if !package_data.dependencies.iter().any(|existing| {
1169            existing.purl == dependency.purl
1170                && existing.scope == dependency.scope
1171                && existing.extracted_requirement == dependency.extracted_requirement
1172                && existing.extra_data == dependency.extra_data
1173        }) {
1174            package_data.dependencies.push(dependency);
1175        }
1176    }
1177}
1178
1179fn merge_sdist_archive_file_references(
1180    entries: &[(String, String)],
1181    metadata_path: &str,
1182    package_data: &mut PackageData,
1183) {
1184    let metadata_dir = metadata_path
1185        .rsplit_once('/')
1186        .map(|(dir, _)| dir)
1187        .unwrap_or("");
1188    let archive_root = metadata_path.split('/').next().unwrap_or("");
1189    let matched_egg_info_dir =
1190        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1191    let mut extra_refs = Vec::new();
1192
1193    for (entry_path, content) in entries {
1194        let is_direct_sources =
1195            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1196        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1197            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1198        });
1199
1200        if is_direct_sources || is_egg_info_sources {
1201            extra_refs.extend(parse_sources_txt(content));
1202        }
1203    }
1204
1205    for file_ref in extra_refs {
1206        if !package_data
1207            .file_references
1208            .iter()
1209            .any(|existing| existing.path == file_ref.path)
1210        {
1211            package_data.file_references.push(file_ref);
1212        }
1213    }
1214}
1215
1216fn select_matching_sdist_egg_info_dir(
1217    entries: &[(String, String)],
1218    archive_root: &str,
1219    package_name: Option<&str>,
1220) -> Option<String> {
1221    let normalized_package_name = package_name.map(normalize_python_package_name);
1222
1223    entries
1224        .iter()
1225        .filter_map(|(entry_path, _)| {
1226            let components: Vec<_> = entry_path
1227                .split('/')
1228                .filter(|part| !part.is_empty())
1229                .collect();
1230            if components.len() == 3
1231                && components[0] == archive_root
1232                && components[1].ends_with(".egg-info")
1233            {
1234                Some(components[1].to_string())
1235            } else {
1236                None
1237            }
1238        })
1239        .min_by_key(|egg_info_dir| {
1240            let normalized_dir_name =
1241                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1242            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1243                0
1244            } else {
1245                1
1246            };
1247
1248            (name_rank, egg_info_dir.clone())
1249        })
1250}
1251
1252fn normalize_python_package_name(name: &str) -> String {
1253    name.to_ascii_lowercase().replace('_', "-")
1254}
1255
1256fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1257    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1258        return;
1259    };
1260
1261    let Some(stem) = strip_python_archive_extension(file_name) else {
1262        return;
1263    };
1264
1265    let Some((name, version)) = stem.rsplit_once('-') else {
1266        return;
1267    };
1268
1269    if package_data.name.is_none() {
1270        package_data.name = Some(name.replace('_', "-"));
1271    }
1272    if package_data.version.is_none() {
1273        package_data.version = Some(version.to_string());
1274    }
1275
1276    if package_data.purl.is_none()
1277        || package_data.repository_homepage_url.is_none()
1278        || package_data.repository_download_url.is_none()
1279        || package_data.api_data_url.is_none()
1280    {
1281        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1282            build_pypi_urls(
1283                package_data.name.as_deref(),
1284                package_data.version.as_deref(),
1285            );
1286
1287        if package_data.repository_homepage_url.is_none() {
1288            package_data.repository_homepage_url = repository_homepage_url;
1289        }
1290        if package_data.repository_download_url.is_none() {
1291            package_data.repository_download_url = repository_download_url;
1292        }
1293        if package_data.api_data_url.is_none() {
1294            package_data.api_data_url = api_data_url;
1295        }
1296        if package_data.purl.is_none() {
1297            package_data.purl = purl;
1298        }
1299    }
1300}
1301
1302fn extract_from_wheel_archive(path: &Path) -> PackageData {
1303    let metadata = match std::fs::metadata(path) {
1304        Ok(m) => m,
1305        Err(e) => {
1306            warn!(
1307                "Failed to read metadata for wheel archive {:?}: {}",
1308                path, e
1309            );
1310            return default_package_data(path);
1311        }
1312    };
1313
1314    if metadata.len() > MAX_ARCHIVE_SIZE {
1315        warn!(
1316            "Wheel archive too large: {} bytes (limit: {} bytes)",
1317            metadata.len(),
1318            MAX_ARCHIVE_SIZE
1319        );
1320        return default_package_data(path);
1321    }
1322
1323    let file = match File::open(path) {
1324        Ok(f) => f,
1325        Err(e) => {
1326            warn!("Failed to open wheel archive {:?}: {}", path, e);
1327            return default_package_data(path);
1328        }
1329    };
1330
1331    let mut archive = match ZipArchive::new(file) {
1332        Ok(a) => a,
1333        Err(e) => {
1334            warn!("Failed to read wheel archive {:?}: {}", path, e);
1335            return default_package_data(path);
1336        }
1337    };
1338
1339    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1340        Ok(entries) => entries,
1341        Err(_) => return default_package_data(path),
1342    };
1343
1344    let metadata_entry =
1345        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1346            Some(entry) => entry,
1347            None => {
1348                warn!("No METADATA file found in wheel archive {:?}", path);
1349                return default_package_data(path);
1350            }
1351        };
1352
1353    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1354        Ok(c) => c,
1355        Err(e) => {
1356            warn!("Failed to read METADATA from {:?}: {}", path, e);
1357            return default_package_data(path);
1358        }
1359    };
1360
1361    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1362
1363    let (size, sha256) = calculate_file_checksums(path);
1364    package_data.size = size;
1365    package_data.sha256 = sha256;
1366
1367    if let Some(record_entry) =
1368        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1369        && let Ok(record_content) =
1370            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1371    {
1372        package_data.file_references = parse_record_csv(&record_content);
1373    }
1374
1375    if let Some(wheel_info) = parse_wheel_filename(path) {
1376        if package_data.name.is_none() {
1377            package_data.name = Some(wheel_info.name.clone());
1378        }
1379        if package_data.version.is_none() {
1380            package_data.version = Some(wheel_info.version.clone());
1381        }
1382
1383        package_data.qualifiers = Some(std::collections::HashMap::from([(
1384            "extension".to_string(),
1385            format!(
1386                "{}-{}-{}",
1387                wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1388            ),
1389        )]));
1390
1391        package_data.purl = build_wheel_purl(
1392            package_data.name.as_deref(),
1393            package_data.version.as_deref(),
1394            &wheel_info,
1395        );
1396
1397        let mut extra_data = package_data.extra_data.unwrap_or_default();
1398        extra_data.insert(
1399            "python_requires".to_string(),
1400            serde_json::Value::String(wheel_info.python_tag.clone()),
1401        );
1402        extra_data.insert(
1403            "abi_tag".to_string(),
1404            serde_json::Value::String(wheel_info.abi_tag.clone()),
1405        );
1406        extra_data.insert(
1407            "platform_tag".to_string(),
1408            serde_json::Value::String(wheel_info.platform_tag.clone()),
1409        );
1410        package_data.extra_data = Some(extra_data);
1411    }
1412
1413    package_data
1414}
1415
1416fn extract_from_egg_archive(path: &Path) -> PackageData {
1417    let metadata = match std::fs::metadata(path) {
1418        Ok(m) => m,
1419        Err(e) => {
1420            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1421            return default_package_data(path);
1422        }
1423    };
1424
1425    if metadata.len() > MAX_ARCHIVE_SIZE {
1426        warn!(
1427            "Egg archive too large: {} bytes (limit: {} bytes)",
1428            metadata.len(),
1429            MAX_ARCHIVE_SIZE
1430        );
1431        return default_package_data(path);
1432    }
1433
1434    let file = match File::open(path) {
1435        Ok(f) => f,
1436        Err(e) => {
1437            warn!("Failed to open egg archive {:?}: {}", path, e);
1438            return default_package_data(path);
1439        }
1440    };
1441
1442    let mut archive = match ZipArchive::new(file) {
1443        Ok(a) => a,
1444        Err(e) => {
1445            warn!("Failed to read egg archive {:?}: {}", path, e);
1446            return default_package_data(path);
1447        }
1448    };
1449
1450    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1451        Ok(entries) => entries,
1452        Err(_) => return default_package_data(path),
1453    };
1454
1455    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1456        &validated_entries,
1457        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1458    ) {
1459        Some(entry) => entry,
1460        None => {
1461            warn!("No PKG-INFO file found in egg archive {:?}", path);
1462            return default_package_data(path);
1463        }
1464    };
1465
1466    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1467        Ok(c) => c,
1468        Err(e) => {
1469            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1470            return default_package_data(path);
1471        }
1472    };
1473
1474    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1475
1476    let (size, sha256) = calculate_file_checksums(path);
1477    package_data.size = size;
1478    package_data.sha256 = sha256;
1479
1480    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1481        &validated_entries,
1482        &[
1483            "EGG-INFO/installed-files.txt",
1484            ".egg-info/installed-files.txt",
1485        ],
1486    ) && let Ok(installed_files_content) =
1487        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1488    {
1489        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1490    }
1491
1492    if let Some(egg_info) = parse_egg_filename(path) {
1493        if package_data.name.is_none() {
1494            package_data.name = Some(egg_info.name.clone());
1495        }
1496        if package_data.version.is_none() {
1497            package_data.version = Some(egg_info.version.clone());
1498        }
1499
1500        if let Some(python_version) = &egg_info.python_version {
1501            let mut extra_data = package_data.extra_data.unwrap_or_default();
1502            extra_data.insert(
1503                "python_version".to_string(),
1504                serde_json::Value::String(python_version.clone()),
1505            );
1506            package_data.extra_data = Some(extra_data);
1507        }
1508    }
1509
1510    package_data.purl = build_egg_purl(
1511        package_data.name.as_deref(),
1512        package_data.version.as_deref(),
1513    );
1514
1515    package_data
1516}
1517
1518fn find_validated_zip_entry_by_suffix<'a>(
1519    entries: &'a [ValidatedZipEntry],
1520    suffix: &str,
1521) -> Option<&'a ValidatedZipEntry> {
1522    entries.iter().find(|entry| entry.name.ends_with(suffix))
1523}
1524
1525fn find_validated_zip_entry_by_any_suffix<'a>(
1526    entries: &'a [ValidatedZipEntry],
1527    suffixes: &[&str],
1528) -> Option<&'a ValidatedZipEntry> {
1529    entries
1530        .iter()
1531        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1532}
1533
1534fn read_validated_zip_entry<R: Read + std::io::Seek>(
1535    archive: &mut ZipArchive<R>,
1536    entry: &ValidatedZipEntry,
1537    path: &Path,
1538    archive_type: &str,
1539) -> Result<String, String> {
1540    let mut file = archive
1541        .by_index(entry.index)
1542        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1543
1544    let compressed_size = file.compressed_size();
1545    let uncompressed_size = file.size();
1546
1547    if compressed_size > 0 {
1548        let ratio = uncompressed_size as f64 / compressed_size as f64;
1549        if ratio > MAX_COMPRESSION_RATIO {
1550            return Err(format!(
1551                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1552                archive_type, path, ratio
1553            ));
1554        }
1555    }
1556
1557    if uncompressed_size > MAX_FILE_SIZE {
1558        return Err(format!(
1559            "Rejected oversized entry in {} {:?}: {} bytes",
1560            archive_type, path, uncompressed_size
1561        ));
1562    }
1563
1564    read_limited_utf8(
1565        &mut file,
1566        MAX_FILE_SIZE,
1567        &format!("{} entry {}", archive_type, entry.name),
1568    )
1569}
1570
1571fn read_limited_utf8<R: Read>(
1572    reader: &mut R,
1573    max_bytes: u64,
1574    context: &str,
1575) -> Result<String, String> {
1576    let mut limited = reader.take(max_bytes + 1);
1577    let mut bytes = Vec::new();
1578    limited
1579        .read_to_end(&mut bytes)
1580        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1581
1582    if bytes.len() as u64 > max_bytes {
1583        return Err(format!(
1584            "{} exceeded {} byte limit while reading",
1585            context, max_bytes
1586        ));
1587    }
1588
1589    String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1590}
1591
1592fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1593    let normalized = entry_path.replace('\\', "/");
1594    if normalized.len() >= 3 {
1595        let bytes = normalized.as_bytes();
1596        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1597            return None;
1598        }
1599    }
1600    let path = Path::new(&normalized);
1601    let mut components = Vec::new();
1602
1603    for component in path.components() {
1604        match component {
1605            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1606            Component::CurDir => {}
1607            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1608        }
1609    }
1610
1611    (!components.is_empty()).then_some(components.join("/"))
1612}
1613
1614/// Parses RECORD CSV format from wheel archives (PEP 427).
1615/// Format: path,hash,size (3 columns, no header)
1616/// Hash format: sha256=urlsafe_base64_hash or empty
1617/// Size: bytes as u64 or empty
1618pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1619    let mut reader = ReaderBuilder::new()
1620        .has_headers(false)
1621        .from_reader(content.as_bytes());
1622
1623    let mut file_references = Vec::new();
1624
1625    for result in reader.records() {
1626        match result {
1627            Ok(record) => {
1628                if record.len() < 3 {
1629                    continue;
1630                }
1631
1632                let path = record.get(0).unwrap_or("").trim().to_string();
1633                if path.is_empty() {
1634                    continue;
1635                }
1636
1637                let hash_field = record.get(1).unwrap_or("").trim();
1638                let size_field = record.get(2).unwrap_or("").trim();
1639
1640                // Parse hash: format is "algorithm=value"
1641                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1642                    let parts: Vec<&str> = hash_field.split('=').collect();
1643                    if parts.len() == 2 && parts[0] == "sha256" {
1644                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1645                            Ok(decoded) => {
1646                                let hex = decoded
1647                                    .iter()
1648                                    .map(|b| format!("{:02x}", b))
1649                                    .collect::<String>();
1650                                Sha256Digest::from_hex(&hex).ok()
1651                            }
1652                            Err(_) => None,
1653                        }
1654                    } else {
1655                        None
1656                    }
1657                } else {
1658                    None
1659                };
1660
1661                // Parse size
1662                let size = if !size_field.is_empty() && size_field != "-" {
1663                    size_field.parse::<u64>().ok()
1664                } else {
1665                    None
1666                };
1667
1668                file_references.push(FileReference {
1669                    path,
1670                    size,
1671                    sha1: None,
1672                    md5: None,
1673                    sha256,
1674                    sha512: None,
1675                    extra_data: None,
1676                });
1677            }
1678            Err(e) => {
1679                warn!("Failed to parse RECORD CSV row: {}", e);
1680                continue;
1681            }
1682        }
1683    }
1684
1685    file_references
1686}
1687
1688/// Parses installed-files.txt format from egg archives (PEP 376).
1689/// Format: one file path per line, no headers, no hash, no size
1690pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1691    content
1692        .lines()
1693        .map(|line| line.trim())
1694        .filter(|line| !line.is_empty())
1695        .map(|path| FileReference {
1696            path: path.to_string(),
1697            size: None,
1698            sha1: None,
1699            md5: None,
1700            sha256: None,
1701            sha512: None,
1702            extra_data: None,
1703        })
1704        .collect()
1705}
1706
1707pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1708    content
1709        .lines()
1710        .map(str::trim)
1711        .filter(|line| !line.is_empty())
1712        .map(|path| FileReference {
1713            path: path.to_string(),
1714            size: None,
1715            sha1: None,
1716            md5: None,
1717            sha256: None,
1718            sha512: None,
1719            extra_data: None,
1720        })
1721        .collect()
1722}
1723
1724struct WheelInfo {
1725    name: String,
1726    version: String,
1727    python_tag: String,
1728    abi_tag: String,
1729    platform_tag: String,
1730}
1731
1732fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1733    let stem = path.file_stem()?.to_string_lossy();
1734    let parts: Vec<&str> = stem.split('-').collect();
1735
1736    if parts.len() >= 5 {
1737        Some(WheelInfo {
1738            name: parts[0].replace('_', "-"),
1739            version: parts[1].to_string(),
1740            python_tag: parts[2].to_string(),
1741            abi_tag: parts[3].to_string(),
1742            platform_tag: parts[4..].join("-"),
1743        })
1744    } else {
1745        None
1746    }
1747}
1748
1749struct EggInfo {
1750    name: String,
1751    version: String,
1752    python_version: Option<String>,
1753}
1754
1755fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1756    let stem = path.file_stem()?.to_string_lossy();
1757    let parts: Vec<&str> = stem.split('-').collect();
1758
1759    if parts.len() >= 2 {
1760        Some(EggInfo {
1761            name: parts[0].replace('_', "-"),
1762            version: parts[1].to_string(),
1763            python_version: parts.get(2).map(|s| s.to_string()),
1764        })
1765    } else {
1766        None
1767    }
1768}
1769
1770fn build_wheel_purl(
1771    name: Option<&str>,
1772    version: Option<&str>,
1773    wheel_info: &WheelInfo,
1774) -> Option<String> {
1775    let name = name?;
1776    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1777
1778    if let Some(ver) = version {
1779        package_url.with_version(ver).ok()?;
1780    }
1781
1782    let extension = format!(
1783        "{}-{}-{}",
1784        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1785    );
1786    package_url.add_qualifier("extension", extension).ok()?;
1787
1788    Some(package_url.to_string())
1789}
1790
1791fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1792    let name = name?;
1793    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1794
1795    if let Some(ver) = version {
1796        package_url.with_version(ver).ok()?;
1797    }
1798
1799    package_url.add_qualifier("type", "egg").ok()?;
1800
1801    Some(package_url.to_string())
1802}
1803
1804fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1805    let metadata = super::rfc822::parse_rfc822_content(content);
1806    build_package_data_from_rfc822(&metadata, datasource_id)
1807}
1808
1809/// Builds PackageData from parsed RFC822 metadata.
1810///
1811/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1812/// and `python_parse_rfc822_content` (content-based) functions.
1813fn build_package_data_from_rfc822(
1814    metadata: &super::rfc822::Rfc822Metadata,
1815    datasource_id: DatasourceId,
1816) -> PackageData {
1817    use super::rfc822::{get_header_all, get_header_first};
1818
1819    let name = get_header_first(&metadata.headers, "name");
1820    let version = get_header_first(&metadata.headers, "version");
1821    let summary = get_header_first(&metadata.headers, "summary");
1822    let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1823    let author = get_header_first(&metadata.headers, "author");
1824    let author_email = get_header_first(&metadata.headers, "author-email");
1825    let license = get_header_first(&metadata.headers, "license");
1826    let license_expression = get_header_first(&metadata.headers, "license-expression");
1827    let download_url = get_header_first(&metadata.headers, "download-url");
1828    let platform = get_header_first(&metadata.headers, "platform");
1829    let requires_python = get_header_first(&metadata.headers, "requires-python");
1830    let classifiers = get_header_all(&metadata.headers, "classifier");
1831    let license_files = get_header_all(&metadata.headers, "license-file");
1832
1833    let description_body = if metadata.body.is_empty() {
1834        get_header_first(&metadata.headers, "description").unwrap_or_default()
1835    } else {
1836        metadata.body.clone()
1837    };
1838
1839    let description = build_description(summary.as_deref(), &description_body);
1840
1841    let mut parties = Vec::new();
1842    if author.is_some() || author_email.is_some() {
1843        parties.push(Party {
1844            r#type: Some("person".to_string()),
1845            role: Some("author".to_string()),
1846            name: author,
1847            email: author_email,
1848            url: None,
1849            organization: None,
1850            organization_url: None,
1851            timezone: None,
1852        });
1853    }
1854
1855    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1856    let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1857    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1858        license_expression
1859            .as_deref()
1860            .and_then(normalize_spdx_expression)
1861            .map(|normalized| {
1862                build_declared_license_data(
1863                    normalized,
1864                    DeclaredLicenseMatchMetadata::single_line(
1865                        license_expression.as_deref().unwrap_or_default(),
1866                    )
1867                    .with_referenced_filenames(&referenced_license_files),
1868                )
1869            })
1870            .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1871
1872    let extracted_license_statement = license_expression
1873        .clone()
1874        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1875
1876    let mut extra_data = HashMap::new();
1877    if let Some(platform_value) = platform
1878        && !platform_value.eq_ignore_ascii_case("unknown")
1879        && !platform_value.is_empty()
1880    {
1881        extra_data.insert(
1882            "platform".to_string(),
1883            serde_json::Value::String(platform_value),
1884        );
1885    }
1886
1887    if let Some(requires_python_value) = requires_python
1888        && !requires_python_value.is_empty()
1889    {
1890        extra_data.insert(
1891            "requires_python".to_string(),
1892            serde_json::Value::String(requires_python_value),
1893        );
1894    }
1895
1896    if !license_files.is_empty() {
1897        extra_data.insert(
1898            "license_files".to_string(),
1899            serde_json::Value::Array(
1900                license_files
1901                    .iter()
1902                    .cloned()
1903                    .map(serde_json::Value::String)
1904                    .collect(),
1905            ),
1906        );
1907    }
1908
1909    let file_references = license_files
1910        .iter()
1911        .map(|path| FileReference {
1912            path: path.clone(),
1913            size: None,
1914            sha1: None,
1915            md5: None,
1916            sha256: None,
1917            sha512: None,
1918            extra_data: None,
1919        })
1920        .collect();
1921
1922    let project_urls = get_header_all(&metadata.headers, "project-url");
1923    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1924    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1925
1926    if !project_urls.is_empty() {
1927        let parsed_urls = parse_project_urls(&project_urls);
1928
1929        for (label, url) in &parsed_urls {
1930            let label_lower = label.to_lowercase();
1931
1932            if bug_tracking_url.is_none()
1933                && matches!(
1934                    label_lower.as_str(),
1935                    "tracker"
1936                        | "bug reports"
1937                        | "bug tracker"
1938                        | "issues"
1939                        | "issue tracker"
1940                        | "github: issues"
1941                )
1942            {
1943                bug_tracking_url = Some(url.clone());
1944            } else if code_view_url.is_none()
1945                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1946            {
1947                code_view_url = Some(url.clone());
1948            } else if vcs_url.is_none()
1949                && matches!(
1950                    label_lower.as_str(),
1951                    "github" | "gitlab" | "github: repo" | "repository"
1952                )
1953            {
1954                vcs_url = Some(url.clone());
1955            } else if homepage_url.is_none()
1956                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1957            {
1958                homepage_url = Some(url.clone());
1959            } else if label_lower == "changelog" {
1960                extra_data.insert(
1961                    "changelog_url".to_string(),
1962                    serde_json::Value::String(url.clone()),
1963                );
1964            }
1965        }
1966
1967        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1968            .iter()
1969            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1970            .collect();
1971
1972        if !project_urls_json.is_empty() {
1973            extra_data.insert(
1974                "project_urls".to_string(),
1975                serde_json::Value::Object(project_urls_json),
1976            );
1977        }
1978    }
1979
1980    let extra_data = if extra_data.is_empty() {
1981        None
1982    } else {
1983        Some(extra_data)
1984    };
1985
1986    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1987        build_pypi_urls(name.as_deref(), version.as_deref());
1988
1989    PackageData {
1990        package_type: Some(PythonParser::PACKAGE_TYPE),
1991        namespace: None,
1992        name,
1993        version,
1994        qualifiers: None,
1995        subpath: None,
1996        primary_language: Some("Python".to_string()),
1997        description,
1998        release_date: None,
1999        parties,
2000        keywords,
2001        homepage_url,
2002        download_url,
2003        size: None,
2004        sha1: None,
2005        md5: None,
2006        sha256: None,
2007        sha512: None,
2008        bug_tracking_url,
2009        code_view_url,
2010        vcs_url,
2011        copyright: None,
2012        holder: None,
2013        declared_license_expression,
2014        declared_license_expression_spdx,
2015        license_detections,
2016        other_license_expression: None,
2017        other_license_expression_spdx: None,
2018        other_license_detections: Vec::new(),
2019        extracted_license_statement,
2020        notice_text: None,
2021        source_packages: Vec::new(),
2022        file_references,
2023        is_private: false,
2024        is_virtual: false,
2025        extra_data,
2026        dependencies,
2027        repository_homepage_url,
2028        repository_download_url,
2029        api_data_url,
2030        datasource_id: Some(datasource_id),
2031        purl,
2032    }
2033}
2034
2035fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
2036    project_urls
2037        .iter()
2038        .filter_map(|url_entry| {
2039            if let Some((label, url)) = url_entry.split_once(", ") {
2040                let label_trimmed = label.trim();
2041                let url_trimmed = url.trim();
2042                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
2043                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
2044                }
2045            }
2046            None
2047        })
2048        .collect()
2049}
2050
2051fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
2052    let mut parts = Vec::new();
2053    if let Some(summary_value) = summary
2054        && !summary_value.trim().is_empty()
2055    {
2056        parts.push(summary_value.trim().to_string());
2057    }
2058
2059    if !body.trim().is_empty() {
2060        parts.push(body.trim().to_string());
2061    }
2062
2063    if parts.is_empty() {
2064        None
2065    } else {
2066        Some(parts.join("\n"))
2067    }
2068}
2069
2070fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
2071    let mut keywords = Vec::new();
2072    let mut license_classifiers = Vec::new();
2073
2074    for classifier in classifiers {
2075        if classifier.starts_with("License ::") {
2076            license_classifiers.push(classifier.to_string());
2077        } else {
2078            keywords.push(classifier.to_string());
2079        }
2080    }
2081
2082    (keywords, license_classifiers)
2083}
2084
2085fn build_extracted_license_statement(
2086    license: Option<&str>,
2087    license_classifiers: &[String],
2088) -> Option<String> {
2089    let mut lines = Vec::new();
2090
2091    if let Some(value) = license
2092        && !value.trim().is_empty()
2093    {
2094        lines.push(format!("license: {}", value.trim()));
2095    }
2096
2097    if !license_classifiers.is_empty() {
2098        lines.push("classifiers:".to_string());
2099        for classifier in license_classifiers {
2100            lines.push(format!("  - '{}'", classifier));
2101        }
2102    }
2103
2104    if lines.is_empty() {
2105        None
2106    } else {
2107        Some(format!("{}\n", lines.join("\n")))
2108    }
2109}
2110
2111pub(crate) fn build_pypi_urls(
2112    name: Option<&str>,
2113    version: Option<&str>,
2114) -> (
2115    Option<String>,
2116    Option<String>,
2117    Option<String>,
2118    Option<String>,
2119) {
2120    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2121
2122    let repository_download_url = name.and_then(|value| {
2123        version.map(|ver| {
2124            format!(
2125                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2126                &value[..1.min(value.len())],
2127                value,
2128                value,
2129                ver
2130            )
2131        })
2132    });
2133
2134    let api_data_url = name.map(|value| {
2135        if let Some(ver) = version {
2136            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2137        } else {
2138            format!("https://pypi.org/pypi/{}/json", value)
2139        }
2140    });
2141
2142    let purl = name.and_then(|value| {
2143        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2144        if let Some(ver) = version {
2145            package_url.with_version(ver).ok()?;
2146        }
2147        Some(package_url.to_string())
2148    });
2149
2150    (
2151        repository_homepage_url,
2152        repository_download_url,
2153        api_data_url,
2154        purl,
2155    )
2156}
2157
2158fn build_pypi_purl_with_extension(
2159    name: &str,
2160    version: Option<&str>,
2161    extension: &str,
2162) -> Option<String> {
2163    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2164    if let Some(ver) = version {
2165        package_url.with_version(ver).ok()?;
2166    }
2167    package_url.add_qualifier("extension", extension).ok()?;
2168    Some(package_url.to_string())
2169}
2170
2171fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2172    let toml_content = match read_toml_file(path) {
2173        Ok(content) => content,
2174        Err(e) => {
2175            warn!(
2176                "Failed to read or parse pyproject.toml at {:?}: {}",
2177                path, e
2178            );
2179            return default_package_data(path);
2180        }
2181    };
2182
2183    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2184    let is_poetry_pyproject = tool_table
2185        .and_then(|tool| tool.get("poetry"))
2186        .and_then(|value| value.as_table())
2187        .is_some();
2188
2189    // Handle both PEP 621 (project table) and poetry formats
2190    let project_table =
2191        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2192            // Standard PEP 621 format with [project] table
2193            project.clone()
2194        } else if let Some(tool) = tool_table {
2195            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2196                // Poetry format with [tool.poetry] table
2197                poetry.clone()
2198            } else {
2199                return default_package_data(path);
2200            }
2201        } else if toml_content.get(FIELD_NAME).is_some() {
2202            // Other format with top-level fields
2203            match toml_content.as_table() {
2204                Some(table) => table.clone(),
2205                None => {
2206                    warn!("Failed to convert TOML content to table in {:?}", path);
2207                    return default_package_data(path);
2208                }
2209            }
2210        } else {
2211            return default_package_data(path);
2212        };
2213
2214    let name = project_table
2215        .get(FIELD_NAME)
2216        .and_then(|v| v.as_str())
2217        .map(String::from);
2218
2219    let version = project_table
2220        .get(FIELD_VERSION)
2221        .and_then(|v| v.as_str())
2222        .map(String::from);
2223    let classifiers = project_table
2224        .get("classifiers")
2225        .and_then(|value| value.as_array())
2226        .map(|values| {
2227            values
2228                .iter()
2229                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2230                .collect::<Vec<_>>()
2231        })
2232        .unwrap_or_default();
2233    let (classifier_keywords, license_classifiers) = split_classifiers(&classifiers);
2234
2235    let extracted_license_statement = extract_raw_license_string(&project_table);
2236    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2237        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2238
2239    let description = project_table
2240        .get(FIELD_DESCRIPTION)
2241        .and_then(|value| value.as_str())
2242        .map(|value| value.to_string());
2243    let mut keywords = project_table
2244        .get(FIELD_KEYWORDS)
2245        .and_then(|value| value.as_array())
2246        .map(|values| {
2247            values
2248                .iter()
2249                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2250                .collect::<Vec<_>>()
2251        })
2252        .unwrap_or_default();
2253    for classifier in classifier_keywords {
2254        if !keywords.contains(&classifier) {
2255            keywords.push(classifier);
2256        }
2257    }
2258
2259    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2260    let mut extra_data = extract_pyproject_extra_data(&toml_content).unwrap_or_default();
2261    let (homepage_url, download_url, bug_tracking_url, code_view_url, repository_url) =
2262        extract_urls(&project_table, &mut extra_data);
2263
2264    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2265
2266    // Create package URL
2267    let purl = name.as_ref().and_then(|n| {
2268        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2269            Ok(p) => p,
2270            Err(e) => {
2271                warn!(
2272                    "Failed to create PackageUrl for Python package '{}': {}",
2273                    n, e
2274                );
2275                return None;
2276            }
2277        };
2278
2279        if let Some(v) = &version
2280            && let Err(e) = package_url.with_version(v)
2281        {
2282            warn!(
2283                "Failed to set version '{}' for Python package '{}': {}",
2284                v, n, e
2285            );
2286            return None;
2287        }
2288
2289        Some(package_url.to_string())
2290    });
2291
2292    let api_data_url = name.as_ref().map(|n| {
2293        if let Some(v) = &version {
2294            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2295        } else {
2296            format!("https://pypi.org/pypi/{}/json", n)
2297        }
2298    });
2299
2300    let pypi_homepage_url = name
2301        .as_ref()
2302        .map(|n| format!("https://pypi.org/project/{}", n));
2303
2304    let pypi_download_url = name.as_ref().and_then(|n| {
2305        version.as_ref().map(|v| {
2306            format!(
2307                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2308                &n[..1.min(n.len())],
2309                n,
2310                n,
2311                v
2312            )
2313        })
2314    });
2315
2316    PackageData {
2317        package_type: Some(PythonParser::PACKAGE_TYPE),
2318        namespace: None,
2319        name,
2320        version,
2321        qualifiers: None,
2322        subpath: None,
2323        primary_language: None,
2324        description,
2325        release_date: None,
2326        parties: extract_parties(&project_table),
2327        keywords,
2328        homepage_url: homepage_url.or(pypi_homepage_url),
2329        download_url: download_url
2330            .or_else(|| repository_url.clone())
2331            .or(pypi_download_url),
2332        size: None,
2333        sha1: None,
2334        md5: None,
2335        sha256: None,
2336        sha512: None,
2337        bug_tracking_url,
2338        code_view_url,
2339        vcs_url: repository_url,
2340        copyright: None,
2341        holder: None,
2342        declared_license_expression,
2343        declared_license_expression_spdx,
2344        license_detections,
2345        other_license_expression: None,
2346        other_license_expression_spdx: None,
2347        other_license_detections: Vec::new(),
2348        extracted_license_statement: extracted_license_statement
2349            .or_else(|| build_extracted_license_statement(None, &license_classifiers)),
2350        notice_text: None,
2351        source_packages: Vec::new(),
2352        file_references: Vec::new(),
2353        is_private: has_private_classifier(&classifiers),
2354        is_virtual: false,
2355        extra_data: if extra_data.is_empty() {
2356            None
2357        } else {
2358            Some(extra_data)
2359        },
2360        dependencies: [dependencies, optional_dependencies].concat(),
2361        repository_homepage_url: None,
2362        repository_download_url: None,
2363        api_data_url,
2364        datasource_id: Some(if is_poetry_pyproject {
2365            DatasourceId::PypiPoetryPyprojectToml
2366        } else {
2367            DatasourceId::PypiPyprojectToml
2368        }),
2369        purl,
2370    }
2371}
2372
2373fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2374    let path_str = path.to_string_lossy().replace('\\', "/");
2375    if path_str.contains("/EGG-INFO/PKG-INFO") {
2376        DatasourceId::PypiEggPkginfo
2377    } else if path_str.ends_with(".egg-info/PKG-INFO") {
2378        DatasourceId::PypiEditableEggPkginfo
2379    } else {
2380        DatasourceId::PypiSdistPkginfo
2381    }
2382}
2383
2384fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2385    project
2386        .get(FIELD_LICENSE)
2387        .and_then(|license_value| match license_value {
2388            TomlValue::String(license_str) => Some(license_str.clone()),
2389            TomlValue::Table(license_table) => license_table
2390                .get("text")
2391                .and_then(|v| v.as_str())
2392                .map(|s| s.to_string())
2393                .or_else(|| {
2394                    license_table
2395                        .get("expression")
2396                        .and_then(|v| v.as_str())
2397                        .map(|expr| expr.to_string())
2398                }),
2399            _ => None,
2400        })
2401}
2402
2403fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2404    match project.get(FIELD_LICENSE) {
2405        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2406        Some(TomlValue::Table(license_table)) => license_table
2407            .get("expression")
2408            .and_then(|value| value.as_str()),
2409        _ => None,
2410    }
2411}
2412
2413fn extract_urls(
2414    project: &TomlMap<String, TomlValue>,
2415    extra_data: &mut HashMap<String, serde_json::Value>,
2416) -> ProjectUrls {
2417    let mut homepage_url = None;
2418    let mut download_url = None;
2419    let mut bug_tracking_url = None;
2420    let mut code_view_url = None;
2421    let mut repository_url = None;
2422
2423    // Check for URLs table
2424    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2425        let parsed_urls: Vec<(String, String)> = urls
2426            .iter()
2427            .filter_map(|(label, value)| {
2428                value
2429                    .as_str()
2430                    .map(|url| (label.to_string(), url.to_string()))
2431            })
2432            .collect();
2433        apply_project_url_mappings(
2434            &parsed_urls,
2435            &mut homepage_url,
2436            &mut bug_tracking_url,
2437            &mut code_view_url,
2438            &mut repository_url,
2439            extra_data,
2440        );
2441
2442        download_url = urls
2443            .get("Downloads")
2444            .or_else(|| urls.get("downloads"))
2445            .and_then(|v| v.as_str())
2446            .map(String::from);
2447
2448        if homepage_url.is_none() {
2449            homepage_url = urls
2450                .get(FIELD_HOMEPAGE)
2451                .and_then(|v| v.as_str())
2452                .map(String::from);
2453        }
2454        if repository_url.is_none() {
2455            repository_url = urls
2456                .get(FIELD_REPOSITORY)
2457                .and_then(|v| v.as_str())
2458                .map(String::from);
2459        }
2460    }
2461
2462    // If not found in URLs table, check for top-level keys
2463    if homepage_url.is_none() {
2464        homepage_url = project
2465            .get(FIELD_HOMEPAGE)
2466            .and_then(|v| v.as_str())
2467            .map(String::from);
2468    }
2469
2470    if repository_url.is_none() {
2471        repository_url = project
2472            .get(FIELD_REPOSITORY)
2473            .and_then(|v| v.as_str())
2474            .map(String::from);
2475    }
2476
2477    (
2478        homepage_url,
2479        download_url,
2480        bug_tracking_url,
2481        code_view_url,
2482        repository_url,
2483    )
2484}
2485
2486fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2487    let mut parties = Vec::new();
2488
2489    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2490        for author in authors {
2491            if let Some(author_str) = author.as_str() {
2492                let (name, email) = split_name_email(author_str);
2493                parties.push(Party {
2494                    r#type: None,
2495                    role: Some("author".to_string()),
2496                    name,
2497                    email,
2498                    url: None,
2499                    organization: None,
2500                    organization_url: None,
2501                    timezone: None,
2502                });
2503            } else if let Some(author_table) = author.as_table() {
2504                let name = author_table
2505                    .get("name")
2506                    .and_then(|value| value.as_str())
2507                    .map(|value| value.to_string());
2508                let email = author_table
2509                    .get("email")
2510                    .and_then(|value| value.as_str())
2511                    .map(|value| value.to_string());
2512                if name.is_some() || email.is_some() {
2513                    parties.push(Party {
2514                        r#type: None,
2515                        role: Some("author".to_string()),
2516                        name,
2517                        email,
2518                        url: None,
2519                        organization: None,
2520                        organization_url: None,
2521                        timezone: None,
2522                    });
2523                }
2524            }
2525        }
2526    }
2527
2528    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2529        for maintainer in maintainers {
2530            if let Some(maintainer_str) = maintainer.as_str() {
2531                let (name, email) = split_name_email(maintainer_str);
2532                parties.push(Party {
2533                    r#type: None,
2534                    role: Some("maintainer".to_string()),
2535                    name,
2536                    email,
2537                    url: None,
2538                    organization: None,
2539                    organization_url: None,
2540                    timezone: None,
2541                });
2542            } else if let Some(maintainer_table) = maintainer.as_table() {
2543                let name = maintainer_table
2544                    .get("name")
2545                    .and_then(|value| value.as_str())
2546                    .map(|value| value.to_string());
2547                let email = maintainer_table
2548                    .get("email")
2549                    .and_then(|value| value.as_str())
2550                    .map(|value| value.to_string());
2551                if name.is_some() || email.is_some() {
2552                    parties.push(Party {
2553                        r#type: None,
2554                        role: Some("maintainer".to_string()),
2555                        name,
2556                        email,
2557                        url: None,
2558                        organization: None,
2559                        organization_url: None,
2560                        timezone: None,
2561                    });
2562                }
2563            }
2564        }
2565    }
2566
2567    parties
2568}
2569
2570fn extract_dependencies(
2571    project: &TomlMap<String, TomlValue>,
2572    toml_content: &TomlValue,
2573) -> (Vec<Dependency>, Vec<Dependency>) {
2574    let mut dependencies = Vec::new();
2575    let mut optional_dependencies = Vec::new();
2576
2577    // Handle dependencies - can be array or table format
2578    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2579        match deps_value {
2580            TomlValue::Array(arr) => {
2581                dependencies = parse_dependency_array(arr, false, None);
2582            }
2583            TomlValue::Table(table) => {
2584                dependencies = parse_dependency_table(table, false, None);
2585            }
2586            _ => {}
2587        }
2588    }
2589
2590    // Handle PEP 621 optional-dependencies with scope
2591    if let Some(opt_deps_table) = project
2592        .get(FIELD_OPTIONAL_DEPENDENCIES)
2593        .and_then(|v| v.as_table())
2594    {
2595        for (extra_name, deps) in opt_deps_table {
2596            match deps {
2597                TomlValue::Array(arr) => {
2598                    optional_dependencies.extend(parse_dependency_array(
2599                        arr,
2600                        true,
2601                        Some(extra_name),
2602                    ));
2603                }
2604                TomlValue::Table(table) => {
2605                    optional_dependencies.extend(parse_dependency_table(
2606                        table,
2607                        true,
2608                        Some(extra_name),
2609                    ));
2610                }
2611                _ => {}
2612            }
2613        }
2614    }
2615
2616    // Handle Poetry dev-dependencies
2617    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2618        match dev_deps_value {
2619            TomlValue::Array(arr) => {
2620                optional_dependencies.extend(parse_dependency_array(
2621                    arr,
2622                    true,
2623                    Some(FIELD_DEV_DEPENDENCIES),
2624                ));
2625            }
2626            TomlValue::Table(table) => {
2627                optional_dependencies.extend(parse_dependency_table(
2628                    table,
2629                    true,
2630                    Some(FIELD_DEV_DEPENDENCIES),
2631                ));
2632            }
2633            _ => {}
2634        }
2635    }
2636
2637    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2638    if let Some(groups_table) = toml_content
2639        .get("tool")
2640        .and_then(|value| value.as_table())
2641        .and_then(|tool| tool.get("poetry"))
2642        .and_then(|value| value.as_table())
2643        .and_then(|poetry| poetry.get("group"))
2644        .and_then(|value| value.as_table())
2645    {
2646        for (group_name, group_data) in groups_table {
2647            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2648                match group_deps {
2649                    TomlValue::Array(arr) => {
2650                        optional_dependencies.extend(parse_dependency_array(
2651                            arr,
2652                            true,
2653                            Some(group_name),
2654                        ));
2655                    }
2656                    TomlValue::Table(table) => {
2657                        optional_dependencies.extend(parse_poetry_group_dependency_table(
2658                            table,
2659                            true,
2660                            Some(group_name),
2661                        ));
2662                    }
2663                    _ => {}
2664                }
2665            }
2666        }
2667    }
2668
2669    if let Some(groups_table) = toml_content
2670        .get(FIELD_DEPENDENCY_GROUPS)
2671        .and_then(|value| value.as_table())
2672    {
2673        for (group_name, deps) in groups_table {
2674            match deps {
2675                TomlValue::Array(arr) => {
2676                    optional_dependencies.extend(parse_dependency_array(
2677                        arr,
2678                        true,
2679                        Some(group_name),
2680                    ));
2681                }
2682                TomlValue::Table(table) => {
2683                    optional_dependencies.extend(parse_dependency_table(
2684                        table,
2685                        true,
2686                        Some(group_name),
2687                    ));
2688                }
2689                _ => {}
2690            }
2691        }
2692    }
2693
2694    if let Some(dev_deps_value) = toml_content
2695        .get("tool")
2696        .and_then(|value| value.as_table())
2697        .and_then(|tool| tool.get("uv"))
2698        .and_then(|value| value.as_table())
2699        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2700    {
2701        match dev_deps_value {
2702            TomlValue::Array(arr) => {
2703                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2704            }
2705            TomlValue::Table(table) => {
2706                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2707            }
2708            _ => {}
2709        }
2710    }
2711
2712    (dependencies, optional_dependencies)
2713}
2714
2715fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2716    let mut extra_data = HashMap::new();
2717
2718    if let Some(tool_uv) = toml_content
2719        .get("tool")
2720        .and_then(|value| value.as_table())
2721        .and_then(|tool| tool.get("uv"))
2722    {
2723        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2724    }
2725
2726    if extra_data.is_empty() {
2727        None
2728    } else {
2729        Some(extra_data)
2730    }
2731}
2732
2733fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2734    match value {
2735        TomlValue::String(value) => JsonValue::String(value.clone()),
2736        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2737        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2738        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2739        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2740        TomlValue::Array(values) => {
2741            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2742        }
2743        TomlValue::Table(values) => JsonValue::Object(
2744            values
2745                .iter()
2746                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2747                .collect::<JsonMap<String, JsonValue>>(),
2748        ),
2749    }
2750}
2751
2752fn parse_dependency_table(
2753    table: &TomlMap<String, TomlValue>,
2754    is_optional: bool,
2755    scope: Option<&str>,
2756) -> Vec<Dependency> {
2757    table
2758        .iter()
2759        .filter_map(|(name, version)| {
2760            let version_str = version.as_str().map(|s| s.to_string());
2761            let mut package_url =
2762                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2763
2764            if let Some(v) = &version_str {
2765                package_url.with_version(v).ok()?;
2766            }
2767
2768            Some(Dependency {
2769                purl: Some(package_url.to_string()),
2770                extracted_requirement: None,
2771                scope: scope.map(|s| s.to_string()),
2772                is_runtime: Some(!is_optional),
2773                is_optional: Some(is_optional),
2774                is_pinned: None,
2775                is_direct: Some(true),
2776                resolved_package: None,
2777                extra_data: None,
2778            })
2779        })
2780        .collect()
2781}
2782
2783fn parse_poetry_group_dependency_table(
2784    table: &TomlMap<String, TomlValue>,
2785    is_optional: bool,
2786    scope: Option<&str>,
2787) -> Vec<Dependency> {
2788    table
2789        .iter()
2790        .filter_map(|(name, value)| build_poetry_group_dependency(name, value, is_optional, scope))
2791        .collect()
2792}
2793
2794fn build_poetry_group_dependency(
2795    name: &str,
2796    value: &TomlValue,
2797    is_optional: bool,
2798    scope: Option<&str>,
2799) -> Option<Dependency> {
2800    let normalized_name = normalize_python_dependency_name(name);
2801    let (version_spec, extras, marker) = match value {
2802        TomlValue::String(spec) => (Some(spec.trim().to_string()), Vec::new(), None),
2803        TomlValue::Table(table) => {
2804            let version_spec = table
2805                .get(FIELD_VERSION)
2806                .and_then(|value| value.as_str())
2807                .map(str::trim)
2808                .filter(|value| !value.is_empty())
2809                .map(ToOwned::to_owned);
2810            let extras = table
2811                .get(FIELD_EXTRAS)
2812                .and_then(|value| value.as_array())
2813                .map(|values| {
2814                    values
2815                        .iter()
2816                        .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2817                        .collect::<Vec<_>>()
2818                })
2819                .unwrap_or_default();
2820            let marker = table
2821                .get("markers")
2822                .and_then(|value| value.as_str())
2823                .map(str::trim)
2824                .filter(|value| !value.is_empty())
2825                .map(ToOwned::to_owned);
2826
2827            (version_spec, extras, marker)
2828        }
2829        _ => return None,
2830    };
2831
2832    let pinned_version = version_spec
2833        .as_deref()
2834        .and_then(extract_exact_pinned_version);
2835    let purl = build_python_dependency_purl(&normalized_name, pinned_version.as_deref())?;
2836
2837    let mut extra_data = HashMap::new();
2838    if let Some(marker) = marker {
2839        extra_data.insert("marker".to_string(), JsonValue::String(marker));
2840    }
2841    if !extras.is_empty() {
2842        extra_data.insert(
2843            "extras".to_string(),
2844            JsonValue::Array(extras.into_iter().map(JsonValue::String).collect()),
2845        );
2846    }
2847
2848    Some(Dependency {
2849        purl: Some(purl),
2850        extracted_requirement: version_spec,
2851        scope: scope.map(|value| value.to_string()),
2852        is_runtime: Some(!is_optional),
2853        is_optional: Some(is_optional),
2854        is_pinned: Some(pinned_version.is_some()),
2855        is_direct: Some(true),
2856        resolved_package: None,
2857        extra_data: if extra_data.is_empty() {
2858            None
2859        } else {
2860            Some(extra_data)
2861        },
2862    })
2863}
2864
2865fn parse_dependency_array(
2866    array: &[TomlValue],
2867    is_optional: bool,
2868    scope: Option<&str>,
2869) -> Vec<Dependency> {
2870    array
2871        .iter()
2872        .filter_map(|dep| {
2873            let dep_str = dep.as_str()?;
2874            build_pyproject_array_dependency(dep_str, is_optional, scope)
2875        })
2876        .collect()
2877}
2878
2879fn build_pyproject_array_dependency(
2880    dep_str: &str,
2881    is_optional: bool,
2882    scope: Option<&str>,
2883) -> Option<Dependency> {
2884    let parsed = parse_pep508_requirement(dep_str)?;
2885    let name = normalize_python_package_name(&parsed.name);
2886    let pinned_version = parsed
2887        .specifiers
2888        .as_deref()
2889        .and_then(extract_exact_pinned_version);
2890
2891    let purl = build_python_dependency_purl(&name, pinned_version.as_deref())?;
2892
2893    let mut extra_data = HashMap::new();
2894    if let Some(marker) = parsed.marker {
2895        extra_data.insert("marker".to_string(), JsonValue::String(marker));
2896    }
2897    if !parsed.extras.is_empty() {
2898        extra_data.insert(
2899            "extras".to_string(),
2900            JsonValue::Array(parsed.extras.into_iter().map(JsonValue::String).collect()),
2901        );
2902    }
2903
2904    let extracted_requirement = parsed.specifiers.or(parsed.url);
2905
2906    Some(Dependency {
2907        purl: Some(purl),
2908        extracted_requirement: extracted_requirement.clone(),
2909        scope: scope.map(|s| s.to_string()),
2910        is_runtime: Some(!is_optional),
2911        is_optional: Some(is_optional),
2912        is_pinned: Some(pinned_version.is_some()),
2913        is_direct: Some(true),
2914        resolved_package: None,
2915        extra_data: if extra_data.is_empty() {
2916            None
2917        } else {
2918            Some(extra_data)
2919        },
2920    })
2921}
2922
2923fn extract_exact_pinned_version(specifiers: &str) -> Option<String> {
2924    let trimmed = specifiers.trim();
2925    if trimmed.contains(',') {
2926        return None;
2927    }
2928
2929    let stripped = if let Some(version) = trimmed.strip_prefix("===") {
2930        version
2931    } else if let Some(version) = trimmed.strip_prefix("==") {
2932        version
2933    } else {
2934        return None;
2935    };
2936
2937    let version = stripped.trim();
2938    if version.is_empty() {
2939        None
2940    } else {
2941        Some(version.to_string())
2942    }
2943}
2944
2945#[derive(Debug, Clone)]
2946enum Value {
2947    String(String),
2948    Number(f64),
2949    Bool(bool),
2950    None,
2951    List(Vec<Value>),
2952    Tuple(Vec<Value>),
2953    Dict(HashMap<String, Value>),
2954}
2955
2956struct LiteralEvaluator {
2957    constants: HashMap<String, Value>,
2958    max_depth: usize,
2959    max_nodes: usize,
2960    nodes_visited: usize,
2961}
2962
2963impl LiteralEvaluator {
2964    fn new(constants: HashMap<String, Value>) -> Self {
2965        Self {
2966            constants,
2967            max_depth: MAX_SETUP_PY_AST_DEPTH,
2968            max_nodes: MAX_SETUP_PY_AST_NODES,
2969            nodes_visited: 0,
2970        }
2971    }
2972
2973    fn insert_constant(&mut self, name: String, value: Value) {
2974        self.constants.insert(name, value);
2975    }
2976
2977    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2978        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2979            return None;
2980        }
2981        self.nodes_visited += 1;
2982
2983        match expr {
2984            ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
2985                Some(Value::String(value.to_str().to_string()))
2986            }
2987            ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
2988                Some(Value::Bool(*value))
2989            }
2990            ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
2991                self.evaluate_number(value)
2992            }
2993            ast::Expr::NoneLiteral(_) => Some(Value::None),
2994            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2995            ast::Expr::List(ast::ExprList { elts, .. }) => {
2996                let mut values = Vec::new();
2997                for elt in elts {
2998                    values.push(self.evaluate_expr(elt, depth + 1)?);
2999                }
3000                Some(Value::List(values))
3001            }
3002            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
3003                let mut values = Vec::new();
3004                for elt in elts {
3005                    values.push(self.evaluate_expr(elt, depth + 1)?);
3006                }
3007                Some(Value::Tuple(values))
3008            }
3009            ast::Expr::Dict(ast::ExprDict { items, .. }) => {
3010                let mut dict = HashMap::new();
3011                for item in items {
3012                    let key_expr = item.key.as_ref()?;
3013                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
3014                    let key = value_to_string(&key_value)?;
3015                    let value = self.evaluate_expr(&item.value, depth + 1)?;
3016                    dict.insert(key, value);
3017                }
3018                Some(Value::Dict(dict))
3019            }
3020            ast::Expr::Call(ast::ExprCall {
3021                func, arguments, ..
3022            }) => {
3023                let args = arguments.args.as_ref();
3024                let keywords = arguments.keywords.as_ref();
3025                if keywords.is_empty()
3026                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
3027                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
3028                {
3029                    return self.evaluate_ordered_dict(args, depth + 1);
3030                }
3031
3032                if !args.is_empty() {
3033                    return None;
3034                }
3035
3036                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
3037                    && id == "dict"
3038                {
3039                    let mut dict = HashMap::new();
3040                    for keyword in keywords {
3041                        let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
3042                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
3043                        dict.insert(key.to_string(), value);
3044                    }
3045                    return Some(Value::Dict(dict));
3046                }
3047
3048                None
3049            }
3050            _ => None,
3051        }
3052    }
3053
3054    fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
3055        match number {
3056            ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
3057            ast::Number::Float(value) => Some(Value::Number(*value)),
3058            ast::Number::Complex { .. } => None,
3059        }
3060    }
3061
3062    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
3063        if args.len() != 1 {
3064            return None;
3065        }
3066
3067        let items = match self.evaluate_expr(&args[0], depth)? {
3068            Value::List(items) | Value::Tuple(items) => items,
3069            _ => return None,
3070        };
3071
3072        let mut dict = HashMap::new();
3073        for item in items {
3074            let Value::Tuple(values) = item else {
3075                return None;
3076            };
3077            if values.len() != 2 {
3078                return None;
3079            }
3080            let key = value_to_string(&values[0])?;
3081            dict.insert(key, values[1].clone());
3082        }
3083
3084        Some(Value::Dict(dict))
3085    }
3086}
3087
3088#[derive(Default)]
3089struct SetupAliases {
3090    setup_names: HashSet<String>,
3091    module_aliases: HashMap<String, String>,
3092}
3093
3094fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
3095    extract_from_setup_py(path).into_iter().collect()
3096}
3097
3098fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
3099    let content = match read_file_to_string(path) {
3100        Ok(content) => content,
3101        Err(e) => {
3102            warn!("Failed to read setup.py at {:?}: {}", path, e);
3103            return Some(default_package_data(path));
3104        }
3105    };
3106
3107    if content.len() > MAX_SETUP_PY_BYTES {
3108        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
3109        let package_data = extract_from_setup_py_regex(&content);
3110        return should_emit_setup_py_package(&package_data).then_some(package_data);
3111    }
3112
3113    let mut package_data = match extract_from_setup_py_ast(&content) {
3114        Ok(Some(data)) => data,
3115        Ok(None) => return Some(default_package_data(path)),
3116        Err(e) => {
3117            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
3118            extract_from_setup_py_regex(&content)
3119        }
3120    };
3121
3122    if package_data.name.is_none() {
3123        package_data.name = extract_setup_value(&content, "name");
3124    }
3125
3126    if package_data.version.is_none() {
3127        package_data.version = extract_setup_value(&content, "version");
3128    }
3129
3130    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
3131
3132    if package_data.purl.is_none() {
3133        package_data.purl = build_setup_py_purl(
3134            package_data.name.as_deref(),
3135            package_data.version.as_deref(),
3136        );
3137    }
3138
3139    if should_emit_setup_py_package(&package_data) {
3140        Some(package_data)
3141    } else {
3142        Some(default_package_data(path))
3143    }
3144}
3145
3146fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
3147    package_data.name.is_some()
3148        || package_data.version.is_some()
3149        || package_data.purl.is_some()
3150        || !package_data.dependencies.is_empty()
3151        || package_data.extracted_license_statement.is_some()
3152        || !package_data.license_detections.is_empty()
3153        || !package_data.parties.is_empty()
3154        || package_data.description.is_some()
3155        || package_data.homepage_url.is_some()
3156        || package_data.bug_tracking_url.is_some()
3157        || package_data.code_view_url.is_some()
3158        || package_data.vcs_url.is_some()
3159}
3160
3161fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
3162    if package_data.version.is_some()
3163        && package_data.extracted_license_statement.is_some()
3164        && package_data
3165            .parties
3166            .iter()
3167            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
3168    {
3169        return;
3170    }
3171
3172    let Some(root) = path.parent() else {
3173        return;
3174    };
3175
3176    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
3177
3178    if package_data.version.is_none() {
3179        package_data.version = dunder_metadata.version;
3180    }
3181
3182    if package_data.extracted_license_statement.is_none() {
3183        package_data.extracted_license_statement = dunder_metadata.license;
3184    }
3185
3186    let has_author = package_data
3187        .parties
3188        .iter()
3189        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
3190
3191    if !has_author && let Some(author) = dunder_metadata.author {
3192        package_data.parties.push(Party {
3193            r#type: Some("person".to_string()),
3194            role: Some("author".to_string()),
3195            name: Some(author),
3196            email: None,
3197            url: None,
3198            organization: None,
3199            organization_url: None,
3200            timezone: None,
3201        });
3202    }
3203}
3204
3205#[derive(Default)]
3206struct DunderMetadata {
3207    version: Option<String>,
3208    author: Option<String>,
3209    license: Option<String>,
3210}
3211
3212fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
3213    let statements = match parse_module(content) {
3214        Ok(parsed) => parsed.into_suite(),
3215        Err(_) => return DunderMetadata::default(),
3216    };
3217
3218    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3219    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3220    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3221    let mut metadata = DunderMetadata::default();
3222
3223    for module in imported_dunder_modules(&statements) {
3224        let Some(path) = resolve_imported_module_path(root, &module) else {
3225            continue;
3226        };
3227        let Ok(module_content) = read_file_to_string(&path) else {
3228            continue;
3229        };
3230
3231        if metadata.version.is_none() {
3232            metadata.version = version_re
3233                .as_ref()
3234                .and_then(|regex| regex.captures(&module_content))
3235                .and_then(|captures| captures.get(1))
3236                .map(|match_| match_.as_str().to_string());
3237        }
3238
3239        if metadata.author.is_none() {
3240            metadata.author = author_re
3241                .as_ref()
3242                .and_then(|regex| regex.captures(&module_content))
3243                .and_then(|captures| captures.get(1))
3244                .map(|match_| match_.as_str().to_string());
3245        }
3246
3247        if metadata.license.is_none() {
3248            metadata.license = license_re
3249                .as_ref()
3250                .and_then(|regex| regex.captures(&module_content))
3251                .and_then(|captures| captures.get(1))
3252                .map(|match_| match_.as_str().to_string());
3253        }
3254
3255        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
3256            return metadata;
3257        }
3258    }
3259
3260    metadata
3261}
3262
3263fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
3264    let mut modules = Vec::new();
3265
3266    for statement in statements {
3267        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
3268            continue;
3269        };
3270        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
3271            continue;
3272        };
3273        let imports_dunder = names.iter().any(|alias| {
3274            matches!(
3275                alias.name.as_str(),
3276                "__version__" | "__author__" | "__license__"
3277            )
3278        });
3279        if imports_dunder {
3280            modules.push(module.to_string());
3281        }
3282    }
3283
3284    modules
3285}
3286
3287fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
3288    let relative = PathBuf::from_iter(module.split('.'));
3289    let candidates = [
3290        root.join(relative.with_extension("py")),
3291        root.join(&relative).join("__init__.py"),
3292        root.join("src").join(relative.with_extension("py")),
3293        root.join("src").join(relative).join("__init__.py"),
3294    ];
3295
3296    candidates.into_iter().find(|candidate| candidate.exists())
3297}
3298
3299/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
3300///
3301/// # Security Model
3302///
3303/// This function parses setup.py as a Python AST and evaluates only literal values
3304/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
3305/// arbitrary code execution during scanning.
3306///
3307/// # DoS Prevention
3308///
3309/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
3310/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
3311/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
3312///
3313/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
3314fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
3315    let statements = parse_module(content)
3316        .map(|parsed| parsed.into_suite())
3317        .map_err(|e| e.to_string())?;
3318    let aliases = collect_setup_aliases(&statements);
3319    let mut evaluator = LiteralEvaluator::new(HashMap::new());
3320    build_setup_py_constants(&statements, &mut evaluator);
3321
3322    let setup_call = find_setup_call(&statements, &aliases);
3323    let Some(call_expr) = setup_call else {
3324        return Ok(None);
3325    };
3326
3327    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3328    Ok(Some(build_setup_py_package_data(&setup_values)))
3329}
3330
3331fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3332    for stmt in statements {
3333        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3334            if targets.len() != 1 {
3335                continue;
3336            }
3337
3338            let Some(name) = extract_assign_name(&targets[0]) else {
3339                continue;
3340            };
3341
3342            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3343                evaluator.insert_constant(name, value);
3344            }
3345        }
3346    }
3347}
3348
3349fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3350    match target {
3351        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3352        _ => None,
3353    }
3354}
3355
3356fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3357    let mut aliases = SetupAliases::default();
3358    aliases.setup_names.insert("setup".to_string());
3359
3360    for stmt in statements {
3361        match stmt {
3362            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3363                for alias in names {
3364                    let module_name = alias.name.as_str();
3365                    if !is_setup_module(module_name) {
3366                        continue;
3367                    }
3368                    let alias_name = alias
3369                        .asname
3370                        .as_ref()
3371                        .map(|name| name.as_str())
3372                        .unwrap_or(module_name);
3373                    aliases
3374                        .module_aliases
3375                        .insert(alias_name.to_string(), module_name.to_string());
3376                }
3377            }
3378            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3379                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3380                    continue;
3381                };
3382                if !is_setup_module(module_name) {
3383                    continue;
3384                }
3385                for alias in names {
3386                    if alias.name.as_str() != "setup" {
3387                        continue;
3388                    }
3389                    let alias_name = alias
3390                        .asname
3391                        .as_ref()
3392                        .map(|name| name.as_str())
3393                        .unwrap_or("setup");
3394                    aliases.setup_names.insert(alias_name.to_string());
3395                }
3396            }
3397            _ => {}
3398        }
3399    }
3400
3401    aliases
3402}
3403
3404fn is_setup_module(module_name: &str) -> bool {
3405    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3406}
3407
3408fn find_setup_call<'a>(
3409    statements: &'a [ast::Stmt],
3410    aliases: &'a SetupAliases,
3411) -> Option<&'a ast::Expr> {
3412    let mut finder = SetupCallFinder {
3413        aliases,
3414        called_function_names: collect_top_level_called_function_names(statements),
3415        nodes_visited: 0,
3416    };
3417    finder.find_in_statements(statements)
3418}
3419
3420fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3421    let mut called = HashSet::new();
3422    collect_called_function_names_in_statements(statements, &mut called);
3423    called
3424}
3425
3426fn collect_called_function_names_in_statements(
3427    statements: &[ast::Stmt],
3428    called: &mut HashSet<String>,
3429) {
3430    for stmt in statements {
3431        match stmt {
3432            ast::Stmt::Expr(ast::StmtExpr { value, .. })
3433            | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3434                collect_called_function_names_in_expr(value.as_ref(), called);
3435            }
3436            ast::Stmt::If(ast::StmtIf {
3437                body,
3438                elif_else_clauses,
3439                ..
3440            }) => {
3441                collect_called_function_names_in_statements(body, called);
3442                for clause in elif_else_clauses {
3443                    collect_called_function_names_in_statements(&clause.body, called);
3444                }
3445            }
3446            ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3447            | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3448                collect_called_function_names_in_statements(body, called);
3449                collect_called_function_names_in_statements(orelse, called);
3450            }
3451            ast::Stmt::With(ast::StmtWith { body, .. }) => {
3452                collect_called_function_names_in_statements(body, called);
3453            }
3454            ast::Stmt::Try(ast::StmtTry {
3455                body,
3456                orelse,
3457                finalbody,
3458                handlers,
3459                ..
3460            }) => {
3461                collect_called_function_names_in_statements(body, called);
3462                collect_called_function_names_in_statements(orelse, called);
3463                collect_called_function_names_in_statements(finalbody, called);
3464                for handler in handlers {
3465                    let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3466                        body,
3467                        ..
3468                    }) = handler;
3469                    collect_called_function_names_in_statements(body, called);
3470                }
3471            }
3472            _ => {}
3473        }
3474    }
3475}
3476
3477fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3478    if let ast::Expr::Call(ast::ExprCall {
3479        func, arguments, ..
3480    }) = expr
3481    {
3482        if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3483            called.insert(id.as_str().to_string());
3484        }
3485
3486        for arg in arguments.args.iter() {
3487            collect_called_function_names_in_expr(arg, called);
3488        }
3489        for keyword in arguments.keywords.iter() {
3490            collect_called_function_names_in_expr(&keyword.value, called);
3491        }
3492    }
3493}
3494
3495struct SetupCallFinder<'a> {
3496    aliases: &'a SetupAliases,
3497    called_function_names: HashSet<String>,
3498    nodes_visited: usize,
3499}
3500
3501impl<'a> SetupCallFinder<'a> {
3502    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3503        for stmt in statements {
3504            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3505                return None;
3506            }
3507            self.nodes_visited += 1;
3508
3509            let found = match stmt {
3510                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3511                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3512                ast::Stmt::If(ast::StmtIf {
3513                    body,
3514                    elif_else_clauses,
3515                    ..
3516                }) => self.find_in_statements(body).or_else(|| {
3517                    for clause in elif_else_clauses {
3518                        if let Some(found) = self.find_in_statements(&clause.body) {
3519                            return Some(found);
3520                        }
3521                    }
3522                    None
3523                }),
3524                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3525                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3526                    .find_in_statements(body)
3527                    .or_else(|| self.find_in_statements(orelse)),
3528                ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3529                    .called_function_names
3530                    .contains(name.as_str())
3531                    .then(|| self.find_in_statements(body))
3532                    .flatten(),
3533                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3534                ast::Stmt::Try(ast::StmtTry {
3535                    body,
3536                    orelse,
3537                    finalbody,
3538                    handlers,
3539                    ..
3540                }) => self
3541                    .find_in_statements(body)
3542                    .or_else(|| self.find_in_statements(orelse))
3543                    .or_else(|| self.find_in_statements(finalbody))
3544                    .or_else(|| {
3545                        for handler in handlers {
3546                            let ast::ExceptHandler::ExceptHandler(
3547                                ast::ExceptHandlerExceptHandler { body, .. },
3548                            ) = handler;
3549                            if let Some(found) = self.find_in_statements(body) {
3550                                return Some(found);
3551                            }
3552                        }
3553                        None
3554                    }),
3555                _ => None,
3556            };
3557
3558            if found.is_some() {
3559                return found;
3560            }
3561        }
3562
3563        None
3564    }
3565
3566    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3567        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3568            return None;
3569        }
3570        self.nodes_visited += 1;
3571
3572        match expr {
3573            ast::Expr::Call(ast::ExprCall { func, .. })
3574                if is_setup_call(func.as_ref(), self.aliases) =>
3575            {
3576                Some(expr)
3577            }
3578            _ => None,
3579        }
3580    }
3581}
3582
3583fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3584    let Some(dotted) = dotted_name(func, 0) else {
3585        return false;
3586    };
3587
3588    if aliases.setup_names.contains(&dotted) {
3589        return true;
3590    }
3591
3592    let Some(module) = dotted.strip_suffix(".setup") else {
3593        return false;
3594    };
3595
3596    let resolved = resolve_module_alias(module, aliases);
3597    is_setup_module(&resolved)
3598}
3599
3600fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3601    if depth >= MAX_SETUP_PY_AST_DEPTH {
3602        return None;
3603    }
3604
3605    match expr {
3606        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3607        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3608            let base = dotted_name(value.as_ref(), depth + 1)?;
3609            Some(format!("{}.{}", base, attr.as_str()))
3610        }
3611        _ => None,
3612    }
3613}
3614
3615fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3616    if let Some(mapped) = aliases.module_aliases.get(module) {
3617        return mapped.clone();
3618    }
3619
3620    let Some((base, rest)) = module.split_once('.') else {
3621        return module.to_string();
3622    };
3623
3624    if let Some(mapped) = aliases.module_aliases.get(base) {
3625        return format!("{}.{}", mapped, rest);
3626    }
3627
3628    module.to_string()
3629}
3630
3631fn extract_setup_keywords(
3632    call_expr: &ast::Expr,
3633    evaluator: &mut LiteralEvaluator,
3634) -> HashMap<String, Value> {
3635    let mut values = HashMap::new();
3636    let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3637        return values;
3638    };
3639
3640    for keyword in arguments.keywords.iter() {
3641        if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3642            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3643                values.insert(arg.to_string(), value);
3644            }
3645        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3646            for (key, value) in dict {
3647                values.insert(key, value);
3648            }
3649        }
3650    }
3651
3652    values
3653}
3654
3655fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3656    let name = get_value_string(values, "name");
3657    let version = get_value_string(values, "version");
3658    let description =
3659        get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3660    let homepage_url =
3661        get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3662    let author = get_value_string(values, "author");
3663    let author_email = get_value_string(values, "author_email");
3664    let maintainer = get_value_string(values, "maintainer");
3665    let maintainer_email = get_value_string(values, "maintainer_email");
3666    let license = get_value_string(values, "license");
3667    let classifiers = values
3668        .get("classifiers")
3669        .and_then(value_to_string_list)
3670        .unwrap_or_default();
3671
3672    let mut parties = Vec::new();
3673    if author.is_some() || author_email.is_some() {
3674        parties.push(Party {
3675            r#type: Some("person".to_string()),
3676            role: Some("author".to_string()),
3677            name: author,
3678            email: author_email,
3679            url: None,
3680            organization: None,
3681            organization_url: None,
3682            timezone: None,
3683        });
3684    }
3685
3686    if maintainer.is_some() || maintainer_email.is_some() {
3687        parties.push(Party {
3688            r#type: Some("person".to_string()),
3689            role: Some("maintainer".to_string()),
3690            name: maintainer,
3691            email: maintainer_email,
3692            url: None,
3693            organization: None,
3694            organization_url: None,
3695            timezone: None,
3696        });
3697    }
3698
3699    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3700        normalize_spdx_declared_license(license.as_deref());
3701    let extracted_license_statement = license.clone();
3702
3703    let dependencies = build_setup_py_dependencies(values);
3704    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3705    let mut homepage_from_project_urls = None;
3706    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3707    let mut extra_data = HashMap::new();
3708
3709    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3710        apply_project_url_mappings(
3711            &parsed_project_urls,
3712            &mut homepage_from_project_urls,
3713            &mut bug_tracking_url,
3714            &mut code_view_url,
3715            &mut vcs_url,
3716            &mut extra_data,
3717        );
3718    }
3719
3720    let extra_data = if extra_data.is_empty() {
3721        None
3722    } else {
3723        Some(extra_data)
3724    };
3725
3726    PackageData {
3727        package_type: Some(PythonParser::PACKAGE_TYPE),
3728        namespace: None,
3729        name,
3730        version,
3731        qualifiers: None,
3732        subpath: None,
3733        primary_language: Some("Python".to_string()),
3734        description,
3735        release_date: None,
3736        parties,
3737        keywords: Vec::new(),
3738        homepage_url: homepage_url.or(homepage_from_project_urls),
3739        download_url: None,
3740        size: None,
3741        sha1: None,
3742        md5: None,
3743        sha256: None,
3744        sha512: None,
3745        bug_tracking_url,
3746        code_view_url,
3747        vcs_url,
3748        copyright: None,
3749        holder: None,
3750        declared_license_expression,
3751        declared_license_expression_spdx,
3752        license_detections,
3753        other_license_expression: None,
3754        other_license_expression_spdx: None,
3755        other_license_detections: Vec::new(),
3756        extracted_license_statement,
3757        notice_text: None,
3758        source_packages: Vec::new(),
3759        file_references: Vec::new(),
3760        is_private: has_private_classifier(&classifiers),
3761        is_virtual: false,
3762        extra_data,
3763        dependencies,
3764        repository_homepage_url: None,
3765        repository_download_url: None,
3766        api_data_url: None,
3767        datasource_id: Some(DatasourceId::PypiSetupPy),
3768        purl,
3769    }
3770}
3771
3772fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3773    let mut dependencies = Vec::new();
3774
3775    if let Some(reqs) = values
3776        .get("install_requires")
3777        .and_then(value_to_string_list)
3778    {
3779        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3780    }
3781
3782    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3783        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3784    }
3785
3786    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3787        let mut extra_items: Vec<_> = extras.iter().collect();
3788        extra_items.sort_by_key(|(name, _)| *name);
3789        for (extra_name, extra_value) in extra_items {
3790            if let Some(reqs) = value_to_string_list(extra_value) {
3791                dependencies.extend(build_setup_py_dependency_list(
3792                    reqs.as_slice(),
3793                    extra_name,
3794                    true,
3795                ));
3796            }
3797        }
3798    }
3799
3800    dependencies
3801}
3802
3803fn build_setup_py_dependency_list(
3804    reqs: &[String],
3805    scope: &str,
3806    is_optional: bool,
3807) -> Vec<Dependency> {
3808    reqs.iter()
3809        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3810        .collect()
3811}
3812
3813fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3814    values.get(key).and_then(value_to_string)
3815}
3816
3817fn value_to_string(value: &Value) -> Option<String> {
3818    match value {
3819        Value::String(value) => Some(value.clone()),
3820        Value::Number(value) => Some(value.to_string()),
3821        Value::Bool(value) => Some(value.to_string()),
3822        _ => None,
3823    }
3824}
3825
3826fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3827    match value {
3828        Value::String(value) => Some(vec![value.clone()]),
3829        Value::List(values) | Value::Tuple(values) => {
3830            let mut items = Vec::new();
3831            for item in values {
3832                items.push(value_to_string(item)?);
3833            }
3834            Some(items)
3835        }
3836        _ => None,
3837    }
3838}
3839
3840fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3841    let Value::Dict(dict) = value else {
3842        return None;
3843    };
3844
3845    let mut pairs: Vec<(String, String)> = dict
3846        .iter()
3847        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3848        .collect::<Option<Vec<_>>>()?;
3849    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3850    Some(pairs)
3851}
3852
3853fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3854    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3855    extract_requires_dist_dependencies(&requires_dist)
3856}
3857
3858pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3859    requires_dist
3860        .iter()
3861        .filter_map(|entry| build_rfc822_dependency(entry))
3862        .collect()
3863}
3864
3865fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3866    build_python_dependency(entry, "install", false, None)
3867}
3868
3869fn build_python_dependency(
3870    entry: &str,
3871    default_scope: &str,
3872    default_optional: bool,
3873    marker_override: Option<&str>,
3874) -> Option<Dependency> {
3875    let (requirement_part, marker_part) = entry
3876        .split_once(';')
3877        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3878        .unwrap_or((entry.trim(), None));
3879
3880    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3881    let requirement = normalize_rfc822_requirement(requirement_part);
3882    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3883        marker_part.or(marker_override),
3884        default_scope,
3885        default_optional,
3886    );
3887    let purl = build_python_dependency_purl(&name, None)?;
3888
3889    let is_pinned = requirement
3890        .as_deref()
3891        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3892    let purl = if is_pinned {
3893        requirement
3894            .as_deref()
3895            .map(|req| req.trim_start_matches('='))
3896            .and_then(|version| build_python_dependency_purl(&name, Some(version)))
3897            .unwrap_or(purl)
3898    } else {
3899        purl
3900    };
3901
3902    let mut extra_data = HashMap::new();
3903    extra_data.extend(marker_data);
3904    if let Some(marker) = marker {
3905        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3906    }
3907
3908    Some(Dependency {
3909        purl: Some(purl),
3910        extracted_requirement: requirement,
3911        scope: Some(scope),
3912        is_runtime: Some(true),
3913        is_optional: Some(is_optional),
3914        is_pinned: Some(is_pinned),
3915        is_direct: Some(true),
3916        resolved_package: None,
3917        extra_data: if extra_data.is_empty() {
3918            None
3919        } else {
3920            Some(extra_data)
3921        },
3922    })
3923}
3924
3925fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3926    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3927    let trimmed = requirement_part.trim();
3928    let mut remainder = trimmed[name.len()..].trim();
3929
3930    if let Some(stripped) = remainder.strip_prefix('[')
3931        && let Some(end_idx) = stripped.find(']')
3932    {
3933        remainder = stripped[end_idx + 1..].trim();
3934    }
3935
3936    let remainder = remainder
3937        .strip_prefix('(')
3938        .and_then(|value| value.strip_suffix(')'))
3939        .unwrap_or(remainder)
3940        .trim();
3941
3942    if remainder.is_empty() {
3943        return None;
3944    }
3945
3946    let mut specifiers: Vec<String> = remainder
3947        .split(',')
3948        .map(|specifier| specifier.trim().replace(' ', ""))
3949        .filter(|specifier| !specifier.is_empty())
3950        .collect();
3951    specifiers.sort();
3952    Some(specifiers.join(","))
3953}
3954
3955fn encode_python_dependency_purl_version(version: &str) -> String {
3956    version.replace('*', "%2A")
3957}
3958
3959fn build_python_dependency_purl(name: &str, version: Option<&str>) -> Option<String> {
3960    let normalized_name = normalize_python_dependency_name(name);
3961
3962    PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &normalized_name)
3963        .ok()
3964        .map(|_| match version {
3965            Some(version) => {
3966                format!(
3967                    "pkg:pypi/{normalized_name}@{}",
3968                    encode_python_dependency_purl_version(version)
3969                )
3970            }
3971            None => format!("pkg:pypi/{normalized_name}"),
3972        })
3973}
3974
3975fn normalize_python_dependency_name(name: &str) -> String {
3976    name.trim().to_ascii_lowercase().replace('_', "-")
3977}
3978
3979fn parse_rfc822_marker(
3980    marker_part: Option<&str>,
3981    default_scope: &str,
3982    default_optional: bool,
3983) -> (
3984    String,
3985    bool,
3986    Option<String>,
3987    HashMap<String, serde_json::Value>,
3988) {
3989    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3990        return (
3991            default_scope.to_string(),
3992            default_optional,
3993            None,
3994            HashMap::new(),
3995        );
3996    };
3997
3998    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3999        .expect("extra marker regex should compile");
4000    let mut extra_data = HashMap::new();
4001
4002    if let Some(python_version) = extract_marker_field(marker, "python_version") {
4003        extra_data.insert(
4004            "python_version".to_string(),
4005            serde_json::Value::String(python_version),
4006        );
4007    }
4008    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
4009        extra_data.insert(
4010            "sys_platform".to_string(),
4011            serde_json::Value::String(sys_platform),
4012        );
4013    }
4014
4015    if let Some(captures) = extra_re.captures(marker)
4016        && let Some(scope) = captures.get(1)
4017    {
4018        return (
4019            scope.as_str().to_string(),
4020            true,
4021            Some(marker.trim().to_string()),
4022            extra_data,
4023        );
4024    }
4025
4026    (
4027        default_scope.to_string(),
4028        default_optional,
4029        Some(marker.trim().to_string()),
4030        extra_data,
4031    )
4032}
4033
4034fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
4035    let re = Regex::new(&format!(
4036        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
4037        field
4038    ))
4039    .ok()?;
4040    let captures = re.captures(marker)?;
4041    let operator = captures.get(1)?.as_str();
4042    let value = captures.get(2)?.as_str();
4043    Some(format!("{} {}", operator, value))
4044}
4045
4046fn parse_requires_txt(content: &str) -> Vec<Dependency> {
4047    let mut dependencies = Vec::new();
4048    let mut current_scope = "install".to_string();
4049    let mut current_optional = false;
4050    let mut current_marker: Option<String> = None;
4051
4052    for line in content.lines() {
4053        let trimmed = line.trim();
4054        if trimmed.is_empty() || trimmed.starts_with('#') {
4055            continue;
4056        }
4057
4058        if trimmed.starts_with('[') && trimmed.ends_with(']') {
4059            let inner = &trimmed[1..trimmed.len() - 1];
4060            if let Some(rest) = inner.strip_prefix(':') {
4061                current_scope = "install".to_string();
4062                current_optional = false;
4063                current_marker = Some(rest.trim().to_string());
4064            } else if let Some((scope, marker)) = inner.split_once(':') {
4065                current_scope = scope.trim().to_string();
4066                current_optional = true;
4067                current_marker = Some(marker.trim().to_string());
4068            } else {
4069                current_scope = inner.trim().to_string();
4070                current_optional = true;
4071                current_marker = None;
4072            }
4073            continue;
4074        }
4075
4076        if let Some(dependency) = build_python_dependency(
4077            trimmed,
4078            &current_scope,
4079            current_optional,
4080            current_marker.as_deref(),
4081        ) {
4082            dependencies.push(dependency);
4083        }
4084    }
4085
4086    dependencies
4087}
4088
4089fn has_private_classifier(classifiers: &[String]) -> bool {
4090    classifiers
4091        .iter()
4092        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
4093}
4094
4095fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
4096    let name = name?;
4097    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
4098    if let Some(version) = version {
4099        package_url.with_version(version).ok()?;
4100    }
4101    Some(package_url.to_string())
4102}
4103
4104fn extract_from_setup_py_regex(content: &str) -> PackageData {
4105    let name = extract_setup_value(content, "name");
4106    let version = extract_setup_value(content, "version");
4107    let license_expression = extract_setup_value(content, "license");
4108
4109    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4110        normalize_spdx_declared_license(license_expression.as_deref());
4111    let extracted_license_statement = license_expression.clone();
4112
4113    let dependencies = extract_setup_py_dependencies(content);
4114    let homepage_url = extract_setup_value(content, "url");
4115    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
4116
4117    PackageData {
4118        package_type: Some(PythonParser::PACKAGE_TYPE),
4119        namespace: None,
4120        name,
4121        version,
4122        qualifiers: None,
4123        subpath: None,
4124        primary_language: Some("Python".to_string()),
4125        description: None,
4126        release_date: None,
4127        parties: Vec::new(),
4128        keywords: Vec::new(),
4129        homepage_url,
4130        download_url: None,
4131        size: None,
4132        sha1: None,
4133        md5: None,
4134        sha256: None,
4135        sha512: None,
4136        bug_tracking_url: None,
4137        code_view_url: None,
4138        vcs_url: None,
4139        copyright: None,
4140        holder: None,
4141        declared_license_expression,
4142        declared_license_expression_spdx,
4143        license_detections,
4144        other_license_expression: None,
4145        other_license_expression_spdx: None,
4146        other_license_detections: Vec::new(),
4147        extracted_license_statement,
4148        notice_text: None,
4149        source_packages: Vec::new(),
4150        file_references: Vec::new(),
4151        is_private: false,
4152        is_virtual: false,
4153        extra_data: None,
4154        dependencies,
4155        repository_homepage_url: None,
4156        repository_download_url: None,
4157        api_data_url: None,
4158        datasource_id: Some(DatasourceId::PypiSetupPy),
4159        purl,
4160    }
4161}
4162
4163fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
4164    crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
4165}
4166
4167fn extract_from_pypi_json(path: &Path) -> PackageData {
4168    let default = PackageData {
4169        package_type: Some(PythonParser::PACKAGE_TYPE),
4170        datasource_id: Some(DatasourceId::PypiJson),
4171        ..Default::default()
4172    };
4173
4174    let content = match read_file_to_string(path) {
4175        Ok(content) => content,
4176        Err(error) => {
4177            warn!("Failed to read pypi.json at {:?}: {}", path, error);
4178            return default;
4179        }
4180    };
4181
4182    let root: serde_json::Value = match serde_json::from_str(&content) {
4183        Ok(value) => value,
4184        Err(error) => {
4185            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
4186            return default;
4187        }
4188    };
4189
4190    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
4191        warn!("No info object found in pypi.json at {:?}", path);
4192        return default;
4193    };
4194
4195    let name = info
4196        .get("name")
4197        .and_then(|value| value.as_str())
4198        .map(ToOwned::to_owned);
4199    let version = info
4200        .get("version")
4201        .and_then(|value| value.as_str())
4202        .map(ToOwned::to_owned);
4203    let summary = info
4204        .get("summary")
4205        .and_then(|value| value.as_str())
4206        .map(ToOwned::to_owned);
4207    let description = info
4208        .get("description")
4209        .and_then(|value| value.as_str())
4210        .filter(|value| !value.trim().is_empty())
4211        .map(ToOwned::to_owned)
4212        .or(summary);
4213    let mut homepage_url = info
4214        .get("home_page")
4215        .and_then(|value| value.as_str())
4216        .map(ToOwned::to_owned);
4217    let author = info
4218        .get("author")
4219        .and_then(|value| value.as_str())
4220        .filter(|value| !value.trim().is_empty())
4221        .map(ToOwned::to_owned);
4222    let author_email = info
4223        .get("author_email")
4224        .and_then(|value| value.as_str())
4225        .filter(|value| !value.trim().is_empty())
4226        .map(ToOwned::to_owned);
4227    let license = info
4228        .get("license")
4229        .and_then(|value| value.as_str())
4230        .filter(|value| !value.trim().is_empty())
4231        .map(ToOwned::to_owned);
4232    let keywords = parse_setup_cfg_keywords(
4233        info.get("keywords")
4234            .and_then(|value| value.as_str())
4235            .map(ToOwned::to_owned),
4236    );
4237    let classifiers = info
4238        .get("classifiers")
4239        .and_then(|value| value.as_array())
4240        .map(|values| {
4241            values
4242                .iter()
4243                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
4244                .collect::<Vec<_>>()
4245        })
4246        .unwrap_or_default();
4247
4248    let mut parties = Vec::new();
4249    if author.is_some() || author_email.is_some() {
4250        parties.push(Party {
4251            r#type: Some("person".to_string()),
4252            role: Some("author".to_string()),
4253            name: author,
4254            email: author_email,
4255            url: None,
4256            organization: None,
4257            organization_url: None,
4258            timezone: None,
4259        });
4260    }
4261
4262    let mut bug_tracking_url = None;
4263    let mut code_view_url = None;
4264    let mut vcs_url = None;
4265    let mut extra_data = HashMap::new();
4266
4267    let parsed_project_urls = info
4268        .get("project_urls")
4269        .and_then(|value| value.as_object())
4270        .map(|map| {
4271            let mut pairs: Vec<(String, String)> = map
4272                .iter()
4273                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
4274                .collect();
4275            pairs.sort_by(|left, right| left.0.cmp(&right.0));
4276            pairs
4277        })
4278        .unwrap_or_default();
4279
4280    apply_project_url_mappings(
4281        &parsed_project_urls,
4282        &mut homepage_url,
4283        &mut bug_tracking_url,
4284        &mut code_view_url,
4285        &mut vcs_url,
4286        &mut extra_data,
4287    );
4288
4289    let (download_url, size, sha256) = root
4290        .get("urls")
4291        .and_then(|value| value.as_array())
4292        .map(|urls| select_pypi_json_artifact(urls))
4293        .unwrap_or((None, None, None));
4294
4295    let sha256 = sha256.and_then(|h| Sha256Digest::from_hex(&h).ok());
4296
4297    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4298        normalize_spdx_declared_license(license.as_deref());
4299    let dependencies = info
4300        .get("requires_dist")
4301        .and_then(|value| value.as_array())
4302        .map(|entries| {
4303            entries
4304                .iter()
4305                .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4306                .collect::<Vec<_>>()
4307        })
4308        .map(|entries| extract_requires_dist_dependencies(&entries))
4309        .unwrap_or_default();
4310
4311    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
4312        build_pypi_urls(name.as_deref(), version.as_deref());
4313
4314    PackageData {
4315        package_type: Some(PythonParser::PACKAGE_TYPE),
4316        namespace: None,
4317        name,
4318        version,
4319        qualifiers: None,
4320        subpath: None,
4321        primary_language: None,
4322        description,
4323        release_date: None,
4324        parties,
4325        keywords,
4326        homepage_url: homepage_url.or(repository_homepage_url.clone()),
4327        download_url,
4328        size,
4329        sha1: None,
4330        md5: None,
4331        sha256,
4332        sha512: None,
4333        bug_tracking_url,
4334        code_view_url,
4335        vcs_url,
4336        copyright: None,
4337        holder: None,
4338        declared_license_expression,
4339        declared_license_expression_spdx,
4340        license_detections,
4341        other_license_expression: None,
4342        other_license_expression_spdx: None,
4343        other_license_detections: Vec::new(),
4344        extracted_license_statement: license,
4345        notice_text: None,
4346        source_packages: Vec::new(),
4347        file_references: Vec::new(),
4348        is_private: has_private_classifier(&classifiers),
4349        is_virtual: false,
4350        extra_data: if extra_data.is_empty() {
4351            None
4352        } else {
4353            Some(extra_data)
4354        },
4355        dependencies,
4356        repository_homepage_url,
4357        repository_download_url,
4358        api_data_url,
4359        datasource_id: Some(DatasourceId::PypiJson),
4360        purl,
4361    }
4362}
4363
4364fn select_pypi_json_artifact(
4365    urls: &[serde_json::Value],
4366) -> (Option<String>, Option<u64>, Option<String>) {
4367    let selected = urls
4368        .iter()
4369        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4370        .or_else(|| urls.first());
4371
4372    let Some(entry) = selected else {
4373        return (None, None, None);
4374    };
4375
4376    let download_url = entry
4377        .get("url")
4378        .and_then(|value| value.as_str())
4379        .map(ToOwned::to_owned);
4380    let size = entry.get("size").and_then(|value| value.as_u64());
4381    let sha256 = entry
4382        .get("digests")
4383        .and_then(|value| value.as_object())
4384        .and_then(|digests| digests.get("sha256"))
4385        .and_then(|value| value.as_str())
4386        .map(ToOwned::to_owned);
4387
4388    (download_url, size, sha256)
4389}
4390
4391fn extract_from_pip_inspect(path: &Path) -> PackageData {
4392    let content = match read_file_to_string(path) {
4393        Ok(content) => content,
4394        Err(e) => {
4395            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4396            return default_package_data(path);
4397        }
4398    };
4399
4400    let root: serde_json::Value = match serde_json::from_str(&content) {
4401        Ok(value) => value,
4402        Err(e) => {
4403            warn!(
4404                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4405                path, e
4406            );
4407            return default_package_data(path);
4408        }
4409    };
4410
4411    let installed = match root.get("installed").and_then(|v| v.as_array()) {
4412        Some(arr) => arr,
4413        None => {
4414            warn!(
4415                "No 'installed' array found in pip-inspect.deplock at {:?}",
4416                path
4417            );
4418            return default_package_data(path);
4419        }
4420    };
4421
4422    let pip_version = root
4423        .get("pip_version")
4424        .and_then(|v| v.as_str())
4425        .map(String::from);
4426    let inspect_version = root
4427        .get("version")
4428        .and_then(|v| v.as_str())
4429        .map(String::from);
4430
4431    let mut main_package: Option<PackageData> = None;
4432    let mut dependencies: Vec<Dependency> = Vec::new();
4433
4434    for package_entry in installed {
4435        let metadata = match package_entry.get("metadata") {
4436            Some(m) => m,
4437            None => continue,
4438        };
4439
4440        let is_requested = package_entry
4441            .get("requested")
4442            .and_then(|v| v.as_bool())
4443            .unwrap_or(false);
4444        let has_direct_url = package_entry.get("direct_url").is_some();
4445
4446        let name = metadata
4447            .get("name")
4448            .and_then(|v| v.as_str())
4449            .map(String::from);
4450        let version = metadata
4451            .get("version")
4452            .and_then(|v| v.as_str())
4453            .map(String::from);
4454        let summary = metadata
4455            .get("summary")
4456            .and_then(|v| v.as_str())
4457            .map(String::from);
4458        let home_page = metadata
4459            .get("home_page")
4460            .and_then(|v| v.as_str())
4461            .map(String::from);
4462        let author = metadata
4463            .get("author")
4464            .and_then(|v| v.as_str())
4465            .map(String::from);
4466        let author_email = metadata
4467            .get("author_email")
4468            .and_then(|v| v.as_str())
4469            .map(String::from);
4470        let license = metadata
4471            .get("license")
4472            .and_then(|v| v.as_str())
4473            .map(String::from);
4474        let description = metadata
4475            .get("description")
4476            .and_then(|v| v.as_str())
4477            .map(String::from);
4478        let keywords = metadata
4479            .get("keywords")
4480            .and_then(|v| v.as_array())
4481            .map(|arr| {
4482                arr.iter()
4483                    .filter_map(|k| k.as_str().map(String::from))
4484                    .collect::<Vec<_>>()
4485            })
4486            .unwrap_or_default();
4487
4488        let mut parties = Vec::new();
4489        if author.is_some() || author_email.is_some() {
4490            parties.push(Party {
4491                r#type: Some("person".to_string()),
4492                role: Some("author".to_string()),
4493                name: author,
4494                email: author_email,
4495                url: None,
4496                organization: None,
4497                organization_url: None,
4498                timezone: None,
4499            });
4500        }
4501
4502        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4503            normalize_spdx_declared_license(license.as_deref());
4504        let extracted_license_statement = license.clone();
4505        let requires_dist = metadata
4506            .get("requires_dist")
4507            .and_then(|v| v.as_array())
4508            .map(|entries| {
4509                entries
4510                    .iter()
4511                    .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4512                    .collect::<Vec<_>>()
4513            })
4514            .unwrap_or_default();
4515        let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4516
4517        let purl = name.as_ref().and_then(|n| {
4518            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4519            if let Some(v) = &version {
4520                package_url.with_version(v).ok()?;
4521            }
4522            Some(package_url.to_string())
4523        });
4524
4525        if is_requested && has_direct_url {
4526            let mut extra_data = HashMap::new();
4527            if let Some(pv) = &pip_version {
4528                extra_data.insert(
4529                    "pip_version".to_string(),
4530                    serde_json::Value::String(pv.clone()),
4531                );
4532            }
4533            if let Some(iv) = &inspect_version {
4534                extra_data.insert(
4535                    "inspect_version".to_string(),
4536                    serde_json::Value::String(iv.clone()),
4537                );
4538            }
4539
4540            main_package = Some(PackageData {
4541                package_type: Some(PythonParser::PACKAGE_TYPE),
4542                namespace: None,
4543                name,
4544                version,
4545                qualifiers: None,
4546                subpath: None,
4547                primary_language: Some("Python".to_string()),
4548                description: description.or(summary),
4549                release_date: None,
4550                parties,
4551                keywords,
4552                homepage_url: home_page,
4553                download_url: None,
4554                size: None,
4555                sha1: None,
4556                md5: None,
4557                sha256: None,
4558                sha512: None,
4559                bug_tracking_url: None,
4560                code_view_url: None,
4561                vcs_url: None,
4562                copyright: None,
4563                holder: None,
4564                declared_license_expression,
4565                declared_license_expression_spdx,
4566                license_detections,
4567                other_license_expression: None,
4568                other_license_expression_spdx: None,
4569                other_license_detections: Vec::new(),
4570                extracted_license_statement,
4571                notice_text: None,
4572                source_packages: Vec::new(),
4573                file_references: Vec::new(),
4574                is_private: false,
4575                is_virtual: true,
4576                extra_data: if extra_data.is_empty() {
4577                    None
4578                } else {
4579                    Some(extra_data)
4580                },
4581                dependencies: parsed_dependencies,
4582                repository_homepage_url: None,
4583                repository_download_url: None,
4584                api_data_url: None,
4585                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4586                purl,
4587            });
4588        } else {
4589            let resolved_package = PackageData {
4590                package_type: Some(PythonParser::PACKAGE_TYPE),
4591                namespace: None,
4592                name: name.clone(),
4593                version: version.clone(),
4594                qualifiers: None,
4595                subpath: None,
4596                primary_language: Some("Python".to_string()),
4597                description: description.or(summary),
4598                release_date: None,
4599                parties,
4600                keywords,
4601                homepage_url: home_page,
4602                download_url: None,
4603                size: None,
4604                sha1: None,
4605                md5: None,
4606                sha256: None,
4607                sha512: None,
4608                bug_tracking_url: None,
4609                code_view_url: None,
4610                vcs_url: None,
4611                copyright: None,
4612                holder: None,
4613                declared_license_expression,
4614                declared_license_expression_spdx,
4615                license_detections,
4616                other_license_expression: None,
4617                other_license_expression_spdx: None,
4618                other_license_detections: Vec::new(),
4619                extracted_license_statement,
4620                notice_text: None,
4621                source_packages: Vec::new(),
4622                file_references: Vec::new(),
4623                is_private: false,
4624                is_virtual: true,
4625                extra_data: None,
4626                dependencies: parsed_dependencies,
4627                repository_homepage_url: None,
4628                repository_download_url: None,
4629                api_data_url: None,
4630                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4631                purl: purl.clone(),
4632            };
4633
4634            let resolved = package_data_to_resolved(&resolved_package);
4635            dependencies.push(Dependency {
4636                purl,
4637                extracted_requirement: None,
4638                scope: None,
4639                is_runtime: Some(true),
4640                is_optional: Some(false),
4641                is_pinned: Some(true),
4642                is_direct: Some(is_requested),
4643                resolved_package: Some(Box::new(resolved)),
4644                extra_data: None,
4645            });
4646        }
4647    }
4648
4649    if let Some(mut main_pkg) = main_package {
4650        let direct_requirement_purls: HashSet<String> = main_pkg
4651            .dependencies
4652            .iter()
4653            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4654            .collect();
4655
4656        let resolved_requirement_purls: HashSet<String> = dependencies
4657            .iter()
4658            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4659            .collect();
4660
4661        let unresolved_dependencies = main_pkg
4662            .dependencies
4663            .iter()
4664            .filter(|dep| {
4665                dep.purl.as_ref().is_some_and(|purl| {
4666                    !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4667                })
4668            })
4669            .cloned()
4670            .collect::<Vec<_>>();
4671
4672        for dependency in &mut dependencies {
4673            if dependency
4674                .purl
4675                .as_ref()
4676                .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4677            {
4678                dependency.is_direct = Some(true);
4679            }
4680        }
4681
4682        main_pkg.dependencies = dependencies;
4683        main_pkg.dependencies.extend(unresolved_dependencies);
4684        main_pkg
4685    } else {
4686        default_package_data(path)
4687    }
4688}
4689
4690fn base_dependency_purl(purl: &str) -> String {
4691    purl.split_once('@')
4692        .map(|(base, _)| base.to_string())
4693        .unwrap_or_else(|| purl.to_string())
4694}
4695
4696type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4697
4698fn extract_from_setup_cfg(path: &Path) -> PackageData {
4699    let content = match read_file_to_string(path) {
4700        Ok(content) => content,
4701        Err(e) => {
4702            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4703            return default_package_data(path);
4704        }
4705    };
4706
4707    let sections = parse_setup_cfg(&content);
4708    let name = get_ini_value(&sections, "metadata", "name");
4709    let version = get_ini_value(&sections, "metadata", "version");
4710    let description = get_ini_value(&sections, "metadata", "description");
4711    let author = get_ini_value(&sections, "metadata", "author");
4712    let author_email = get_ini_value(&sections, "metadata", "author_email");
4713    let maintainer = get_ini_value(&sections, "metadata", "maintainer");
4714    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4715    let license = get_ini_value(&sections, "metadata", "license");
4716    let mut homepage_url = get_ini_value(&sections, "metadata", "url");
4717    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4718    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4719    let python_requires = get_ini_value(&sections, "options", "python_requires");
4720    let parsed_project_urls =
4721        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4722    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4723    let mut extra_data = HashMap::new();
4724
4725    let mut parties = Vec::new();
4726    if author.is_some() || author_email.is_some() {
4727        parties.push(Party {
4728            r#type: Some("person".to_string()),
4729            role: Some("author".to_string()),
4730            name: author,
4731            email: author_email,
4732            url: None,
4733            organization: None,
4734            organization_url: None,
4735            timezone: None,
4736        });
4737    }
4738
4739    if maintainer.is_some() || maintainer_email.is_some() {
4740        parties.push(Party {
4741            r#type: Some("person".to_string()),
4742            role: Some("maintainer".to_string()),
4743            name: maintainer,
4744            email: maintainer_email,
4745            url: None,
4746            organization: None,
4747            organization_url: None,
4748            timezone: None,
4749        });
4750    }
4751
4752    let declared_license_expression = None;
4753    let declared_license_expression_spdx = None;
4754    let license_detections = Vec::new();
4755    let extracted_license_statement = license.clone();
4756
4757    let dependencies = extract_setup_cfg_dependencies(&sections);
4758
4759    if let Some(value) = python_requires {
4760        extra_data.insert(
4761            "python_requires".to_string(),
4762            serde_json::Value::String(value),
4763        );
4764    }
4765
4766    apply_project_url_mappings(
4767        &parsed_project_urls,
4768        &mut homepage_url,
4769        &mut bug_tracking_url,
4770        &mut code_view_url,
4771        &mut vcs_url,
4772        &mut extra_data,
4773    );
4774
4775    let extra_data = if extra_data.is_empty() {
4776        None
4777    } else {
4778        Some(extra_data)
4779    };
4780
4781    let purl = name.as_ref().and_then(|n| {
4782        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4783        if let Some(v) = &version {
4784            package_url.with_version(v).ok()?;
4785        }
4786        Some(package_url.to_string())
4787    });
4788
4789    PackageData {
4790        package_type: Some(PythonParser::PACKAGE_TYPE),
4791        namespace: None,
4792        name,
4793        version,
4794        qualifiers: None,
4795        subpath: None,
4796        primary_language: Some("Python".to_string()),
4797        description,
4798        release_date: None,
4799        parties,
4800        keywords,
4801        homepage_url,
4802        download_url: None,
4803        size: None,
4804        sha1: None,
4805        md5: None,
4806        sha256: None,
4807        sha512: None,
4808        bug_tracking_url,
4809        code_view_url,
4810        vcs_url,
4811        copyright: None,
4812        holder: None,
4813        declared_license_expression,
4814        declared_license_expression_spdx,
4815        license_detections,
4816        other_license_expression: None,
4817        other_license_expression_spdx: None,
4818        other_license_detections: Vec::new(),
4819        extracted_license_statement,
4820        notice_text: None,
4821        source_packages: Vec::new(),
4822        file_references: Vec::new(),
4823        is_private: has_private_classifier(&classifiers),
4824        is_virtual: false,
4825        extra_data,
4826        dependencies,
4827        repository_homepage_url: None,
4828        repository_download_url: None,
4829        api_data_url: None,
4830        datasource_id: Some(DatasourceId::PypiSetupCfg),
4831        purl,
4832    }
4833}
4834
4835fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4836    let Some(keywords) = value else {
4837        return Vec::new();
4838    };
4839
4840    keywords
4841        .split(',')
4842        .map(str::trim)
4843        .filter(|keyword| !keyword.is_empty())
4844        .map(ToOwned::to_owned)
4845        .collect()
4846}
4847
4848fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4849    entries
4850        .iter()
4851        .filter_map(|entry| {
4852            let (label, url) = entry.split_once('=')?;
4853            let label = label.trim();
4854            let url = url.trim();
4855            if label.is_empty() || url.is_empty() {
4856                None
4857            } else {
4858                Some((label.to_string(), url.to_string()))
4859            }
4860        })
4861        .collect()
4862}
4863
4864fn apply_project_url_mappings(
4865    parsed_urls: &[(String, String)],
4866    homepage_url: &mut Option<String>,
4867    bug_tracking_url: &mut Option<String>,
4868    code_view_url: &mut Option<String>,
4869    vcs_url: &mut Option<String>,
4870    extra_data: &mut HashMap<String, serde_json::Value>,
4871) {
4872    for (label, url) in parsed_urls {
4873        let label_lower = label.to_lowercase();
4874
4875        if bug_tracking_url.is_none()
4876            && matches!(
4877                label_lower.as_str(),
4878                "tracker"
4879                    | "bug reports"
4880                    | "bug tracker"
4881                    | "issues"
4882                    | "issue tracker"
4883                    | "github: issues"
4884            )
4885        {
4886            *bug_tracking_url = Some(url.clone());
4887        } else if code_view_url.is_none()
4888            && matches!(label_lower.as_str(), "source" | "source code" | "code")
4889        {
4890            *code_view_url = Some(url.clone());
4891        } else if vcs_url.is_none()
4892            && matches!(
4893                label_lower.as_str(),
4894                "github" | "gitlab" | "github: repo" | "repository"
4895            )
4896        {
4897            *vcs_url = Some(url.clone());
4898        } else if homepage_url.is_none()
4899            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4900        {
4901            *homepage_url = Some(url.clone());
4902        } else if label_lower == "changelog" {
4903            extra_data.insert(
4904                "changelog_url".to_string(),
4905                serde_json::Value::String(url.clone()),
4906            );
4907        }
4908    }
4909
4910    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4911        .iter()
4912        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4913        .collect();
4914
4915    if !project_urls_json.is_empty() {
4916        extra_data.insert(
4917            "project_urls".to_string(),
4918            serde_json::Value::Object(project_urls_json),
4919        );
4920    }
4921}
4922
4923fn parse_setup_cfg(content: &str) -> IniSections {
4924    let mut sections: IniSections = HashMap::new();
4925    let mut current_section: Option<String> = None;
4926    let mut current_key: Option<String> = None;
4927
4928    for raw_line in content.lines() {
4929        let line = raw_line.trim_end_matches('\r');
4930        let trimmed = line.trim();
4931        if trimmed.is_empty() {
4932            continue;
4933        }
4934
4935        let stripped = line.trim_start();
4936        if stripped.starts_with('#') || stripped.starts_with(';') {
4937            continue;
4938        }
4939
4940        if stripped.starts_with('[') && stripped.ends_with(']') {
4941            let section_name = stripped
4942                .trim_start_matches('[')
4943                .trim_end_matches(']')
4944                .trim()
4945                .to_ascii_lowercase();
4946            current_section = if section_name.is_empty() {
4947                None
4948            } else {
4949                Some(section_name)
4950            };
4951            current_key = None;
4952            continue;
4953        }
4954
4955        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4956            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4957                let value = stripped.trim();
4958                if !value.is_empty() {
4959                    sections
4960                        .entry(section.clone())
4961                        .or_default()
4962                        .entry(key.clone())
4963                        .or_default()
4964                        .push(value.to_string());
4965                }
4966            }
4967            continue;
4968        }
4969
4970        if let Some((key, value)) = stripped.split_once('=')
4971            && let Some(section) = current_section.as_ref()
4972        {
4973            let key_name = key.trim().to_ascii_lowercase();
4974            let value_trimmed = value.trim();
4975            let entry = sections
4976                .entry(section.clone())
4977                .or_default()
4978                .entry(key_name.clone())
4979                .or_default();
4980            if !value_trimmed.is_empty() {
4981                entry.push(value_trimmed.to_string());
4982            }
4983            current_key = Some(key_name);
4984        }
4985    }
4986
4987    sections
4988}
4989
4990fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4991    sections
4992        .get(&section.to_ascii_lowercase())
4993        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4994        .and_then(|entries| entries.first())
4995        .map(|value| value.trim().to_string())
4996}
4997
4998fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4999    sections
5000        .get(&section.to_ascii_lowercase())
5001        .and_then(|values| values.get(&key.to_ascii_lowercase()))
5002        .cloned()
5003        .unwrap_or_default()
5004}
5005
5006fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
5007    let mut dependencies = Vec::new();
5008
5009    for (sub_section, scope) in [
5010        ("install_requires", "install"),
5011        ("tests_require", "test"),
5012        ("setup_requires", "setup"),
5013    ] {
5014        let reqs = get_ini_values(sections, "options", sub_section);
5015        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
5016    }
5017
5018    if let Some(extras) = sections.get("options.extras_require") {
5019        let mut extra_items: Vec<_> = extras.iter().collect();
5020        extra_items.sort_by_key(|(name, _)| *name);
5021        for (extra_name, reqs) in extra_items {
5022            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
5023        }
5024    }
5025
5026    dependencies
5027}
5028
5029fn parse_setup_cfg_requirements(
5030    reqs: &[String],
5031    scope: &str,
5032    is_optional: bool,
5033) -> Vec<Dependency> {
5034    reqs.iter()
5035        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
5036        .collect()
5037}
5038
5039fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
5040    let trimmed = req.trim();
5041    if trimmed.is_empty() || trimmed.starts_with('#') {
5042        return None;
5043    }
5044
5045    let name = extract_setup_cfg_dependency_name(trimmed)?;
5046    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5047
5048    Some(Dependency {
5049        purl: Some(purl.to_string()),
5050        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
5051        scope: Some(scope.to_string()),
5052        is_runtime: Some(true),
5053        is_optional: Some(is_optional),
5054        is_pinned: Some(false),
5055        is_direct: Some(true),
5056        resolved_package: None,
5057        extra_data: None,
5058    })
5059}
5060
5061fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
5062    let trimmed = req.trim();
5063    if trimmed.is_empty() {
5064        return None;
5065    }
5066
5067    let end = trimmed
5068        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
5069        .unwrap_or(trimmed.len());
5070    let name = trimmed[..end].trim();
5071    if name.is_empty() {
5072        None
5073    } else {
5074        Some(name.to_string())
5075    }
5076}
5077
5078fn normalize_setup_cfg_requirement(req: &str) -> String {
5079    req.chars().filter(|c| !c.is_whitespace()).collect()
5080}
5081
5082fn extract_setup_value(content: &str, key: &str) -> Option<String> {
5083    let patterns = vec![
5084        format!("{}=\"", key),   // name="value"
5085        format!("{} =\"", key),  // name ="value"
5086        format!("{}= \"", key),  // name= "value"
5087        format!("{} = \"", key), // name = "value"
5088        format!("{}='", key),    // name='value'
5089        format!("{} ='", key),   // name ='value'
5090        format!("{}= '", key),   // name= 'value'
5091        format!("{} = '", key),  // name = 'value'
5092    ];
5093
5094    for pattern in patterns {
5095        if let Some(start_idx) = content.find(&pattern) {
5096            let value_start = start_idx + pattern.len();
5097            let remaining = &content[value_start..];
5098
5099            if let Some(end_idx) = remaining.find(['"', '\'']) {
5100                return Some(remaining[..end_idx].to_string());
5101            }
5102        }
5103    }
5104
5105    None
5106}
5107
5108fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
5109    let mut dependencies = Vec::new();
5110
5111    if let Some(tests_deps) = extract_tests_require(content) {
5112        dependencies.extend(tests_deps);
5113    }
5114
5115    if let Some(extras_deps) = extract_extras_require(content) {
5116        dependencies.extend(extras_deps);
5117    }
5118
5119    dependencies
5120}
5121
5122fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
5123    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
5124    let re = Regex::new(pattern).ok()?;
5125    let captures = re.captures(content)?;
5126    let deps_str = captures.get(1)?.as_str();
5127
5128    let deps = parse_setup_py_dep_list(deps_str, "test", true);
5129    if deps.is_empty() { None } else { Some(deps) }
5130}
5131
5132fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
5133    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
5134    let re = Regex::new(pattern).ok()?;
5135    let captures = re.captures(content)?;
5136    let dict_content = captures.get(1)?.as_str();
5137
5138    let mut all_deps = Vec::new();
5139
5140    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
5141    let entry_re = Regex::new(entry_pattern).ok()?;
5142
5143    for entry_cap in entry_re.captures_iter(dict_content) {
5144        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
5145            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
5146            all_deps.extend(deps);
5147        }
5148    }
5149
5150    if all_deps.is_empty() {
5151        None
5152    } else {
5153        Some(all_deps)
5154    }
5155}
5156
5157fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
5158    let dep_pattern = r#"['"]([^'"]+)['"]"#;
5159    let re = match Regex::new(dep_pattern) {
5160        Ok(r) => r,
5161        Err(_) => return Vec::new(),
5162    };
5163
5164    re.captures_iter(deps_str)
5165        .filter_map(|cap| {
5166            let dep_str = cap.get(1)?.as_str().trim();
5167            if dep_str.is_empty() {
5168                return None;
5169            }
5170
5171            let name = extract_setup_cfg_dependency_name(dep_str)?;
5172            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5173
5174            Some(Dependency {
5175                purl: Some(purl.to_string()),
5176                extracted_requirement: Some(dep_str.to_string()),
5177                scope: Some(scope.to_string()),
5178                is_runtime: Some(true),
5179                is_optional: Some(is_optional),
5180                is_pinned: Some(false),
5181                is_direct: Some(true),
5182                resolved_package: None,
5183                extra_data: None,
5184            })
5185        })
5186        .collect()
5187}
5188
5189/// Reads and parses a TOML file
5190pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
5191    let content = read_file_to_string(path).map_err(|e| e.to_string())?;
5192    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
5193}
5194
5195/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
5196///
5197/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
5198/// Essential for SBOM compliance and package integrity verification.
5199///
5200/// # Returns
5201///
5202/// - `(Some(size), Some(hash))` on success
5203/// - `(None, None)` if file cannot be opened
5204/// - `(Some(size), None)` if hash calculation fails during read
5205fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<Sha256Digest>) {
5206    let mut file = match File::open(path) {
5207        Ok(f) => f,
5208        Err(_) => return (None, None),
5209    };
5210
5211    let metadata = match file.metadata() {
5212        Ok(m) => m,
5213        Err(_) => return (None, None),
5214    };
5215    let size = metadata.len();
5216
5217    let mut hasher = Sha256::new();
5218    let mut buffer = vec![0; 8192];
5219
5220    loop {
5221        match file.read(&mut buffer) {
5222            Ok(0) => break,
5223            Ok(n) => hasher.update(&buffer[..n]),
5224            Err(_) => return (Some(size), None),
5225        }
5226    }
5227
5228    let hash = Sha256Digest::from_bytes(hasher.finalize().into());
5229    (Some(size), Some(hash))
5230}
5231
5232fn default_package_data(path: &Path) -> PackageData {
5233    PackageData {
5234        package_type: Some(PythonParser::PACKAGE_TYPE),
5235        primary_language: Some("Python".to_string()),
5236        datasource_id: infer_python_datasource_id(path),
5237        ..Default::default()
5238    }
5239}
5240
5241fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
5242    let file_name = path.file_name().and_then(|name| name.to_str());
5243
5244    match file_name {
5245        Some("pyproject.toml") => {
5246            if read_toml_file(path)
5247                .ok()
5248                .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
5249                .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
5250                .is_some()
5251            {
5252                Some(DatasourceId::PypiPoetryPyprojectToml)
5253            } else {
5254                Some(DatasourceId::PypiPyprojectToml)
5255            }
5256        }
5257        Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
5258            Some(DatasourceId::PypiSetupPy)
5259        }
5260        Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
5261        Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
5262        Some("METADATA") if is_installed_wheel_metadata_path(path) => {
5263            Some(DatasourceId::PypiWheelMetadata)
5264        }
5265        Some("pypi.json") => Some(DatasourceId::PypiJson),
5266        Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
5267        Some("origin.json") if is_pip_cache_origin_json(path) => {
5268            Some(DatasourceId::PypiPipOriginJson)
5269        }
5270        _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
5271            Some(DatasourceId::PypiSdist)
5272        }
5273        _ if path
5274            .extension()
5275            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
5276        {
5277            Some(DatasourceId::PypiWheel)
5278        }
5279        _ if path
5280            .extension()
5281            .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
5282        {
5283            Some(DatasourceId::PypiEgg)
5284        }
5285        _ => None,
5286    }
5287}
5288
5289crate::register_parser!(
5290    "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
5291    &[
5292        "**/pyproject.toml",
5293        "**/setup.py",
5294        "**/*_setup.py",
5295        "**/setup.cfg",
5296        "**/pypi.json",
5297        "**/PKG-INFO",
5298        "**/*.dist-info/METADATA",
5299        "**/origin.json",
5300        "**/*.tar.gz",
5301        "**/*.tgz",
5302        "**/*.tar.bz2",
5303        "**/*.tar.xz",
5304        "**/*.zip",
5305        "**/*.whl",
5306        "**/*.egg"
5307    ],
5308    "pypi",
5309    "Python",
5310    Some("https://packaging.python.org/"),
5311);