Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{
35    DatasourceId, Dependency, FileReference, PackageData, PackageType, Party, Sha256Digest,
36};
37use crate::parser_warn as warn;
38use crate::parsers::utils::{read_file_to_string, split_name_email};
39use base64::Engine;
40use base64::engine::general_purpose::URL_SAFE_NO_PAD;
41use bzip2::read::BzDecoder;
42use csv::ReaderBuilder;
43use flate2::read::GzDecoder;
44use liblzma::read::XzDecoder;
45use packageurl::PackageUrl;
46use regex::Regex;
47use ruff_python_ast as ast;
48use ruff_python_parser::parse_module;
49use serde_json::{Map as JsonMap, Value as JsonValue};
50use sha2::{Digest, Sha256};
51use std::collections::{HashMap, HashSet};
52use std::fs::File;
53use std::io::Read;
54use std::path::{Component, Path, PathBuf};
55use tar::Archive;
56use toml::Value as TomlValue;
57use toml::map::Map as TomlMap;
58use zip::ZipArchive;
59
60use super::PackageParser;
61use super::license_normalization::{
62    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
63    normalize_spdx_expression,
64};
65use super::pep508::parse_pep508_requirement;
66
67// Field constants for pyproject.toml
68const FIELD_PROJECT: &str = "project";
69const FIELD_NAME: &str = "name";
70const FIELD_VERSION: &str = "version";
71const FIELD_DESCRIPTION: &str = "description";
72const FIELD_KEYWORDS: &str = "keywords";
73const FIELD_LICENSE: &str = "license";
74const FIELD_AUTHORS: &str = "authors";
75const FIELD_MAINTAINERS: &str = "maintainers";
76const FIELD_URLS: &str = "urls";
77const FIELD_HOMEPAGE: &str = "homepage";
78const FIELD_REPOSITORY: &str = "repository";
79const FIELD_DEPENDENCIES: &str = "dependencies";
80const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
81const FIELD_EXTRAS: &str = "extras";
82
83type ProjectUrls = (
84    Option<String>,
85    Option<String>,
86    Option<String>,
87    Option<String>,
88    Option<String>,
89);
90const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
91const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
92const MAX_SETUP_PY_BYTES: usize = 1_048_576;
93const MAX_SETUP_PY_AST_NODES: usize = 10_000;
94const MAX_SETUP_PY_AST_DEPTH: usize = 50;
95const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
96const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
97const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
98
99/// Python package parser supporting 11 manifest formats.
100///
101/// Extracts metadata from Python package files including pyproject.toml, setup.py,
102/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
103///
104/// # Security
105///
106/// setup.py files are parsed using AST analysis rather than code execution to prevent
107/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
108pub struct PythonParser;
109
110#[derive(Clone, Copy, Debug)]
111enum PythonSdistArchiveFormat {
112    TarGz,
113    Tgz,
114    TarBz2,
115    TarXz,
116    Zip,
117}
118
119#[derive(Clone, Debug)]
120struct ValidatedZipEntry {
121    index: usize,
122    name: String,
123}
124
125impl PackageParser for PythonParser {
126    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
127
128    fn extract_packages(path: &Path) -> Vec<PackageData> {
129        vec![
130            if path.file_name().unwrap_or_default() == "pyproject.toml" {
131                extract_from_pyproject_toml(path)
132            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
133                extract_from_setup_cfg(path)
134            } else if is_setup_py_like_path(path) {
135                return extract_setup_py_packages(path);
136            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
137                extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
138            } else if is_installed_wheel_metadata_path(path) {
139                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
140            } else if is_pip_cache_origin_json(path) {
141                extract_from_pip_origin_json(path)
142            } else if path.file_name().unwrap_or_default() == "pypi.json" {
143                extract_from_pypi_json(path)
144            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
145                extract_from_pip_inspect(path)
146            } else if is_python_sdist_archive_path(path) {
147                extract_from_sdist_archive(path)
148            } else if path
149                .extension()
150                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
151            {
152                extract_from_wheel_archive(path)
153            } else if path
154                .extension()
155                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
156            {
157                extract_from_egg_archive(path)
158            } else {
159                default_package_data(path)
160            },
161        ]
162    }
163
164    fn is_match(path: &Path) -> bool {
165        if let Some(filename) = path.file_name()
166            && (filename == "pyproject.toml"
167                || filename == "setup.cfg"
168                || is_setup_py_like_path(path)
169                || filename == "PKG-INFO"
170                || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
171                || filename == "pypi.json"
172                || filename == "pip-inspect.deplock"
173                || is_pip_cache_origin_json(path))
174        {
175            return true;
176        }
177
178        if let Some(extension) = path.extension() {
179            let ext = extension.to_string_lossy().to_lowercase();
180            if (ext == "whl" && is_valid_wheel_archive_path(path))
181                || ext == "egg"
182                || is_python_sdist_archive_path(path)
183            {
184                return true;
185            }
186        }
187
188        false
189    }
190}
191
192fn is_setup_py_like_path(path: &Path) -> bool {
193    path.file_name()
194        .and_then(|name| name.to_str())
195        .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
196}
197
198fn is_installed_wheel_metadata_path(path: &Path) -> bool {
199    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
200        && path
201            .parent()
202            .and_then(|parent| parent.file_name())
203            .and_then(|name| name.to_str())
204            .is_some_and(|name| name.ends_with(".dist-info"))
205}
206
207#[derive(Debug, Clone)]
208struct InstalledWheelMetadata {
209    wheel_tags: Vec<String>,
210    wheel_version: Option<String>,
211    wheel_generator: Option<String>,
212    root_is_purelib: Option<bool>,
213    compressed_tag: Option<String>,
214}
215
216fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
217    let Some(parent) = path.parent() else {
218        return;
219    };
220
221    if !parent
222        .file_name()
223        .and_then(|name| name.to_str())
224        .is_some_and(|name| name.ends_with(".dist-info"))
225    {
226        return;
227    }
228
229    let wheel_path = parent.join("WHEEL");
230    if !wheel_path.exists() {
231        return;
232    }
233
234    let Ok(content) = read_file_to_string(&wheel_path) else {
235        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
236        return;
237    };
238
239    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
240        return;
241    };
242
243    apply_installed_wheel_metadata(package_data, &wheel_metadata);
244}
245
246fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
247    use super::rfc822::{get_header_all, get_header_first};
248
249    let metadata = super::rfc822::parse_rfc822_content(content);
250    let wheel_tags = get_header_all(&metadata.headers, "tag");
251    if wheel_tags.is_empty() {
252        return None;
253    }
254
255    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
256    let wheel_generator = get_header_first(&metadata.headers, "generator");
257    let root_is_purelib =
258        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
259            match value.to_ascii_lowercase().as_str() {
260                "true" => Some(true),
261                "false" => Some(false),
262                _ => None,
263            }
264        });
265
266    let compressed_tag = compress_wheel_tags(&wheel_tags);
267
268    Some(InstalledWheelMetadata {
269        wheel_tags,
270        wheel_version,
271        wheel_generator,
272        root_is_purelib,
273        compressed_tag,
274    })
275}
276
277fn compress_wheel_tags(tags: &[String]) -> Option<String> {
278    if tags.is_empty() {
279        return None;
280    }
281
282    if tags.len() == 1 {
283        return Some(tags[0].clone());
284    }
285
286    let mut python_tags = Vec::new();
287    let mut abi_tag: Option<&str> = None;
288    let mut platform_tag: Option<&str> = None;
289
290    for tag in tags {
291        let mut parts = tag.splitn(3, '-');
292        let python = parts.next()?;
293        let abi = parts.next()?;
294        let platform = parts.next()?;
295
296        if abi_tag.is_some_and(|existing| existing != abi)
297            || platform_tag.is_some_and(|existing| existing != platform)
298        {
299            return None;
300        }
301
302        abi_tag = Some(abi);
303        platform_tag = Some(platform);
304        python_tags.push(python.to_string());
305    }
306
307    Some(format!(
308        "{}-{}-{}",
309        python_tags.join("."),
310        abi_tag?,
311        platform_tag?
312    ))
313}
314
315fn apply_installed_wheel_metadata(
316    package_data: &mut PackageData,
317    wheel_metadata: &InstalledWheelMetadata,
318) {
319    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
320    extra_data.insert(
321        "wheel_tags".to_string(),
322        JsonValue::Array(
323            wheel_metadata
324                .wheel_tags
325                .iter()
326                .cloned()
327                .map(JsonValue::String)
328                .collect(),
329        ),
330    );
331
332    if let Some(wheel_version) = &wheel_metadata.wheel_version {
333        extra_data.insert(
334            "wheel_version".to_string(),
335            JsonValue::String(wheel_version.clone()),
336        );
337    }
338
339    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
340        extra_data.insert(
341            "wheel_generator".to_string(),
342            JsonValue::String(wheel_generator.clone()),
343        );
344    }
345
346    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
347        extra_data.insert(
348            "root_is_purelib".to_string(),
349            JsonValue::Bool(root_is_purelib),
350        );
351    }
352
353    if let (Some(name), Some(version), Some(extension)) = (
354        package_data.name.as_deref(),
355        package_data.version.as_deref(),
356        wheel_metadata.compressed_tag.as_deref(),
357    ) {
358        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
359    }
360}
361
362fn is_pip_cache_origin_json(path: &Path) -> bool {
363    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
364        && path.ancestors().skip(1).any(|ancestor| {
365            ancestor
366                .file_name()
367                .and_then(|name| name.to_str())
368                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
369        })
370}
371
372fn extract_from_pip_origin_json(path: &Path) -> PackageData {
373    let content = match read_file_to_string(path) {
374        Ok(content) => content,
375        Err(e) => {
376            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
377            return default_package_data(path);
378        }
379    };
380
381    let root: JsonValue = match serde_json::from_str(&content) {
382        Ok(root) => root,
383        Err(e) => {
384            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
385            return default_package_data(path);
386        }
387    };
388
389    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
390        warn!("No url found in pip cache origin.json at {:?}", path);
391        return default_package_data(path);
392    };
393
394    let sibling_wheel = find_sibling_cached_wheel(path);
395    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
396        sibling_wheel
397            .as_ref()
398            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
399    });
400
401    let Some((name, version)) = name_version else {
402        warn!(
403            "Failed to infer package name/version from pip cache origin.json at {:?}",
404            path
405        );
406        return default_package_data(path);
407    };
408
409    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
410        build_pypi_urls(Some(&name), Some(&version));
411    let purl = sibling_wheel
412        .as_ref()
413        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
414        .or(plain_purl);
415
416    PackageData {
417        package_type: Some(PythonParser::PACKAGE_TYPE),
418        primary_language: Some("Python".to_string()),
419        name: Some(name),
420        version: Some(version),
421        datasource_id: Some(DatasourceId::PypiPipOriginJson),
422        download_url: Some(download_url.to_string()),
423        sha256: extract_sha256_from_origin_json(&root)
424            .and_then(|h| Sha256Digest::from_hex(&h).ok()),
425        repository_homepage_url,
426        repository_download_url,
427        api_data_url,
428        purl,
429        ..Default::default()
430    }
431}
432
433fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
434    let parent = path.parent()?;
435    let entries = parent.read_dir().ok()?;
436
437    for entry in entries.flatten() {
438        let sibling_path = entry.path();
439        if sibling_path
440            .extension()
441            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
442            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
443        {
444            return Some(wheel_info);
445        }
446    }
447
448    None
449}
450
451fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
452    let file_name = url.rsplit('/').next()?;
453
454    if file_name.ends_with(".whl") {
455        return parse_wheel_filename(Path::new(file_name))
456            .map(|wheel_info| (wheel_info.name, wheel_info.version));
457    }
458
459    let stem = strip_python_archive_extension(file_name)?;
460    let (name, version) = stem.rsplit_once('-')?;
461    if name.is_empty() || version.is_empty() {
462        return None;
463    }
464
465    Some((name.replace('_', "-"), version.to_string()))
466}
467
468fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
469    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
470        .iter()
471        .find_map(|suffix| file_name.strip_suffix(suffix))
472}
473
474fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
475    root.pointer("/archive_info/hashes/sha256")
476        .and_then(|value| value.as_str())
477        .map(ToOwned::to_owned)
478        .or_else(|| {
479            root.pointer("/archive_info/hash")
480                .and_then(|value| value.as_str())
481                .and_then(normalize_origin_hash)
482        })
483}
484
485fn normalize_origin_hash(hash: &str) -> Option<String> {
486    if let Some(value) = hash.strip_prefix("sha256=") {
487        return Some(value.to_string());
488    }
489    if let Some(value) = hash.strip_prefix("sha256:") {
490        return Some(value.to_string());
491    }
492    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
493        return Some(hash.to_string());
494    }
495    None
496}
497
498fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
499    let content = match read_file_to_string(path) {
500        Ok(content) => content,
501        Err(e) => {
502            warn!("Failed to read metadata at {:?}: {}", path, e);
503            return default_package_data(path);
504        }
505    };
506
507    let metadata = super::rfc822::parse_rfc822_content(&content);
508    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
509    merge_sibling_metadata_dependencies(path, &mut package_data);
510    merge_sibling_metadata_file_references(path, &mut package_data);
511    if datasource_id == DatasourceId::PypiWheelMetadata {
512        merge_sibling_wheel_metadata(path, &mut package_data);
513    }
514    package_data
515}
516
517fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
518    let mut extra_dependencies = Vec::new();
519
520    if let Some(parent) = path.parent() {
521        let direct_requires = parent.join("requires.txt");
522        if direct_requires.exists()
523            && let Ok(content) = read_file_to_string(&direct_requires)
524        {
525            extra_dependencies.extend(parse_requires_txt(&content));
526        }
527
528        let sibling_egg_info_requires = parent
529            .read_dir()
530            .ok()
531            .into_iter()
532            .flatten()
533            .flatten()
534            .find_map(|entry| {
535                let child_path = entry.path();
536                if child_path.is_dir()
537                    && child_path
538                        .file_name()
539                        .and_then(|name| name.to_str())
540                        .is_some_and(|name| name.ends_with(".egg-info"))
541                {
542                    let requires = child_path.join("requires.txt");
543                    requires.exists().then_some(requires)
544                } else {
545                    None
546                }
547            });
548
549        if let Some(requires_path) = sibling_egg_info_requires
550            && let Ok(content) = read_file_to_string(&requires_path)
551        {
552            extra_dependencies.extend(parse_requires_txt(&content));
553        }
554    }
555
556    for dependency in extra_dependencies {
557        if !package_data.dependencies.iter().any(|existing| {
558            existing.purl == dependency.purl
559                && existing.scope == dependency.scope
560                && existing.extracted_requirement == dependency.extracted_requirement
561                && existing.extra_data == dependency.extra_data
562        }) {
563            package_data.dependencies.push(dependency);
564        }
565    }
566}
567
568fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
569    let mut extra_refs = Vec::new();
570
571    if let Some(parent) = path.parent() {
572        let record_path = parent.join("RECORD");
573        if record_path.exists()
574            && let Ok(content) = read_file_to_string(&record_path)
575        {
576            extra_refs.extend(parse_record_csv(&content));
577        }
578
579        let installed_files_path = parent.join("installed-files.txt");
580        if installed_files_path.exists()
581            && let Ok(content) = read_file_to_string(&installed_files_path)
582        {
583            extra_refs.extend(parse_installed_files_txt(&content));
584        }
585
586        let sources_path = parent.join("SOURCES.txt");
587        if sources_path.exists()
588            && let Ok(content) = read_file_to_string(&sources_path)
589        {
590            extra_refs.extend(parse_sources_txt(&content));
591        }
592    }
593
594    for file_ref in extra_refs {
595        if !package_data
596            .file_references
597            .iter()
598            .any(|existing| existing.path == file_ref.path)
599        {
600            package_data.file_references.push(file_ref);
601        }
602    }
603}
604
605fn collect_validated_zip_entries<R: Read + std::io::Seek>(
606    archive: &mut ZipArchive<R>,
607    path: &Path,
608    archive_type: &str,
609) -> Result<Vec<ValidatedZipEntry>, String> {
610    let mut total_extracted = 0u64;
611    let mut entries = Vec::new();
612
613    for i in 0..archive.len() {
614        if let Ok(file) = archive.by_index_raw(i) {
615            let compressed_size = file.compressed_size();
616            let uncompressed_size = file.size();
617            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
618                warn!(
619                    "Skipping unsafe path in {} {:?}: {}",
620                    archive_type,
621                    path,
622                    file.name()
623                );
624                continue;
625            };
626
627            if compressed_size > 0 {
628                let ratio = uncompressed_size as f64 / compressed_size as f64;
629                if ratio > MAX_COMPRESSION_RATIO {
630                    warn!(
631                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
632                        archive_type, path, ratio
633                    );
634                    continue;
635                }
636            }
637
638            if uncompressed_size > MAX_FILE_SIZE {
639                warn!(
640                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
641                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
642                );
643                continue;
644            }
645
646            total_extracted += uncompressed_size;
647            if total_extracted > MAX_ARCHIVE_SIZE {
648                let msg = format!(
649                    "Total extracted size exceeds limit for {} {:?}",
650                    archive_type, path
651                );
652                warn!("{}", msg);
653                return Err(msg);
654            }
655
656            entries.push(ValidatedZipEntry {
657                index: i,
658                name: entry_name,
659            });
660        }
661    }
662
663    Ok(entries)
664}
665
666fn is_python_sdist_archive_path(path: &Path) -> bool {
667    detect_python_sdist_archive_format(path).is_some()
668}
669
670fn is_valid_wheel_archive_path(path: &Path) -> bool {
671    if !path.is_file() {
672        return true;
673    }
674
675    let file = match File::open(path) {
676        Ok(file) => file,
677        Err(_) => return false,
678    };
679    let mut archive = match ZipArchive::new(file) {
680        Ok(archive) => archive,
681        Err(_) => return false,
682    };
683
684    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
685        Ok(entries) => entries,
686        Err(_) => return false,
687    };
688
689    find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA").is_some()
690}
691
692fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
693    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
694
695    if !is_likely_python_sdist_filename(&file_name) {
696        return None;
697    }
698
699    if file_name.ends_with(".tar.gz") {
700        tar_gz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarGz)
701    } else if file_name.ends_with(".tgz") {
702        tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
703    } else if file_name.ends_with(".tar.bz2") {
704        tar_bz2_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarBz2)
705    } else if file_name.ends_with(".tar.xz") {
706        tar_xz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarXz)
707    } else if file_name.ends_with(".zip") {
708        zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
709    } else {
710        None
711    }
712}
713
714fn tar_gz_sdist_contains_pkg_info(path: &Path) -> bool {
715    let Some(compressed_size) = compressed_archive_size(path) else {
716        return false;
717    };
718    let file = match File::open(path) {
719        Ok(file) => file,
720        Err(_) => return false,
721    };
722    let decoder = GzDecoder::new(file);
723    tar_sdist_contains_pkg_info(path, decoder, "tar.gz", compressed_size)
724}
725
726fn tar_bz2_sdist_contains_pkg_info(path: &Path) -> bool {
727    let Some(compressed_size) = compressed_archive_size(path) else {
728        return false;
729    };
730    let file = match File::open(path) {
731        Ok(file) => file,
732        Err(_) => return false,
733    };
734    let decoder = BzDecoder::new(file);
735    tar_sdist_contains_pkg_info(path, decoder, "tar.bz2", compressed_size)
736}
737
738fn tar_xz_sdist_contains_pkg_info(path: &Path) -> bool {
739    let Some(compressed_size) = compressed_archive_size(path) else {
740        return false;
741    };
742    let file = match File::open(path) {
743        Ok(file) => file,
744        Err(_) => return false,
745    };
746    let decoder = XzDecoder::new(file);
747    tar_sdist_contains_pkg_info(path, decoder, "tar.xz", compressed_size)
748}
749
750fn compressed_archive_size(path: &Path) -> Option<u64> {
751    std::fs::metadata(path).ok().map(|metadata| metadata.len())
752}
753
754fn tar_sdist_contains_pkg_info<R: Read>(
755    path: &Path,
756    reader: R,
757    archive_type: &str,
758    compressed_size: u64,
759) -> bool {
760    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
761    else {
762        return false;
763    };
764
765    select_sdist_pkginfo_entry(path, &entries).is_some()
766}
767
768fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
769    if !path.is_file() {
770        return true;
771    }
772
773    let Some(compressed_size) = compressed_archive_size(path) else {
774        return false;
775    };
776    let file = match File::open(path) {
777        Ok(file) => file,
778        Err(_) => return false,
779    };
780    let decoder = GzDecoder::new(file);
781    tar_sdist_contains_pkg_info(path, decoder, "tgz", compressed_size)
782}
783
784fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
785    if !path.is_file() {
786        return true;
787    }
788
789    let file = match File::open(path) {
790        Ok(file) => file,
791        Err(_) => return false,
792    };
793    let mut archive = match ZipArchive::new(file) {
794        Ok(archive) => archive,
795        Err(_) => return false,
796    };
797
798    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
799        Ok(entries) => entries,
800        Err(_) => return false,
801    };
802    let metadata_entries: Vec<_> = validated_entries
803        .iter()
804        .filter(|entry| entry.name.ends_with("/PKG-INFO"))
805        .filter_map(|entry| {
806            read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
807                .ok()
808                .map(|content| (entry.name.clone(), content))
809        })
810        .collect();
811
812    has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
813}
814
815fn is_likely_python_sdist_filename(file_name: &str) -> bool {
816    let Some(stem) = strip_python_archive_extension(file_name) else {
817        return false;
818    };
819
820    let Some((name, version)) = stem.rsplit_once('-') else {
821        return false;
822    };
823
824    !name.is_empty()
825        && !version.is_empty()
826        && version.chars().any(|ch| ch.is_ascii_digit())
827        && name
828            .chars()
829            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
830}
831
832fn extract_from_sdist_archive(path: &Path) -> PackageData {
833    let metadata = match std::fs::metadata(path) {
834        Ok(m) => m,
835        Err(e) => {
836            warn!(
837                "Failed to read metadata for sdist archive {:?}: {}",
838                path, e
839            );
840            return default_package_data(path);
841        }
842    };
843
844    if metadata.len() > MAX_ARCHIVE_SIZE {
845        warn!(
846            "sdist archive too large: {} bytes (limit: {} bytes)",
847            metadata.len(),
848            MAX_ARCHIVE_SIZE
849        );
850        return default_package_data(path);
851    }
852
853    let Some(format) = detect_python_sdist_archive_format(path) else {
854        return default_package_data(path);
855    };
856
857    let mut package_data = match format {
858        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
859            let file = match File::open(path) {
860                Ok(file) => file,
861                Err(e) => {
862                    warn!("Failed to open sdist archive {:?}: {}", path, e);
863                    return default_package_data(path);
864                }
865            };
866            let decoder = GzDecoder::new(file);
867            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
868        }
869        PythonSdistArchiveFormat::TarBz2 => {
870            let file = match File::open(path) {
871                Ok(file) => file,
872                Err(e) => {
873                    warn!("Failed to open sdist archive {:?}: {}", path, e);
874                    return default_package_data(path);
875                }
876            };
877            let decoder = BzDecoder::new(file);
878            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
879        }
880        PythonSdistArchiveFormat::TarXz => {
881            let file = match File::open(path) {
882                Ok(file) => file,
883                Err(e) => {
884                    warn!("Failed to open sdist archive {:?}: {}", path, e);
885                    return default_package_data(path);
886                }
887            };
888            let decoder = XzDecoder::new(file);
889            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
890        }
891        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
892    };
893
894    if package_data.package_type.is_some() {
895        let (size, sha256) = calculate_file_checksums(path);
896        package_data.size = size;
897        package_data.sha256 = sha256;
898    }
899
900    package_data
901}
902
903fn extract_from_tar_sdist_archive<R: Read>(
904    path: &Path,
905    reader: R,
906    archive_type: &str,
907    compressed_size: u64,
908) -> PackageData {
909    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
910    else {
911        return default_package_data(path);
912    };
913
914    build_sdist_package_data(path, entries)
915}
916
917fn collect_tar_sdist_entries<R: Read>(
918    path: &Path,
919    reader: R,
920    archive_type: &str,
921    compressed_size: u64,
922) -> Option<Vec<(String, String)>> {
923    let mut archive = Archive::new(reader);
924    let archive_entries = match archive.entries() {
925        Ok(entries) => entries,
926        Err(e) => {
927            warn!(
928                "Failed to read {} sdist archive {:?}: {}",
929                archive_type, path, e
930            );
931            return None;
932        }
933    };
934
935    let mut total_extracted = 0u64;
936    let mut entries = Vec::new();
937
938    for entry_result in archive_entries {
939        let mut entry = match entry_result {
940            Ok(entry) => entry,
941            Err(e) => {
942                warn!(
943                    "Failed to read {} sdist entry from {:?}: {}",
944                    archive_type, path, e
945                );
946                continue;
947            }
948        };
949
950        let entry_size = entry.size();
951        if entry_size > MAX_FILE_SIZE {
952            warn!(
953                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
954                archive_type, path, entry_size, MAX_FILE_SIZE
955            );
956            continue;
957        }
958
959        total_extracted += entry_size;
960        if total_extracted > MAX_ARCHIVE_SIZE {
961            warn!(
962                "Total extracted size exceeds limit for {} sdist {:?}",
963                archive_type, path
964            );
965            return None;
966        }
967
968        if compressed_size > 0 {
969            let ratio = total_extracted as f64 / compressed_size as f64;
970            if ratio > MAX_COMPRESSION_RATIO {
971                warn!(
972                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
973                    archive_type, path, ratio
974                );
975                return None;
976            }
977        }
978
979        let entry_path = match entry.path() {
980            Ok(path) => path.to_string_lossy().replace('\\', "/"),
981            Err(e) => {
982                warn!(
983                    "Failed to get {} sdist entry path from {:?}: {}",
984                    archive_type, path, e
985                );
986                continue;
987            }
988        };
989
990        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
991            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
992            continue;
993        };
994
995        if !is_relevant_sdist_text_entry(&entry_path) {
996            continue;
997        }
998
999        if let Ok(content) = read_limited_utf8(
1000            &mut entry,
1001            MAX_FILE_SIZE,
1002            &format!("{} entry {}", archive_type, entry_path),
1003        ) {
1004            entries.push((entry_path, content));
1005        }
1006    }
1007
1008    Some(entries)
1009}
1010
1011fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
1012    let file = match File::open(path) {
1013        Ok(file) => file,
1014        Err(e) => {
1015            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
1016            return default_package_data(path);
1017        }
1018    };
1019
1020    let mut archive = match ZipArchive::new(file) {
1021        Ok(archive) => archive,
1022        Err(e) => {
1023            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
1024            return default_package_data(path);
1025        }
1026    };
1027
1028    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
1029        Ok(entries) => entries,
1030        Err(_) => return default_package_data(path),
1031    };
1032
1033    let mut entries = Vec::new();
1034    for entry in validated_entries.iter() {
1035        if !is_relevant_sdist_text_entry(&entry.name) {
1036            continue;
1037        }
1038
1039        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
1040            entries.push((entry.name.clone(), content));
1041        }
1042    }
1043
1044    build_sdist_package_data(path, entries)
1045}
1046
1047fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
1048    entry_path.ends_with("/PKG-INFO")
1049        || entry_path.ends_with("/requires.txt")
1050        || entry_path.ends_with("/SOURCES.txt")
1051}
1052
1053fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
1054    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
1055        warn!("No PKG-INFO file found in sdist archive {:?}", path);
1056        return default_package_data(path);
1057    };
1058
1059    let mut package_data =
1060        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
1061    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
1062    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
1063    apply_sdist_name_version_fallback(path, &mut package_data);
1064    package_data.datasource_id = Some(DatasourceId::PypiSdist);
1065    package_data
1066}
1067
1068fn select_sdist_pkginfo_entry(
1069    archive_path: &Path,
1070    entries: &[(String, String)],
1071) -> Option<(String, String)> {
1072    let expected_name = sdist_archive_expected_name(archive_path);
1073
1074    entries
1075        .iter()
1076        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
1077        .min_by_key(|(entry_path, content)| {
1078            let components: Vec<_> = entry_path
1079                .split('/')
1080                .filter(|part| !part.is_empty())
1081                .collect();
1082            let candidate_name = sdist_pkginfo_candidate_name(content);
1083            let name_rank = if candidate_name == expected_name {
1084                0
1085            } else {
1086                1
1087            };
1088            let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1089
1090            (name_rank, kind_rank, components.len(), entry_path.clone())
1091        })
1092        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1093}
1094
1095fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1096    let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1097        return false;
1098    };
1099
1100    entries.iter().any(|(entry_path, content)| {
1101        sdist_pkginfo_kind_rank(entry_path) < 3
1102            && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1103    })
1104}
1105
1106fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1107    archive_path
1108        .file_name()
1109        .and_then(|name| name.to_str())
1110        .and_then(strip_python_archive_extension)
1111        .and_then(|stem| {
1112            stem.rsplit_once('-')
1113                .map(|(name, _)| normalize_python_package_name(name))
1114        })
1115}
1116
1117fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1118    let metadata = super::rfc822::parse_rfc822_content(content);
1119    super::rfc822::get_header_first(&metadata.headers, "name")
1120        .map(|name| normalize_python_package_name(&name))
1121}
1122
1123fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1124    let components: Vec<_> = entry_path
1125        .split('/')
1126        .filter(|part| !part.is_empty())
1127        .collect();
1128
1129    if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1130    {
1131        0
1132    } else if components.len() == 2 && components[1] == "PKG-INFO" {
1133        1
1134    } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1135        2
1136    } else {
1137        3
1138    }
1139}
1140
1141fn merge_sdist_archive_dependencies(
1142    entries: &[(String, String)],
1143    metadata_path: &str,
1144    package_data: &mut PackageData,
1145) {
1146    let metadata_dir = metadata_path
1147        .rsplit_once('/')
1148        .map(|(dir, _)| dir)
1149        .unwrap_or("");
1150    let archive_root = metadata_path.split('/').next().unwrap_or("");
1151    let matched_egg_info_dir =
1152        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1153    let mut extra_dependencies = Vec::new();
1154
1155    for (entry_path, content) in entries {
1156        let is_direct_requires =
1157            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1158        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1159            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1160        });
1161
1162        if is_direct_requires || is_egg_info_requires {
1163            extra_dependencies.extend(parse_requires_txt(content));
1164        }
1165    }
1166
1167    for dependency in extra_dependencies {
1168        if !package_data.dependencies.iter().any(|existing| {
1169            existing.purl == dependency.purl
1170                && existing.scope == dependency.scope
1171                && existing.extracted_requirement == dependency.extracted_requirement
1172                && existing.extra_data == dependency.extra_data
1173        }) {
1174            package_data.dependencies.push(dependency);
1175        }
1176    }
1177}
1178
1179fn merge_sdist_archive_file_references(
1180    entries: &[(String, String)],
1181    metadata_path: &str,
1182    package_data: &mut PackageData,
1183) {
1184    let metadata_dir = metadata_path
1185        .rsplit_once('/')
1186        .map(|(dir, _)| dir)
1187        .unwrap_or("");
1188    let archive_root = metadata_path.split('/').next().unwrap_or("");
1189    let matched_egg_info_dir =
1190        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1191    let mut extra_refs = Vec::new();
1192
1193    for (entry_path, content) in entries {
1194        let is_direct_sources =
1195            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1196        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1197            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1198        });
1199
1200        if is_direct_sources || is_egg_info_sources {
1201            extra_refs.extend(parse_sources_txt(content));
1202        }
1203    }
1204
1205    for file_ref in extra_refs {
1206        if !package_data
1207            .file_references
1208            .iter()
1209            .any(|existing| existing.path == file_ref.path)
1210        {
1211            package_data.file_references.push(file_ref);
1212        }
1213    }
1214}
1215
1216fn select_matching_sdist_egg_info_dir(
1217    entries: &[(String, String)],
1218    archive_root: &str,
1219    package_name: Option<&str>,
1220) -> Option<String> {
1221    let normalized_package_name = package_name.map(normalize_python_package_name);
1222
1223    entries
1224        .iter()
1225        .filter_map(|(entry_path, _)| {
1226            let components: Vec<_> = entry_path
1227                .split('/')
1228                .filter(|part| !part.is_empty())
1229                .collect();
1230            if components.len() == 3
1231                && components[0] == archive_root
1232                && components[1].ends_with(".egg-info")
1233            {
1234                Some(components[1].to_string())
1235            } else {
1236                None
1237            }
1238        })
1239        .min_by_key(|egg_info_dir| {
1240            let normalized_dir_name =
1241                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1242            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1243                0
1244            } else {
1245                1
1246            };
1247
1248            (name_rank, egg_info_dir.clone())
1249        })
1250}
1251
1252fn normalize_python_package_name(name: &str) -> String {
1253    name.to_ascii_lowercase().replace('_', "-")
1254}
1255
1256fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1257    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1258        return;
1259    };
1260
1261    let Some(stem) = strip_python_archive_extension(file_name) else {
1262        return;
1263    };
1264
1265    let Some((name, version)) = stem.rsplit_once('-') else {
1266        return;
1267    };
1268
1269    if package_data.name.is_none() {
1270        package_data.name = Some(name.replace('_', "-"));
1271    }
1272    if package_data.version.is_none() {
1273        package_data.version = Some(version.to_string());
1274    }
1275
1276    if package_data.purl.is_none()
1277        || package_data.repository_homepage_url.is_none()
1278        || package_data.repository_download_url.is_none()
1279        || package_data.api_data_url.is_none()
1280    {
1281        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1282            build_pypi_urls(
1283                package_data.name.as_deref(),
1284                package_data.version.as_deref(),
1285            );
1286
1287        if package_data.repository_homepage_url.is_none() {
1288            package_data.repository_homepage_url = repository_homepage_url;
1289        }
1290        if package_data.repository_download_url.is_none() {
1291            package_data.repository_download_url = repository_download_url;
1292        }
1293        if package_data.api_data_url.is_none() {
1294            package_data.api_data_url = api_data_url;
1295        }
1296        if package_data.purl.is_none() {
1297            package_data.purl = purl;
1298        }
1299    }
1300}
1301
1302fn extract_from_wheel_archive(path: &Path) -> PackageData {
1303    let metadata = match std::fs::metadata(path) {
1304        Ok(m) => m,
1305        Err(e) => {
1306            warn!(
1307                "Failed to read metadata for wheel archive {:?}: {}",
1308                path, e
1309            );
1310            return default_package_data(path);
1311        }
1312    };
1313
1314    if metadata.len() > MAX_ARCHIVE_SIZE {
1315        warn!(
1316            "Wheel archive too large: {} bytes (limit: {} bytes)",
1317            metadata.len(),
1318            MAX_ARCHIVE_SIZE
1319        );
1320        return default_package_data(path);
1321    }
1322
1323    let file = match File::open(path) {
1324        Ok(f) => f,
1325        Err(e) => {
1326            warn!("Failed to open wheel archive {:?}: {}", path, e);
1327            return default_package_data(path);
1328        }
1329    };
1330
1331    let mut archive = match ZipArchive::new(file) {
1332        Ok(a) => a,
1333        Err(e) => {
1334            warn!("Failed to read wheel archive {:?}: {}", path, e);
1335            return default_package_data(path);
1336        }
1337    };
1338
1339    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1340        Ok(entries) => entries,
1341        Err(_) => return default_package_data(path),
1342    };
1343
1344    let metadata_entry =
1345        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1346            Some(entry) => entry,
1347            None => {
1348                warn!("No METADATA file found in wheel archive {:?}", path);
1349                return default_package_data(path);
1350            }
1351        };
1352
1353    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1354        Ok(c) => c,
1355        Err(e) => {
1356            warn!("Failed to read METADATA from {:?}: {}", path, e);
1357            return default_package_data(path);
1358        }
1359    };
1360
1361    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1362
1363    let (size, sha256) = calculate_file_checksums(path);
1364    package_data.size = size;
1365    package_data.sha256 = sha256;
1366
1367    if let Some(record_entry) =
1368        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1369        && let Ok(record_content) =
1370            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1371    {
1372        package_data.file_references = parse_record_csv(&record_content);
1373    }
1374
1375    if let Some(wheel_info) = parse_wheel_filename(path) {
1376        if package_data.name.is_none() {
1377            package_data.name = Some(wheel_info.name.clone());
1378        }
1379        if package_data.version.is_none() {
1380            package_data.version = Some(wheel_info.version.clone());
1381        }
1382
1383        package_data.qualifiers = Some(std::collections::HashMap::from([(
1384            "extension".to_string(),
1385            format!(
1386                "{}-{}-{}",
1387                wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1388            ),
1389        )]));
1390
1391        package_data.purl = build_wheel_purl(
1392            package_data.name.as_deref(),
1393            package_data.version.as_deref(),
1394            &wheel_info,
1395        );
1396
1397        let mut extra_data = package_data.extra_data.unwrap_or_default();
1398        extra_data.insert(
1399            "python_requires".to_string(),
1400            serde_json::Value::String(wheel_info.python_tag.clone()),
1401        );
1402        extra_data.insert(
1403            "abi_tag".to_string(),
1404            serde_json::Value::String(wheel_info.abi_tag.clone()),
1405        );
1406        extra_data.insert(
1407            "platform_tag".to_string(),
1408            serde_json::Value::String(wheel_info.platform_tag.clone()),
1409        );
1410        package_data.extra_data = Some(extra_data);
1411    }
1412
1413    package_data
1414}
1415
1416fn extract_from_egg_archive(path: &Path) -> PackageData {
1417    let metadata = match std::fs::metadata(path) {
1418        Ok(m) => m,
1419        Err(e) => {
1420            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1421            return default_package_data(path);
1422        }
1423    };
1424
1425    if metadata.len() > MAX_ARCHIVE_SIZE {
1426        warn!(
1427            "Egg archive too large: {} bytes (limit: {} bytes)",
1428            metadata.len(),
1429            MAX_ARCHIVE_SIZE
1430        );
1431        return default_package_data(path);
1432    }
1433
1434    let file = match File::open(path) {
1435        Ok(f) => f,
1436        Err(e) => {
1437            warn!("Failed to open egg archive {:?}: {}", path, e);
1438            return default_package_data(path);
1439        }
1440    };
1441
1442    let mut archive = match ZipArchive::new(file) {
1443        Ok(a) => a,
1444        Err(e) => {
1445            warn!("Failed to read egg archive {:?}: {}", path, e);
1446            return default_package_data(path);
1447        }
1448    };
1449
1450    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1451        Ok(entries) => entries,
1452        Err(_) => return default_package_data(path),
1453    };
1454
1455    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1456        &validated_entries,
1457        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1458    ) {
1459        Some(entry) => entry,
1460        None => {
1461            warn!("No PKG-INFO file found in egg archive {:?}", path);
1462            return default_package_data(path);
1463        }
1464    };
1465
1466    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1467        Ok(c) => c,
1468        Err(e) => {
1469            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1470            return default_package_data(path);
1471        }
1472    };
1473
1474    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1475
1476    let (size, sha256) = calculate_file_checksums(path);
1477    package_data.size = size;
1478    package_data.sha256 = sha256;
1479
1480    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1481        &validated_entries,
1482        &[
1483            "EGG-INFO/installed-files.txt",
1484            ".egg-info/installed-files.txt",
1485        ],
1486    ) && let Ok(installed_files_content) =
1487        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1488    {
1489        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1490    }
1491
1492    if let Some(egg_info) = parse_egg_filename(path) {
1493        if package_data.name.is_none() {
1494            package_data.name = Some(egg_info.name.clone());
1495        }
1496        if package_data.version.is_none() {
1497            package_data.version = Some(egg_info.version.clone());
1498        }
1499
1500        if let Some(python_version) = &egg_info.python_version {
1501            let mut extra_data = package_data.extra_data.unwrap_or_default();
1502            extra_data.insert(
1503                "python_version".to_string(),
1504                serde_json::Value::String(python_version.clone()),
1505            );
1506            package_data.extra_data = Some(extra_data);
1507        }
1508    }
1509
1510    package_data.purl = build_egg_purl(
1511        package_data.name.as_deref(),
1512        package_data.version.as_deref(),
1513    );
1514
1515    package_data
1516}
1517
1518fn find_validated_zip_entry_by_suffix<'a>(
1519    entries: &'a [ValidatedZipEntry],
1520    suffix: &str,
1521) -> Option<&'a ValidatedZipEntry> {
1522    entries.iter().find(|entry| entry.name.ends_with(suffix))
1523}
1524
1525fn find_validated_zip_entry_by_any_suffix<'a>(
1526    entries: &'a [ValidatedZipEntry],
1527    suffixes: &[&str],
1528) -> Option<&'a ValidatedZipEntry> {
1529    entries
1530        .iter()
1531        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1532}
1533
1534fn read_validated_zip_entry<R: Read + std::io::Seek>(
1535    archive: &mut ZipArchive<R>,
1536    entry: &ValidatedZipEntry,
1537    path: &Path,
1538    archive_type: &str,
1539) -> Result<String, String> {
1540    let mut file = archive
1541        .by_index(entry.index)
1542        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1543
1544    let compressed_size = file.compressed_size();
1545    let uncompressed_size = file.size();
1546
1547    if compressed_size > 0 {
1548        let ratio = uncompressed_size as f64 / compressed_size as f64;
1549        if ratio > MAX_COMPRESSION_RATIO {
1550            return Err(format!(
1551                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1552                archive_type, path, ratio
1553            ));
1554        }
1555    }
1556
1557    if uncompressed_size > MAX_FILE_SIZE {
1558        return Err(format!(
1559            "Rejected oversized entry in {} {:?}: {} bytes",
1560            archive_type, path, uncompressed_size
1561        ));
1562    }
1563
1564    read_limited_utf8(
1565        &mut file,
1566        MAX_FILE_SIZE,
1567        &format!("{} entry {}", archive_type, entry.name),
1568    )
1569}
1570
1571fn read_limited_utf8<R: Read>(
1572    reader: &mut R,
1573    max_bytes: u64,
1574    context: &str,
1575) -> Result<String, String> {
1576    let mut limited = reader.take(max_bytes + 1);
1577    let mut bytes = Vec::new();
1578    limited
1579        .read_to_end(&mut bytes)
1580        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1581
1582    if bytes.len() as u64 > max_bytes {
1583        return Err(format!(
1584            "{} exceeded {} byte limit while reading",
1585            context, max_bytes
1586        ));
1587    }
1588
1589    String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1590}
1591
1592fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1593    let normalized = entry_path.replace('\\', "/");
1594    if normalized.len() >= 3 {
1595        let bytes = normalized.as_bytes();
1596        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1597            return None;
1598        }
1599    }
1600    let path = Path::new(&normalized);
1601    let mut components = Vec::new();
1602
1603    for component in path.components() {
1604        match component {
1605            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1606            Component::CurDir => {}
1607            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1608        }
1609    }
1610
1611    (!components.is_empty()).then_some(components.join("/"))
1612}
1613
1614/// Parses RECORD CSV format from wheel archives (PEP 427).
1615/// Format: path,hash,size (3 columns, no header)
1616/// Hash format: sha256=urlsafe_base64_hash or empty
1617/// Size: bytes as u64 or empty
1618pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1619    let mut reader = ReaderBuilder::new()
1620        .has_headers(false)
1621        .from_reader(content.as_bytes());
1622
1623    let mut file_references = Vec::new();
1624
1625    for result in reader.records() {
1626        match result {
1627            Ok(record) => {
1628                if record.len() < 3 {
1629                    continue;
1630                }
1631
1632                let path = record.get(0).unwrap_or("").trim().to_string();
1633                if path.is_empty() {
1634                    continue;
1635                }
1636
1637                let hash_field = record.get(1).unwrap_or("").trim();
1638                let size_field = record.get(2).unwrap_or("").trim();
1639
1640                // Parse hash: format is "algorithm=value"
1641                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1642                    let parts: Vec<&str> = hash_field.split('=').collect();
1643                    if parts.len() == 2 && parts[0] == "sha256" {
1644                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1645                            Ok(decoded) => {
1646                                let hex = decoded
1647                                    .iter()
1648                                    .map(|b| format!("{:02x}", b))
1649                                    .collect::<String>();
1650                                Sha256Digest::from_hex(&hex).ok()
1651                            }
1652                            Err(_) => None,
1653                        }
1654                    } else {
1655                        None
1656                    }
1657                } else {
1658                    None
1659                };
1660
1661                // Parse size
1662                let size = if !size_field.is_empty() && size_field != "-" {
1663                    size_field.parse::<u64>().ok()
1664                } else {
1665                    None
1666                };
1667
1668                file_references.push(FileReference {
1669                    path,
1670                    size,
1671                    sha1: None,
1672                    md5: None,
1673                    sha256,
1674                    sha512: None,
1675                    extra_data: None,
1676                });
1677            }
1678            Err(e) => {
1679                warn!("Failed to parse RECORD CSV row: {}", e);
1680                continue;
1681            }
1682        }
1683    }
1684
1685    file_references
1686}
1687
1688/// Parses installed-files.txt format from egg archives (PEP 376).
1689/// Format: one file path per line, no headers, no hash, no size
1690pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1691    content
1692        .lines()
1693        .map(|line| line.trim())
1694        .filter(|line| !line.is_empty())
1695        .map(|path| FileReference {
1696            path: path.to_string(),
1697            size: None,
1698            sha1: None,
1699            md5: None,
1700            sha256: None,
1701            sha512: None,
1702            extra_data: None,
1703        })
1704        .collect()
1705}
1706
1707pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1708    content
1709        .lines()
1710        .map(str::trim)
1711        .filter(|line| !line.is_empty())
1712        .map(|path| FileReference {
1713            path: path.to_string(),
1714            size: None,
1715            sha1: None,
1716            md5: None,
1717            sha256: None,
1718            sha512: None,
1719            extra_data: None,
1720        })
1721        .collect()
1722}
1723
1724struct WheelInfo {
1725    name: String,
1726    version: String,
1727    python_tag: String,
1728    abi_tag: String,
1729    platform_tag: String,
1730}
1731
1732fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1733    let stem = path.file_stem()?.to_string_lossy();
1734    let parts: Vec<&str> = stem.split('-').collect();
1735
1736    if parts.len() >= 5 {
1737        Some(WheelInfo {
1738            name: parts[0].replace('_', "-"),
1739            version: parts[1].to_string(),
1740            python_tag: parts[2].to_string(),
1741            abi_tag: parts[3].to_string(),
1742            platform_tag: parts[4..].join("-"),
1743        })
1744    } else {
1745        None
1746    }
1747}
1748
1749struct EggInfo {
1750    name: String,
1751    version: String,
1752    python_version: Option<String>,
1753}
1754
1755fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1756    let stem = path.file_stem()?.to_string_lossy();
1757    let parts: Vec<&str> = stem.split('-').collect();
1758
1759    if parts.len() >= 2 {
1760        Some(EggInfo {
1761            name: parts[0].replace('_', "-"),
1762            version: parts[1].to_string(),
1763            python_version: parts.get(2).map(|s| s.to_string()),
1764        })
1765    } else {
1766        None
1767    }
1768}
1769
1770fn build_wheel_purl(
1771    name: Option<&str>,
1772    version: Option<&str>,
1773    wheel_info: &WheelInfo,
1774) -> Option<String> {
1775    let name = name?;
1776    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1777
1778    if let Some(ver) = version {
1779        package_url.with_version(ver).ok()?;
1780    }
1781
1782    let extension = format!(
1783        "{}-{}-{}",
1784        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1785    );
1786    package_url.add_qualifier("extension", extension).ok()?;
1787
1788    Some(package_url.to_string())
1789}
1790
1791fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1792    let name = name?;
1793    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1794
1795    if let Some(ver) = version {
1796        package_url.with_version(ver).ok()?;
1797    }
1798
1799    package_url.add_qualifier("type", "egg").ok()?;
1800
1801    Some(package_url.to_string())
1802}
1803
1804fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1805    let metadata = super::rfc822::parse_rfc822_content(content);
1806    build_package_data_from_rfc822(&metadata, datasource_id)
1807}
1808
1809/// Builds PackageData from parsed RFC822 metadata.
1810///
1811/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1812/// and `python_parse_rfc822_content` (content-based) functions.
1813fn build_package_data_from_rfc822(
1814    metadata: &super::rfc822::Rfc822Metadata,
1815    datasource_id: DatasourceId,
1816) -> PackageData {
1817    use super::rfc822::{get_header_all, get_header_first};
1818
1819    let name = get_header_first(&metadata.headers, "name");
1820    let version = get_header_first(&metadata.headers, "version");
1821    let summary = get_header_first(&metadata.headers, "summary");
1822    let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1823    let author = get_header_first(&metadata.headers, "author");
1824    let author_email = get_header_first(&metadata.headers, "author-email");
1825    let license = get_header_first(&metadata.headers, "license");
1826    let license_expression = get_header_first(&metadata.headers, "license-expression");
1827    let download_url = get_header_first(&metadata.headers, "download-url");
1828    let platform = get_header_first(&metadata.headers, "platform");
1829    let requires_python = get_header_first(&metadata.headers, "requires-python");
1830    let classifiers = get_header_all(&metadata.headers, "classifier");
1831    let license_files = get_header_all(&metadata.headers, "license-file");
1832
1833    let description_body = if metadata.body.is_empty() {
1834        get_header_first(&metadata.headers, "description").unwrap_or_default()
1835    } else {
1836        metadata.body.clone()
1837    };
1838
1839    let description = build_description(summary.as_deref(), &description_body);
1840
1841    let mut parties = Vec::new();
1842    if author.is_some() || author_email.is_some() {
1843        parties.push(Party {
1844            r#type: Some("person".to_string()),
1845            role: Some("author".to_string()),
1846            name: author,
1847            email: author_email,
1848            url: None,
1849            organization: None,
1850            organization_url: None,
1851            timezone: None,
1852        });
1853    }
1854
1855    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1856    let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1857    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1858        license_expression
1859            .as_deref()
1860            .and_then(normalize_spdx_expression)
1861            .map(|normalized| {
1862                build_declared_license_data(
1863                    normalized,
1864                    DeclaredLicenseMatchMetadata::single_line(
1865                        license_expression.as_deref().unwrap_or_default(),
1866                    )
1867                    .with_referenced_filenames(&referenced_license_files),
1868                )
1869            })
1870            .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1871
1872    let extracted_license_statement = license_expression
1873        .clone()
1874        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1875
1876    let mut extra_data = HashMap::new();
1877    if let Some(platform_value) = platform
1878        && !platform_value.eq_ignore_ascii_case("unknown")
1879        && !platform_value.is_empty()
1880    {
1881        extra_data.insert(
1882            "platform".to_string(),
1883            serde_json::Value::String(platform_value),
1884        );
1885    }
1886
1887    if let Some(requires_python_value) = requires_python
1888        && !requires_python_value.is_empty()
1889    {
1890        extra_data.insert(
1891            "requires_python".to_string(),
1892            serde_json::Value::String(requires_python_value),
1893        );
1894    }
1895
1896    if !license_files.is_empty() {
1897        extra_data.insert(
1898            "license_files".to_string(),
1899            serde_json::Value::Array(
1900                license_files
1901                    .iter()
1902                    .cloned()
1903                    .map(serde_json::Value::String)
1904                    .collect(),
1905            ),
1906        );
1907    }
1908
1909    let file_references = license_files
1910        .iter()
1911        .map(|path| FileReference {
1912            path: path.clone(),
1913            size: None,
1914            sha1: None,
1915            md5: None,
1916            sha256: None,
1917            sha512: None,
1918            extra_data: None,
1919        })
1920        .collect();
1921
1922    let project_urls = get_header_all(&metadata.headers, "project-url");
1923    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1924    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1925
1926    if !project_urls.is_empty() {
1927        let parsed_urls = parse_project_urls(&project_urls);
1928
1929        for (label, url) in &parsed_urls {
1930            let label_lower = label.to_lowercase();
1931
1932            if bug_tracking_url.is_none()
1933                && matches!(
1934                    label_lower.as_str(),
1935                    "tracker"
1936                        | "bug reports"
1937                        | "bug tracker"
1938                        | "issues"
1939                        | "issue tracker"
1940                        | "github: issues"
1941                )
1942            {
1943                bug_tracking_url = Some(url.clone());
1944            } else if code_view_url.is_none()
1945                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1946            {
1947                code_view_url = Some(url.clone());
1948            } else if vcs_url.is_none()
1949                && matches!(
1950                    label_lower.as_str(),
1951                    "github" | "gitlab" | "github: repo" | "repository"
1952                )
1953            {
1954                vcs_url = Some(url.clone());
1955            } else if homepage_url.is_none()
1956                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1957            {
1958                homepage_url = Some(url.clone());
1959            } else if label_lower == "changelog" {
1960                extra_data.insert(
1961                    "changelog_url".to_string(),
1962                    serde_json::Value::String(url.clone()),
1963                );
1964            }
1965        }
1966
1967        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1968            .iter()
1969            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1970            .collect();
1971
1972        if !project_urls_json.is_empty() {
1973            extra_data.insert(
1974                "project_urls".to_string(),
1975                serde_json::Value::Object(project_urls_json),
1976            );
1977        }
1978    }
1979
1980    let extra_data = if extra_data.is_empty() {
1981        None
1982    } else {
1983        Some(extra_data)
1984    };
1985
1986    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1987        build_pypi_urls(name.as_deref(), version.as_deref());
1988
1989    PackageData {
1990        package_type: Some(PythonParser::PACKAGE_TYPE),
1991        namespace: None,
1992        name,
1993        version,
1994        qualifiers: None,
1995        subpath: None,
1996        primary_language: Some("Python".to_string()),
1997        description,
1998        release_date: None,
1999        parties,
2000        keywords,
2001        homepage_url,
2002        download_url,
2003        size: None,
2004        sha1: None,
2005        md5: None,
2006        sha256: None,
2007        sha512: None,
2008        bug_tracking_url,
2009        code_view_url,
2010        vcs_url,
2011        copyright: None,
2012        holder: None,
2013        declared_license_expression,
2014        declared_license_expression_spdx,
2015        license_detections,
2016        other_license_expression: None,
2017        other_license_expression_spdx: None,
2018        other_license_detections: Vec::new(),
2019        extracted_license_statement,
2020        notice_text: None,
2021        source_packages: Vec::new(),
2022        file_references,
2023        is_private: false,
2024        is_virtual: false,
2025        extra_data,
2026        dependencies,
2027        repository_homepage_url,
2028        repository_download_url,
2029        api_data_url,
2030        datasource_id: Some(datasource_id),
2031        purl,
2032    }
2033}
2034
2035fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
2036    project_urls
2037        .iter()
2038        .filter_map(|url_entry| {
2039            if let Some((label, url)) = url_entry.split_once(", ") {
2040                let label_trimmed = label.trim();
2041                let url_trimmed = url.trim();
2042                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
2043                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
2044                }
2045            }
2046            None
2047        })
2048        .collect()
2049}
2050
2051fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
2052    let mut parts = Vec::new();
2053    if let Some(summary_value) = summary
2054        && !summary_value.trim().is_empty()
2055    {
2056        parts.push(summary_value.trim().to_string());
2057    }
2058
2059    if !body.trim().is_empty() {
2060        parts.push(body.trim().to_string());
2061    }
2062
2063    if parts.is_empty() {
2064        None
2065    } else {
2066        Some(parts.join("\n"))
2067    }
2068}
2069
2070fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
2071    let mut keywords = Vec::new();
2072    let mut license_classifiers = Vec::new();
2073
2074    for classifier in classifiers {
2075        if classifier.starts_with("License ::") {
2076            license_classifiers.push(classifier.to_string());
2077        } else {
2078            keywords.push(classifier.to_string());
2079        }
2080    }
2081
2082    (keywords, license_classifiers)
2083}
2084
2085fn build_extracted_license_statement(
2086    license: Option<&str>,
2087    license_classifiers: &[String],
2088) -> Option<String> {
2089    let mut lines = Vec::new();
2090
2091    if let Some(value) = license
2092        && !value.trim().is_empty()
2093    {
2094        lines.push(format!("license: {}", value.trim()));
2095    }
2096
2097    if !license_classifiers.is_empty() {
2098        lines.push("classifiers:".to_string());
2099        for classifier in license_classifiers {
2100            lines.push(format!("  - '{}'", classifier));
2101        }
2102    }
2103
2104    if lines.is_empty() {
2105        None
2106    } else {
2107        Some(format!("{}\n", lines.join("\n")))
2108    }
2109}
2110
2111pub(crate) fn build_pypi_urls(
2112    name: Option<&str>,
2113    version: Option<&str>,
2114) -> (
2115    Option<String>,
2116    Option<String>,
2117    Option<String>,
2118    Option<String>,
2119) {
2120    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2121
2122    let repository_download_url = name.and_then(|value| {
2123        version.map(|ver| {
2124            format!(
2125                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2126                &value[..1.min(value.len())],
2127                value,
2128                value,
2129                ver
2130            )
2131        })
2132    });
2133
2134    let api_data_url = name.map(|value| {
2135        if let Some(ver) = version {
2136            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2137        } else {
2138            format!("https://pypi.org/pypi/{}/json", value)
2139        }
2140    });
2141
2142    let purl = name.and_then(|value| {
2143        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2144        if let Some(ver) = version {
2145            package_url.with_version(ver).ok()?;
2146        }
2147        Some(package_url.to_string())
2148    });
2149
2150    (
2151        repository_homepage_url,
2152        repository_download_url,
2153        api_data_url,
2154        purl,
2155    )
2156}
2157
2158fn build_pypi_purl_with_extension(
2159    name: &str,
2160    version: Option<&str>,
2161    extension: &str,
2162) -> Option<String> {
2163    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2164    if let Some(ver) = version {
2165        package_url.with_version(ver).ok()?;
2166    }
2167    package_url.add_qualifier("extension", extension).ok()?;
2168    Some(package_url.to_string())
2169}
2170
2171fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2172    let toml_content = match read_toml_file(path) {
2173        Ok(content) => content,
2174        Err(e) => {
2175            warn!(
2176                "Failed to read or parse pyproject.toml at {:?}: {}",
2177                path, e
2178            );
2179            return default_package_data(path);
2180        }
2181    };
2182
2183    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2184    let is_poetry_pyproject = tool_table
2185        .and_then(|tool| tool.get("poetry"))
2186        .and_then(|value| value.as_table())
2187        .is_some();
2188
2189    // Handle both PEP 621 (project table) and poetry formats
2190    let project_table =
2191        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2192            // Standard PEP 621 format with [project] table
2193            project.clone()
2194        } else if let Some(tool) = tool_table {
2195            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2196                // Poetry format with [tool.poetry] table
2197                poetry.clone()
2198            } else {
2199                return default_package_data(path);
2200            }
2201        } else if toml_content.get(FIELD_NAME).is_some() {
2202            // Other format with top-level fields
2203            match toml_content.as_table() {
2204                Some(table) => table.clone(),
2205                None => {
2206                    warn!("Failed to convert TOML content to table in {:?}", path);
2207                    return default_package_data(path);
2208                }
2209            }
2210        } else {
2211            return default_package_data(path);
2212        };
2213
2214    let name = project_table
2215        .get(FIELD_NAME)
2216        .and_then(|v| v.as_str())
2217        .map(String::from);
2218
2219    let version = project_table
2220        .get(FIELD_VERSION)
2221        .and_then(|v| v.as_str())
2222        .map(String::from);
2223    let classifiers = project_table
2224        .get("classifiers")
2225        .and_then(|value| value.as_array())
2226        .map(|values| {
2227            values
2228                .iter()
2229                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2230                .collect::<Vec<_>>()
2231        })
2232        .unwrap_or_default();
2233    let (classifier_keywords, license_classifiers) = split_classifiers(&classifiers);
2234
2235    let extracted_license_statement = extract_raw_license_string(&project_table);
2236    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2237        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2238
2239    let description = project_table
2240        .get(FIELD_DESCRIPTION)
2241        .and_then(|value| value.as_str())
2242        .map(|value| value.to_string());
2243    let mut keywords = project_table
2244        .get(FIELD_KEYWORDS)
2245        .and_then(|value| value.as_array())
2246        .map(|values| {
2247            values
2248                .iter()
2249                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2250                .collect::<Vec<_>>()
2251        })
2252        .unwrap_or_default();
2253    for classifier in classifier_keywords {
2254        if !keywords.contains(&classifier) {
2255            keywords.push(classifier);
2256        }
2257    }
2258
2259    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2260    let mut extra_data = extract_pyproject_extra_data(&toml_content).unwrap_or_default();
2261    let (homepage_url, download_url, bug_tracking_url, code_view_url, repository_url) =
2262        extract_urls(&project_table, &mut extra_data);
2263
2264    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2265
2266    // Create package URL
2267    let purl = name.as_ref().and_then(|n| {
2268        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2269            Ok(p) => p,
2270            Err(e) => {
2271                warn!(
2272                    "Failed to create PackageUrl for Python package '{}': {}",
2273                    n, e
2274                );
2275                return None;
2276            }
2277        };
2278
2279        if let Some(v) = &version
2280            && let Err(e) = package_url.with_version(v)
2281        {
2282            warn!(
2283                "Failed to set version '{}' for Python package '{}': {}",
2284                v, n, e
2285            );
2286            return None;
2287        }
2288
2289        Some(package_url.to_string())
2290    });
2291
2292    let api_data_url = name.as_ref().map(|n| {
2293        if let Some(v) = &version {
2294            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2295        } else {
2296            format!("https://pypi.org/pypi/{}/json", n)
2297        }
2298    });
2299
2300    let pypi_homepage_url = name
2301        .as_ref()
2302        .map(|n| format!("https://pypi.org/project/{}", n));
2303
2304    let pypi_download_url = name.as_ref().and_then(|n| {
2305        version.as_ref().map(|v| {
2306            format!(
2307                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2308                &n[..1.min(n.len())],
2309                n,
2310                n,
2311                v
2312            )
2313        })
2314    });
2315
2316    PackageData {
2317        package_type: Some(PythonParser::PACKAGE_TYPE),
2318        namespace: None,
2319        name,
2320        version,
2321        qualifiers: None,
2322        subpath: None,
2323        primary_language: None,
2324        description,
2325        release_date: None,
2326        parties: extract_parties(&project_table),
2327        keywords,
2328        homepage_url: homepage_url.or(pypi_homepage_url),
2329        download_url: download_url
2330            .or_else(|| repository_url.clone())
2331            .or(pypi_download_url),
2332        size: None,
2333        sha1: None,
2334        md5: None,
2335        sha256: None,
2336        sha512: None,
2337        bug_tracking_url,
2338        code_view_url,
2339        vcs_url: repository_url,
2340        copyright: None,
2341        holder: None,
2342        declared_license_expression,
2343        declared_license_expression_spdx,
2344        license_detections,
2345        other_license_expression: None,
2346        other_license_expression_spdx: None,
2347        other_license_detections: Vec::new(),
2348        extracted_license_statement: extracted_license_statement
2349            .or_else(|| build_extracted_license_statement(None, &license_classifiers)),
2350        notice_text: None,
2351        source_packages: Vec::new(),
2352        file_references: Vec::new(),
2353        is_private: has_private_classifier(&classifiers),
2354        is_virtual: false,
2355        extra_data: if extra_data.is_empty() {
2356            None
2357        } else {
2358            Some(extra_data)
2359        },
2360        dependencies: [dependencies, optional_dependencies].concat(),
2361        repository_homepage_url: None,
2362        repository_download_url: None,
2363        api_data_url,
2364        datasource_id: Some(if is_poetry_pyproject {
2365            DatasourceId::PypiPoetryPyprojectToml
2366        } else {
2367            DatasourceId::PypiPyprojectToml
2368        }),
2369        purl,
2370    }
2371}
2372
2373fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2374    let path_str = path.to_string_lossy().replace('\\', "/");
2375    if path_str.contains("/EGG-INFO/PKG-INFO") {
2376        DatasourceId::PypiEggPkginfo
2377    } else if path_str.ends_with(".egg-info/PKG-INFO") {
2378        DatasourceId::PypiEditableEggPkginfo
2379    } else {
2380        DatasourceId::PypiSdistPkginfo
2381    }
2382}
2383
2384fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2385    project
2386        .get(FIELD_LICENSE)
2387        .and_then(|license_value| match license_value {
2388            TomlValue::String(license_str) => Some(license_str.clone()),
2389            TomlValue::Table(license_table) => license_table
2390                .get("text")
2391                .and_then(|v| v.as_str())
2392                .map(|s| s.to_string())
2393                .or_else(|| {
2394                    license_table
2395                        .get("expression")
2396                        .and_then(|v| v.as_str())
2397                        .map(|expr| expr.to_string())
2398                }),
2399            _ => None,
2400        })
2401}
2402
2403fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2404    match project.get(FIELD_LICENSE) {
2405        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2406        Some(TomlValue::Table(license_table)) => license_table
2407            .get("expression")
2408            .and_then(|value| value.as_str()),
2409        _ => None,
2410    }
2411}
2412
2413fn extract_urls(
2414    project: &TomlMap<String, TomlValue>,
2415    extra_data: &mut HashMap<String, serde_json::Value>,
2416) -> ProjectUrls {
2417    let mut homepage_url = None;
2418    let mut download_url = None;
2419    let mut bug_tracking_url = None;
2420    let mut code_view_url = None;
2421    let mut repository_url = None;
2422
2423    // Check for URLs table
2424    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2425        let parsed_urls: Vec<(String, String)> = urls
2426            .iter()
2427            .filter_map(|(label, value)| {
2428                value
2429                    .as_str()
2430                    .map(|url| (label.to_string(), url.to_string()))
2431            })
2432            .collect();
2433        apply_project_url_mappings(
2434            &parsed_urls,
2435            &mut homepage_url,
2436            &mut bug_tracking_url,
2437            &mut code_view_url,
2438            &mut repository_url,
2439            extra_data,
2440        );
2441
2442        download_url = urls
2443            .get("Downloads")
2444            .or_else(|| urls.get("downloads"))
2445            .and_then(|v| v.as_str())
2446            .map(String::from);
2447
2448        if homepage_url.is_none() {
2449            homepage_url = urls
2450                .get(FIELD_HOMEPAGE)
2451                .and_then(|v| v.as_str())
2452                .map(String::from);
2453        }
2454        if repository_url.is_none() {
2455            repository_url = urls
2456                .get(FIELD_REPOSITORY)
2457                .and_then(|v| v.as_str())
2458                .map(String::from);
2459        }
2460    }
2461
2462    // If not found in URLs table, check for top-level keys
2463    if homepage_url.is_none() {
2464        homepage_url = project
2465            .get(FIELD_HOMEPAGE)
2466            .and_then(|v| v.as_str())
2467            .map(String::from);
2468    }
2469
2470    if repository_url.is_none() {
2471        repository_url = project
2472            .get(FIELD_REPOSITORY)
2473            .and_then(|v| v.as_str())
2474            .map(String::from);
2475    }
2476
2477    (
2478        homepage_url,
2479        download_url,
2480        bug_tracking_url,
2481        code_view_url,
2482        repository_url,
2483    )
2484}
2485
2486fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2487    let mut parties = Vec::new();
2488
2489    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2490        for author in authors {
2491            if let Some(author_str) = author.as_str() {
2492                let (name, email) = split_name_email(author_str);
2493                parties.push(Party {
2494                    r#type: None,
2495                    role: Some("author".to_string()),
2496                    name,
2497                    email,
2498                    url: None,
2499                    organization: None,
2500                    organization_url: None,
2501                    timezone: None,
2502                });
2503            } else if let Some(author_table) = author.as_table() {
2504                let name = author_table
2505                    .get("name")
2506                    .and_then(|value| value.as_str())
2507                    .map(|value| value.to_string());
2508                let email = author_table
2509                    .get("email")
2510                    .and_then(|value| value.as_str())
2511                    .map(|value| value.to_string());
2512                if name.is_some() || email.is_some() {
2513                    parties.push(Party {
2514                        r#type: None,
2515                        role: Some("author".to_string()),
2516                        name,
2517                        email,
2518                        url: None,
2519                        organization: None,
2520                        organization_url: None,
2521                        timezone: None,
2522                    });
2523                }
2524            }
2525        }
2526    }
2527
2528    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2529        for maintainer in maintainers {
2530            if let Some(maintainer_str) = maintainer.as_str() {
2531                let (name, email) = split_name_email(maintainer_str);
2532                parties.push(Party {
2533                    r#type: None,
2534                    role: Some("maintainer".to_string()),
2535                    name,
2536                    email,
2537                    url: None,
2538                    organization: None,
2539                    organization_url: None,
2540                    timezone: None,
2541                });
2542            } else if let Some(maintainer_table) = maintainer.as_table() {
2543                let name = maintainer_table
2544                    .get("name")
2545                    .and_then(|value| value.as_str())
2546                    .map(|value| value.to_string());
2547                let email = maintainer_table
2548                    .get("email")
2549                    .and_then(|value| value.as_str())
2550                    .map(|value| value.to_string());
2551                if name.is_some() || email.is_some() {
2552                    parties.push(Party {
2553                        r#type: None,
2554                        role: Some("maintainer".to_string()),
2555                        name,
2556                        email,
2557                        url: None,
2558                        organization: None,
2559                        organization_url: None,
2560                        timezone: None,
2561                    });
2562                }
2563            }
2564        }
2565    }
2566
2567    parties
2568}
2569
2570fn extract_dependencies(
2571    project: &TomlMap<String, TomlValue>,
2572    toml_content: &TomlValue,
2573) -> (Vec<Dependency>, Vec<Dependency>) {
2574    let mut dependencies = Vec::new();
2575    let mut optional_dependencies = Vec::new();
2576
2577    // Handle dependencies - can be array or table format
2578    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2579        match deps_value {
2580            TomlValue::Array(arr) => {
2581                dependencies = parse_dependency_array(arr, false, None);
2582            }
2583            TomlValue::Table(table) => {
2584                dependencies = parse_dependency_table(table, false, None);
2585            }
2586            _ => {}
2587        }
2588    }
2589
2590    // Handle PEP 621 optional-dependencies with scope
2591    if let Some(opt_deps_table) = project
2592        .get(FIELD_OPTIONAL_DEPENDENCIES)
2593        .and_then(|v| v.as_table())
2594    {
2595        for (extra_name, deps) in opt_deps_table {
2596            match deps {
2597                TomlValue::Array(arr) => {
2598                    optional_dependencies.extend(parse_dependency_array(
2599                        arr,
2600                        true,
2601                        Some(extra_name),
2602                    ));
2603                }
2604                TomlValue::Table(table) => {
2605                    optional_dependencies.extend(parse_dependency_table(
2606                        table,
2607                        true,
2608                        Some(extra_name),
2609                    ));
2610                }
2611                _ => {}
2612            }
2613        }
2614    }
2615
2616    // Handle Poetry dev-dependencies
2617    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2618        match dev_deps_value {
2619            TomlValue::Array(arr) => {
2620                optional_dependencies.extend(parse_dependency_array(
2621                    arr,
2622                    true,
2623                    Some(FIELD_DEV_DEPENDENCIES),
2624                ));
2625            }
2626            TomlValue::Table(table) => {
2627                optional_dependencies.extend(parse_dependency_table(
2628                    table,
2629                    true,
2630                    Some(FIELD_DEV_DEPENDENCIES),
2631                ));
2632            }
2633            _ => {}
2634        }
2635    }
2636
2637    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2638    if let Some(groups_table) = toml_content
2639        .get("tool")
2640        .and_then(|value| value.as_table())
2641        .and_then(|tool| tool.get("poetry"))
2642        .and_then(|value| value.as_table())
2643        .and_then(|poetry| poetry.get("group"))
2644        .and_then(|value| value.as_table())
2645    {
2646        for (group_name, group_data) in groups_table {
2647            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2648                match group_deps {
2649                    TomlValue::Array(arr) => {
2650                        optional_dependencies.extend(parse_dependency_array(
2651                            arr,
2652                            true,
2653                            Some(group_name),
2654                        ));
2655                    }
2656                    TomlValue::Table(table) => {
2657                        optional_dependencies.extend(parse_poetry_group_dependency_table(
2658                            table,
2659                            true,
2660                            Some(group_name),
2661                        ));
2662                    }
2663                    _ => {}
2664                }
2665            }
2666        }
2667    }
2668
2669    if let Some(groups_table) = toml_content
2670        .get(FIELD_DEPENDENCY_GROUPS)
2671        .and_then(|value| value.as_table())
2672    {
2673        for (group_name, deps) in groups_table {
2674            match deps {
2675                TomlValue::Array(arr) => {
2676                    optional_dependencies.extend(parse_dependency_array(
2677                        arr,
2678                        true,
2679                        Some(group_name),
2680                    ));
2681                }
2682                TomlValue::Table(table) => {
2683                    optional_dependencies.extend(parse_dependency_table(
2684                        table,
2685                        true,
2686                        Some(group_name),
2687                    ));
2688                }
2689                _ => {}
2690            }
2691        }
2692    }
2693
2694    if let Some(dev_deps_value) = toml_content
2695        .get("tool")
2696        .and_then(|value| value.as_table())
2697        .and_then(|tool| tool.get("uv"))
2698        .and_then(|value| value.as_table())
2699        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2700    {
2701        match dev_deps_value {
2702            TomlValue::Array(arr) => {
2703                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2704            }
2705            TomlValue::Table(table) => {
2706                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2707            }
2708            _ => {}
2709        }
2710    }
2711
2712    (dependencies, optional_dependencies)
2713}
2714
2715fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2716    let mut extra_data = HashMap::new();
2717
2718    if let Some(tool_uv) = toml_content
2719        .get("tool")
2720        .and_then(|value| value.as_table())
2721        .and_then(|tool| tool.get("uv"))
2722    {
2723        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2724    }
2725
2726    if extra_data.is_empty() {
2727        None
2728    } else {
2729        Some(extra_data)
2730    }
2731}
2732
2733fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2734    match value {
2735        TomlValue::String(value) => JsonValue::String(value.clone()),
2736        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2737        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2738        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2739        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2740        TomlValue::Array(values) => {
2741            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2742        }
2743        TomlValue::Table(values) => JsonValue::Object(
2744            values
2745                .iter()
2746                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2747                .collect::<JsonMap<String, JsonValue>>(),
2748        ),
2749    }
2750}
2751
2752fn parse_dependency_table(
2753    table: &TomlMap<String, TomlValue>,
2754    is_optional: bool,
2755    scope: Option<&str>,
2756) -> Vec<Dependency> {
2757    table
2758        .iter()
2759        .filter_map(|(name, version)| {
2760            let version_str = version.as_str().map(|s| s.to_string());
2761            let mut package_url =
2762                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2763
2764            if let Some(v) = &version_str {
2765                package_url.with_version(v).ok()?;
2766            }
2767
2768            Some(Dependency {
2769                purl: Some(package_url.to_string()),
2770                extracted_requirement: None,
2771                scope: scope.map(|s| s.to_string()),
2772                is_runtime: Some(!is_optional),
2773                is_optional: Some(is_optional),
2774                is_pinned: None,
2775                is_direct: Some(true),
2776                resolved_package: None,
2777                extra_data: None,
2778            })
2779        })
2780        .collect()
2781}
2782
2783fn parse_poetry_group_dependency_table(
2784    table: &TomlMap<String, TomlValue>,
2785    is_optional: bool,
2786    scope: Option<&str>,
2787) -> Vec<Dependency> {
2788    table
2789        .iter()
2790        .filter_map(|(name, value)| build_poetry_group_dependency(name, value, is_optional, scope))
2791        .collect()
2792}
2793
2794fn build_poetry_group_dependency(
2795    name: &str,
2796    value: &TomlValue,
2797    is_optional: bool,
2798    scope: Option<&str>,
2799) -> Option<Dependency> {
2800    let normalized_name = normalize_python_dependency_name(name);
2801    let (version_spec, extras, marker) = match value {
2802        TomlValue::String(spec) => (Some(spec.trim().to_string()), Vec::new(), None),
2803        TomlValue::Table(table) => {
2804            let version_spec = table
2805                .get(FIELD_VERSION)
2806                .and_then(|value| value.as_str())
2807                .map(str::trim)
2808                .filter(|value| !value.is_empty())
2809                .map(ToOwned::to_owned);
2810            let extras = table
2811                .get(FIELD_EXTRAS)
2812                .and_then(|value| value.as_array())
2813                .map(|values| {
2814                    values
2815                        .iter()
2816                        .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2817                        .collect::<Vec<_>>()
2818                })
2819                .unwrap_or_default();
2820            let marker = table
2821                .get("markers")
2822                .and_then(|value| value.as_str())
2823                .map(str::trim)
2824                .filter(|value| !value.is_empty())
2825                .map(ToOwned::to_owned);
2826
2827            (version_spec, extras, marker)
2828        }
2829        _ => return None,
2830    };
2831
2832    let pinned_version = version_spec
2833        .as_deref()
2834        .and_then(extract_exact_pinned_version);
2835    let purl = build_python_dependency_purl(&normalized_name, pinned_version.as_deref())?;
2836
2837    let mut extra_data = HashMap::new();
2838    if let Some(marker) = marker {
2839        extra_data.insert("marker".to_string(), JsonValue::String(marker));
2840    }
2841    if !extras.is_empty() {
2842        extra_data.insert(
2843            "extras".to_string(),
2844            JsonValue::Array(extras.into_iter().map(JsonValue::String).collect()),
2845        );
2846    }
2847
2848    Some(Dependency {
2849        purl: Some(purl),
2850        extracted_requirement: version_spec,
2851        scope: scope.map(|value| value.to_string()),
2852        is_runtime: Some(!is_optional),
2853        is_optional: Some(is_optional),
2854        is_pinned: Some(pinned_version.is_some()),
2855        is_direct: Some(true),
2856        resolved_package: None,
2857        extra_data: if extra_data.is_empty() {
2858            None
2859        } else {
2860            Some(extra_data)
2861        },
2862    })
2863}
2864
2865fn parse_dependency_array(
2866    array: &[TomlValue],
2867    is_optional: bool,
2868    scope: Option<&str>,
2869) -> Vec<Dependency> {
2870    array
2871        .iter()
2872        .filter_map(|dep| {
2873            let dep_str = dep.as_str()?;
2874            build_pyproject_array_dependency(dep_str, is_optional, scope)
2875        })
2876        .collect()
2877}
2878
2879fn build_pyproject_array_dependency(
2880    dep_str: &str,
2881    is_optional: bool,
2882    scope: Option<&str>,
2883) -> Option<Dependency> {
2884    let parsed = parse_pep508_requirement(dep_str)?;
2885    let name = normalize_python_package_name(&parsed.name);
2886    let pinned_version = parsed
2887        .specifiers
2888        .as_deref()
2889        .and_then(extract_exact_pinned_version);
2890
2891    let purl = build_python_dependency_purl(&name, pinned_version.as_deref())?;
2892
2893    let mut extra_data = HashMap::new();
2894    if let Some(marker) = parsed.marker {
2895        extra_data.insert("marker".to_string(), JsonValue::String(marker));
2896    }
2897    if !parsed.extras.is_empty() {
2898        extra_data.insert(
2899            "extras".to_string(),
2900            JsonValue::Array(parsed.extras.into_iter().map(JsonValue::String).collect()),
2901        );
2902    }
2903
2904    let extracted_requirement = parsed.specifiers.or(parsed.url);
2905
2906    Some(Dependency {
2907        purl: Some(purl),
2908        extracted_requirement: extracted_requirement.clone(),
2909        scope: scope.map(|s| s.to_string()),
2910        is_runtime: Some(!is_optional),
2911        is_optional: Some(is_optional),
2912        is_pinned: Some(pinned_version.is_some()),
2913        is_direct: Some(true),
2914        resolved_package: None,
2915        extra_data: if extra_data.is_empty() {
2916            None
2917        } else {
2918            Some(extra_data)
2919        },
2920    })
2921}
2922
2923fn extract_exact_pinned_version(specifiers: &str) -> Option<String> {
2924    let trimmed = specifiers.trim();
2925    if trimmed.contains(',') {
2926        return None;
2927    }
2928
2929    let stripped = if let Some(version) = trimmed.strip_prefix("===") {
2930        version
2931    } else if let Some(version) = trimmed.strip_prefix("==") {
2932        version
2933    } else {
2934        return None;
2935    };
2936
2937    let version = stripped.trim();
2938    if version.is_empty() {
2939        None
2940    } else {
2941        Some(version.to_string())
2942    }
2943}
2944
2945#[derive(Debug, Clone)]
2946enum Value {
2947    String(String),
2948    Number(f64),
2949    Bool(bool),
2950    None,
2951    List(Vec<Value>),
2952    Tuple(Vec<Value>),
2953    Dict(HashMap<String, Value>),
2954}
2955
2956struct LiteralEvaluator {
2957    constants: HashMap<String, Value>,
2958    max_depth: usize,
2959    max_nodes: usize,
2960    nodes_visited: usize,
2961}
2962
2963impl LiteralEvaluator {
2964    fn new(constants: HashMap<String, Value>) -> Self {
2965        Self {
2966            constants,
2967            max_depth: MAX_SETUP_PY_AST_DEPTH,
2968            max_nodes: MAX_SETUP_PY_AST_NODES,
2969            nodes_visited: 0,
2970        }
2971    }
2972
2973    fn insert_constant(&mut self, name: String, value: Value) {
2974        self.constants.insert(name, value);
2975    }
2976
2977    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2978        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2979            return None;
2980        }
2981        self.nodes_visited += 1;
2982
2983        match expr {
2984            ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
2985                Some(Value::String(value.to_str().to_string()))
2986            }
2987            ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
2988                Some(Value::Bool(*value))
2989            }
2990            ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
2991                self.evaluate_number(value)
2992            }
2993            ast::Expr::NoneLiteral(_) => Some(Value::None),
2994            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2995            ast::Expr::List(ast::ExprList { elts, .. }) => {
2996                let mut values = Vec::new();
2997                for elt in elts {
2998                    values.push(self.evaluate_expr(elt, depth + 1)?);
2999                }
3000                Some(Value::List(values))
3001            }
3002            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
3003                let mut values = Vec::new();
3004                for elt in elts {
3005                    values.push(self.evaluate_expr(elt, depth + 1)?);
3006                }
3007                Some(Value::Tuple(values))
3008            }
3009            ast::Expr::Dict(ast::ExprDict { items, .. }) => {
3010                let mut dict = HashMap::new();
3011                for item in items {
3012                    let key_expr = item.key.as_ref()?;
3013                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
3014                    let key = value_to_string(&key_value)?;
3015                    let value = self.evaluate_expr(&item.value, depth + 1)?;
3016                    dict.insert(key, value);
3017                }
3018                Some(Value::Dict(dict))
3019            }
3020            ast::Expr::Call(ast::ExprCall {
3021                func, arguments, ..
3022            }) => {
3023                let args = arguments.args.as_ref();
3024                let keywords = arguments.keywords.as_ref();
3025                if keywords.is_empty()
3026                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
3027                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
3028                {
3029                    return self.evaluate_ordered_dict(args, depth + 1);
3030                }
3031
3032                if !args.is_empty() {
3033                    return None;
3034                }
3035
3036                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
3037                    && id == "dict"
3038                {
3039                    let mut dict = HashMap::new();
3040                    for keyword in keywords {
3041                        let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
3042                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
3043                        dict.insert(key.to_string(), value);
3044                    }
3045                    return Some(Value::Dict(dict));
3046                }
3047
3048                None
3049            }
3050            _ => None,
3051        }
3052    }
3053
3054    fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
3055        match number {
3056            ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
3057            ast::Number::Float(value) => Some(Value::Number(*value)),
3058            ast::Number::Complex { .. } => None,
3059        }
3060    }
3061
3062    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
3063        if args.len() != 1 {
3064            return None;
3065        }
3066
3067        let items = match self.evaluate_expr(&args[0], depth)? {
3068            Value::List(items) | Value::Tuple(items) => items,
3069            _ => return None,
3070        };
3071
3072        let mut dict = HashMap::new();
3073        for item in items {
3074            let Value::Tuple(values) = item else {
3075                return None;
3076            };
3077            if values.len() != 2 {
3078                return None;
3079            }
3080            let key = value_to_string(&values[0])?;
3081            dict.insert(key, values[1].clone());
3082        }
3083
3084        Some(Value::Dict(dict))
3085    }
3086}
3087
3088#[derive(Default)]
3089struct SetupAliases {
3090    setup_names: HashSet<String>,
3091    module_aliases: HashMap<String, String>,
3092}
3093
3094fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
3095    extract_from_setup_py(path).into_iter().collect()
3096}
3097
3098fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
3099    let content = match read_file_to_string(path) {
3100        Ok(content) => content,
3101        Err(e) => {
3102            warn!("Failed to read setup.py at {:?}: {}", path, e);
3103            return Some(default_package_data(path));
3104        }
3105    };
3106
3107    if content.len() > MAX_SETUP_PY_BYTES {
3108        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
3109        let package_data = extract_from_setup_py_regex(&content);
3110        return should_emit_setup_py_package(&package_data).then_some(package_data);
3111    }
3112
3113    let mut package_data = match extract_from_setup_py_ast(&content) {
3114        Ok(Some(data)) => data,
3115        Ok(None) => return Some(default_package_data(path)),
3116        Err(e) => {
3117            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
3118            extract_from_setup_py_regex(&content)
3119        }
3120    };
3121
3122    if package_data.name.is_none() {
3123        package_data.name = extract_setup_value(&content, "name");
3124    }
3125
3126    if package_data.version.is_none() {
3127        package_data.version = extract_setup_value(&content, "version");
3128    }
3129
3130    if package_data
3131        .version
3132        .as_deref()
3133        .is_some_and(|version| version.trim().is_empty())
3134    {
3135        package_data.version = None;
3136    }
3137
3138    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
3139    package_data.purl = build_setup_py_purl(
3140        package_data.name.as_deref(),
3141        package_data.version.as_deref(),
3142    );
3143
3144    if should_emit_setup_py_package(&package_data) {
3145        Some(package_data)
3146    } else {
3147        Some(default_package_data(path))
3148    }
3149}
3150
3151fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
3152    package_data.name.is_some()
3153        || package_data.version.is_some()
3154        || package_data.purl.is_some()
3155        || !package_data.dependencies.is_empty()
3156        || package_data.extracted_license_statement.is_some()
3157        || !package_data.license_detections.is_empty()
3158        || !package_data.parties.is_empty()
3159        || package_data.description.is_some()
3160        || package_data.homepage_url.is_some()
3161        || package_data.bug_tracking_url.is_some()
3162        || package_data.code_view_url.is_some()
3163        || package_data.vcs_url.is_some()
3164}
3165
3166fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
3167    if package_data.version.is_some()
3168        && package_data.extracted_license_statement.is_some()
3169        && package_data
3170            .parties
3171            .iter()
3172            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
3173    {
3174        return;
3175    }
3176
3177    let Some(root) = path.parent() else {
3178        return;
3179    };
3180
3181    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
3182
3183    if package_data.version.is_none() {
3184        package_data.version = dunder_metadata.version;
3185    }
3186
3187    if package_data.extracted_license_statement.is_none() {
3188        package_data.extracted_license_statement = dunder_metadata.license;
3189    }
3190
3191    let has_author = package_data
3192        .parties
3193        .iter()
3194        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
3195
3196    if !has_author && let Some(author) = dunder_metadata.author {
3197        package_data.parties.push(Party {
3198            r#type: Some("person".to_string()),
3199            role: Some("author".to_string()),
3200            name: Some(author),
3201            email: None,
3202            url: None,
3203            organization: None,
3204            organization_url: None,
3205            timezone: None,
3206        });
3207    }
3208}
3209
3210#[derive(Default)]
3211struct DunderMetadata {
3212    version: Option<String>,
3213    author: Option<String>,
3214    license: Option<String>,
3215}
3216
3217fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
3218    let statements = match parse_module(content) {
3219        Ok(parsed) => parsed.into_suite(),
3220        Err(_) => return DunderMetadata::default(),
3221    };
3222
3223    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3224    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3225    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3226    let mut metadata = DunderMetadata::default();
3227    let mut candidate_paths = Vec::new();
3228
3229    for module in imported_dunder_modules(&statements) {
3230        let Some(path) = resolve_imported_module_path(root, &module) else {
3231            continue;
3232        };
3233
3234        candidate_paths.push(path);
3235    }
3236
3237    candidate_paths.extend(referenced_dunder_init_paths(root, content));
3238
3239    let mut seen_paths = HashSet::new();
3240    for path in candidate_paths {
3241        if !seen_paths.insert(path.clone()) {
3242            continue;
3243        }
3244
3245        let Ok(module_content) = read_file_to_string(&path) else {
3246            continue;
3247        };
3248
3249        if metadata.version.is_none() {
3250            metadata.version = version_re
3251                .as_ref()
3252                .and_then(|regex| regex.captures(&module_content))
3253                .and_then(|captures| captures.get(1))
3254                .map(|match_| match_.as_str().to_string());
3255        }
3256
3257        if metadata.author.is_none() {
3258            metadata.author = author_re
3259                .as_ref()
3260                .and_then(|regex| regex.captures(&module_content))
3261                .and_then(|captures| captures.get(1))
3262                .map(|match_| match_.as_str().to_string());
3263        }
3264
3265        if metadata.license.is_none() {
3266            metadata.license = license_re
3267                .as_ref()
3268                .and_then(|regex| regex.captures(&module_content))
3269                .and_then(|captures| captures.get(1))
3270                .map(|match_| match_.as_str().to_string());
3271        }
3272
3273        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
3274            return metadata;
3275        }
3276    }
3277
3278    metadata
3279}
3280
3281fn referenced_dunder_init_paths(root: &Path, content: &str) -> Vec<PathBuf> {
3282    let open_re = match Regex::new(r#"open\(\s*['\"]([^'\"]+__init__\.py)['\"]"#) {
3283        Ok(regex) => regex,
3284        Err(_) => return Vec::new(),
3285    };
3286
3287    open_re
3288        .captures_iter(content)
3289        .filter_map(|captures| captures.get(1).map(|m| m.as_str()))
3290        .filter_map(|relative| {
3291            let relative_path = PathBuf::from(relative);
3292            if relative_path.is_absolute()
3293                || relative_path.components().any(|component| {
3294                    matches!(
3295                        component,
3296                        Component::ParentDir | Component::RootDir | Component::Prefix(_)
3297                    )
3298                })
3299            {
3300                return None;
3301            }
3302
3303            let candidate = root.join(relative_path);
3304            candidate.exists().then_some(candidate)
3305        })
3306        .collect()
3307}
3308
3309fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
3310    let mut modules = Vec::new();
3311
3312    for statement in statements {
3313        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
3314            continue;
3315        };
3316        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
3317            continue;
3318        };
3319        let imports_dunder = names.iter().any(|alias| {
3320            matches!(
3321                alias.name.as_str(),
3322                "__version__" | "__author__" | "__license__"
3323            )
3324        });
3325        if imports_dunder {
3326            modules.push(module.to_string());
3327        }
3328    }
3329
3330    modules
3331}
3332
3333fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
3334    let relative = PathBuf::from_iter(module.split('.'));
3335    let candidates = [
3336        root.join(relative.with_extension("py")),
3337        root.join(&relative).join("__init__.py"),
3338        root.join("src").join(relative.with_extension("py")),
3339        root.join("src").join(relative).join("__init__.py"),
3340    ];
3341
3342    candidates.into_iter().find(|candidate| candidate.exists())
3343}
3344
3345/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
3346///
3347/// # Security Model
3348///
3349/// This function parses setup.py as a Python AST and evaluates only literal values
3350/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
3351/// arbitrary code execution during scanning.
3352///
3353/// # DoS Prevention
3354///
3355/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
3356/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
3357/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
3358///
3359/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
3360fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
3361    let statements = parse_module(content)
3362        .map(|parsed| parsed.into_suite())
3363        .map_err(|e| e.to_string())?;
3364    let aliases = collect_setup_aliases(&statements);
3365    let mut evaluator = LiteralEvaluator::new(HashMap::new());
3366    build_setup_py_constants(&statements, &mut evaluator);
3367
3368    let setup_call = find_setup_call(&statements, &aliases);
3369    let Some(call_expr) = setup_call else {
3370        return Ok(None);
3371    };
3372
3373    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3374    Ok(Some(build_setup_py_package_data(&setup_values)))
3375}
3376
3377fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3378    for stmt in statements {
3379        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3380            if targets.len() != 1 {
3381                continue;
3382            }
3383
3384            let Some(name) = extract_assign_name(&targets[0]) else {
3385                continue;
3386            };
3387
3388            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3389                evaluator.insert_constant(name, value);
3390            }
3391        }
3392    }
3393}
3394
3395fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3396    match target {
3397        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3398        _ => None,
3399    }
3400}
3401
3402fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3403    let mut aliases = SetupAliases::default();
3404    aliases.setup_names.insert("setup".to_string());
3405
3406    for stmt in statements {
3407        match stmt {
3408            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3409                for alias in names {
3410                    let module_name = alias.name.as_str();
3411                    if !is_setup_module(module_name) {
3412                        continue;
3413                    }
3414                    let alias_name = alias
3415                        .asname
3416                        .as_ref()
3417                        .map(|name| name.as_str())
3418                        .unwrap_or(module_name);
3419                    aliases
3420                        .module_aliases
3421                        .insert(alias_name.to_string(), module_name.to_string());
3422                }
3423            }
3424            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3425                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3426                    continue;
3427                };
3428                if !is_setup_module(module_name) {
3429                    continue;
3430                }
3431                for alias in names {
3432                    if alias.name.as_str() != "setup" {
3433                        continue;
3434                    }
3435                    let alias_name = alias
3436                        .asname
3437                        .as_ref()
3438                        .map(|name| name.as_str())
3439                        .unwrap_or("setup");
3440                    aliases.setup_names.insert(alias_name.to_string());
3441                }
3442            }
3443            _ => {}
3444        }
3445    }
3446
3447    aliases
3448}
3449
3450fn is_setup_module(module_name: &str) -> bool {
3451    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3452}
3453
3454fn find_setup_call<'a>(
3455    statements: &'a [ast::Stmt],
3456    aliases: &'a SetupAliases,
3457) -> Option<&'a ast::Expr> {
3458    let mut finder = SetupCallFinder {
3459        aliases,
3460        called_function_names: collect_top_level_called_function_names(statements),
3461        nodes_visited: 0,
3462    };
3463    finder.find_in_statements(statements)
3464}
3465
3466fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3467    let mut called = HashSet::new();
3468    collect_called_function_names_in_statements(statements, &mut called);
3469    called
3470}
3471
3472fn collect_called_function_names_in_statements(
3473    statements: &[ast::Stmt],
3474    called: &mut HashSet<String>,
3475) {
3476    for stmt in statements {
3477        match stmt {
3478            ast::Stmt::Expr(ast::StmtExpr { value, .. })
3479            | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3480                collect_called_function_names_in_expr(value.as_ref(), called);
3481            }
3482            ast::Stmt::If(ast::StmtIf {
3483                body,
3484                elif_else_clauses,
3485                ..
3486            }) => {
3487                collect_called_function_names_in_statements(body, called);
3488                for clause in elif_else_clauses {
3489                    collect_called_function_names_in_statements(&clause.body, called);
3490                }
3491            }
3492            ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3493            | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3494                collect_called_function_names_in_statements(body, called);
3495                collect_called_function_names_in_statements(orelse, called);
3496            }
3497            ast::Stmt::With(ast::StmtWith { body, .. }) => {
3498                collect_called_function_names_in_statements(body, called);
3499            }
3500            ast::Stmt::Try(ast::StmtTry {
3501                body,
3502                orelse,
3503                finalbody,
3504                handlers,
3505                ..
3506            }) => {
3507                collect_called_function_names_in_statements(body, called);
3508                collect_called_function_names_in_statements(orelse, called);
3509                collect_called_function_names_in_statements(finalbody, called);
3510                for handler in handlers {
3511                    let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3512                        body,
3513                        ..
3514                    }) = handler;
3515                    collect_called_function_names_in_statements(body, called);
3516                }
3517            }
3518            _ => {}
3519        }
3520    }
3521}
3522
3523fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3524    if let ast::Expr::Call(ast::ExprCall {
3525        func, arguments, ..
3526    }) = expr
3527    {
3528        if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3529            called.insert(id.as_str().to_string());
3530        }
3531
3532        for arg in arguments.args.iter() {
3533            collect_called_function_names_in_expr(arg, called);
3534        }
3535        for keyword in arguments.keywords.iter() {
3536            collect_called_function_names_in_expr(&keyword.value, called);
3537        }
3538    }
3539}
3540
3541struct SetupCallFinder<'a> {
3542    aliases: &'a SetupAliases,
3543    called_function_names: HashSet<String>,
3544    nodes_visited: usize,
3545}
3546
3547impl<'a> SetupCallFinder<'a> {
3548    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3549        for stmt in statements {
3550            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3551                return None;
3552            }
3553            self.nodes_visited += 1;
3554
3555            let found = match stmt {
3556                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3557                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3558                ast::Stmt::If(ast::StmtIf {
3559                    body,
3560                    elif_else_clauses,
3561                    ..
3562                }) => self.find_in_statements(body).or_else(|| {
3563                    for clause in elif_else_clauses {
3564                        if let Some(found) = self.find_in_statements(&clause.body) {
3565                            return Some(found);
3566                        }
3567                    }
3568                    None
3569                }),
3570                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3571                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3572                    .find_in_statements(body)
3573                    .or_else(|| self.find_in_statements(orelse)),
3574                ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3575                    .called_function_names
3576                    .contains(name.as_str())
3577                    .then(|| self.find_in_statements(body))
3578                    .flatten(),
3579                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3580                ast::Stmt::Try(ast::StmtTry {
3581                    body,
3582                    orelse,
3583                    finalbody,
3584                    handlers,
3585                    ..
3586                }) => self
3587                    .find_in_statements(body)
3588                    .or_else(|| self.find_in_statements(orelse))
3589                    .or_else(|| self.find_in_statements(finalbody))
3590                    .or_else(|| {
3591                        for handler in handlers {
3592                            let ast::ExceptHandler::ExceptHandler(
3593                                ast::ExceptHandlerExceptHandler { body, .. },
3594                            ) = handler;
3595                            if let Some(found) = self.find_in_statements(body) {
3596                                return Some(found);
3597                            }
3598                        }
3599                        None
3600                    }),
3601                _ => None,
3602            };
3603
3604            if found.is_some() {
3605                return found;
3606            }
3607        }
3608
3609        None
3610    }
3611
3612    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3613        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3614            return None;
3615        }
3616        self.nodes_visited += 1;
3617
3618        match expr {
3619            ast::Expr::Call(ast::ExprCall { func, .. })
3620                if is_setup_call(func.as_ref(), self.aliases) =>
3621            {
3622                Some(expr)
3623            }
3624            _ => None,
3625        }
3626    }
3627}
3628
3629fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3630    let Some(dotted) = dotted_name(func, 0) else {
3631        return false;
3632    };
3633
3634    if aliases.setup_names.contains(&dotted) {
3635        return true;
3636    }
3637
3638    let Some(module) = dotted.strip_suffix(".setup") else {
3639        return false;
3640    };
3641
3642    let resolved = resolve_module_alias(module, aliases);
3643    is_setup_module(&resolved)
3644}
3645
3646fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3647    if depth >= MAX_SETUP_PY_AST_DEPTH {
3648        return None;
3649    }
3650
3651    match expr {
3652        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3653        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3654            let base = dotted_name(value.as_ref(), depth + 1)?;
3655            Some(format!("{}.{}", base, attr.as_str()))
3656        }
3657        _ => None,
3658    }
3659}
3660
3661fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3662    if let Some(mapped) = aliases.module_aliases.get(module) {
3663        return mapped.clone();
3664    }
3665
3666    let Some((base, rest)) = module.split_once('.') else {
3667        return module.to_string();
3668    };
3669
3670    if let Some(mapped) = aliases.module_aliases.get(base) {
3671        return format!("{}.{}", mapped, rest);
3672    }
3673
3674    module.to_string()
3675}
3676
3677fn extract_setup_keywords(
3678    call_expr: &ast::Expr,
3679    evaluator: &mut LiteralEvaluator,
3680) -> HashMap<String, Value> {
3681    let mut values = HashMap::new();
3682    let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3683        return values;
3684    };
3685
3686    for keyword in arguments.keywords.iter() {
3687        if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3688            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3689                values.insert(arg.to_string(), value);
3690            }
3691        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3692            for (key, value) in dict {
3693                values.insert(key, value);
3694            }
3695        }
3696    }
3697
3698    values
3699}
3700
3701fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3702    let name = get_value_string(values, "name");
3703    let version = get_value_string(values, "version");
3704    let description =
3705        get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3706    let homepage_url =
3707        get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3708    let author = get_value_string(values, "author");
3709    let author_email = get_value_string(values, "author_email");
3710    let maintainer = get_value_string(values, "maintainer");
3711    let maintainer_email = get_value_string(values, "maintainer_email");
3712    let license = get_value_string(values, "license");
3713    let classifiers = values
3714        .get("classifiers")
3715        .and_then(value_to_string_list)
3716        .unwrap_or_default();
3717
3718    let mut parties = Vec::new();
3719    if author.is_some() || author_email.is_some() {
3720        parties.push(Party {
3721            r#type: Some("person".to_string()),
3722            role: Some("author".to_string()),
3723            name: author,
3724            email: author_email,
3725            url: None,
3726            organization: None,
3727            organization_url: None,
3728            timezone: None,
3729        });
3730    }
3731
3732    if maintainer.is_some() || maintainer_email.is_some() {
3733        parties.push(Party {
3734            r#type: Some("person".to_string()),
3735            role: Some("maintainer".to_string()),
3736            name: maintainer,
3737            email: maintainer_email,
3738            url: None,
3739            organization: None,
3740            organization_url: None,
3741            timezone: None,
3742        });
3743    }
3744
3745    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3746        normalize_spdx_declared_license(license.as_deref());
3747    let extracted_license_statement = license.clone();
3748
3749    let dependencies = build_setup_py_dependencies(values);
3750    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3751    let mut homepage_from_project_urls = None;
3752    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3753    let mut extra_data = HashMap::new();
3754
3755    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3756        apply_project_url_mappings(
3757            &parsed_project_urls,
3758            &mut homepage_from_project_urls,
3759            &mut bug_tracking_url,
3760            &mut code_view_url,
3761            &mut vcs_url,
3762            &mut extra_data,
3763        );
3764    }
3765
3766    let extra_data = if extra_data.is_empty() {
3767        None
3768    } else {
3769        Some(extra_data)
3770    };
3771
3772    PackageData {
3773        package_type: Some(PythonParser::PACKAGE_TYPE),
3774        namespace: None,
3775        name,
3776        version,
3777        qualifiers: None,
3778        subpath: None,
3779        primary_language: Some("Python".to_string()),
3780        description,
3781        release_date: None,
3782        parties,
3783        keywords: Vec::new(),
3784        homepage_url: homepage_url.or(homepage_from_project_urls),
3785        download_url: None,
3786        size: None,
3787        sha1: None,
3788        md5: None,
3789        sha256: None,
3790        sha512: None,
3791        bug_tracking_url,
3792        code_view_url,
3793        vcs_url,
3794        copyright: None,
3795        holder: None,
3796        declared_license_expression,
3797        declared_license_expression_spdx,
3798        license_detections,
3799        other_license_expression: None,
3800        other_license_expression_spdx: None,
3801        other_license_detections: Vec::new(),
3802        extracted_license_statement,
3803        notice_text: None,
3804        source_packages: Vec::new(),
3805        file_references: Vec::new(),
3806        is_private: has_private_classifier(&classifiers),
3807        is_virtual: false,
3808        extra_data,
3809        dependencies,
3810        repository_homepage_url: None,
3811        repository_download_url: None,
3812        api_data_url: None,
3813        datasource_id: Some(DatasourceId::PypiSetupPy),
3814        purl,
3815    }
3816}
3817
3818fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3819    let mut dependencies = Vec::new();
3820
3821    if let Some(reqs) = values
3822        .get("install_requires")
3823        .and_then(value_to_string_list)
3824    {
3825        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3826    }
3827
3828    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3829        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3830    }
3831
3832    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3833        let mut extra_items: Vec<_> = extras.iter().collect();
3834        extra_items.sort_by_key(|(name, _)| *name);
3835        for (extra_name, extra_value) in extra_items {
3836            if let Some(reqs) = value_to_string_list(extra_value) {
3837                dependencies.extend(build_setup_py_dependency_list(
3838                    reqs.as_slice(),
3839                    extra_name,
3840                    true,
3841                ));
3842            }
3843        }
3844    }
3845
3846    dependencies
3847}
3848
3849fn build_setup_py_dependency_list(
3850    reqs: &[String],
3851    scope: &str,
3852    is_optional: bool,
3853) -> Vec<Dependency> {
3854    reqs.iter()
3855        .filter_map(|req| build_python_dependency(req, scope, is_optional, None))
3856        .collect()
3857}
3858
3859fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3860    values.get(key).and_then(value_to_string)
3861}
3862
3863fn value_to_string(value: &Value) -> Option<String> {
3864    match value {
3865        Value::String(value) => Some(value.clone()),
3866        Value::Number(value) => Some(value.to_string()),
3867        Value::Bool(value) => Some(value.to_string()),
3868        _ => None,
3869    }
3870}
3871
3872fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3873    match value {
3874        Value::String(value) => Some(vec![value.clone()]),
3875        Value::List(values) | Value::Tuple(values) => {
3876            let mut items = Vec::new();
3877            for item in values {
3878                items.push(value_to_string(item)?);
3879            }
3880            Some(items)
3881        }
3882        _ => None,
3883    }
3884}
3885
3886fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3887    let Value::Dict(dict) = value else {
3888        return None;
3889    };
3890
3891    let mut pairs: Vec<(String, String)> = dict
3892        .iter()
3893        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3894        .collect::<Option<Vec<_>>>()?;
3895    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3896    Some(pairs)
3897}
3898
3899fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3900    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3901    extract_requires_dist_dependencies(&requires_dist)
3902}
3903
3904pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3905    requires_dist
3906        .iter()
3907        .filter_map(|entry| build_rfc822_dependency(entry))
3908        .collect()
3909}
3910
3911fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3912    build_python_dependency(entry, "install", false, None)
3913}
3914
3915fn build_python_dependency(
3916    entry: &str,
3917    default_scope: &str,
3918    default_optional: bool,
3919    marker_override: Option<&str>,
3920) -> Option<Dependency> {
3921    let (requirement_part, marker_part) = entry
3922        .split_once(';')
3923        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3924        .unwrap_or((entry.trim(), None));
3925
3926    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3927    let requirement = normalize_rfc822_requirement(requirement_part);
3928    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3929        marker_part.or(marker_override),
3930        default_scope,
3931        default_optional,
3932    );
3933    let purl = build_python_dependency_purl(&name, None)?;
3934
3935    let is_pinned = requirement
3936        .as_deref()
3937        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3938    let purl = if is_pinned {
3939        requirement
3940            .as_deref()
3941            .map(|req| req.trim_start_matches('='))
3942            .and_then(|version| build_python_dependency_purl(&name, Some(version)))
3943            .unwrap_or(purl)
3944    } else {
3945        purl
3946    };
3947
3948    let mut extra_data = HashMap::new();
3949    extra_data.extend(marker_data);
3950    if let Some(marker) = marker {
3951        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3952    }
3953
3954    Some(Dependency {
3955        purl: Some(purl),
3956        extracted_requirement: requirement,
3957        scope: Some(scope),
3958        is_runtime: Some(true),
3959        is_optional: Some(is_optional),
3960        is_pinned: Some(is_pinned),
3961        is_direct: Some(true),
3962        resolved_package: None,
3963        extra_data: if extra_data.is_empty() {
3964            None
3965        } else {
3966            Some(extra_data)
3967        },
3968    })
3969}
3970
3971fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3972    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3973    let trimmed = requirement_part.trim();
3974    let mut remainder = trimmed[name.len()..].trim();
3975
3976    if let Some(stripped) = remainder.strip_prefix('[')
3977        && let Some(end_idx) = stripped.find(']')
3978    {
3979        remainder = stripped[end_idx + 1..].trim();
3980    }
3981
3982    let remainder = remainder
3983        .strip_prefix('(')
3984        .and_then(|value| value.strip_suffix(')'))
3985        .unwrap_or(remainder)
3986        .trim();
3987
3988    if remainder.is_empty() {
3989        return None;
3990    }
3991
3992    let mut specifiers: Vec<String> = remainder
3993        .split(',')
3994        .map(|specifier| specifier.trim().replace(' ', ""))
3995        .filter(|specifier| !specifier.is_empty())
3996        .collect();
3997    specifiers.sort();
3998    Some(specifiers.join(","))
3999}
4000
4001fn encode_python_dependency_purl_version(version: &str) -> String {
4002    version.replace('*', "%2A")
4003}
4004
4005fn build_python_dependency_purl(name: &str, version: Option<&str>) -> Option<String> {
4006    let normalized_name = normalize_python_dependency_name(name);
4007
4008    PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &normalized_name)
4009        .ok()
4010        .map(|_| match version {
4011            Some(version) => {
4012                format!(
4013                    "pkg:pypi/{normalized_name}@{}",
4014                    encode_python_dependency_purl_version(version)
4015                )
4016            }
4017            None => format!("pkg:pypi/{normalized_name}"),
4018        })
4019}
4020
4021fn normalize_python_dependency_name(name: &str) -> String {
4022    name.trim().to_ascii_lowercase().replace('_', "-")
4023}
4024
4025fn parse_rfc822_marker(
4026    marker_part: Option<&str>,
4027    default_scope: &str,
4028    default_optional: bool,
4029) -> (
4030    String,
4031    bool,
4032    Option<String>,
4033    HashMap<String, serde_json::Value>,
4034) {
4035    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
4036        return (
4037            default_scope.to_string(),
4038            default_optional,
4039            None,
4040            HashMap::new(),
4041        );
4042    };
4043
4044    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
4045        .expect("extra marker regex should compile");
4046    let mut extra_data = HashMap::new();
4047
4048    if let Some(python_version) = extract_marker_field(marker, "python_version") {
4049        extra_data.insert(
4050            "python_version".to_string(),
4051            serde_json::Value::String(python_version),
4052        );
4053    }
4054    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
4055        extra_data.insert(
4056            "sys_platform".to_string(),
4057            serde_json::Value::String(sys_platform),
4058        );
4059    }
4060
4061    if let Some(captures) = extra_re.captures(marker)
4062        && let Some(scope) = captures.get(1)
4063    {
4064        return (
4065            scope.as_str().to_string(),
4066            true,
4067            Some(marker.trim().to_string()),
4068            extra_data,
4069        );
4070    }
4071
4072    (
4073        default_scope.to_string(),
4074        default_optional,
4075        Some(marker.trim().to_string()),
4076        extra_data,
4077    )
4078}
4079
4080fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
4081    let re = Regex::new(&format!(
4082        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
4083        field
4084    ))
4085    .ok()?;
4086    let captures = re.captures(marker)?;
4087    let operator = captures.get(1)?.as_str();
4088    let value = captures.get(2)?.as_str();
4089    Some(format!("{} {}", operator, value))
4090}
4091
4092fn parse_requires_txt(content: &str) -> Vec<Dependency> {
4093    let mut dependencies = Vec::new();
4094    let mut current_scope = "install".to_string();
4095    let mut current_optional = false;
4096    let mut current_marker: Option<String> = None;
4097
4098    for line in content.lines() {
4099        let trimmed = line.trim();
4100        if trimmed.is_empty() || trimmed.starts_with('#') {
4101            continue;
4102        }
4103
4104        if trimmed.starts_with('[') && trimmed.ends_with(']') {
4105            let inner = &trimmed[1..trimmed.len() - 1];
4106            if let Some(rest) = inner.strip_prefix(':') {
4107                current_scope = "install".to_string();
4108                current_optional = false;
4109                current_marker = Some(rest.trim().to_string());
4110            } else if let Some((scope, marker)) = inner.split_once(':') {
4111                current_scope = scope.trim().to_string();
4112                current_optional = true;
4113                current_marker = Some(marker.trim().to_string());
4114            } else {
4115                current_scope = inner.trim().to_string();
4116                current_optional = true;
4117                current_marker = None;
4118            }
4119            continue;
4120        }
4121
4122        if let Some(dependency) = build_python_dependency(
4123            trimmed,
4124            &current_scope,
4125            current_optional,
4126            current_marker.as_deref(),
4127        ) {
4128            dependencies.push(dependency);
4129        }
4130    }
4131
4132    dependencies
4133}
4134
4135fn has_private_classifier(classifiers: &[String]) -> bool {
4136    classifiers
4137        .iter()
4138        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
4139}
4140
4141fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
4142    let name = name?;
4143    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
4144    if let Some(version) = version {
4145        package_url.with_version(version).ok()?;
4146    }
4147    Some(package_url.to_string())
4148}
4149
4150fn extract_from_setup_py_regex(content: &str) -> PackageData {
4151    let name = extract_setup_value(content, "name");
4152    let version = extract_setup_value(content, "version");
4153    let license_expression = extract_setup_value(content, "license");
4154
4155    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4156        normalize_spdx_declared_license(license_expression.as_deref());
4157    let extracted_license_statement = license_expression.clone();
4158
4159    let dependencies = extract_setup_py_dependencies(content);
4160    let homepage_url = extract_setup_value(content, "url");
4161    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
4162
4163    PackageData {
4164        package_type: Some(PythonParser::PACKAGE_TYPE),
4165        namespace: None,
4166        name,
4167        version,
4168        qualifiers: None,
4169        subpath: None,
4170        primary_language: Some("Python".to_string()),
4171        description: None,
4172        release_date: None,
4173        parties: Vec::new(),
4174        keywords: Vec::new(),
4175        homepage_url,
4176        download_url: None,
4177        size: None,
4178        sha1: None,
4179        md5: None,
4180        sha256: None,
4181        sha512: None,
4182        bug_tracking_url: None,
4183        code_view_url: None,
4184        vcs_url: None,
4185        copyright: None,
4186        holder: None,
4187        declared_license_expression,
4188        declared_license_expression_spdx,
4189        license_detections,
4190        other_license_expression: None,
4191        other_license_expression_spdx: None,
4192        other_license_detections: Vec::new(),
4193        extracted_license_statement,
4194        notice_text: None,
4195        source_packages: Vec::new(),
4196        file_references: Vec::new(),
4197        is_private: false,
4198        is_virtual: false,
4199        extra_data: None,
4200        dependencies,
4201        repository_homepage_url: None,
4202        repository_download_url: None,
4203        api_data_url: None,
4204        datasource_id: Some(DatasourceId::PypiSetupPy),
4205        purl,
4206    }
4207}
4208
4209fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
4210    crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
4211}
4212
4213fn extract_from_pypi_json(path: &Path) -> PackageData {
4214    let default = PackageData {
4215        package_type: Some(PythonParser::PACKAGE_TYPE),
4216        datasource_id: Some(DatasourceId::PypiJson),
4217        ..Default::default()
4218    };
4219
4220    let content = match read_file_to_string(path) {
4221        Ok(content) => content,
4222        Err(error) => {
4223            warn!("Failed to read pypi.json at {:?}: {}", path, error);
4224            return default;
4225        }
4226    };
4227
4228    let root: serde_json::Value = match serde_json::from_str(&content) {
4229        Ok(value) => value,
4230        Err(error) => {
4231            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
4232            return default;
4233        }
4234    };
4235
4236    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
4237        warn!("No info object found in pypi.json at {:?}", path);
4238        return default;
4239    };
4240
4241    let name = info
4242        .get("name")
4243        .and_then(|value| value.as_str())
4244        .map(ToOwned::to_owned);
4245    let version = info
4246        .get("version")
4247        .and_then(|value| value.as_str())
4248        .map(ToOwned::to_owned);
4249    let summary = info
4250        .get("summary")
4251        .and_then(|value| value.as_str())
4252        .map(ToOwned::to_owned);
4253    let description = info
4254        .get("description")
4255        .and_then(|value| value.as_str())
4256        .filter(|value| !value.trim().is_empty())
4257        .map(ToOwned::to_owned)
4258        .or(summary);
4259    let mut homepage_url = info
4260        .get("home_page")
4261        .and_then(|value| value.as_str())
4262        .map(ToOwned::to_owned);
4263    let author = info
4264        .get("author")
4265        .and_then(|value| value.as_str())
4266        .filter(|value| !value.trim().is_empty())
4267        .map(ToOwned::to_owned);
4268    let author_email = info
4269        .get("author_email")
4270        .and_then(|value| value.as_str())
4271        .filter(|value| !value.trim().is_empty())
4272        .map(ToOwned::to_owned);
4273    let license = info
4274        .get("license")
4275        .and_then(|value| value.as_str())
4276        .filter(|value| !value.trim().is_empty())
4277        .map(ToOwned::to_owned);
4278    let keywords = parse_setup_cfg_keywords(
4279        info.get("keywords")
4280            .and_then(|value| value.as_str())
4281            .map(ToOwned::to_owned),
4282    );
4283    let classifiers = info
4284        .get("classifiers")
4285        .and_then(|value| value.as_array())
4286        .map(|values| {
4287            values
4288                .iter()
4289                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
4290                .collect::<Vec<_>>()
4291        })
4292        .unwrap_or_default();
4293
4294    let mut parties = Vec::new();
4295    if author.is_some() || author_email.is_some() {
4296        parties.push(Party {
4297            r#type: Some("person".to_string()),
4298            role: Some("author".to_string()),
4299            name: author,
4300            email: author_email,
4301            url: None,
4302            organization: None,
4303            organization_url: None,
4304            timezone: None,
4305        });
4306    }
4307
4308    let mut bug_tracking_url = None;
4309    let mut code_view_url = None;
4310    let mut vcs_url = None;
4311    let mut extra_data = HashMap::new();
4312
4313    let parsed_project_urls = info
4314        .get("project_urls")
4315        .and_then(|value| value.as_object())
4316        .map(|map| {
4317            let mut pairs: Vec<(String, String)> = map
4318                .iter()
4319                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
4320                .collect();
4321            pairs.sort_by(|left, right| left.0.cmp(&right.0));
4322            pairs
4323        })
4324        .unwrap_or_default();
4325
4326    apply_project_url_mappings(
4327        &parsed_project_urls,
4328        &mut homepage_url,
4329        &mut bug_tracking_url,
4330        &mut code_view_url,
4331        &mut vcs_url,
4332        &mut extra_data,
4333    );
4334
4335    let (download_url, size, sha256) = root
4336        .get("urls")
4337        .and_then(|value| value.as_array())
4338        .map(|urls| select_pypi_json_artifact(urls))
4339        .unwrap_or((None, None, None));
4340
4341    let sha256 = sha256.and_then(|h| Sha256Digest::from_hex(&h).ok());
4342
4343    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4344        normalize_spdx_declared_license(license.as_deref());
4345    let dependencies = info
4346        .get("requires_dist")
4347        .and_then(|value| value.as_array())
4348        .map(|entries| {
4349            entries
4350                .iter()
4351                .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4352                .collect::<Vec<_>>()
4353        })
4354        .map(|entries| extract_requires_dist_dependencies(&entries))
4355        .unwrap_or_default();
4356
4357    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
4358        build_pypi_urls(name.as_deref(), version.as_deref());
4359
4360    PackageData {
4361        package_type: Some(PythonParser::PACKAGE_TYPE),
4362        namespace: None,
4363        name,
4364        version,
4365        qualifiers: None,
4366        subpath: None,
4367        primary_language: None,
4368        description,
4369        release_date: None,
4370        parties,
4371        keywords,
4372        homepage_url: homepage_url.or(repository_homepage_url.clone()),
4373        download_url,
4374        size,
4375        sha1: None,
4376        md5: None,
4377        sha256,
4378        sha512: None,
4379        bug_tracking_url,
4380        code_view_url,
4381        vcs_url,
4382        copyright: None,
4383        holder: None,
4384        declared_license_expression,
4385        declared_license_expression_spdx,
4386        license_detections,
4387        other_license_expression: None,
4388        other_license_expression_spdx: None,
4389        other_license_detections: Vec::new(),
4390        extracted_license_statement: license,
4391        notice_text: None,
4392        source_packages: Vec::new(),
4393        file_references: Vec::new(),
4394        is_private: has_private_classifier(&classifiers),
4395        is_virtual: false,
4396        extra_data: if extra_data.is_empty() {
4397            None
4398        } else {
4399            Some(extra_data)
4400        },
4401        dependencies,
4402        repository_homepage_url,
4403        repository_download_url,
4404        api_data_url,
4405        datasource_id: Some(DatasourceId::PypiJson),
4406        purl,
4407    }
4408}
4409
4410fn select_pypi_json_artifact(
4411    urls: &[serde_json::Value],
4412) -> (Option<String>, Option<u64>, Option<String>) {
4413    let selected = urls
4414        .iter()
4415        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4416        .or_else(|| urls.first());
4417
4418    let Some(entry) = selected else {
4419        return (None, None, None);
4420    };
4421
4422    let download_url = entry
4423        .get("url")
4424        .and_then(|value| value.as_str())
4425        .map(ToOwned::to_owned);
4426    let size = entry.get("size").and_then(|value| value.as_u64());
4427    let sha256 = entry
4428        .get("digests")
4429        .and_then(|value| value.as_object())
4430        .and_then(|digests| digests.get("sha256"))
4431        .and_then(|value| value.as_str())
4432        .map(ToOwned::to_owned);
4433
4434    (download_url, size, sha256)
4435}
4436
4437fn extract_from_pip_inspect(path: &Path) -> PackageData {
4438    let content = match read_file_to_string(path) {
4439        Ok(content) => content,
4440        Err(e) => {
4441            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4442            return default_package_data(path);
4443        }
4444    };
4445
4446    let root: serde_json::Value = match serde_json::from_str(&content) {
4447        Ok(value) => value,
4448        Err(e) => {
4449            warn!(
4450                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4451                path, e
4452            );
4453            return default_package_data(path);
4454        }
4455    };
4456
4457    let installed = match root.get("installed").and_then(|v| v.as_array()) {
4458        Some(arr) => arr,
4459        None => {
4460            warn!(
4461                "No 'installed' array found in pip-inspect.deplock at {:?}",
4462                path
4463            );
4464            return default_package_data(path);
4465        }
4466    };
4467
4468    let pip_version = root
4469        .get("pip_version")
4470        .and_then(|v| v.as_str())
4471        .map(String::from);
4472    let inspect_version = root
4473        .get("version")
4474        .and_then(|v| v.as_str())
4475        .map(String::from);
4476
4477    let mut main_package: Option<PackageData> = None;
4478    let mut dependencies: Vec<Dependency> = Vec::new();
4479
4480    for package_entry in installed {
4481        let metadata = match package_entry.get("metadata") {
4482            Some(m) => m,
4483            None => continue,
4484        };
4485
4486        let is_requested = package_entry
4487            .get("requested")
4488            .and_then(|v| v.as_bool())
4489            .unwrap_or(false);
4490        let has_direct_url = package_entry.get("direct_url").is_some();
4491
4492        let name = metadata
4493            .get("name")
4494            .and_then(|v| v.as_str())
4495            .map(String::from);
4496        let version = metadata
4497            .get("version")
4498            .and_then(|v| v.as_str())
4499            .map(String::from);
4500        let summary = metadata
4501            .get("summary")
4502            .and_then(|v| v.as_str())
4503            .map(String::from);
4504        let home_page = metadata
4505            .get("home_page")
4506            .and_then(|v| v.as_str())
4507            .map(String::from);
4508        let author = metadata
4509            .get("author")
4510            .and_then(|v| v.as_str())
4511            .map(String::from);
4512        let author_email = metadata
4513            .get("author_email")
4514            .and_then(|v| v.as_str())
4515            .map(String::from);
4516        let license = metadata
4517            .get("license")
4518            .and_then(|v| v.as_str())
4519            .map(String::from);
4520        let description = metadata
4521            .get("description")
4522            .and_then(|v| v.as_str())
4523            .map(String::from);
4524        let keywords = metadata
4525            .get("keywords")
4526            .and_then(|v| v.as_array())
4527            .map(|arr| {
4528                arr.iter()
4529                    .filter_map(|k| k.as_str().map(String::from))
4530                    .collect::<Vec<_>>()
4531            })
4532            .unwrap_or_default();
4533
4534        let mut parties = Vec::new();
4535        if author.is_some() || author_email.is_some() {
4536            parties.push(Party {
4537                r#type: Some("person".to_string()),
4538                role: Some("author".to_string()),
4539                name: author,
4540                email: author_email,
4541                url: None,
4542                organization: None,
4543                organization_url: None,
4544                timezone: None,
4545            });
4546        }
4547
4548        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4549            normalize_spdx_declared_license(license.as_deref());
4550        let extracted_license_statement = license.clone();
4551        let requires_dist = metadata
4552            .get("requires_dist")
4553            .and_then(|v| v.as_array())
4554            .map(|entries| {
4555                entries
4556                    .iter()
4557                    .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4558                    .collect::<Vec<_>>()
4559            })
4560            .unwrap_or_default();
4561        let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4562
4563        let purl = name.as_ref().and_then(|n| {
4564            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4565            if let Some(v) = &version {
4566                package_url.with_version(v).ok()?;
4567            }
4568            Some(package_url.to_string())
4569        });
4570
4571        if is_requested && has_direct_url {
4572            let mut extra_data = HashMap::new();
4573            if let Some(pv) = &pip_version {
4574                extra_data.insert(
4575                    "pip_version".to_string(),
4576                    serde_json::Value::String(pv.clone()),
4577                );
4578            }
4579            if let Some(iv) = &inspect_version {
4580                extra_data.insert(
4581                    "inspect_version".to_string(),
4582                    serde_json::Value::String(iv.clone()),
4583                );
4584            }
4585
4586            main_package = Some(PackageData {
4587                package_type: Some(PythonParser::PACKAGE_TYPE),
4588                namespace: None,
4589                name,
4590                version,
4591                qualifiers: None,
4592                subpath: None,
4593                primary_language: Some("Python".to_string()),
4594                description: description.or(summary),
4595                release_date: None,
4596                parties,
4597                keywords,
4598                homepage_url: home_page,
4599                download_url: None,
4600                size: None,
4601                sha1: None,
4602                md5: None,
4603                sha256: None,
4604                sha512: None,
4605                bug_tracking_url: None,
4606                code_view_url: None,
4607                vcs_url: None,
4608                copyright: None,
4609                holder: None,
4610                declared_license_expression,
4611                declared_license_expression_spdx,
4612                license_detections,
4613                other_license_expression: None,
4614                other_license_expression_spdx: None,
4615                other_license_detections: Vec::new(),
4616                extracted_license_statement,
4617                notice_text: None,
4618                source_packages: Vec::new(),
4619                file_references: Vec::new(),
4620                is_private: false,
4621                is_virtual: true,
4622                extra_data: if extra_data.is_empty() {
4623                    None
4624                } else {
4625                    Some(extra_data)
4626                },
4627                dependencies: parsed_dependencies,
4628                repository_homepage_url: None,
4629                repository_download_url: None,
4630                api_data_url: None,
4631                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4632                purl,
4633            });
4634        } else {
4635            let resolved_package = PackageData {
4636                package_type: Some(PythonParser::PACKAGE_TYPE),
4637                namespace: None,
4638                name: name.clone(),
4639                version: version.clone(),
4640                qualifiers: None,
4641                subpath: None,
4642                primary_language: Some("Python".to_string()),
4643                description: description.or(summary),
4644                release_date: None,
4645                parties,
4646                keywords,
4647                homepage_url: home_page,
4648                download_url: None,
4649                size: None,
4650                sha1: None,
4651                md5: None,
4652                sha256: None,
4653                sha512: None,
4654                bug_tracking_url: None,
4655                code_view_url: None,
4656                vcs_url: None,
4657                copyright: None,
4658                holder: None,
4659                declared_license_expression,
4660                declared_license_expression_spdx,
4661                license_detections,
4662                other_license_expression: None,
4663                other_license_expression_spdx: None,
4664                other_license_detections: Vec::new(),
4665                extracted_license_statement,
4666                notice_text: None,
4667                source_packages: Vec::new(),
4668                file_references: Vec::new(),
4669                is_private: false,
4670                is_virtual: true,
4671                extra_data: None,
4672                dependencies: parsed_dependencies,
4673                repository_homepage_url: None,
4674                repository_download_url: None,
4675                api_data_url: None,
4676                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4677                purl: purl.clone(),
4678            };
4679
4680            let resolved = package_data_to_resolved(&resolved_package);
4681            dependencies.push(Dependency {
4682                purl,
4683                extracted_requirement: None,
4684                scope: None,
4685                is_runtime: Some(true),
4686                is_optional: Some(false),
4687                is_pinned: Some(true),
4688                is_direct: Some(is_requested),
4689                resolved_package: Some(Box::new(resolved)),
4690                extra_data: None,
4691            });
4692        }
4693    }
4694
4695    if let Some(mut main_pkg) = main_package {
4696        let direct_requirement_purls: HashSet<String> = main_pkg
4697            .dependencies
4698            .iter()
4699            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4700            .collect();
4701
4702        let resolved_requirement_purls: HashSet<String> = dependencies
4703            .iter()
4704            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4705            .collect();
4706
4707        let unresolved_dependencies = main_pkg
4708            .dependencies
4709            .iter()
4710            .filter(|dep| {
4711                dep.purl.as_ref().is_some_and(|purl| {
4712                    !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4713                })
4714            })
4715            .cloned()
4716            .collect::<Vec<_>>();
4717
4718        for dependency in &mut dependencies {
4719            if dependency
4720                .purl
4721                .as_ref()
4722                .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4723            {
4724                dependency.is_direct = Some(true);
4725            }
4726        }
4727
4728        main_pkg.dependencies = dependencies;
4729        main_pkg.dependencies.extend(unresolved_dependencies);
4730        main_pkg
4731    } else {
4732        default_package_data(path)
4733    }
4734}
4735
4736fn base_dependency_purl(purl: &str) -> String {
4737    purl.split_once('@')
4738        .map(|(base, _)| base.to_string())
4739        .unwrap_or_else(|| purl.to_string())
4740}
4741
4742type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4743
4744fn extract_from_setup_cfg(path: &Path) -> PackageData {
4745    let content = match read_file_to_string(path) {
4746        Ok(content) => content,
4747        Err(e) => {
4748            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4749            return default_package_data(path);
4750        }
4751    };
4752
4753    let sections = parse_setup_cfg(&content);
4754    let name = get_ini_value(&sections, "metadata", "name");
4755    let version = get_ini_value(&sections, "metadata", "version");
4756    let description = get_ini_value(&sections, "metadata", "description");
4757    let author = get_ini_value(&sections, "metadata", "author");
4758    let author_email = get_ini_value(&sections, "metadata", "author_email");
4759    let maintainer = get_ini_value(&sections, "metadata", "maintainer");
4760    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4761    let license = get_ini_value(&sections, "metadata", "license");
4762    let mut homepage_url = get_ini_value(&sections, "metadata", "url");
4763    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4764    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4765    let python_requires = get_ini_value(&sections, "options", "python_requires");
4766    let parsed_project_urls =
4767        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4768    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4769    let mut extra_data = HashMap::new();
4770
4771    let mut parties = Vec::new();
4772    if author.is_some() || author_email.is_some() {
4773        parties.push(Party {
4774            r#type: Some("person".to_string()),
4775            role: Some("author".to_string()),
4776            name: author,
4777            email: author_email,
4778            url: None,
4779            organization: None,
4780            organization_url: None,
4781            timezone: None,
4782        });
4783    }
4784
4785    if maintainer.is_some() || maintainer_email.is_some() {
4786        parties.push(Party {
4787            r#type: Some("person".to_string()),
4788            role: Some("maintainer".to_string()),
4789            name: maintainer,
4790            email: maintainer_email,
4791            url: None,
4792            organization: None,
4793            organization_url: None,
4794            timezone: None,
4795        });
4796    }
4797
4798    let declared_license_expression = None;
4799    let declared_license_expression_spdx = None;
4800    let license_detections = Vec::new();
4801    let extracted_license_statement = license.clone();
4802
4803    let dependencies = extract_setup_cfg_dependencies(&sections);
4804
4805    if let Some(value) = python_requires {
4806        extra_data.insert(
4807            "python_requires".to_string(),
4808            serde_json::Value::String(value),
4809        );
4810    }
4811
4812    apply_project_url_mappings(
4813        &parsed_project_urls,
4814        &mut homepage_url,
4815        &mut bug_tracking_url,
4816        &mut code_view_url,
4817        &mut vcs_url,
4818        &mut extra_data,
4819    );
4820
4821    let extra_data = if extra_data.is_empty() {
4822        None
4823    } else {
4824        Some(extra_data)
4825    };
4826
4827    let purl = name.as_ref().and_then(|n| {
4828        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4829        if let Some(v) = &version {
4830            package_url.with_version(v).ok()?;
4831        }
4832        Some(package_url.to_string())
4833    });
4834
4835    PackageData {
4836        package_type: Some(PythonParser::PACKAGE_TYPE),
4837        namespace: None,
4838        name,
4839        version,
4840        qualifiers: None,
4841        subpath: None,
4842        primary_language: Some("Python".to_string()),
4843        description,
4844        release_date: None,
4845        parties,
4846        keywords,
4847        homepage_url,
4848        download_url: None,
4849        size: None,
4850        sha1: None,
4851        md5: None,
4852        sha256: None,
4853        sha512: None,
4854        bug_tracking_url,
4855        code_view_url,
4856        vcs_url,
4857        copyright: None,
4858        holder: None,
4859        declared_license_expression,
4860        declared_license_expression_spdx,
4861        license_detections,
4862        other_license_expression: None,
4863        other_license_expression_spdx: None,
4864        other_license_detections: Vec::new(),
4865        extracted_license_statement,
4866        notice_text: None,
4867        source_packages: Vec::new(),
4868        file_references: Vec::new(),
4869        is_private: has_private_classifier(&classifiers),
4870        is_virtual: false,
4871        extra_data,
4872        dependencies,
4873        repository_homepage_url: None,
4874        repository_download_url: None,
4875        api_data_url: None,
4876        datasource_id: Some(DatasourceId::PypiSetupCfg),
4877        purl,
4878    }
4879}
4880
4881fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4882    let Some(keywords) = value else {
4883        return Vec::new();
4884    };
4885
4886    keywords
4887        .split(',')
4888        .map(str::trim)
4889        .filter(|keyword| !keyword.is_empty())
4890        .map(ToOwned::to_owned)
4891        .collect()
4892}
4893
4894fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4895    entries
4896        .iter()
4897        .filter_map(|entry| {
4898            let (label, url) = entry.split_once('=')?;
4899            let label = label.trim();
4900            let url = url.trim();
4901            if label.is_empty() || url.is_empty() {
4902                None
4903            } else {
4904                Some((label.to_string(), url.to_string()))
4905            }
4906        })
4907        .collect()
4908}
4909
4910fn apply_project_url_mappings(
4911    parsed_urls: &[(String, String)],
4912    homepage_url: &mut Option<String>,
4913    bug_tracking_url: &mut Option<String>,
4914    code_view_url: &mut Option<String>,
4915    vcs_url: &mut Option<String>,
4916    extra_data: &mut HashMap<String, serde_json::Value>,
4917) {
4918    for (label, url) in parsed_urls {
4919        let label_lower = label.to_lowercase();
4920
4921        if bug_tracking_url.is_none()
4922            && matches!(
4923                label_lower.as_str(),
4924                "tracker"
4925                    | "bug reports"
4926                    | "bug tracker"
4927                    | "issues"
4928                    | "issue tracker"
4929                    | "github: issues"
4930            )
4931        {
4932            *bug_tracking_url = Some(url.clone());
4933        } else if code_view_url.is_none()
4934            && matches!(label_lower.as_str(), "source" | "source code" | "code")
4935        {
4936            *code_view_url = Some(url.clone());
4937        } else if vcs_url.is_none()
4938            && matches!(
4939                label_lower.as_str(),
4940                "github" | "gitlab" | "github: repo" | "repository"
4941            )
4942        {
4943            *vcs_url = Some(url.clone());
4944        } else if homepage_url.is_none()
4945            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4946        {
4947            *homepage_url = Some(url.clone());
4948        } else if label_lower == "changelog" {
4949            extra_data.insert(
4950                "changelog_url".to_string(),
4951                serde_json::Value::String(url.clone()),
4952            );
4953        }
4954    }
4955
4956    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4957        .iter()
4958        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4959        .collect();
4960
4961    if !project_urls_json.is_empty() {
4962        extra_data.insert(
4963            "project_urls".to_string(),
4964            serde_json::Value::Object(project_urls_json),
4965        );
4966    }
4967}
4968
4969fn parse_setup_cfg(content: &str) -> IniSections {
4970    let mut sections: IniSections = HashMap::new();
4971    let mut current_section: Option<String> = None;
4972    let mut current_key: Option<String> = None;
4973
4974    for raw_line in content.lines() {
4975        let line = raw_line.trim_end_matches('\r');
4976        let trimmed = line.trim();
4977        if trimmed.is_empty() {
4978            continue;
4979        }
4980
4981        let stripped = line.trim_start();
4982        if stripped.starts_with('#') || stripped.starts_with(';') {
4983            continue;
4984        }
4985
4986        if stripped.starts_with('[') && stripped.ends_with(']') {
4987            let section_name = stripped
4988                .trim_start_matches('[')
4989                .trim_end_matches(']')
4990                .trim()
4991                .to_ascii_lowercase();
4992            current_section = if section_name.is_empty() {
4993                None
4994            } else {
4995                Some(section_name)
4996            };
4997            current_key = None;
4998            continue;
4999        }
5000
5001        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
5002            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
5003                let value = stripped.trim();
5004                if !value.is_empty() {
5005                    sections
5006                        .entry(section.clone())
5007                        .or_default()
5008                        .entry(key.clone())
5009                        .or_default()
5010                        .push(value.to_string());
5011                }
5012            }
5013            continue;
5014        }
5015
5016        if let Some((key, value)) = stripped.split_once('=')
5017            && let Some(section) = current_section.as_ref()
5018        {
5019            let key_name = key.trim().to_ascii_lowercase();
5020            let value_trimmed = value.trim();
5021            let entry = sections
5022                .entry(section.clone())
5023                .or_default()
5024                .entry(key_name.clone())
5025                .or_default();
5026            if !value_trimmed.is_empty() {
5027                entry.push(value_trimmed.to_string());
5028            }
5029            current_key = Some(key_name);
5030        }
5031    }
5032
5033    sections
5034}
5035
5036fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
5037    sections
5038        .get(&section.to_ascii_lowercase())
5039        .and_then(|values| values.get(&key.to_ascii_lowercase()))
5040        .and_then(|entries| entries.first())
5041        .map(|value| value.trim().to_string())
5042}
5043
5044fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
5045    sections
5046        .get(&section.to_ascii_lowercase())
5047        .and_then(|values| values.get(&key.to_ascii_lowercase()))
5048        .cloned()
5049        .unwrap_or_default()
5050}
5051
5052fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
5053    let mut dependencies = Vec::new();
5054
5055    for (sub_section, scope) in [
5056        ("install_requires", "install"),
5057        ("tests_require", "test"),
5058        ("setup_requires", "setup"),
5059    ] {
5060        let reqs = get_ini_values(sections, "options", sub_section);
5061        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
5062    }
5063
5064    if let Some(extras) = sections.get("options.extras_require") {
5065        let mut extra_items: Vec<_> = extras.iter().collect();
5066        extra_items.sort_by_key(|(name, _)| *name);
5067        for (extra_name, reqs) in extra_items {
5068            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
5069        }
5070    }
5071
5072    dependencies
5073}
5074
5075fn parse_setup_cfg_requirements(
5076    reqs: &[String],
5077    scope: &str,
5078    is_optional: bool,
5079) -> Vec<Dependency> {
5080    reqs.iter()
5081        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
5082        .collect()
5083}
5084
5085fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
5086    let trimmed = req.trim();
5087    if trimmed.is_empty() || trimmed.starts_with('#') {
5088        return None;
5089    }
5090
5091    let name = extract_setup_cfg_dependency_name(trimmed)?;
5092    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5093
5094    Some(Dependency {
5095        purl: Some(purl.to_string()),
5096        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
5097        scope: Some(scope.to_string()),
5098        is_runtime: Some(true),
5099        is_optional: Some(is_optional),
5100        is_pinned: Some(false),
5101        is_direct: Some(true),
5102        resolved_package: None,
5103        extra_data: None,
5104    })
5105}
5106
5107fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
5108    let trimmed = req.trim();
5109    if trimmed.is_empty() {
5110        return None;
5111    }
5112
5113    let end = trimmed
5114        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
5115        .unwrap_or(trimmed.len());
5116    let name = trimmed[..end].trim();
5117    if name.is_empty() {
5118        None
5119    } else {
5120        Some(name.to_string())
5121    }
5122}
5123
5124fn normalize_setup_cfg_requirement(req: &str) -> String {
5125    req.chars().filter(|c| !c.is_whitespace()).collect()
5126}
5127
5128fn extract_setup_value(content: &str, key: &str) -> Option<String> {
5129    let patterns = vec![
5130        format!("{}=\"", key),   // name="value"
5131        format!("{} =\"", key),  // name ="value"
5132        format!("{}= \"", key),  // name= "value"
5133        format!("{} = \"", key), // name = "value"
5134        format!("{}='", key),    // name='value'
5135        format!("{} ='", key),   // name ='value'
5136        format!("{}= '", key),   // name= 'value'
5137        format!("{} = '", key),  // name = 'value'
5138    ];
5139
5140    for pattern in patterns {
5141        if let Some(start_idx) = content.find(&pattern) {
5142            let value_start = start_idx + pattern.len();
5143            let remaining = &content[value_start..];
5144
5145            if let Some(end_idx) = remaining.find(['"', '\'']) {
5146                return Some(remaining[..end_idx].to_string());
5147            }
5148        }
5149    }
5150
5151    None
5152}
5153
5154fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
5155    let mut dependencies = Vec::new();
5156
5157    if let Some(tests_deps) = extract_tests_require(content) {
5158        dependencies.extend(tests_deps);
5159    }
5160
5161    if let Some(extras_deps) = extract_extras_require(content) {
5162        dependencies.extend(extras_deps);
5163    }
5164
5165    dependencies
5166}
5167
5168fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
5169    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
5170    let re = Regex::new(pattern).ok()?;
5171    let captures = re.captures(content)?;
5172    let deps_str = captures.get(1)?.as_str();
5173
5174    let deps = parse_setup_py_dep_list(deps_str, "test", true);
5175    if deps.is_empty() { None } else { Some(deps) }
5176}
5177
5178fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
5179    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
5180    let re = Regex::new(pattern).ok()?;
5181    let captures = re.captures(content)?;
5182    let dict_content = captures.get(1)?.as_str();
5183
5184    let mut all_deps = Vec::new();
5185
5186    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
5187    let entry_re = Regex::new(entry_pattern).ok()?;
5188
5189    for entry_cap in entry_re.captures_iter(dict_content) {
5190        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
5191            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
5192            all_deps.extend(deps);
5193        }
5194    }
5195
5196    if all_deps.is_empty() {
5197        None
5198    } else {
5199        Some(all_deps)
5200    }
5201}
5202
5203fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
5204    let dep_pattern = r#"['"]([^'"]+)['"]"#;
5205    let re = match Regex::new(dep_pattern) {
5206        Ok(r) => r,
5207        Err(_) => return Vec::new(),
5208    };
5209
5210    re.captures_iter(deps_str)
5211        .filter_map(|cap| {
5212            let dep_str = cap.get(1)?.as_str().trim();
5213            if dep_str.is_empty() {
5214                return None;
5215            }
5216
5217            let name = extract_setup_cfg_dependency_name(dep_str)?;
5218            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5219
5220            Some(Dependency {
5221                purl: Some(purl.to_string()),
5222                extracted_requirement: Some(dep_str.to_string()),
5223                scope: Some(scope.to_string()),
5224                is_runtime: Some(true),
5225                is_optional: Some(is_optional),
5226                is_pinned: Some(false),
5227                is_direct: Some(true),
5228                resolved_package: None,
5229                extra_data: None,
5230            })
5231        })
5232        .collect()
5233}
5234
5235/// Reads and parses a TOML file
5236pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
5237    let content = read_file_to_string(path).map_err(|e| e.to_string())?;
5238    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
5239}
5240
5241/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
5242///
5243/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
5244/// Essential for SBOM compliance and package integrity verification.
5245///
5246/// # Returns
5247///
5248/// - `(Some(size), Some(hash))` on success
5249/// - `(None, None)` if file cannot be opened
5250/// - `(Some(size), None)` if hash calculation fails during read
5251fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<Sha256Digest>) {
5252    let mut file = match File::open(path) {
5253        Ok(f) => f,
5254        Err(_) => return (None, None),
5255    };
5256
5257    let metadata = match file.metadata() {
5258        Ok(m) => m,
5259        Err(_) => return (None, None),
5260    };
5261    let size = metadata.len();
5262
5263    let mut hasher = Sha256::new();
5264    let mut buffer = vec![0; 8192];
5265
5266    loop {
5267        match file.read(&mut buffer) {
5268            Ok(0) => break,
5269            Ok(n) => hasher.update(&buffer[..n]),
5270            Err(_) => return (Some(size), None),
5271        }
5272    }
5273
5274    let hash = Sha256Digest::from_bytes(hasher.finalize().into());
5275    (Some(size), Some(hash))
5276}
5277
5278fn default_package_data(path: &Path) -> PackageData {
5279    PackageData {
5280        package_type: Some(PythonParser::PACKAGE_TYPE),
5281        primary_language: Some("Python".to_string()),
5282        datasource_id: infer_python_datasource_id(path),
5283        ..Default::default()
5284    }
5285}
5286
5287fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
5288    let file_name = path.file_name().and_then(|name| name.to_str());
5289
5290    match file_name {
5291        Some("pyproject.toml") => {
5292            if read_toml_file(path)
5293                .ok()
5294                .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
5295                .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
5296                .is_some()
5297            {
5298                Some(DatasourceId::PypiPoetryPyprojectToml)
5299            } else {
5300                Some(DatasourceId::PypiPyprojectToml)
5301            }
5302        }
5303        Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
5304            Some(DatasourceId::PypiSetupPy)
5305        }
5306        Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
5307        Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
5308        Some("METADATA") if is_installed_wheel_metadata_path(path) => {
5309            Some(DatasourceId::PypiWheelMetadata)
5310        }
5311        Some("pypi.json") => Some(DatasourceId::PypiJson),
5312        Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
5313        Some("origin.json") if is_pip_cache_origin_json(path) => {
5314            Some(DatasourceId::PypiPipOriginJson)
5315        }
5316        _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
5317            Some(DatasourceId::PypiSdist)
5318        }
5319        _ if path
5320            .extension()
5321            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
5322        {
5323            Some(DatasourceId::PypiWheel)
5324        }
5325        _ if path
5326            .extension()
5327            .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
5328        {
5329            Some(DatasourceId::PypiEgg)
5330        }
5331        _ => None,
5332    }
5333}
5334
5335crate::register_parser!(
5336    "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
5337    &[
5338        "**/pyproject.toml",
5339        "**/setup.py",
5340        "**/*_setup.py",
5341        "**/setup.cfg",
5342        "**/pypi.json",
5343        "**/PKG-INFO",
5344        "**/*.dist-info/METADATA",
5345        "**/origin.json",
5346        "**/*.tar.gz",
5347        "**/*.tgz",
5348        "**/*.tar.bz2",
5349        "**/*.tar.xz",
5350        "**/*.zip",
5351        "**/*.whl",
5352        "**/*.egg"
5353    ],
5354    "pypi",
5355    "Python",
5356    Some("https://packaging.python.org/"),
5357);