Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{
35    DatasourceId, Dependency, FileReference, PackageData, PackageType, Party, Sha256Digest,
36};
37use crate::parser_warn as warn;
38use crate::parsers::utils::{read_file_to_string, split_name_email};
39use base64::Engine;
40use base64::engine::general_purpose::URL_SAFE_NO_PAD;
41use bzip2::read::BzDecoder;
42use csv::ReaderBuilder;
43use flate2::read::GzDecoder;
44use liblzma::read::XzDecoder;
45use packageurl::PackageUrl;
46use regex::Regex;
47use ruff_python_ast as ast;
48use ruff_python_parser::parse_module;
49use serde_json::{Map as JsonMap, Value as JsonValue};
50use sha2::{Digest, Sha256};
51use std::collections::{HashMap, HashSet};
52use std::fs::File;
53use std::io::Read;
54use std::path::{Component, Path, PathBuf};
55use tar::Archive;
56use toml::Value as TomlValue;
57use toml::map::Map as TomlMap;
58use zip::ZipArchive;
59
60use super::PackageParser;
61use super::license_normalization::{
62    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
63    normalize_spdx_expression,
64};
65use super::pep508::parse_pep508_requirement;
66
67// Field constants for pyproject.toml
68const FIELD_PROJECT: &str = "project";
69const FIELD_NAME: &str = "name";
70const FIELD_VERSION: &str = "version";
71const FIELD_DESCRIPTION: &str = "description";
72const FIELD_KEYWORDS: &str = "keywords";
73const FIELD_LICENSE: &str = "license";
74const FIELD_AUTHORS: &str = "authors";
75const FIELD_MAINTAINERS: &str = "maintainers";
76const FIELD_URLS: &str = "urls";
77const FIELD_HOMEPAGE: &str = "homepage";
78const FIELD_REPOSITORY: &str = "repository";
79const FIELD_DEPENDENCIES: &str = "dependencies";
80const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
81
82type ProjectUrls = (
83    Option<String>,
84    Option<String>,
85    Option<String>,
86    Option<String>,
87    Option<String>,
88);
89const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
90const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
91const MAX_SETUP_PY_BYTES: usize = 1_048_576;
92const MAX_SETUP_PY_AST_NODES: usize = 10_000;
93const MAX_SETUP_PY_AST_DEPTH: usize = 50;
94const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
95const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
96const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
97
98/// Python package parser supporting 11 manifest formats.
99///
100/// Extracts metadata from Python package files including pyproject.toml, setup.py,
101/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
102///
103/// # Security
104///
105/// setup.py files are parsed using AST analysis rather than code execution to prevent
106/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
107pub struct PythonParser;
108
109#[derive(Clone, Copy, Debug)]
110enum PythonSdistArchiveFormat {
111    TarGz,
112    Tgz,
113    TarBz2,
114    TarXz,
115    Zip,
116}
117
118#[derive(Clone, Debug)]
119struct ValidatedZipEntry {
120    index: usize,
121    name: String,
122}
123
124impl PackageParser for PythonParser {
125    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
126
127    fn extract_packages(path: &Path) -> Vec<PackageData> {
128        vec![
129            if path.file_name().unwrap_or_default() == "pyproject.toml" {
130                extract_from_pyproject_toml(path)
131            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
132                extract_from_setup_cfg(path)
133            } else if is_setup_py_like_path(path) {
134                return extract_setup_py_packages(path);
135            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
136                extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
137            } else if is_installed_wheel_metadata_path(path) {
138                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
139            } else if is_pip_cache_origin_json(path) {
140                extract_from_pip_origin_json(path)
141            } else if path.file_name().unwrap_or_default() == "pypi.json" {
142                extract_from_pypi_json(path)
143            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
144                extract_from_pip_inspect(path)
145            } else if is_python_sdist_archive_path(path) {
146                extract_from_sdist_archive(path)
147            } else if path
148                .extension()
149                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
150            {
151                extract_from_wheel_archive(path)
152            } else if path
153                .extension()
154                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
155            {
156                extract_from_egg_archive(path)
157            } else {
158                default_package_data(path)
159            },
160        ]
161    }
162
163    fn is_match(path: &Path) -> bool {
164        if let Some(filename) = path.file_name()
165            && (filename == "pyproject.toml"
166                || filename == "setup.cfg"
167                || is_setup_py_like_path(path)
168                || filename == "PKG-INFO"
169                || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
170                || filename == "pypi.json"
171                || filename == "pip-inspect.deplock"
172                || is_pip_cache_origin_json(path))
173        {
174            return true;
175        }
176
177        if let Some(extension) = path.extension() {
178            let ext = extension.to_string_lossy().to_lowercase();
179            if (ext == "whl" && is_valid_wheel_archive_path(path))
180                || ext == "egg"
181                || is_python_sdist_archive_path(path)
182            {
183                return true;
184            }
185        }
186
187        false
188    }
189}
190
191fn is_setup_py_like_path(path: &Path) -> bool {
192    path.file_name()
193        .and_then(|name| name.to_str())
194        .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
195}
196
197fn is_installed_wheel_metadata_path(path: &Path) -> bool {
198    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
199        && path
200            .parent()
201            .and_then(|parent| parent.file_name())
202            .and_then(|name| name.to_str())
203            .is_some_and(|name| name.ends_with(".dist-info"))
204}
205
206#[derive(Debug, Clone)]
207struct InstalledWheelMetadata {
208    wheel_tags: Vec<String>,
209    wheel_version: Option<String>,
210    wheel_generator: Option<String>,
211    root_is_purelib: Option<bool>,
212    compressed_tag: Option<String>,
213}
214
215fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
216    let Some(parent) = path.parent() else {
217        return;
218    };
219
220    if !parent
221        .file_name()
222        .and_then(|name| name.to_str())
223        .is_some_and(|name| name.ends_with(".dist-info"))
224    {
225        return;
226    }
227
228    let wheel_path = parent.join("WHEEL");
229    if !wheel_path.exists() {
230        return;
231    }
232
233    let Ok(content) = read_file_to_string(&wheel_path) else {
234        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
235        return;
236    };
237
238    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
239        return;
240    };
241
242    apply_installed_wheel_metadata(package_data, &wheel_metadata);
243}
244
245fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
246    use super::rfc822::{get_header_all, get_header_first};
247
248    let metadata = super::rfc822::parse_rfc822_content(content);
249    let wheel_tags = get_header_all(&metadata.headers, "tag");
250    if wheel_tags.is_empty() {
251        return None;
252    }
253
254    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
255    let wheel_generator = get_header_first(&metadata.headers, "generator");
256    let root_is_purelib =
257        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
258            match value.to_ascii_lowercase().as_str() {
259                "true" => Some(true),
260                "false" => Some(false),
261                _ => None,
262            }
263        });
264
265    let compressed_tag = compress_wheel_tags(&wheel_tags);
266
267    Some(InstalledWheelMetadata {
268        wheel_tags,
269        wheel_version,
270        wheel_generator,
271        root_is_purelib,
272        compressed_tag,
273    })
274}
275
276fn compress_wheel_tags(tags: &[String]) -> Option<String> {
277    if tags.is_empty() {
278        return None;
279    }
280
281    if tags.len() == 1 {
282        return Some(tags[0].clone());
283    }
284
285    let mut python_tags = Vec::new();
286    let mut abi_tag: Option<&str> = None;
287    let mut platform_tag: Option<&str> = None;
288
289    for tag in tags {
290        let mut parts = tag.splitn(3, '-');
291        let python = parts.next()?;
292        let abi = parts.next()?;
293        let platform = parts.next()?;
294
295        if abi_tag.is_some_and(|existing| existing != abi)
296            || platform_tag.is_some_and(|existing| existing != platform)
297        {
298            return None;
299        }
300
301        abi_tag = Some(abi);
302        platform_tag = Some(platform);
303        python_tags.push(python.to_string());
304    }
305
306    Some(format!(
307        "{}-{}-{}",
308        python_tags.join("."),
309        abi_tag?,
310        platform_tag?
311    ))
312}
313
314fn apply_installed_wheel_metadata(
315    package_data: &mut PackageData,
316    wheel_metadata: &InstalledWheelMetadata,
317) {
318    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
319    extra_data.insert(
320        "wheel_tags".to_string(),
321        JsonValue::Array(
322            wheel_metadata
323                .wheel_tags
324                .iter()
325                .cloned()
326                .map(JsonValue::String)
327                .collect(),
328        ),
329    );
330
331    if let Some(wheel_version) = &wheel_metadata.wheel_version {
332        extra_data.insert(
333            "wheel_version".to_string(),
334            JsonValue::String(wheel_version.clone()),
335        );
336    }
337
338    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
339        extra_data.insert(
340            "wheel_generator".to_string(),
341            JsonValue::String(wheel_generator.clone()),
342        );
343    }
344
345    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
346        extra_data.insert(
347            "root_is_purelib".to_string(),
348            JsonValue::Bool(root_is_purelib),
349        );
350    }
351
352    if let (Some(name), Some(version), Some(extension)) = (
353        package_data.name.as_deref(),
354        package_data.version.as_deref(),
355        wheel_metadata.compressed_tag.as_deref(),
356    ) {
357        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
358    }
359}
360
361fn is_pip_cache_origin_json(path: &Path) -> bool {
362    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
363        && path.ancestors().skip(1).any(|ancestor| {
364            ancestor
365                .file_name()
366                .and_then(|name| name.to_str())
367                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
368        })
369}
370
371fn extract_from_pip_origin_json(path: &Path) -> PackageData {
372    let content = match read_file_to_string(path) {
373        Ok(content) => content,
374        Err(e) => {
375            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
376            return default_package_data(path);
377        }
378    };
379
380    let root: JsonValue = match serde_json::from_str(&content) {
381        Ok(root) => root,
382        Err(e) => {
383            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
384            return default_package_data(path);
385        }
386    };
387
388    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
389        warn!("No url found in pip cache origin.json at {:?}", path);
390        return default_package_data(path);
391    };
392
393    let sibling_wheel = find_sibling_cached_wheel(path);
394    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
395        sibling_wheel
396            .as_ref()
397            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
398    });
399
400    let Some((name, version)) = name_version else {
401        warn!(
402            "Failed to infer package name/version from pip cache origin.json at {:?}",
403            path
404        );
405        return default_package_data(path);
406    };
407
408    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
409        build_pypi_urls(Some(&name), Some(&version));
410    let purl = sibling_wheel
411        .as_ref()
412        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
413        .or(plain_purl);
414
415    PackageData {
416        package_type: Some(PythonParser::PACKAGE_TYPE),
417        primary_language: Some("Python".to_string()),
418        name: Some(name),
419        version: Some(version),
420        datasource_id: Some(DatasourceId::PypiPipOriginJson),
421        download_url: Some(download_url.to_string()),
422        sha256: extract_sha256_from_origin_json(&root)
423            .and_then(|h| Sha256Digest::from_hex(&h).ok()),
424        repository_homepage_url,
425        repository_download_url,
426        api_data_url,
427        purl,
428        ..Default::default()
429    }
430}
431
432fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
433    let parent = path.parent()?;
434    let entries = parent.read_dir().ok()?;
435
436    for entry in entries.flatten() {
437        let sibling_path = entry.path();
438        if sibling_path
439            .extension()
440            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
441            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
442        {
443            return Some(wheel_info);
444        }
445    }
446
447    None
448}
449
450fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
451    let file_name = url.rsplit('/').next()?;
452
453    if file_name.ends_with(".whl") {
454        return parse_wheel_filename(Path::new(file_name))
455            .map(|wheel_info| (wheel_info.name, wheel_info.version));
456    }
457
458    let stem = strip_python_archive_extension(file_name)?;
459    let (name, version) = stem.rsplit_once('-')?;
460    if name.is_empty() || version.is_empty() {
461        return None;
462    }
463
464    Some((name.replace('_', "-"), version.to_string()))
465}
466
467fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
468    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
469        .iter()
470        .find_map(|suffix| file_name.strip_suffix(suffix))
471}
472
473fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
474    root.pointer("/archive_info/hashes/sha256")
475        .and_then(|value| value.as_str())
476        .map(ToOwned::to_owned)
477        .or_else(|| {
478            root.pointer("/archive_info/hash")
479                .and_then(|value| value.as_str())
480                .and_then(normalize_origin_hash)
481        })
482}
483
484fn normalize_origin_hash(hash: &str) -> Option<String> {
485    if let Some(value) = hash.strip_prefix("sha256=") {
486        return Some(value.to_string());
487    }
488    if let Some(value) = hash.strip_prefix("sha256:") {
489        return Some(value.to_string());
490    }
491    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
492        return Some(hash.to_string());
493    }
494    None
495}
496
497fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
498    let content = match read_file_to_string(path) {
499        Ok(content) => content,
500        Err(e) => {
501            warn!("Failed to read metadata at {:?}: {}", path, e);
502            return default_package_data(path);
503        }
504    };
505
506    let metadata = super::rfc822::parse_rfc822_content(&content);
507    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
508    merge_sibling_metadata_dependencies(path, &mut package_data);
509    merge_sibling_metadata_file_references(path, &mut package_data);
510    if datasource_id == DatasourceId::PypiWheelMetadata {
511        merge_sibling_wheel_metadata(path, &mut package_data);
512    }
513    package_data
514}
515
516fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
517    let mut extra_dependencies = Vec::new();
518
519    if let Some(parent) = path.parent() {
520        let direct_requires = parent.join("requires.txt");
521        if direct_requires.exists()
522            && let Ok(content) = read_file_to_string(&direct_requires)
523        {
524            extra_dependencies.extend(parse_requires_txt(&content));
525        }
526
527        let sibling_egg_info_requires = parent
528            .read_dir()
529            .ok()
530            .into_iter()
531            .flatten()
532            .flatten()
533            .find_map(|entry| {
534                let child_path = entry.path();
535                if child_path.is_dir()
536                    && child_path
537                        .file_name()
538                        .and_then(|name| name.to_str())
539                        .is_some_and(|name| name.ends_with(".egg-info"))
540                {
541                    let requires = child_path.join("requires.txt");
542                    requires.exists().then_some(requires)
543                } else {
544                    None
545                }
546            });
547
548        if let Some(requires_path) = sibling_egg_info_requires
549            && let Ok(content) = read_file_to_string(&requires_path)
550        {
551            extra_dependencies.extend(parse_requires_txt(&content));
552        }
553    }
554
555    for dependency in extra_dependencies {
556        if !package_data.dependencies.iter().any(|existing| {
557            existing.purl == dependency.purl
558                && existing.scope == dependency.scope
559                && existing.extracted_requirement == dependency.extracted_requirement
560                && existing.extra_data == dependency.extra_data
561        }) {
562            package_data.dependencies.push(dependency);
563        }
564    }
565}
566
567fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
568    let mut extra_refs = Vec::new();
569
570    if let Some(parent) = path.parent() {
571        let record_path = parent.join("RECORD");
572        if record_path.exists()
573            && let Ok(content) = read_file_to_string(&record_path)
574        {
575            extra_refs.extend(parse_record_csv(&content));
576        }
577
578        let installed_files_path = parent.join("installed-files.txt");
579        if installed_files_path.exists()
580            && let Ok(content) = read_file_to_string(&installed_files_path)
581        {
582            extra_refs.extend(parse_installed_files_txt(&content));
583        }
584
585        let sources_path = parent.join("SOURCES.txt");
586        if sources_path.exists()
587            && let Ok(content) = read_file_to_string(&sources_path)
588        {
589            extra_refs.extend(parse_sources_txt(&content));
590        }
591    }
592
593    for file_ref in extra_refs {
594        if !package_data
595            .file_references
596            .iter()
597            .any(|existing| existing.path == file_ref.path)
598        {
599            package_data.file_references.push(file_ref);
600        }
601    }
602}
603
604fn collect_validated_zip_entries<R: Read + std::io::Seek>(
605    archive: &mut ZipArchive<R>,
606    path: &Path,
607    archive_type: &str,
608) -> Result<Vec<ValidatedZipEntry>, String> {
609    let mut total_extracted = 0u64;
610    let mut entries = Vec::new();
611
612    for i in 0..archive.len() {
613        if let Ok(file) = archive.by_index_raw(i) {
614            let compressed_size = file.compressed_size();
615            let uncompressed_size = file.size();
616            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
617                warn!(
618                    "Skipping unsafe path in {} {:?}: {}",
619                    archive_type,
620                    path,
621                    file.name()
622                );
623                continue;
624            };
625
626            if compressed_size > 0 {
627                let ratio = uncompressed_size as f64 / compressed_size as f64;
628                if ratio > MAX_COMPRESSION_RATIO {
629                    warn!(
630                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
631                        archive_type, path, ratio
632                    );
633                    continue;
634                }
635            }
636
637            if uncompressed_size > MAX_FILE_SIZE {
638                warn!(
639                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
640                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
641                );
642                continue;
643            }
644
645            total_extracted += uncompressed_size;
646            if total_extracted > MAX_ARCHIVE_SIZE {
647                let msg = format!(
648                    "Total extracted size exceeds limit for {} {:?}",
649                    archive_type, path
650                );
651                warn!("{}", msg);
652                return Err(msg);
653            }
654
655            entries.push(ValidatedZipEntry {
656                index: i,
657                name: entry_name,
658            });
659        }
660    }
661
662    Ok(entries)
663}
664
665fn is_python_sdist_archive_path(path: &Path) -> bool {
666    detect_python_sdist_archive_format(path).is_some()
667}
668
669fn is_valid_wheel_archive_path(path: &Path) -> bool {
670    if !path.is_file() {
671        return true;
672    }
673
674    let file = match File::open(path) {
675        Ok(file) => file,
676        Err(_) => return false,
677    };
678    let mut archive = match ZipArchive::new(file) {
679        Ok(archive) => archive,
680        Err(_) => return false,
681    };
682
683    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
684        Ok(entries) => entries,
685        Err(_) => return false,
686    };
687
688    find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA").is_some()
689}
690
691fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
692    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
693
694    if !is_likely_python_sdist_filename(&file_name) {
695        return None;
696    }
697
698    if file_name.ends_with(".tar.gz") {
699        tar_gz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarGz)
700    } else if file_name.ends_with(".tgz") {
701        tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
702    } else if file_name.ends_with(".tar.bz2") {
703        tar_bz2_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarBz2)
704    } else if file_name.ends_with(".tar.xz") {
705        tar_xz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarXz)
706    } else if file_name.ends_with(".zip") {
707        zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
708    } else {
709        None
710    }
711}
712
713fn tar_gz_sdist_contains_pkg_info(path: &Path) -> bool {
714    let Some(compressed_size) = compressed_archive_size(path) else {
715        return false;
716    };
717    let file = match File::open(path) {
718        Ok(file) => file,
719        Err(_) => return false,
720    };
721    let decoder = GzDecoder::new(file);
722    tar_sdist_contains_pkg_info(path, decoder, "tar.gz", compressed_size)
723}
724
725fn tar_bz2_sdist_contains_pkg_info(path: &Path) -> bool {
726    let Some(compressed_size) = compressed_archive_size(path) else {
727        return false;
728    };
729    let file = match File::open(path) {
730        Ok(file) => file,
731        Err(_) => return false,
732    };
733    let decoder = BzDecoder::new(file);
734    tar_sdist_contains_pkg_info(path, decoder, "tar.bz2", compressed_size)
735}
736
737fn tar_xz_sdist_contains_pkg_info(path: &Path) -> bool {
738    let Some(compressed_size) = compressed_archive_size(path) else {
739        return false;
740    };
741    let file = match File::open(path) {
742        Ok(file) => file,
743        Err(_) => return false,
744    };
745    let decoder = XzDecoder::new(file);
746    tar_sdist_contains_pkg_info(path, decoder, "tar.xz", compressed_size)
747}
748
749fn compressed_archive_size(path: &Path) -> Option<u64> {
750    std::fs::metadata(path).ok().map(|metadata| metadata.len())
751}
752
753fn tar_sdist_contains_pkg_info<R: Read>(
754    path: &Path,
755    reader: R,
756    archive_type: &str,
757    compressed_size: u64,
758) -> bool {
759    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
760    else {
761        return false;
762    };
763
764    select_sdist_pkginfo_entry(path, &entries).is_some()
765}
766
767fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
768    if !path.is_file() {
769        return true;
770    }
771
772    let Some(compressed_size) = compressed_archive_size(path) else {
773        return false;
774    };
775    let file = match File::open(path) {
776        Ok(file) => file,
777        Err(_) => return false,
778    };
779    let decoder = GzDecoder::new(file);
780    tar_sdist_contains_pkg_info(path, decoder, "tgz", compressed_size)
781}
782
783fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
784    if !path.is_file() {
785        return true;
786    }
787
788    let file = match File::open(path) {
789        Ok(file) => file,
790        Err(_) => return false,
791    };
792    let mut archive = match ZipArchive::new(file) {
793        Ok(archive) => archive,
794        Err(_) => return false,
795    };
796
797    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
798        Ok(entries) => entries,
799        Err(_) => return false,
800    };
801    let metadata_entries: Vec<_> = validated_entries
802        .iter()
803        .filter(|entry| entry.name.ends_with("/PKG-INFO"))
804        .filter_map(|entry| {
805            read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
806                .ok()
807                .map(|content| (entry.name.clone(), content))
808        })
809        .collect();
810
811    has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
812}
813
814fn is_likely_python_sdist_filename(file_name: &str) -> bool {
815    let Some(stem) = strip_python_archive_extension(file_name) else {
816        return false;
817    };
818
819    let Some((name, version)) = stem.rsplit_once('-') else {
820        return false;
821    };
822
823    !name.is_empty()
824        && !version.is_empty()
825        && version.chars().any(|ch| ch.is_ascii_digit())
826        && name
827            .chars()
828            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
829}
830
831fn extract_from_sdist_archive(path: &Path) -> PackageData {
832    let metadata = match std::fs::metadata(path) {
833        Ok(m) => m,
834        Err(e) => {
835            warn!(
836                "Failed to read metadata for sdist archive {:?}: {}",
837                path, e
838            );
839            return default_package_data(path);
840        }
841    };
842
843    if metadata.len() > MAX_ARCHIVE_SIZE {
844        warn!(
845            "sdist archive too large: {} bytes (limit: {} bytes)",
846            metadata.len(),
847            MAX_ARCHIVE_SIZE
848        );
849        return default_package_data(path);
850    }
851
852    let Some(format) = detect_python_sdist_archive_format(path) else {
853        return default_package_data(path);
854    };
855
856    let mut package_data = match format {
857        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
858            let file = match File::open(path) {
859                Ok(file) => file,
860                Err(e) => {
861                    warn!("Failed to open sdist archive {:?}: {}", path, e);
862                    return default_package_data(path);
863                }
864            };
865            let decoder = GzDecoder::new(file);
866            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
867        }
868        PythonSdistArchiveFormat::TarBz2 => {
869            let file = match File::open(path) {
870                Ok(file) => file,
871                Err(e) => {
872                    warn!("Failed to open sdist archive {:?}: {}", path, e);
873                    return default_package_data(path);
874                }
875            };
876            let decoder = BzDecoder::new(file);
877            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
878        }
879        PythonSdistArchiveFormat::TarXz => {
880            let file = match File::open(path) {
881                Ok(file) => file,
882                Err(e) => {
883                    warn!("Failed to open sdist archive {:?}: {}", path, e);
884                    return default_package_data(path);
885                }
886            };
887            let decoder = XzDecoder::new(file);
888            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
889        }
890        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
891    };
892
893    if package_data.package_type.is_some() {
894        let (size, sha256) = calculate_file_checksums(path);
895        package_data.size = size;
896        package_data.sha256 = sha256;
897    }
898
899    package_data
900}
901
902fn extract_from_tar_sdist_archive<R: Read>(
903    path: &Path,
904    reader: R,
905    archive_type: &str,
906    compressed_size: u64,
907) -> PackageData {
908    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
909    else {
910        return default_package_data(path);
911    };
912
913    build_sdist_package_data(path, entries)
914}
915
916fn collect_tar_sdist_entries<R: Read>(
917    path: &Path,
918    reader: R,
919    archive_type: &str,
920    compressed_size: u64,
921) -> Option<Vec<(String, String)>> {
922    let mut archive = Archive::new(reader);
923    let archive_entries = match archive.entries() {
924        Ok(entries) => entries,
925        Err(e) => {
926            warn!(
927                "Failed to read {} sdist archive {:?}: {}",
928                archive_type, path, e
929            );
930            return None;
931        }
932    };
933
934    let mut total_extracted = 0u64;
935    let mut entries = Vec::new();
936
937    for entry_result in archive_entries {
938        let mut entry = match entry_result {
939            Ok(entry) => entry,
940            Err(e) => {
941                warn!(
942                    "Failed to read {} sdist entry from {:?}: {}",
943                    archive_type, path, e
944                );
945                continue;
946            }
947        };
948
949        let entry_size = entry.size();
950        if entry_size > MAX_FILE_SIZE {
951            warn!(
952                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
953                archive_type, path, entry_size, MAX_FILE_SIZE
954            );
955            continue;
956        }
957
958        total_extracted += entry_size;
959        if total_extracted > MAX_ARCHIVE_SIZE {
960            warn!(
961                "Total extracted size exceeds limit for {} sdist {:?}",
962                archive_type, path
963            );
964            return None;
965        }
966
967        if compressed_size > 0 {
968            let ratio = total_extracted as f64 / compressed_size as f64;
969            if ratio > MAX_COMPRESSION_RATIO {
970                warn!(
971                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
972                    archive_type, path, ratio
973                );
974                return None;
975            }
976        }
977
978        let entry_path = match entry.path() {
979            Ok(path) => path.to_string_lossy().replace('\\', "/"),
980            Err(e) => {
981                warn!(
982                    "Failed to get {} sdist entry path from {:?}: {}",
983                    archive_type, path, e
984                );
985                continue;
986            }
987        };
988
989        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
990            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
991            continue;
992        };
993
994        if !is_relevant_sdist_text_entry(&entry_path) {
995            continue;
996        }
997
998        if let Ok(content) = read_limited_utf8(
999            &mut entry,
1000            MAX_FILE_SIZE,
1001            &format!("{} entry {}", archive_type, entry_path),
1002        ) {
1003            entries.push((entry_path, content));
1004        }
1005    }
1006
1007    Some(entries)
1008}
1009
1010fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
1011    let file = match File::open(path) {
1012        Ok(file) => file,
1013        Err(e) => {
1014            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
1015            return default_package_data(path);
1016        }
1017    };
1018
1019    let mut archive = match ZipArchive::new(file) {
1020        Ok(archive) => archive,
1021        Err(e) => {
1022            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
1023            return default_package_data(path);
1024        }
1025    };
1026
1027    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
1028        Ok(entries) => entries,
1029        Err(_) => return default_package_data(path),
1030    };
1031
1032    let mut entries = Vec::new();
1033    for entry in validated_entries.iter() {
1034        if !is_relevant_sdist_text_entry(&entry.name) {
1035            continue;
1036        }
1037
1038        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
1039            entries.push((entry.name.clone(), content));
1040        }
1041    }
1042
1043    build_sdist_package_data(path, entries)
1044}
1045
1046fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
1047    entry_path.ends_with("/PKG-INFO")
1048        || entry_path.ends_with("/requires.txt")
1049        || entry_path.ends_with("/SOURCES.txt")
1050}
1051
1052fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
1053    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
1054        warn!("No PKG-INFO file found in sdist archive {:?}", path);
1055        return default_package_data(path);
1056    };
1057
1058    let mut package_data =
1059        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
1060    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
1061    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
1062    apply_sdist_name_version_fallback(path, &mut package_data);
1063    package_data.datasource_id = Some(DatasourceId::PypiSdist);
1064    package_data
1065}
1066
1067fn select_sdist_pkginfo_entry(
1068    archive_path: &Path,
1069    entries: &[(String, String)],
1070) -> Option<(String, String)> {
1071    let expected_name = sdist_archive_expected_name(archive_path);
1072
1073    entries
1074        .iter()
1075        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
1076        .min_by_key(|(entry_path, content)| {
1077            let components: Vec<_> = entry_path
1078                .split('/')
1079                .filter(|part| !part.is_empty())
1080                .collect();
1081            let candidate_name = sdist_pkginfo_candidate_name(content);
1082            let name_rank = if candidate_name == expected_name {
1083                0
1084            } else {
1085                1
1086            };
1087            let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1088
1089            (name_rank, kind_rank, components.len(), entry_path.clone())
1090        })
1091        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1092}
1093
1094fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1095    let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1096        return false;
1097    };
1098
1099    entries.iter().any(|(entry_path, content)| {
1100        sdist_pkginfo_kind_rank(entry_path) < 3
1101            && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1102    })
1103}
1104
1105fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1106    archive_path
1107        .file_name()
1108        .and_then(|name| name.to_str())
1109        .and_then(strip_python_archive_extension)
1110        .and_then(|stem| {
1111            stem.rsplit_once('-')
1112                .map(|(name, _)| normalize_python_package_name(name))
1113        })
1114}
1115
1116fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1117    let metadata = super::rfc822::parse_rfc822_content(content);
1118    super::rfc822::get_header_first(&metadata.headers, "name")
1119        .map(|name| normalize_python_package_name(&name))
1120}
1121
1122fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1123    let components: Vec<_> = entry_path
1124        .split('/')
1125        .filter(|part| !part.is_empty())
1126        .collect();
1127
1128    if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1129    {
1130        0
1131    } else if components.len() == 2 && components[1] == "PKG-INFO" {
1132        1
1133    } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1134        2
1135    } else {
1136        3
1137    }
1138}
1139
1140fn merge_sdist_archive_dependencies(
1141    entries: &[(String, String)],
1142    metadata_path: &str,
1143    package_data: &mut PackageData,
1144) {
1145    let metadata_dir = metadata_path
1146        .rsplit_once('/')
1147        .map(|(dir, _)| dir)
1148        .unwrap_or("");
1149    let archive_root = metadata_path.split('/').next().unwrap_or("");
1150    let matched_egg_info_dir =
1151        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1152    let mut extra_dependencies = Vec::new();
1153
1154    for (entry_path, content) in entries {
1155        let is_direct_requires =
1156            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1157        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1158            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1159        });
1160
1161        if is_direct_requires || is_egg_info_requires {
1162            extra_dependencies.extend(parse_requires_txt(content));
1163        }
1164    }
1165
1166    for dependency in extra_dependencies {
1167        if !package_data.dependencies.iter().any(|existing| {
1168            existing.purl == dependency.purl
1169                && existing.scope == dependency.scope
1170                && existing.extracted_requirement == dependency.extracted_requirement
1171                && existing.extra_data == dependency.extra_data
1172        }) {
1173            package_data.dependencies.push(dependency);
1174        }
1175    }
1176}
1177
1178fn merge_sdist_archive_file_references(
1179    entries: &[(String, String)],
1180    metadata_path: &str,
1181    package_data: &mut PackageData,
1182) {
1183    let metadata_dir = metadata_path
1184        .rsplit_once('/')
1185        .map(|(dir, _)| dir)
1186        .unwrap_or("");
1187    let archive_root = metadata_path.split('/').next().unwrap_or("");
1188    let matched_egg_info_dir =
1189        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1190    let mut extra_refs = Vec::new();
1191
1192    for (entry_path, content) in entries {
1193        let is_direct_sources =
1194            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1195        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1196            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1197        });
1198
1199        if is_direct_sources || is_egg_info_sources {
1200            extra_refs.extend(parse_sources_txt(content));
1201        }
1202    }
1203
1204    for file_ref in extra_refs {
1205        if !package_data
1206            .file_references
1207            .iter()
1208            .any(|existing| existing.path == file_ref.path)
1209        {
1210            package_data.file_references.push(file_ref);
1211        }
1212    }
1213}
1214
1215fn select_matching_sdist_egg_info_dir(
1216    entries: &[(String, String)],
1217    archive_root: &str,
1218    package_name: Option<&str>,
1219) -> Option<String> {
1220    let normalized_package_name = package_name.map(normalize_python_package_name);
1221
1222    entries
1223        .iter()
1224        .filter_map(|(entry_path, _)| {
1225            let components: Vec<_> = entry_path
1226                .split('/')
1227                .filter(|part| !part.is_empty())
1228                .collect();
1229            if components.len() == 3
1230                && components[0] == archive_root
1231                && components[1].ends_with(".egg-info")
1232            {
1233                Some(components[1].to_string())
1234            } else {
1235                None
1236            }
1237        })
1238        .min_by_key(|egg_info_dir| {
1239            let normalized_dir_name =
1240                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1241            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1242                0
1243            } else {
1244                1
1245            };
1246
1247            (name_rank, egg_info_dir.clone())
1248        })
1249}
1250
1251fn normalize_python_package_name(name: &str) -> String {
1252    name.to_ascii_lowercase().replace('_', "-")
1253}
1254
1255fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1256    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1257        return;
1258    };
1259
1260    let Some(stem) = strip_python_archive_extension(file_name) else {
1261        return;
1262    };
1263
1264    let Some((name, version)) = stem.rsplit_once('-') else {
1265        return;
1266    };
1267
1268    if package_data.name.is_none() {
1269        package_data.name = Some(name.replace('_', "-"));
1270    }
1271    if package_data.version.is_none() {
1272        package_data.version = Some(version.to_string());
1273    }
1274
1275    if package_data.purl.is_none()
1276        || package_data.repository_homepage_url.is_none()
1277        || package_data.repository_download_url.is_none()
1278        || package_data.api_data_url.is_none()
1279    {
1280        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1281            build_pypi_urls(
1282                package_data.name.as_deref(),
1283                package_data.version.as_deref(),
1284            );
1285
1286        if package_data.repository_homepage_url.is_none() {
1287            package_data.repository_homepage_url = repository_homepage_url;
1288        }
1289        if package_data.repository_download_url.is_none() {
1290            package_data.repository_download_url = repository_download_url;
1291        }
1292        if package_data.api_data_url.is_none() {
1293            package_data.api_data_url = api_data_url;
1294        }
1295        if package_data.purl.is_none() {
1296            package_data.purl = purl;
1297        }
1298    }
1299}
1300
1301fn extract_from_wheel_archive(path: &Path) -> PackageData {
1302    let metadata = match std::fs::metadata(path) {
1303        Ok(m) => m,
1304        Err(e) => {
1305            warn!(
1306                "Failed to read metadata for wheel archive {:?}: {}",
1307                path, e
1308            );
1309            return default_package_data(path);
1310        }
1311    };
1312
1313    if metadata.len() > MAX_ARCHIVE_SIZE {
1314        warn!(
1315            "Wheel archive too large: {} bytes (limit: {} bytes)",
1316            metadata.len(),
1317            MAX_ARCHIVE_SIZE
1318        );
1319        return default_package_data(path);
1320    }
1321
1322    let file = match File::open(path) {
1323        Ok(f) => f,
1324        Err(e) => {
1325            warn!("Failed to open wheel archive {:?}: {}", path, e);
1326            return default_package_data(path);
1327        }
1328    };
1329
1330    let mut archive = match ZipArchive::new(file) {
1331        Ok(a) => a,
1332        Err(e) => {
1333            warn!("Failed to read wheel archive {:?}: {}", path, e);
1334            return default_package_data(path);
1335        }
1336    };
1337
1338    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1339        Ok(entries) => entries,
1340        Err(_) => return default_package_data(path),
1341    };
1342
1343    let metadata_entry =
1344        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1345            Some(entry) => entry,
1346            None => {
1347                warn!("No METADATA file found in wheel archive {:?}", path);
1348                return default_package_data(path);
1349            }
1350        };
1351
1352    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1353        Ok(c) => c,
1354        Err(e) => {
1355            warn!("Failed to read METADATA from {:?}: {}", path, e);
1356            return default_package_data(path);
1357        }
1358    };
1359
1360    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1361
1362    let (size, sha256) = calculate_file_checksums(path);
1363    package_data.size = size;
1364    package_data.sha256 = sha256;
1365
1366    if let Some(record_entry) =
1367        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1368        && let Ok(record_content) =
1369            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1370    {
1371        package_data.file_references = parse_record_csv(&record_content);
1372    }
1373
1374    if let Some(wheel_info) = parse_wheel_filename(path) {
1375        if package_data.name.is_none() {
1376            package_data.name = Some(wheel_info.name.clone());
1377        }
1378        if package_data.version.is_none() {
1379            package_data.version = Some(wheel_info.version.clone());
1380        }
1381
1382        package_data.qualifiers = Some(std::collections::HashMap::from([(
1383            "extension".to_string(),
1384            format!(
1385                "{}-{}-{}",
1386                wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1387            ),
1388        )]));
1389
1390        package_data.purl = build_wheel_purl(
1391            package_data.name.as_deref(),
1392            package_data.version.as_deref(),
1393            &wheel_info,
1394        );
1395
1396        let mut extra_data = package_data.extra_data.unwrap_or_default();
1397        extra_data.insert(
1398            "python_requires".to_string(),
1399            serde_json::Value::String(wheel_info.python_tag.clone()),
1400        );
1401        extra_data.insert(
1402            "abi_tag".to_string(),
1403            serde_json::Value::String(wheel_info.abi_tag.clone()),
1404        );
1405        extra_data.insert(
1406            "platform_tag".to_string(),
1407            serde_json::Value::String(wheel_info.platform_tag.clone()),
1408        );
1409        package_data.extra_data = Some(extra_data);
1410    }
1411
1412    package_data
1413}
1414
1415fn extract_from_egg_archive(path: &Path) -> PackageData {
1416    let metadata = match std::fs::metadata(path) {
1417        Ok(m) => m,
1418        Err(e) => {
1419            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1420            return default_package_data(path);
1421        }
1422    };
1423
1424    if metadata.len() > MAX_ARCHIVE_SIZE {
1425        warn!(
1426            "Egg archive too large: {} bytes (limit: {} bytes)",
1427            metadata.len(),
1428            MAX_ARCHIVE_SIZE
1429        );
1430        return default_package_data(path);
1431    }
1432
1433    let file = match File::open(path) {
1434        Ok(f) => f,
1435        Err(e) => {
1436            warn!("Failed to open egg archive {:?}: {}", path, e);
1437            return default_package_data(path);
1438        }
1439    };
1440
1441    let mut archive = match ZipArchive::new(file) {
1442        Ok(a) => a,
1443        Err(e) => {
1444            warn!("Failed to read egg archive {:?}: {}", path, e);
1445            return default_package_data(path);
1446        }
1447    };
1448
1449    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1450        Ok(entries) => entries,
1451        Err(_) => return default_package_data(path),
1452    };
1453
1454    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1455        &validated_entries,
1456        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1457    ) {
1458        Some(entry) => entry,
1459        None => {
1460            warn!("No PKG-INFO file found in egg archive {:?}", path);
1461            return default_package_data(path);
1462        }
1463    };
1464
1465    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1466        Ok(c) => c,
1467        Err(e) => {
1468            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1469            return default_package_data(path);
1470        }
1471    };
1472
1473    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1474
1475    let (size, sha256) = calculate_file_checksums(path);
1476    package_data.size = size;
1477    package_data.sha256 = sha256;
1478
1479    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1480        &validated_entries,
1481        &[
1482            "EGG-INFO/installed-files.txt",
1483            ".egg-info/installed-files.txt",
1484        ],
1485    ) && let Ok(installed_files_content) =
1486        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1487    {
1488        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1489    }
1490
1491    if let Some(egg_info) = parse_egg_filename(path) {
1492        if package_data.name.is_none() {
1493            package_data.name = Some(egg_info.name.clone());
1494        }
1495        if package_data.version.is_none() {
1496            package_data.version = Some(egg_info.version.clone());
1497        }
1498
1499        if let Some(python_version) = &egg_info.python_version {
1500            let mut extra_data = package_data.extra_data.unwrap_or_default();
1501            extra_data.insert(
1502                "python_version".to_string(),
1503                serde_json::Value::String(python_version.clone()),
1504            );
1505            package_data.extra_data = Some(extra_data);
1506        }
1507    }
1508
1509    package_data.purl = build_egg_purl(
1510        package_data.name.as_deref(),
1511        package_data.version.as_deref(),
1512    );
1513
1514    package_data
1515}
1516
1517fn find_validated_zip_entry_by_suffix<'a>(
1518    entries: &'a [ValidatedZipEntry],
1519    suffix: &str,
1520) -> Option<&'a ValidatedZipEntry> {
1521    entries.iter().find(|entry| entry.name.ends_with(suffix))
1522}
1523
1524fn find_validated_zip_entry_by_any_suffix<'a>(
1525    entries: &'a [ValidatedZipEntry],
1526    suffixes: &[&str],
1527) -> Option<&'a ValidatedZipEntry> {
1528    entries
1529        .iter()
1530        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1531}
1532
1533fn read_validated_zip_entry<R: Read + std::io::Seek>(
1534    archive: &mut ZipArchive<R>,
1535    entry: &ValidatedZipEntry,
1536    path: &Path,
1537    archive_type: &str,
1538) -> Result<String, String> {
1539    let mut file = archive
1540        .by_index(entry.index)
1541        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1542
1543    let compressed_size = file.compressed_size();
1544    let uncompressed_size = file.size();
1545
1546    if compressed_size > 0 {
1547        let ratio = uncompressed_size as f64 / compressed_size as f64;
1548        if ratio > MAX_COMPRESSION_RATIO {
1549            return Err(format!(
1550                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1551                archive_type, path, ratio
1552            ));
1553        }
1554    }
1555
1556    if uncompressed_size > MAX_FILE_SIZE {
1557        return Err(format!(
1558            "Rejected oversized entry in {} {:?}: {} bytes",
1559            archive_type, path, uncompressed_size
1560        ));
1561    }
1562
1563    read_limited_utf8(
1564        &mut file,
1565        MAX_FILE_SIZE,
1566        &format!("{} entry {}", archive_type, entry.name),
1567    )
1568}
1569
1570fn read_limited_utf8<R: Read>(
1571    reader: &mut R,
1572    max_bytes: u64,
1573    context: &str,
1574) -> Result<String, String> {
1575    let mut limited = reader.take(max_bytes + 1);
1576    let mut bytes = Vec::new();
1577    limited
1578        .read_to_end(&mut bytes)
1579        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1580
1581    if bytes.len() as u64 > max_bytes {
1582        return Err(format!(
1583            "{} exceeded {} byte limit while reading",
1584            context, max_bytes
1585        ));
1586    }
1587
1588    String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1589}
1590
1591fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1592    let normalized = entry_path.replace('\\', "/");
1593    if normalized.len() >= 3 {
1594        let bytes = normalized.as_bytes();
1595        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1596            return None;
1597        }
1598    }
1599    let path = Path::new(&normalized);
1600    let mut components = Vec::new();
1601
1602    for component in path.components() {
1603        match component {
1604            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1605            Component::CurDir => {}
1606            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1607        }
1608    }
1609
1610    (!components.is_empty()).then_some(components.join("/"))
1611}
1612
1613/// Parses RECORD CSV format from wheel archives (PEP 427).
1614/// Format: path,hash,size (3 columns, no header)
1615/// Hash format: sha256=urlsafe_base64_hash or empty
1616/// Size: bytes as u64 or empty
1617pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1618    let mut reader = ReaderBuilder::new()
1619        .has_headers(false)
1620        .from_reader(content.as_bytes());
1621
1622    let mut file_references = Vec::new();
1623
1624    for result in reader.records() {
1625        match result {
1626            Ok(record) => {
1627                if record.len() < 3 {
1628                    continue;
1629                }
1630
1631                let path = record.get(0).unwrap_or("").trim().to_string();
1632                if path.is_empty() {
1633                    continue;
1634                }
1635
1636                let hash_field = record.get(1).unwrap_or("").trim();
1637                let size_field = record.get(2).unwrap_or("").trim();
1638
1639                // Parse hash: format is "algorithm=value"
1640                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1641                    let parts: Vec<&str> = hash_field.split('=').collect();
1642                    if parts.len() == 2 && parts[0] == "sha256" {
1643                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1644                            Ok(decoded) => {
1645                                let hex = decoded
1646                                    .iter()
1647                                    .map(|b| format!("{:02x}", b))
1648                                    .collect::<String>();
1649                                Sha256Digest::from_hex(&hex).ok()
1650                            }
1651                            Err(_) => None,
1652                        }
1653                    } else {
1654                        None
1655                    }
1656                } else {
1657                    None
1658                };
1659
1660                // Parse size
1661                let size = if !size_field.is_empty() && size_field != "-" {
1662                    size_field.parse::<u64>().ok()
1663                } else {
1664                    None
1665                };
1666
1667                file_references.push(FileReference {
1668                    path,
1669                    size,
1670                    sha1: None,
1671                    md5: None,
1672                    sha256,
1673                    sha512: None,
1674                    extra_data: None,
1675                });
1676            }
1677            Err(e) => {
1678                warn!("Failed to parse RECORD CSV row: {}", e);
1679                continue;
1680            }
1681        }
1682    }
1683
1684    file_references
1685}
1686
1687/// Parses installed-files.txt format from egg archives (PEP 376).
1688/// Format: one file path per line, no headers, no hash, no size
1689pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1690    content
1691        .lines()
1692        .map(|line| line.trim())
1693        .filter(|line| !line.is_empty())
1694        .map(|path| FileReference {
1695            path: path.to_string(),
1696            size: None,
1697            sha1: None,
1698            md5: None,
1699            sha256: None,
1700            sha512: None,
1701            extra_data: None,
1702        })
1703        .collect()
1704}
1705
1706pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1707    content
1708        .lines()
1709        .map(str::trim)
1710        .filter(|line| !line.is_empty())
1711        .map(|path| FileReference {
1712            path: path.to_string(),
1713            size: None,
1714            sha1: None,
1715            md5: None,
1716            sha256: None,
1717            sha512: None,
1718            extra_data: None,
1719        })
1720        .collect()
1721}
1722
1723struct WheelInfo {
1724    name: String,
1725    version: String,
1726    python_tag: String,
1727    abi_tag: String,
1728    platform_tag: String,
1729}
1730
1731fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1732    let stem = path.file_stem()?.to_string_lossy();
1733    let parts: Vec<&str> = stem.split('-').collect();
1734
1735    if parts.len() >= 5 {
1736        Some(WheelInfo {
1737            name: parts[0].replace('_', "-"),
1738            version: parts[1].to_string(),
1739            python_tag: parts[2].to_string(),
1740            abi_tag: parts[3].to_string(),
1741            platform_tag: parts[4..].join("-"),
1742        })
1743    } else {
1744        None
1745    }
1746}
1747
1748struct EggInfo {
1749    name: String,
1750    version: String,
1751    python_version: Option<String>,
1752}
1753
1754fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1755    let stem = path.file_stem()?.to_string_lossy();
1756    let parts: Vec<&str> = stem.split('-').collect();
1757
1758    if parts.len() >= 2 {
1759        Some(EggInfo {
1760            name: parts[0].replace('_', "-"),
1761            version: parts[1].to_string(),
1762            python_version: parts.get(2).map(|s| s.to_string()),
1763        })
1764    } else {
1765        None
1766    }
1767}
1768
1769fn build_wheel_purl(
1770    name: Option<&str>,
1771    version: Option<&str>,
1772    wheel_info: &WheelInfo,
1773) -> Option<String> {
1774    let name = name?;
1775    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1776
1777    if let Some(ver) = version {
1778        package_url.with_version(ver).ok()?;
1779    }
1780
1781    let extension = format!(
1782        "{}-{}-{}",
1783        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1784    );
1785    package_url.add_qualifier("extension", extension).ok()?;
1786
1787    Some(package_url.to_string())
1788}
1789
1790fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1791    let name = name?;
1792    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1793
1794    if let Some(ver) = version {
1795        package_url.with_version(ver).ok()?;
1796    }
1797
1798    package_url.add_qualifier("type", "egg").ok()?;
1799
1800    Some(package_url.to_string())
1801}
1802
1803fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1804    let metadata = super::rfc822::parse_rfc822_content(content);
1805    build_package_data_from_rfc822(&metadata, datasource_id)
1806}
1807
1808/// Builds PackageData from parsed RFC822 metadata.
1809///
1810/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1811/// and `python_parse_rfc822_content` (content-based) functions.
1812fn build_package_data_from_rfc822(
1813    metadata: &super::rfc822::Rfc822Metadata,
1814    datasource_id: DatasourceId,
1815) -> PackageData {
1816    use super::rfc822::{get_header_all, get_header_first};
1817
1818    let name = get_header_first(&metadata.headers, "name");
1819    let version = get_header_first(&metadata.headers, "version");
1820    let summary = get_header_first(&metadata.headers, "summary");
1821    let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1822    let author = get_header_first(&metadata.headers, "author");
1823    let author_email = get_header_first(&metadata.headers, "author-email");
1824    let license = get_header_first(&metadata.headers, "license");
1825    let license_expression = get_header_first(&metadata.headers, "license-expression");
1826    let download_url = get_header_first(&metadata.headers, "download-url");
1827    let platform = get_header_first(&metadata.headers, "platform");
1828    let requires_python = get_header_first(&metadata.headers, "requires-python");
1829    let classifiers = get_header_all(&metadata.headers, "classifier");
1830    let license_files = get_header_all(&metadata.headers, "license-file");
1831
1832    let description_body = if metadata.body.is_empty() {
1833        get_header_first(&metadata.headers, "description").unwrap_or_default()
1834    } else {
1835        metadata.body.clone()
1836    };
1837
1838    let description = build_description(summary.as_deref(), &description_body);
1839
1840    let mut parties = Vec::new();
1841    if author.is_some() || author_email.is_some() {
1842        parties.push(Party {
1843            r#type: Some("person".to_string()),
1844            role: Some("author".to_string()),
1845            name: author,
1846            email: author_email,
1847            url: None,
1848            organization: None,
1849            organization_url: None,
1850            timezone: None,
1851        });
1852    }
1853
1854    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1855    let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1856    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1857        license_expression
1858            .as_deref()
1859            .and_then(normalize_spdx_expression)
1860            .map(|normalized| {
1861                build_declared_license_data(
1862                    normalized,
1863                    DeclaredLicenseMatchMetadata::single_line(
1864                        license_expression.as_deref().unwrap_or_default(),
1865                    )
1866                    .with_referenced_filenames(&referenced_license_files),
1867                )
1868            })
1869            .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1870
1871    let extracted_license_statement = license_expression
1872        .clone()
1873        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1874
1875    let mut extra_data = HashMap::new();
1876    if let Some(platform_value) = platform
1877        && !platform_value.eq_ignore_ascii_case("unknown")
1878        && !platform_value.is_empty()
1879    {
1880        extra_data.insert(
1881            "platform".to_string(),
1882            serde_json::Value::String(platform_value),
1883        );
1884    }
1885
1886    if let Some(requires_python_value) = requires_python
1887        && !requires_python_value.is_empty()
1888    {
1889        extra_data.insert(
1890            "requires_python".to_string(),
1891            serde_json::Value::String(requires_python_value),
1892        );
1893    }
1894
1895    if !license_files.is_empty() {
1896        extra_data.insert(
1897            "license_files".to_string(),
1898            serde_json::Value::Array(
1899                license_files
1900                    .iter()
1901                    .cloned()
1902                    .map(serde_json::Value::String)
1903                    .collect(),
1904            ),
1905        );
1906    }
1907
1908    let file_references = license_files
1909        .iter()
1910        .map(|path| FileReference {
1911            path: path.clone(),
1912            size: None,
1913            sha1: None,
1914            md5: None,
1915            sha256: None,
1916            sha512: None,
1917            extra_data: None,
1918        })
1919        .collect();
1920
1921    let project_urls = get_header_all(&metadata.headers, "project-url");
1922    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1923    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1924
1925    if !project_urls.is_empty() {
1926        let parsed_urls = parse_project_urls(&project_urls);
1927
1928        for (label, url) in &parsed_urls {
1929            let label_lower = label.to_lowercase();
1930
1931            if bug_tracking_url.is_none()
1932                && matches!(
1933                    label_lower.as_str(),
1934                    "tracker"
1935                        | "bug reports"
1936                        | "bug tracker"
1937                        | "issues"
1938                        | "issue tracker"
1939                        | "github: issues"
1940                )
1941            {
1942                bug_tracking_url = Some(url.clone());
1943            } else if code_view_url.is_none()
1944                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1945            {
1946                code_view_url = Some(url.clone());
1947            } else if vcs_url.is_none()
1948                && matches!(
1949                    label_lower.as_str(),
1950                    "github" | "gitlab" | "github: repo" | "repository"
1951                )
1952            {
1953                vcs_url = Some(url.clone());
1954            } else if homepage_url.is_none()
1955                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1956            {
1957                homepage_url = Some(url.clone());
1958            } else if label_lower == "changelog" {
1959                extra_data.insert(
1960                    "changelog_url".to_string(),
1961                    serde_json::Value::String(url.clone()),
1962                );
1963            }
1964        }
1965
1966        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1967            .iter()
1968            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1969            .collect();
1970
1971        if !project_urls_json.is_empty() {
1972            extra_data.insert(
1973                "project_urls".to_string(),
1974                serde_json::Value::Object(project_urls_json),
1975            );
1976        }
1977    }
1978
1979    let extra_data = if extra_data.is_empty() {
1980        None
1981    } else {
1982        Some(extra_data)
1983    };
1984
1985    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1986        build_pypi_urls(name.as_deref(), version.as_deref());
1987
1988    PackageData {
1989        package_type: Some(PythonParser::PACKAGE_TYPE),
1990        namespace: None,
1991        name,
1992        version,
1993        qualifiers: None,
1994        subpath: None,
1995        primary_language: Some("Python".to_string()),
1996        description,
1997        release_date: None,
1998        parties,
1999        keywords,
2000        homepage_url,
2001        download_url,
2002        size: None,
2003        sha1: None,
2004        md5: None,
2005        sha256: None,
2006        sha512: None,
2007        bug_tracking_url,
2008        code_view_url,
2009        vcs_url,
2010        copyright: None,
2011        holder: None,
2012        declared_license_expression,
2013        declared_license_expression_spdx,
2014        license_detections,
2015        other_license_expression: None,
2016        other_license_expression_spdx: None,
2017        other_license_detections: Vec::new(),
2018        extracted_license_statement,
2019        notice_text: None,
2020        source_packages: Vec::new(),
2021        file_references,
2022        is_private: false,
2023        is_virtual: false,
2024        extra_data,
2025        dependencies,
2026        repository_homepage_url,
2027        repository_download_url,
2028        api_data_url,
2029        datasource_id: Some(datasource_id),
2030        purl,
2031    }
2032}
2033
2034fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
2035    project_urls
2036        .iter()
2037        .filter_map(|url_entry| {
2038            if let Some((label, url)) = url_entry.split_once(", ") {
2039                let label_trimmed = label.trim();
2040                let url_trimmed = url.trim();
2041                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
2042                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
2043                }
2044            }
2045            None
2046        })
2047        .collect()
2048}
2049
2050fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
2051    let mut parts = Vec::new();
2052    if let Some(summary_value) = summary
2053        && !summary_value.trim().is_empty()
2054    {
2055        parts.push(summary_value.trim().to_string());
2056    }
2057
2058    if !body.trim().is_empty() {
2059        parts.push(body.trim().to_string());
2060    }
2061
2062    if parts.is_empty() {
2063        None
2064    } else {
2065        Some(parts.join("\n"))
2066    }
2067}
2068
2069fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
2070    let mut keywords = Vec::new();
2071    let mut license_classifiers = Vec::new();
2072
2073    for classifier in classifiers {
2074        if classifier.starts_with("License ::") {
2075            license_classifiers.push(classifier.to_string());
2076        } else {
2077            keywords.push(classifier.to_string());
2078        }
2079    }
2080
2081    (keywords, license_classifiers)
2082}
2083
2084fn build_extracted_license_statement(
2085    license: Option<&str>,
2086    license_classifiers: &[String],
2087) -> Option<String> {
2088    let mut lines = Vec::new();
2089
2090    if let Some(value) = license
2091        && !value.trim().is_empty()
2092    {
2093        lines.push(format!("license: {}", value.trim()));
2094    }
2095
2096    if !license_classifiers.is_empty() {
2097        lines.push("classifiers:".to_string());
2098        for classifier in license_classifiers {
2099            lines.push(format!("  - '{}'", classifier));
2100        }
2101    }
2102
2103    if lines.is_empty() {
2104        None
2105    } else {
2106        Some(format!("{}\n", lines.join("\n")))
2107    }
2108}
2109
2110pub(crate) fn build_pypi_urls(
2111    name: Option<&str>,
2112    version: Option<&str>,
2113) -> (
2114    Option<String>,
2115    Option<String>,
2116    Option<String>,
2117    Option<String>,
2118) {
2119    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2120
2121    let repository_download_url = name.and_then(|value| {
2122        version.map(|ver| {
2123            format!(
2124                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2125                &value[..1.min(value.len())],
2126                value,
2127                value,
2128                ver
2129            )
2130        })
2131    });
2132
2133    let api_data_url = name.map(|value| {
2134        if let Some(ver) = version {
2135            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2136        } else {
2137            format!("https://pypi.org/pypi/{}/json", value)
2138        }
2139    });
2140
2141    let purl = name.and_then(|value| {
2142        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2143        if let Some(ver) = version {
2144            package_url.with_version(ver).ok()?;
2145        }
2146        Some(package_url.to_string())
2147    });
2148
2149    (
2150        repository_homepage_url,
2151        repository_download_url,
2152        api_data_url,
2153        purl,
2154    )
2155}
2156
2157fn build_pypi_purl_with_extension(
2158    name: &str,
2159    version: Option<&str>,
2160    extension: &str,
2161) -> Option<String> {
2162    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2163    if let Some(ver) = version {
2164        package_url.with_version(ver).ok()?;
2165    }
2166    package_url.add_qualifier("extension", extension).ok()?;
2167    Some(package_url.to_string())
2168}
2169
2170fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2171    let toml_content = match read_toml_file(path) {
2172        Ok(content) => content,
2173        Err(e) => {
2174            warn!(
2175                "Failed to read or parse pyproject.toml at {:?}: {}",
2176                path, e
2177            );
2178            return default_package_data(path);
2179        }
2180    };
2181
2182    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2183    let is_poetry_pyproject = tool_table
2184        .and_then(|tool| tool.get("poetry"))
2185        .and_then(|value| value.as_table())
2186        .is_some();
2187
2188    // Handle both PEP 621 (project table) and poetry formats
2189    let project_table =
2190        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2191            // Standard PEP 621 format with [project] table
2192            project.clone()
2193        } else if let Some(tool) = tool_table {
2194            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2195                // Poetry format with [tool.poetry] table
2196                poetry.clone()
2197            } else {
2198                return default_package_data(path);
2199            }
2200        } else if toml_content.get(FIELD_NAME).is_some() {
2201            // Other format with top-level fields
2202            match toml_content.as_table() {
2203                Some(table) => table.clone(),
2204                None => {
2205                    warn!("Failed to convert TOML content to table in {:?}", path);
2206                    return default_package_data(path);
2207                }
2208            }
2209        } else {
2210            return default_package_data(path);
2211        };
2212
2213    let name = project_table
2214        .get(FIELD_NAME)
2215        .and_then(|v| v.as_str())
2216        .map(String::from);
2217
2218    let version = project_table
2219        .get(FIELD_VERSION)
2220        .and_then(|v| v.as_str())
2221        .map(String::from);
2222    let classifiers = project_table
2223        .get("classifiers")
2224        .and_then(|value| value.as_array())
2225        .map(|values| {
2226            values
2227                .iter()
2228                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2229                .collect::<Vec<_>>()
2230        })
2231        .unwrap_or_default();
2232    let (classifier_keywords, license_classifiers) = split_classifiers(&classifiers);
2233
2234    let extracted_license_statement = extract_raw_license_string(&project_table);
2235    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2236        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2237
2238    let description = project_table
2239        .get(FIELD_DESCRIPTION)
2240        .and_then(|value| value.as_str())
2241        .map(|value| value.to_string());
2242    let mut keywords = project_table
2243        .get(FIELD_KEYWORDS)
2244        .and_then(|value| value.as_array())
2245        .map(|values| {
2246            values
2247                .iter()
2248                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2249                .collect::<Vec<_>>()
2250        })
2251        .unwrap_or_default();
2252    for classifier in classifier_keywords {
2253        if !keywords.contains(&classifier) {
2254            keywords.push(classifier);
2255        }
2256    }
2257
2258    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2259    let mut extra_data = extract_pyproject_extra_data(&toml_content).unwrap_or_default();
2260    let (homepage_url, download_url, bug_tracking_url, code_view_url, repository_url) =
2261        extract_urls(&project_table, &mut extra_data);
2262
2263    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2264
2265    // Create package URL
2266    let purl = name.as_ref().and_then(|n| {
2267        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2268            Ok(p) => p,
2269            Err(e) => {
2270                warn!(
2271                    "Failed to create PackageUrl for Python package '{}': {}",
2272                    n, e
2273                );
2274                return None;
2275            }
2276        };
2277
2278        if let Some(v) = &version
2279            && let Err(e) = package_url.with_version(v)
2280        {
2281            warn!(
2282                "Failed to set version '{}' for Python package '{}': {}",
2283                v, n, e
2284            );
2285            return None;
2286        }
2287
2288        Some(package_url.to_string())
2289    });
2290
2291    let api_data_url = name.as_ref().map(|n| {
2292        if let Some(v) = &version {
2293            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2294        } else {
2295            format!("https://pypi.org/pypi/{}/json", n)
2296        }
2297    });
2298
2299    let pypi_homepage_url = name
2300        .as_ref()
2301        .map(|n| format!("https://pypi.org/project/{}", n));
2302
2303    let pypi_download_url = name.as_ref().and_then(|n| {
2304        version.as_ref().map(|v| {
2305            format!(
2306                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2307                &n[..1.min(n.len())],
2308                n,
2309                n,
2310                v
2311            )
2312        })
2313    });
2314
2315    PackageData {
2316        package_type: Some(PythonParser::PACKAGE_TYPE),
2317        namespace: None,
2318        name,
2319        version,
2320        qualifiers: None,
2321        subpath: None,
2322        primary_language: None,
2323        description,
2324        release_date: None,
2325        parties: extract_parties(&project_table),
2326        keywords,
2327        homepage_url: homepage_url.or(pypi_homepage_url),
2328        download_url: download_url
2329            .or_else(|| repository_url.clone())
2330            .or(pypi_download_url),
2331        size: None,
2332        sha1: None,
2333        md5: None,
2334        sha256: None,
2335        sha512: None,
2336        bug_tracking_url,
2337        code_view_url,
2338        vcs_url: repository_url,
2339        copyright: None,
2340        holder: None,
2341        declared_license_expression,
2342        declared_license_expression_spdx,
2343        license_detections,
2344        other_license_expression: None,
2345        other_license_expression_spdx: None,
2346        other_license_detections: Vec::new(),
2347        extracted_license_statement: extracted_license_statement
2348            .or_else(|| build_extracted_license_statement(None, &license_classifiers)),
2349        notice_text: None,
2350        source_packages: Vec::new(),
2351        file_references: Vec::new(),
2352        is_private: has_private_classifier(&classifiers),
2353        is_virtual: false,
2354        extra_data: if extra_data.is_empty() {
2355            None
2356        } else {
2357            Some(extra_data)
2358        },
2359        dependencies: [dependencies, optional_dependencies].concat(),
2360        repository_homepage_url: None,
2361        repository_download_url: None,
2362        api_data_url,
2363        datasource_id: Some(if is_poetry_pyproject {
2364            DatasourceId::PypiPoetryPyprojectToml
2365        } else {
2366            DatasourceId::PypiPyprojectToml
2367        }),
2368        purl,
2369    }
2370}
2371
2372fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2373    let path_str = path.to_string_lossy().replace('\\', "/");
2374    if path_str.contains("/EGG-INFO/PKG-INFO") {
2375        DatasourceId::PypiEggPkginfo
2376    } else if path_str.ends_with(".egg-info/PKG-INFO") {
2377        DatasourceId::PypiEditableEggPkginfo
2378    } else {
2379        DatasourceId::PypiSdistPkginfo
2380    }
2381}
2382
2383fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2384    project
2385        .get(FIELD_LICENSE)
2386        .and_then(|license_value| match license_value {
2387            TomlValue::String(license_str) => Some(license_str.clone()),
2388            TomlValue::Table(license_table) => license_table
2389                .get("text")
2390                .and_then(|v| v.as_str())
2391                .map(|s| s.to_string())
2392                .or_else(|| {
2393                    license_table
2394                        .get("expression")
2395                        .and_then(|v| v.as_str())
2396                        .map(|expr| expr.to_string())
2397                }),
2398            _ => None,
2399        })
2400}
2401
2402fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2403    match project.get(FIELD_LICENSE) {
2404        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2405        Some(TomlValue::Table(license_table)) => license_table
2406            .get("expression")
2407            .and_then(|value| value.as_str()),
2408        _ => None,
2409    }
2410}
2411
2412fn extract_urls(
2413    project: &TomlMap<String, TomlValue>,
2414    extra_data: &mut HashMap<String, serde_json::Value>,
2415) -> ProjectUrls {
2416    let mut homepage_url = None;
2417    let mut download_url = None;
2418    let mut bug_tracking_url = None;
2419    let mut code_view_url = None;
2420    let mut repository_url = None;
2421
2422    // Check for URLs table
2423    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2424        let parsed_urls: Vec<(String, String)> = urls
2425            .iter()
2426            .filter_map(|(label, value)| {
2427                value
2428                    .as_str()
2429                    .map(|url| (label.to_string(), url.to_string()))
2430            })
2431            .collect();
2432        apply_project_url_mappings(
2433            &parsed_urls,
2434            &mut homepage_url,
2435            &mut bug_tracking_url,
2436            &mut code_view_url,
2437            &mut repository_url,
2438            extra_data,
2439        );
2440
2441        download_url = urls
2442            .get("Downloads")
2443            .or_else(|| urls.get("downloads"))
2444            .and_then(|v| v.as_str())
2445            .map(String::from);
2446
2447        if homepage_url.is_none() {
2448            homepage_url = urls
2449                .get(FIELD_HOMEPAGE)
2450                .and_then(|v| v.as_str())
2451                .map(String::from);
2452        }
2453        if repository_url.is_none() {
2454            repository_url = urls
2455                .get(FIELD_REPOSITORY)
2456                .and_then(|v| v.as_str())
2457                .map(String::from);
2458        }
2459    }
2460
2461    // If not found in URLs table, check for top-level keys
2462    if homepage_url.is_none() {
2463        homepage_url = project
2464            .get(FIELD_HOMEPAGE)
2465            .and_then(|v| v.as_str())
2466            .map(String::from);
2467    }
2468
2469    if repository_url.is_none() {
2470        repository_url = project
2471            .get(FIELD_REPOSITORY)
2472            .and_then(|v| v.as_str())
2473            .map(String::from);
2474    }
2475
2476    (
2477        homepage_url,
2478        download_url,
2479        bug_tracking_url,
2480        code_view_url,
2481        repository_url,
2482    )
2483}
2484
2485fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2486    let mut parties = Vec::new();
2487
2488    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2489        for author in authors {
2490            if let Some(author_str) = author.as_str() {
2491                let (name, email) = split_name_email(author_str);
2492                parties.push(Party {
2493                    r#type: None,
2494                    role: Some("author".to_string()),
2495                    name,
2496                    email,
2497                    url: None,
2498                    organization: None,
2499                    organization_url: None,
2500                    timezone: None,
2501                });
2502            } else if let Some(author_table) = author.as_table() {
2503                let name = author_table
2504                    .get("name")
2505                    .and_then(|value| value.as_str())
2506                    .map(|value| value.to_string());
2507                let email = author_table
2508                    .get("email")
2509                    .and_then(|value| value.as_str())
2510                    .map(|value| value.to_string());
2511                if name.is_some() || email.is_some() {
2512                    parties.push(Party {
2513                        r#type: None,
2514                        role: Some("author".to_string()),
2515                        name,
2516                        email,
2517                        url: None,
2518                        organization: None,
2519                        organization_url: None,
2520                        timezone: None,
2521                    });
2522                }
2523            }
2524        }
2525    }
2526
2527    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2528        for maintainer in maintainers {
2529            if let Some(maintainer_str) = maintainer.as_str() {
2530                let (name, email) = split_name_email(maintainer_str);
2531                parties.push(Party {
2532                    r#type: None,
2533                    role: Some("maintainer".to_string()),
2534                    name,
2535                    email,
2536                    url: None,
2537                    organization: None,
2538                    organization_url: None,
2539                    timezone: None,
2540                });
2541            } else if let Some(maintainer_table) = maintainer.as_table() {
2542                let name = maintainer_table
2543                    .get("name")
2544                    .and_then(|value| value.as_str())
2545                    .map(|value| value.to_string());
2546                let email = maintainer_table
2547                    .get("email")
2548                    .and_then(|value| value.as_str())
2549                    .map(|value| value.to_string());
2550                if name.is_some() || email.is_some() {
2551                    parties.push(Party {
2552                        r#type: None,
2553                        role: Some("maintainer".to_string()),
2554                        name,
2555                        email,
2556                        url: None,
2557                        organization: None,
2558                        organization_url: None,
2559                        timezone: None,
2560                    });
2561                }
2562            }
2563        }
2564    }
2565
2566    parties
2567}
2568
2569fn extract_dependencies(
2570    project: &TomlMap<String, TomlValue>,
2571    toml_content: &TomlValue,
2572) -> (Vec<Dependency>, Vec<Dependency>) {
2573    let mut dependencies = Vec::new();
2574    let mut optional_dependencies = Vec::new();
2575
2576    // Handle dependencies - can be array or table format
2577    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2578        match deps_value {
2579            TomlValue::Array(arr) => {
2580                dependencies = parse_dependency_array(arr, false, None);
2581            }
2582            TomlValue::Table(table) => {
2583                dependencies = parse_dependency_table(table, false, None);
2584            }
2585            _ => {}
2586        }
2587    }
2588
2589    // Handle PEP 621 optional-dependencies with scope
2590    if let Some(opt_deps_table) = project
2591        .get(FIELD_OPTIONAL_DEPENDENCIES)
2592        .and_then(|v| v.as_table())
2593    {
2594        for (extra_name, deps) in opt_deps_table {
2595            match deps {
2596                TomlValue::Array(arr) => {
2597                    optional_dependencies.extend(parse_dependency_array(
2598                        arr,
2599                        true,
2600                        Some(extra_name),
2601                    ));
2602                }
2603                TomlValue::Table(table) => {
2604                    optional_dependencies.extend(parse_dependency_table(
2605                        table,
2606                        true,
2607                        Some(extra_name),
2608                    ));
2609                }
2610                _ => {}
2611            }
2612        }
2613    }
2614
2615    // Handle Poetry dev-dependencies
2616    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2617        match dev_deps_value {
2618            TomlValue::Array(arr) => {
2619                optional_dependencies.extend(parse_dependency_array(
2620                    arr,
2621                    true,
2622                    Some(FIELD_DEV_DEPENDENCIES),
2623                ));
2624            }
2625            TomlValue::Table(table) => {
2626                optional_dependencies.extend(parse_dependency_table(
2627                    table,
2628                    true,
2629                    Some(FIELD_DEV_DEPENDENCIES),
2630                ));
2631            }
2632            _ => {}
2633        }
2634    }
2635
2636    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2637    if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
2638        for (group_name, group_data) in groups_table {
2639            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2640                match group_deps {
2641                    TomlValue::Array(arr) => {
2642                        optional_dependencies.extend(parse_dependency_array(
2643                            arr,
2644                            true,
2645                            Some(group_name),
2646                        ));
2647                    }
2648                    TomlValue::Table(table) => {
2649                        optional_dependencies.extend(parse_dependency_table(
2650                            table,
2651                            true,
2652                            Some(group_name),
2653                        ));
2654                    }
2655                    _ => {}
2656                }
2657            }
2658        }
2659    }
2660
2661    if let Some(groups_table) = toml_content
2662        .get(FIELD_DEPENDENCY_GROUPS)
2663        .and_then(|value| value.as_table())
2664    {
2665        for (group_name, deps) in groups_table {
2666            match deps {
2667                TomlValue::Array(arr) => {
2668                    optional_dependencies.extend(parse_dependency_array(
2669                        arr,
2670                        true,
2671                        Some(group_name),
2672                    ));
2673                }
2674                TomlValue::Table(table) => {
2675                    optional_dependencies.extend(parse_dependency_table(
2676                        table,
2677                        true,
2678                        Some(group_name),
2679                    ));
2680                }
2681                _ => {}
2682            }
2683        }
2684    }
2685
2686    if let Some(dev_deps_value) = toml_content
2687        .get("tool")
2688        .and_then(|value| value.as_table())
2689        .and_then(|tool| tool.get("uv"))
2690        .and_then(|value| value.as_table())
2691        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2692    {
2693        match dev_deps_value {
2694            TomlValue::Array(arr) => {
2695                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2696            }
2697            TomlValue::Table(table) => {
2698                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2699            }
2700            _ => {}
2701        }
2702    }
2703
2704    (dependencies, optional_dependencies)
2705}
2706
2707fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2708    let mut extra_data = HashMap::new();
2709
2710    if let Some(tool_uv) = toml_content
2711        .get("tool")
2712        .and_then(|value| value.as_table())
2713        .and_then(|tool| tool.get("uv"))
2714    {
2715        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2716    }
2717
2718    if extra_data.is_empty() {
2719        None
2720    } else {
2721        Some(extra_data)
2722    }
2723}
2724
2725fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2726    match value {
2727        TomlValue::String(value) => JsonValue::String(value.clone()),
2728        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2729        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2730        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2731        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2732        TomlValue::Array(values) => {
2733            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2734        }
2735        TomlValue::Table(values) => JsonValue::Object(
2736            values
2737                .iter()
2738                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2739                .collect::<JsonMap<String, JsonValue>>(),
2740        ),
2741    }
2742}
2743
2744fn parse_dependency_table(
2745    table: &TomlMap<String, TomlValue>,
2746    is_optional: bool,
2747    scope: Option<&str>,
2748) -> Vec<Dependency> {
2749    table
2750        .iter()
2751        .filter_map(|(name, version)| {
2752            let version_str = version.as_str().map(|s| s.to_string());
2753            let mut package_url =
2754                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2755
2756            if let Some(v) = &version_str {
2757                package_url.with_version(v).ok()?;
2758            }
2759
2760            Some(Dependency {
2761                purl: Some(package_url.to_string()),
2762                extracted_requirement: None,
2763                scope: scope.map(|s| s.to_string()),
2764                is_runtime: Some(!is_optional),
2765                is_optional: Some(is_optional),
2766                is_pinned: None,
2767                is_direct: Some(true),
2768                resolved_package: None,
2769                extra_data: None,
2770            })
2771        })
2772        .collect()
2773}
2774
2775fn parse_dependency_array(
2776    array: &[TomlValue],
2777    is_optional: bool,
2778    scope: Option<&str>,
2779) -> Vec<Dependency> {
2780    array
2781        .iter()
2782        .filter_map(|dep| {
2783            let dep_str = dep.as_str()?;
2784            build_pyproject_array_dependency(dep_str, is_optional, scope)
2785        })
2786        .collect()
2787}
2788
2789fn build_pyproject_array_dependency(
2790    dep_str: &str,
2791    is_optional: bool,
2792    scope: Option<&str>,
2793) -> Option<Dependency> {
2794    let parsed = parse_pep508_requirement(dep_str)?;
2795    let name = normalize_python_package_name(&parsed.name);
2796    let pinned_version = parsed
2797        .specifiers
2798        .as_deref()
2799        .and_then(extract_exact_pinned_version);
2800
2801    let purl = build_python_dependency_purl(&name, pinned_version.as_deref())?;
2802
2803    let mut extra_data = HashMap::new();
2804    if let Some(marker) = parsed.marker {
2805        extra_data.insert("marker".to_string(), JsonValue::String(marker));
2806    }
2807    if !parsed.extras.is_empty() {
2808        extra_data.insert(
2809            "extras".to_string(),
2810            JsonValue::Array(parsed.extras.into_iter().map(JsonValue::String).collect()),
2811        );
2812    }
2813
2814    let extracted_requirement = parsed.specifiers.or(parsed.url);
2815
2816    Some(Dependency {
2817        purl: Some(purl),
2818        extracted_requirement: extracted_requirement.clone(),
2819        scope: scope.map(|s| s.to_string()),
2820        is_runtime: Some(!is_optional),
2821        is_optional: Some(is_optional),
2822        is_pinned: Some(pinned_version.is_some()),
2823        is_direct: Some(true),
2824        resolved_package: None,
2825        extra_data: if extra_data.is_empty() {
2826            None
2827        } else {
2828            Some(extra_data)
2829        },
2830    })
2831}
2832
2833fn extract_exact_pinned_version(specifiers: &str) -> Option<String> {
2834    let trimmed = specifiers.trim();
2835    if trimmed.contains(',') {
2836        return None;
2837    }
2838
2839    let stripped = if let Some(version) = trimmed.strip_prefix("===") {
2840        version
2841    } else if let Some(version) = trimmed.strip_prefix("==") {
2842        version
2843    } else {
2844        return None;
2845    };
2846
2847    let version = stripped.trim();
2848    if version.is_empty() {
2849        None
2850    } else {
2851        Some(version.to_string())
2852    }
2853}
2854
2855#[derive(Debug, Clone)]
2856enum Value {
2857    String(String),
2858    Number(f64),
2859    Bool(bool),
2860    None,
2861    List(Vec<Value>),
2862    Tuple(Vec<Value>),
2863    Dict(HashMap<String, Value>),
2864}
2865
2866struct LiteralEvaluator {
2867    constants: HashMap<String, Value>,
2868    max_depth: usize,
2869    max_nodes: usize,
2870    nodes_visited: usize,
2871}
2872
2873impl LiteralEvaluator {
2874    fn new(constants: HashMap<String, Value>) -> Self {
2875        Self {
2876            constants,
2877            max_depth: MAX_SETUP_PY_AST_DEPTH,
2878            max_nodes: MAX_SETUP_PY_AST_NODES,
2879            nodes_visited: 0,
2880        }
2881    }
2882
2883    fn insert_constant(&mut self, name: String, value: Value) {
2884        self.constants.insert(name, value);
2885    }
2886
2887    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2888        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2889            return None;
2890        }
2891        self.nodes_visited += 1;
2892
2893        match expr {
2894            ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
2895                Some(Value::String(value.to_str().to_string()))
2896            }
2897            ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
2898                Some(Value::Bool(*value))
2899            }
2900            ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
2901                self.evaluate_number(value)
2902            }
2903            ast::Expr::NoneLiteral(_) => Some(Value::None),
2904            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2905            ast::Expr::List(ast::ExprList { elts, .. }) => {
2906                let mut values = Vec::new();
2907                for elt in elts {
2908                    values.push(self.evaluate_expr(elt, depth + 1)?);
2909                }
2910                Some(Value::List(values))
2911            }
2912            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
2913                let mut values = Vec::new();
2914                for elt in elts {
2915                    values.push(self.evaluate_expr(elt, depth + 1)?);
2916                }
2917                Some(Value::Tuple(values))
2918            }
2919            ast::Expr::Dict(ast::ExprDict { items, .. }) => {
2920                let mut dict = HashMap::new();
2921                for item in items {
2922                    let key_expr = item.key.as_ref()?;
2923                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
2924                    let key = value_to_string(&key_value)?;
2925                    let value = self.evaluate_expr(&item.value, depth + 1)?;
2926                    dict.insert(key, value);
2927                }
2928                Some(Value::Dict(dict))
2929            }
2930            ast::Expr::Call(ast::ExprCall {
2931                func, arguments, ..
2932            }) => {
2933                let args = arguments.args.as_ref();
2934                let keywords = arguments.keywords.as_ref();
2935                if keywords.is_empty()
2936                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
2937                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
2938                {
2939                    return self.evaluate_ordered_dict(args, depth + 1);
2940                }
2941
2942                if !args.is_empty() {
2943                    return None;
2944                }
2945
2946                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
2947                    && id == "dict"
2948                {
2949                    let mut dict = HashMap::new();
2950                    for keyword in keywords {
2951                        let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
2952                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2953                        dict.insert(key.to_string(), value);
2954                    }
2955                    return Some(Value::Dict(dict));
2956                }
2957
2958                None
2959            }
2960            _ => None,
2961        }
2962    }
2963
2964    fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
2965        match number {
2966            ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2967            ast::Number::Float(value) => Some(Value::Number(*value)),
2968            ast::Number::Complex { .. } => None,
2969        }
2970    }
2971
2972    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2973        if args.len() != 1 {
2974            return None;
2975        }
2976
2977        let items = match self.evaluate_expr(&args[0], depth)? {
2978            Value::List(items) | Value::Tuple(items) => items,
2979            _ => return None,
2980        };
2981
2982        let mut dict = HashMap::new();
2983        for item in items {
2984            let Value::Tuple(values) = item else {
2985                return None;
2986            };
2987            if values.len() != 2 {
2988                return None;
2989            }
2990            let key = value_to_string(&values[0])?;
2991            dict.insert(key, values[1].clone());
2992        }
2993
2994        Some(Value::Dict(dict))
2995    }
2996}
2997
2998#[derive(Default)]
2999struct SetupAliases {
3000    setup_names: HashSet<String>,
3001    module_aliases: HashMap<String, String>,
3002}
3003
3004fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
3005    extract_from_setup_py(path).into_iter().collect()
3006}
3007
3008fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
3009    let content = match read_file_to_string(path) {
3010        Ok(content) => content,
3011        Err(e) => {
3012            warn!("Failed to read setup.py at {:?}: {}", path, e);
3013            return Some(default_package_data(path));
3014        }
3015    };
3016
3017    if content.len() > MAX_SETUP_PY_BYTES {
3018        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
3019        let package_data = extract_from_setup_py_regex(&content);
3020        return should_emit_setup_py_package(&package_data).then_some(package_data);
3021    }
3022
3023    let mut package_data = match extract_from_setup_py_ast(&content) {
3024        Ok(Some(data)) => data,
3025        Ok(None) => return Some(default_package_data(path)),
3026        Err(e) => {
3027            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
3028            extract_from_setup_py_regex(&content)
3029        }
3030    };
3031
3032    if package_data.name.is_none() {
3033        package_data.name = extract_setup_value(&content, "name");
3034    }
3035
3036    if package_data.version.is_none() {
3037        package_data.version = extract_setup_value(&content, "version");
3038    }
3039
3040    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
3041
3042    if package_data.purl.is_none() {
3043        package_data.purl = build_setup_py_purl(
3044            package_data.name.as_deref(),
3045            package_data.version.as_deref(),
3046        );
3047    }
3048
3049    if should_emit_setup_py_package(&package_data) {
3050        Some(package_data)
3051    } else {
3052        Some(default_package_data(path))
3053    }
3054}
3055
3056fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
3057    package_data.name.is_some()
3058        || package_data.version.is_some()
3059        || package_data.purl.is_some()
3060        || !package_data.dependencies.is_empty()
3061        || package_data.extracted_license_statement.is_some()
3062        || !package_data.license_detections.is_empty()
3063        || !package_data.parties.is_empty()
3064        || package_data.description.is_some()
3065        || package_data.homepage_url.is_some()
3066        || package_data.bug_tracking_url.is_some()
3067        || package_data.code_view_url.is_some()
3068        || package_data.vcs_url.is_some()
3069}
3070
3071fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
3072    if package_data.version.is_some()
3073        && package_data.extracted_license_statement.is_some()
3074        && package_data
3075            .parties
3076            .iter()
3077            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
3078    {
3079        return;
3080    }
3081
3082    let Some(root) = path.parent() else {
3083        return;
3084    };
3085
3086    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
3087
3088    if package_data.version.is_none() {
3089        package_data.version = dunder_metadata.version;
3090    }
3091
3092    if package_data.extracted_license_statement.is_none() {
3093        package_data.extracted_license_statement = dunder_metadata.license;
3094    }
3095
3096    let has_author = package_data
3097        .parties
3098        .iter()
3099        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
3100
3101    if !has_author && let Some(author) = dunder_metadata.author {
3102        package_data.parties.push(Party {
3103            r#type: Some("person".to_string()),
3104            role: Some("author".to_string()),
3105            name: Some(author),
3106            email: None,
3107            url: None,
3108            organization: None,
3109            organization_url: None,
3110            timezone: None,
3111        });
3112    }
3113}
3114
3115#[derive(Default)]
3116struct DunderMetadata {
3117    version: Option<String>,
3118    author: Option<String>,
3119    license: Option<String>,
3120}
3121
3122fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
3123    let statements = match parse_module(content) {
3124        Ok(parsed) => parsed.into_suite(),
3125        Err(_) => return DunderMetadata::default(),
3126    };
3127
3128    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3129    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3130    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3131    let mut metadata = DunderMetadata::default();
3132
3133    for module in imported_dunder_modules(&statements) {
3134        let Some(path) = resolve_imported_module_path(root, &module) else {
3135            continue;
3136        };
3137        let Ok(module_content) = read_file_to_string(&path) else {
3138            continue;
3139        };
3140
3141        if metadata.version.is_none() {
3142            metadata.version = version_re
3143                .as_ref()
3144                .and_then(|regex| regex.captures(&module_content))
3145                .and_then(|captures| captures.get(1))
3146                .map(|match_| match_.as_str().to_string());
3147        }
3148
3149        if metadata.author.is_none() {
3150            metadata.author = author_re
3151                .as_ref()
3152                .and_then(|regex| regex.captures(&module_content))
3153                .and_then(|captures| captures.get(1))
3154                .map(|match_| match_.as_str().to_string());
3155        }
3156
3157        if metadata.license.is_none() {
3158            metadata.license = license_re
3159                .as_ref()
3160                .and_then(|regex| regex.captures(&module_content))
3161                .and_then(|captures| captures.get(1))
3162                .map(|match_| match_.as_str().to_string());
3163        }
3164
3165        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
3166            return metadata;
3167        }
3168    }
3169
3170    metadata
3171}
3172
3173fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
3174    let mut modules = Vec::new();
3175
3176    for statement in statements {
3177        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
3178            continue;
3179        };
3180        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
3181            continue;
3182        };
3183        let imports_dunder = names.iter().any(|alias| {
3184            matches!(
3185                alias.name.as_str(),
3186                "__version__" | "__author__" | "__license__"
3187            )
3188        });
3189        if imports_dunder {
3190            modules.push(module.to_string());
3191        }
3192    }
3193
3194    modules
3195}
3196
3197fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
3198    let relative = PathBuf::from_iter(module.split('.'));
3199    let candidates = [
3200        root.join(relative.with_extension("py")),
3201        root.join(&relative).join("__init__.py"),
3202        root.join("src").join(relative.with_extension("py")),
3203        root.join("src").join(relative).join("__init__.py"),
3204    ];
3205
3206    candidates.into_iter().find(|candidate| candidate.exists())
3207}
3208
3209/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
3210///
3211/// # Security Model
3212///
3213/// This function parses setup.py as a Python AST and evaluates only literal values
3214/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
3215/// arbitrary code execution during scanning.
3216///
3217/// # DoS Prevention
3218///
3219/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
3220/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
3221/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
3222///
3223/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
3224fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
3225    let statements = parse_module(content)
3226        .map(|parsed| parsed.into_suite())
3227        .map_err(|e| e.to_string())?;
3228    let aliases = collect_setup_aliases(&statements);
3229    let mut evaluator = LiteralEvaluator::new(HashMap::new());
3230    build_setup_py_constants(&statements, &mut evaluator);
3231
3232    let setup_call = find_setup_call(&statements, &aliases);
3233    let Some(call_expr) = setup_call else {
3234        return Ok(None);
3235    };
3236
3237    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3238    Ok(Some(build_setup_py_package_data(&setup_values)))
3239}
3240
3241fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3242    for stmt in statements {
3243        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3244            if targets.len() != 1 {
3245                continue;
3246            }
3247
3248            let Some(name) = extract_assign_name(&targets[0]) else {
3249                continue;
3250            };
3251
3252            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3253                evaluator.insert_constant(name, value);
3254            }
3255        }
3256    }
3257}
3258
3259fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3260    match target {
3261        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3262        _ => None,
3263    }
3264}
3265
3266fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3267    let mut aliases = SetupAliases::default();
3268    aliases.setup_names.insert("setup".to_string());
3269
3270    for stmt in statements {
3271        match stmt {
3272            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3273                for alias in names {
3274                    let module_name = alias.name.as_str();
3275                    if !is_setup_module(module_name) {
3276                        continue;
3277                    }
3278                    let alias_name = alias
3279                        .asname
3280                        .as_ref()
3281                        .map(|name| name.as_str())
3282                        .unwrap_or(module_name);
3283                    aliases
3284                        .module_aliases
3285                        .insert(alias_name.to_string(), module_name.to_string());
3286                }
3287            }
3288            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3289                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3290                    continue;
3291                };
3292                if !is_setup_module(module_name) {
3293                    continue;
3294                }
3295                for alias in names {
3296                    if alias.name.as_str() != "setup" {
3297                        continue;
3298                    }
3299                    let alias_name = alias
3300                        .asname
3301                        .as_ref()
3302                        .map(|name| name.as_str())
3303                        .unwrap_or("setup");
3304                    aliases.setup_names.insert(alias_name.to_string());
3305                }
3306            }
3307            _ => {}
3308        }
3309    }
3310
3311    aliases
3312}
3313
3314fn is_setup_module(module_name: &str) -> bool {
3315    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3316}
3317
3318fn find_setup_call<'a>(
3319    statements: &'a [ast::Stmt],
3320    aliases: &'a SetupAliases,
3321) -> Option<&'a ast::Expr> {
3322    let mut finder = SetupCallFinder {
3323        aliases,
3324        called_function_names: collect_top_level_called_function_names(statements),
3325        nodes_visited: 0,
3326    };
3327    finder.find_in_statements(statements)
3328}
3329
3330fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3331    let mut called = HashSet::new();
3332    collect_called_function_names_in_statements(statements, &mut called);
3333    called
3334}
3335
3336fn collect_called_function_names_in_statements(
3337    statements: &[ast::Stmt],
3338    called: &mut HashSet<String>,
3339) {
3340    for stmt in statements {
3341        match stmt {
3342            ast::Stmt::Expr(ast::StmtExpr { value, .. })
3343            | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3344                collect_called_function_names_in_expr(value.as_ref(), called);
3345            }
3346            ast::Stmt::If(ast::StmtIf {
3347                body,
3348                elif_else_clauses,
3349                ..
3350            }) => {
3351                collect_called_function_names_in_statements(body, called);
3352                for clause in elif_else_clauses {
3353                    collect_called_function_names_in_statements(&clause.body, called);
3354                }
3355            }
3356            ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3357            | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3358                collect_called_function_names_in_statements(body, called);
3359                collect_called_function_names_in_statements(orelse, called);
3360            }
3361            ast::Stmt::With(ast::StmtWith { body, .. }) => {
3362                collect_called_function_names_in_statements(body, called);
3363            }
3364            ast::Stmt::Try(ast::StmtTry {
3365                body,
3366                orelse,
3367                finalbody,
3368                handlers,
3369                ..
3370            }) => {
3371                collect_called_function_names_in_statements(body, called);
3372                collect_called_function_names_in_statements(orelse, called);
3373                collect_called_function_names_in_statements(finalbody, called);
3374                for handler in handlers {
3375                    let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3376                        body,
3377                        ..
3378                    }) = handler;
3379                    collect_called_function_names_in_statements(body, called);
3380                }
3381            }
3382            _ => {}
3383        }
3384    }
3385}
3386
3387fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3388    if let ast::Expr::Call(ast::ExprCall {
3389        func, arguments, ..
3390    }) = expr
3391    {
3392        if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3393            called.insert(id.as_str().to_string());
3394        }
3395
3396        for arg in arguments.args.iter() {
3397            collect_called_function_names_in_expr(arg, called);
3398        }
3399        for keyword in arguments.keywords.iter() {
3400            collect_called_function_names_in_expr(&keyword.value, called);
3401        }
3402    }
3403}
3404
3405struct SetupCallFinder<'a> {
3406    aliases: &'a SetupAliases,
3407    called_function_names: HashSet<String>,
3408    nodes_visited: usize,
3409}
3410
3411impl<'a> SetupCallFinder<'a> {
3412    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3413        for stmt in statements {
3414            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3415                return None;
3416            }
3417            self.nodes_visited += 1;
3418
3419            let found = match stmt {
3420                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3421                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3422                ast::Stmt::If(ast::StmtIf {
3423                    body,
3424                    elif_else_clauses,
3425                    ..
3426                }) => self.find_in_statements(body).or_else(|| {
3427                    for clause in elif_else_clauses {
3428                        if let Some(found) = self.find_in_statements(&clause.body) {
3429                            return Some(found);
3430                        }
3431                    }
3432                    None
3433                }),
3434                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3435                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3436                    .find_in_statements(body)
3437                    .or_else(|| self.find_in_statements(orelse)),
3438                ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3439                    .called_function_names
3440                    .contains(name.as_str())
3441                    .then(|| self.find_in_statements(body))
3442                    .flatten(),
3443                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3444                ast::Stmt::Try(ast::StmtTry {
3445                    body,
3446                    orelse,
3447                    finalbody,
3448                    handlers,
3449                    ..
3450                }) => self
3451                    .find_in_statements(body)
3452                    .or_else(|| self.find_in_statements(orelse))
3453                    .or_else(|| self.find_in_statements(finalbody))
3454                    .or_else(|| {
3455                        for handler in handlers {
3456                            let ast::ExceptHandler::ExceptHandler(
3457                                ast::ExceptHandlerExceptHandler { body, .. },
3458                            ) = handler;
3459                            if let Some(found) = self.find_in_statements(body) {
3460                                return Some(found);
3461                            }
3462                        }
3463                        None
3464                    }),
3465                _ => None,
3466            };
3467
3468            if found.is_some() {
3469                return found;
3470            }
3471        }
3472
3473        None
3474    }
3475
3476    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3477        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3478            return None;
3479        }
3480        self.nodes_visited += 1;
3481
3482        match expr {
3483            ast::Expr::Call(ast::ExprCall { func, .. })
3484                if is_setup_call(func.as_ref(), self.aliases) =>
3485            {
3486                Some(expr)
3487            }
3488            _ => None,
3489        }
3490    }
3491}
3492
3493fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3494    let Some(dotted) = dotted_name(func, 0) else {
3495        return false;
3496    };
3497
3498    if aliases.setup_names.contains(&dotted) {
3499        return true;
3500    }
3501
3502    let Some(module) = dotted.strip_suffix(".setup") else {
3503        return false;
3504    };
3505
3506    let resolved = resolve_module_alias(module, aliases);
3507    is_setup_module(&resolved)
3508}
3509
3510fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3511    if depth >= MAX_SETUP_PY_AST_DEPTH {
3512        return None;
3513    }
3514
3515    match expr {
3516        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3517        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3518            let base = dotted_name(value.as_ref(), depth + 1)?;
3519            Some(format!("{}.{}", base, attr.as_str()))
3520        }
3521        _ => None,
3522    }
3523}
3524
3525fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3526    if let Some(mapped) = aliases.module_aliases.get(module) {
3527        return mapped.clone();
3528    }
3529
3530    let Some((base, rest)) = module.split_once('.') else {
3531        return module.to_string();
3532    };
3533
3534    if let Some(mapped) = aliases.module_aliases.get(base) {
3535        return format!("{}.{}", mapped, rest);
3536    }
3537
3538    module.to_string()
3539}
3540
3541fn extract_setup_keywords(
3542    call_expr: &ast::Expr,
3543    evaluator: &mut LiteralEvaluator,
3544) -> HashMap<String, Value> {
3545    let mut values = HashMap::new();
3546    let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3547        return values;
3548    };
3549
3550    for keyword in arguments.keywords.iter() {
3551        if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3552            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3553                values.insert(arg.to_string(), value);
3554            }
3555        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3556            for (key, value) in dict {
3557                values.insert(key, value);
3558            }
3559        }
3560    }
3561
3562    values
3563}
3564
3565fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3566    let name = get_value_string(values, "name");
3567    let version = get_value_string(values, "version");
3568    let description =
3569        get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3570    let homepage_url =
3571        get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3572    let author = get_value_string(values, "author");
3573    let author_email = get_value_string(values, "author_email");
3574    let maintainer = get_value_string(values, "maintainer");
3575    let maintainer_email = get_value_string(values, "maintainer_email");
3576    let license = get_value_string(values, "license");
3577    let classifiers = values
3578        .get("classifiers")
3579        .and_then(value_to_string_list)
3580        .unwrap_or_default();
3581
3582    let mut parties = Vec::new();
3583    if author.is_some() || author_email.is_some() {
3584        parties.push(Party {
3585            r#type: Some("person".to_string()),
3586            role: Some("author".to_string()),
3587            name: author,
3588            email: author_email,
3589            url: None,
3590            organization: None,
3591            organization_url: None,
3592            timezone: None,
3593        });
3594    }
3595
3596    if maintainer.is_some() || maintainer_email.is_some() {
3597        parties.push(Party {
3598            r#type: Some("person".to_string()),
3599            role: Some("maintainer".to_string()),
3600            name: maintainer,
3601            email: maintainer_email,
3602            url: None,
3603            organization: None,
3604            organization_url: None,
3605            timezone: None,
3606        });
3607    }
3608
3609    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3610        normalize_spdx_declared_license(license.as_deref());
3611    let extracted_license_statement = license.clone();
3612
3613    let dependencies = build_setup_py_dependencies(values);
3614    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3615    let mut homepage_from_project_urls = None;
3616    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3617    let mut extra_data = HashMap::new();
3618
3619    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3620        apply_project_url_mappings(
3621            &parsed_project_urls,
3622            &mut homepage_from_project_urls,
3623            &mut bug_tracking_url,
3624            &mut code_view_url,
3625            &mut vcs_url,
3626            &mut extra_data,
3627        );
3628    }
3629
3630    let extra_data = if extra_data.is_empty() {
3631        None
3632    } else {
3633        Some(extra_data)
3634    };
3635
3636    PackageData {
3637        package_type: Some(PythonParser::PACKAGE_TYPE),
3638        namespace: None,
3639        name,
3640        version,
3641        qualifiers: None,
3642        subpath: None,
3643        primary_language: Some("Python".to_string()),
3644        description,
3645        release_date: None,
3646        parties,
3647        keywords: Vec::new(),
3648        homepage_url: homepage_url.or(homepage_from_project_urls),
3649        download_url: None,
3650        size: None,
3651        sha1: None,
3652        md5: None,
3653        sha256: None,
3654        sha512: None,
3655        bug_tracking_url,
3656        code_view_url,
3657        vcs_url,
3658        copyright: None,
3659        holder: None,
3660        declared_license_expression,
3661        declared_license_expression_spdx,
3662        license_detections,
3663        other_license_expression: None,
3664        other_license_expression_spdx: None,
3665        other_license_detections: Vec::new(),
3666        extracted_license_statement,
3667        notice_text: None,
3668        source_packages: Vec::new(),
3669        file_references: Vec::new(),
3670        is_private: has_private_classifier(&classifiers),
3671        is_virtual: false,
3672        extra_data,
3673        dependencies,
3674        repository_homepage_url: None,
3675        repository_download_url: None,
3676        api_data_url: None,
3677        datasource_id: Some(DatasourceId::PypiSetupPy),
3678        purl,
3679    }
3680}
3681
3682fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3683    let mut dependencies = Vec::new();
3684
3685    if let Some(reqs) = values
3686        .get("install_requires")
3687        .and_then(value_to_string_list)
3688    {
3689        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3690    }
3691
3692    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3693        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3694    }
3695
3696    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3697        let mut extra_items: Vec<_> = extras.iter().collect();
3698        extra_items.sort_by_key(|(name, _)| *name);
3699        for (extra_name, extra_value) in extra_items {
3700            if let Some(reqs) = value_to_string_list(extra_value) {
3701                dependencies.extend(build_setup_py_dependency_list(
3702                    reqs.as_slice(),
3703                    extra_name,
3704                    true,
3705                ));
3706            }
3707        }
3708    }
3709
3710    dependencies
3711}
3712
3713fn build_setup_py_dependency_list(
3714    reqs: &[String],
3715    scope: &str,
3716    is_optional: bool,
3717) -> Vec<Dependency> {
3718    reqs.iter()
3719        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3720        .collect()
3721}
3722
3723fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3724    values.get(key).and_then(value_to_string)
3725}
3726
3727fn value_to_string(value: &Value) -> Option<String> {
3728    match value {
3729        Value::String(value) => Some(value.clone()),
3730        Value::Number(value) => Some(value.to_string()),
3731        Value::Bool(value) => Some(value.to_string()),
3732        _ => None,
3733    }
3734}
3735
3736fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3737    match value {
3738        Value::String(value) => Some(vec![value.clone()]),
3739        Value::List(values) | Value::Tuple(values) => {
3740            let mut items = Vec::new();
3741            for item in values {
3742                items.push(value_to_string(item)?);
3743            }
3744            Some(items)
3745        }
3746        _ => None,
3747    }
3748}
3749
3750fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3751    let Value::Dict(dict) = value else {
3752        return None;
3753    };
3754
3755    let mut pairs: Vec<(String, String)> = dict
3756        .iter()
3757        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3758        .collect::<Option<Vec<_>>>()?;
3759    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3760    Some(pairs)
3761}
3762
3763fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3764    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3765    extract_requires_dist_dependencies(&requires_dist)
3766}
3767
3768pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3769    requires_dist
3770        .iter()
3771        .filter_map(|entry| build_rfc822_dependency(entry))
3772        .collect()
3773}
3774
3775fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3776    build_python_dependency(entry, "install", false, None)
3777}
3778
3779fn build_python_dependency(
3780    entry: &str,
3781    default_scope: &str,
3782    default_optional: bool,
3783    marker_override: Option<&str>,
3784) -> Option<Dependency> {
3785    let (requirement_part, marker_part) = entry
3786        .split_once(';')
3787        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3788        .unwrap_or((entry.trim(), None));
3789
3790    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3791    let requirement = normalize_rfc822_requirement(requirement_part);
3792    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3793        marker_part.or(marker_override),
3794        default_scope,
3795        default_optional,
3796    );
3797    let purl = build_python_dependency_purl(&name, None)?;
3798
3799    let is_pinned = requirement
3800        .as_deref()
3801        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3802    let purl = if is_pinned {
3803        requirement
3804            .as_deref()
3805            .map(|req| req.trim_start_matches('='))
3806            .and_then(|version| build_python_dependency_purl(&name, Some(version)))
3807            .unwrap_or(purl)
3808    } else {
3809        purl
3810    };
3811
3812    let mut extra_data = HashMap::new();
3813    extra_data.extend(marker_data);
3814    if let Some(marker) = marker {
3815        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3816    }
3817
3818    Some(Dependency {
3819        purl: Some(purl),
3820        extracted_requirement: requirement,
3821        scope: Some(scope),
3822        is_runtime: Some(true),
3823        is_optional: Some(is_optional),
3824        is_pinned: Some(is_pinned),
3825        is_direct: Some(true),
3826        resolved_package: None,
3827        extra_data: if extra_data.is_empty() {
3828            None
3829        } else {
3830            Some(extra_data)
3831        },
3832    })
3833}
3834
3835fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3836    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3837    let trimmed = requirement_part.trim();
3838    let mut remainder = trimmed[name.len()..].trim();
3839
3840    if let Some(stripped) = remainder.strip_prefix('[')
3841        && let Some(end_idx) = stripped.find(']')
3842    {
3843        remainder = stripped[end_idx + 1..].trim();
3844    }
3845
3846    let remainder = remainder
3847        .strip_prefix('(')
3848        .and_then(|value| value.strip_suffix(')'))
3849        .unwrap_or(remainder)
3850        .trim();
3851
3852    if remainder.is_empty() {
3853        return None;
3854    }
3855
3856    let mut specifiers: Vec<String> = remainder
3857        .split(',')
3858        .map(|specifier| specifier.trim().replace(' ', ""))
3859        .filter(|specifier| !specifier.is_empty())
3860        .collect();
3861    specifiers.sort();
3862    Some(specifiers.join(","))
3863}
3864
3865fn encode_python_dependency_purl_version(version: &str) -> String {
3866    version.replace('*', "%2A")
3867}
3868
3869fn build_python_dependency_purl(name: &str, version: Option<&str>) -> Option<String> {
3870    let normalized_name = normalize_python_dependency_name(name);
3871
3872    PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &normalized_name)
3873        .ok()
3874        .map(|_| match version {
3875            Some(version) => {
3876                format!(
3877                    "pkg:pypi/{normalized_name}@{}",
3878                    encode_python_dependency_purl_version(version)
3879                )
3880            }
3881            None => format!("pkg:pypi/{normalized_name}"),
3882        })
3883}
3884
3885fn normalize_python_dependency_name(name: &str) -> String {
3886    name.trim().to_ascii_lowercase().replace('_', "-")
3887}
3888
3889fn parse_rfc822_marker(
3890    marker_part: Option<&str>,
3891    default_scope: &str,
3892    default_optional: bool,
3893) -> (
3894    String,
3895    bool,
3896    Option<String>,
3897    HashMap<String, serde_json::Value>,
3898) {
3899    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3900        return (
3901            default_scope.to_string(),
3902            default_optional,
3903            None,
3904            HashMap::new(),
3905        );
3906    };
3907
3908    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3909        .expect("extra marker regex should compile");
3910    let mut extra_data = HashMap::new();
3911
3912    if let Some(python_version) = extract_marker_field(marker, "python_version") {
3913        extra_data.insert(
3914            "python_version".to_string(),
3915            serde_json::Value::String(python_version),
3916        );
3917    }
3918    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
3919        extra_data.insert(
3920            "sys_platform".to_string(),
3921            serde_json::Value::String(sys_platform),
3922        );
3923    }
3924
3925    if let Some(captures) = extra_re.captures(marker)
3926        && let Some(scope) = captures.get(1)
3927    {
3928        return (
3929            scope.as_str().to_string(),
3930            true,
3931            Some(marker.trim().to_string()),
3932            extra_data,
3933        );
3934    }
3935
3936    (
3937        default_scope.to_string(),
3938        default_optional,
3939        Some(marker.trim().to_string()),
3940        extra_data,
3941    )
3942}
3943
3944fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
3945    let re = Regex::new(&format!(
3946        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
3947        field
3948    ))
3949    .ok()?;
3950    let captures = re.captures(marker)?;
3951    let operator = captures.get(1)?.as_str();
3952    let value = captures.get(2)?.as_str();
3953    Some(format!("{} {}", operator, value))
3954}
3955
3956fn parse_requires_txt(content: &str) -> Vec<Dependency> {
3957    let mut dependencies = Vec::new();
3958    let mut current_scope = "install".to_string();
3959    let mut current_optional = false;
3960    let mut current_marker: Option<String> = None;
3961
3962    for line in content.lines() {
3963        let trimmed = line.trim();
3964        if trimmed.is_empty() || trimmed.starts_with('#') {
3965            continue;
3966        }
3967
3968        if trimmed.starts_with('[') && trimmed.ends_with(']') {
3969            let inner = &trimmed[1..trimmed.len() - 1];
3970            if let Some(rest) = inner.strip_prefix(':') {
3971                current_scope = "install".to_string();
3972                current_optional = false;
3973                current_marker = Some(rest.trim().to_string());
3974            } else if let Some((scope, marker)) = inner.split_once(':') {
3975                current_scope = scope.trim().to_string();
3976                current_optional = true;
3977                current_marker = Some(marker.trim().to_string());
3978            } else {
3979                current_scope = inner.trim().to_string();
3980                current_optional = true;
3981                current_marker = None;
3982            }
3983            continue;
3984        }
3985
3986        if let Some(dependency) = build_python_dependency(
3987            trimmed,
3988            &current_scope,
3989            current_optional,
3990            current_marker.as_deref(),
3991        ) {
3992            dependencies.push(dependency);
3993        }
3994    }
3995
3996    dependencies
3997}
3998
3999fn has_private_classifier(classifiers: &[String]) -> bool {
4000    classifiers
4001        .iter()
4002        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
4003}
4004
4005fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
4006    let name = name?;
4007    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
4008    if let Some(version) = version {
4009        package_url.with_version(version).ok()?;
4010    }
4011    Some(package_url.to_string())
4012}
4013
4014fn extract_from_setup_py_regex(content: &str) -> PackageData {
4015    let name = extract_setup_value(content, "name");
4016    let version = extract_setup_value(content, "version");
4017    let license_expression = extract_setup_value(content, "license");
4018
4019    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4020        normalize_spdx_declared_license(license_expression.as_deref());
4021    let extracted_license_statement = license_expression.clone();
4022
4023    let dependencies = extract_setup_py_dependencies(content);
4024    let homepage_url = extract_setup_value(content, "url");
4025    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
4026
4027    PackageData {
4028        package_type: Some(PythonParser::PACKAGE_TYPE),
4029        namespace: None,
4030        name,
4031        version,
4032        qualifiers: None,
4033        subpath: None,
4034        primary_language: Some("Python".to_string()),
4035        description: None,
4036        release_date: None,
4037        parties: Vec::new(),
4038        keywords: Vec::new(),
4039        homepage_url,
4040        download_url: None,
4041        size: None,
4042        sha1: None,
4043        md5: None,
4044        sha256: None,
4045        sha512: None,
4046        bug_tracking_url: None,
4047        code_view_url: None,
4048        vcs_url: None,
4049        copyright: None,
4050        holder: None,
4051        declared_license_expression,
4052        declared_license_expression_spdx,
4053        license_detections,
4054        other_license_expression: None,
4055        other_license_expression_spdx: None,
4056        other_license_detections: Vec::new(),
4057        extracted_license_statement,
4058        notice_text: None,
4059        source_packages: Vec::new(),
4060        file_references: Vec::new(),
4061        is_private: false,
4062        is_virtual: false,
4063        extra_data: None,
4064        dependencies,
4065        repository_homepage_url: None,
4066        repository_download_url: None,
4067        api_data_url: None,
4068        datasource_id: Some(DatasourceId::PypiSetupPy),
4069        purl,
4070    }
4071}
4072
4073fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
4074    crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
4075}
4076
4077fn extract_from_pypi_json(path: &Path) -> PackageData {
4078    let default = PackageData {
4079        package_type: Some(PythonParser::PACKAGE_TYPE),
4080        datasource_id: Some(DatasourceId::PypiJson),
4081        ..Default::default()
4082    };
4083
4084    let content = match read_file_to_string(path) {
4085        Ok(content) => content,
4086        Err(error) => {
4087            warn!("Failed to read pypi.json at {:?}: {}", path, error);
4088            return default;
4089        }
4090    };
4091
4092    let root: serde_json::Value = match serde_json::from_str(&content) {
4093        Ok(value) => value,
4094        Err(error) => {
4095            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
4096            return default;
4097        }
4098    };
4099
4100    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
4101        warn!("No info object found in pypi.json at {:?}", path);
4102        return default;
4103    };
4104
4105    let name = info
4106        .get("name")
4107        .and_then(|value| value.as_str())
4108        .map(ToOwned::to_owned);
4109    let version = info
4110        .get("version")
4111        .and_then(|value| value.as_str())
4112        .map(ToOwned::to_owned);
4113    let summary = info
4114        .get("summary")
4115        .and_then(|value| value.as_str())
4116        .map(ToOwned::to_owned);
4117    let description = info
4118        .get("description")
4119        .and_then(|value| value.as_str())
4120        .filter(|value| !value.trim().is_empty())
4121        .map(ToOwned::to_owned)
4122        .or(summary);
4123    let mut homepage_url = info
4124        .get("home_page")
4125        .and_then(|value| value.as_str())
4126        .map(ToOwned::to_owned);
4127    let author = info
4128        .get("author")
4129        .and_then(|value| value.as_str())
4130        .filter(|value| !value.trim().is_empty())
4131        .map(ToOwned::to_owned);
4132    let author_email = info
4133        .get("author_email")
4134        .and_then(|value| value.as_str())
4135        .filter(|value| !value.trim().is_empty())
4136        .map(ToOwned::to_owned);
4137    let license = info
4138        .get("license")
4139        .and_then(|value| value.as_str())
4140        .filter(|value| !value.trim().is_empty())
4141        .map(ToOwned::to_owned);
4142    let keywords = parse_setup_cfg_keywords(
4143        info.get("keywords")
4144            .and_then(|value| value.as_str())
4145            .map(ToOwned::to_owned),
4146    );
4147    let classifiers = info
4148        .get("classifiers")
4149        .and_then(|value| value.as_array())
4150        .map(|values| {
4151            values
4152                .iter()
4153                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
4154                .collect::<Vec<_>>()
4155        })
4156        .unwrap_or_default();
4157
4158    let mut parties = Vec::new();
4159    if author.is_some() || author_email.is_some() {
4160        parties.push(Party {
4161            r#type: Some("person".to_string()),
4162            role: Some("author".to_string()),
4163            name: author,
4164            email: author_email,
4165            url: None,
4166            organization: None,
4167            organization_url: None,
4168            timezone: None,
4169        });
4170    }
4171
4172    let mut bug_tracking_url = None;
4173    let mut code_view_url = None;
4174    let mut vcs_url = None;
4175    let mut extra_data = HashMap::new();
4176
4177    let parsed_project_urls = info
4178        .get("project_urls")
4179        .and_then(|value| value.as_object())
4180        .map(|map| {
4181            let mut pairs: Vec<(String, String)> = map
4182                .iter()
4183                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
4184                .collect();
4185            pairs.sort_by(|left, right| left.0.cmp(&right.0));
4186            pairs
4187        })
4188        .unwrap_or_default();
4189
4190    apply_project_url_mappings(
4191        &parsed_project_urls,
4192        &mut homepage_url,
4193        &mut bug_tracking_url,
4194        &mut code_view_url,
4195        &mut vcs_url,
4196        &mut extra_data,
4197    );
4198
4199    let (download_url, size, sha256) = root
4200        .get("urls")
4201        .and_then(|value| value.as_array())
4202        .map(|urls| select_pypi_json_artifact(urls))
4203        .unwrap_or((None, None, None));
4204
4205    let sha256 = sha256.and_then(|h| Sha256Digest::from_hex(&h).ok());
4206
4207    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4208        normalize_spdx_declared_license(license.as_deref());
4209    let dependencies = info
4210        .get("requires_dist")
4211        .and_then(|value| value.as_array())
4212        .map(|entries| {
4213            entries
4214                .iter()
4215                .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4216                .collect::<Vec<_>>()
4217        })
4218        .map(|entries| extract_requires_dist_dependencies(&entries))
4219        .unwrap_or_default();
4220
4221    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
4222        build_pypi_urls(name.as_deref(), version.as_deref());
4223
4224    PackageData {
4225        package_type: Some(PythonParser::PACKAGE_TYPE),
4226        namespace: None,
4227        name,
4228        version,
4229        qualifiers: None,
4230        subpath: None,
4231        primary_language: None,
4232        description,
4233        release_date: None,
4234        parties,
4235        keywords,
4236        homepage_url: homepage_url.or(repository_homepage_url.clone()),
4237        download_url,
4238        size,
4239        sha1: None,
4240        md5: None,
4241        sha256,
4242        sha512: None,
4243        bug_tracking_url,
4244        code_view_url,
4245        vcs_url,
4246        copyright: None,
4247        holder: None,
4248        declared_license_expression,
4249        declared_license_expression_spdx,
4250        license_detections,
4251        other_license_expression: None,
4252        other_license_expression_spdx: None,
4253        other_license_detections: Vec::new(),
4254        extracted_license_statement: license,
4255        notice_text: None,
4256        source_packages: Vec::new(),
4257        file_references: Vec::new(),
4258        is_private: has_private_classifier(&classifiers),
4259        is_virtual: false,
4260        extra_data: if extra_data.is_empty() {
4261            None
4262        } else {
4263            Some(extra_data)
4264        },
4265        dependencies,
4266        repository_homepage_url,
4267        repository_download_url,
4268        api_data_url,
4269        datasource_id: Some(DatasourceId::PypiJson),
4270        purl,
4271    }
4272}
4273
4274fn select_pypi_json_artifact(
4275    urls: &[serde_json::Value],
4276) -> (Option<String>, Option<u64>, Option<String>) {
4277    let selected = urls
4278        .iter()
4279        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4280        .or_else(|| urls.first());
4281
4282    let Some(entry) = selected else {
4283        return (None, None, None);
4284    };
4285
4286    let download_url = entry
4287        .get("url")
4288        .and_then(|value| value.as_str())
4289        .map(ToOwned::to_owned);
4290    let size = entry.get("size").and_then(|value| value.as_u64());
4291    let sha256 = entry
4292        .get("digests")
4293        .and_then(|value| value.as_object())
4294        .and_then(|digests| digests.get("sha256"))
4295        .and_then(|value| value.as_str())
4296        .map(ToOwned::to_owned);
4297
4298    (download_url, size, sha256)
4299}
4300
4301fn extract_from_pip_inspect(path: &Path) -> PackageData {
4302    let content = match read_file_to_string(path) {
4303        Ok(content) => content,
4304        Err(e) => {
4305            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4306            return default_package_data(path);
4307        }
4308    };
4309
4310    let root: serde_json::Value = match serde_json::from_str(&content) {
4311        Ok(value) => value,
4312        Err(e) => {
4313            warn!(
4314                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4315                path, e
4316            );
4317            return default_package_data(path);
4318        }
4319    };
4320
4321    let installed = match root.get("installed").and_then(|v| v.as_array()) {
4322        Some(arr) => arr,
4323        None => {
4324            warn!(
4325                "No 'installed' array found in pip-inspect.deplock at {:?}",
4326                path
4327            );
4328            return default_package_data(path);
4329        }
4330    };
4331
4332    let pip_version = root
4333        .get("pip_version")
4334        .and_then(|v| v.as_str())
4335        .map(String::from);
4336    let inspect_version = root
4337        .get("version")
4338        .and_then(|v| v.as_str())
4339        .map(String::from);
4340
4341    let mut main_package: Option<PackageData> = None;
4342    let mut dependencies: Vec<Dependency> = Vec::new();
4343
4344    for package_entry in installed {
4345        let metadata = match package_entry.get("metadata") {
4346            Some(m) => m,
4347            None => continue,
4348        };
4349
4350        let is_requested = package_entry
4351            .get("requested")
4352            .and_then(|v| v.as_bool())
4353            .unwrap_or(false);
4354        let has_direct_url = package_entry.get("direct_url").is_some();
4355
4356        let name = metadata
4357            .get("name")
4358            .and_then(|v| v.as_str())
4359            .map(String::from);
4360        let version = metadata
4361            .get("version")
4362            .and_then(|v| v.as_str())
4363            .map(String::from);
4364        let summary = metadata
4365            .get("summary")
4366            .and_then(|v| v.as_str())
4367            .map(String::from);
4368        let home_page = metadata
4369            .get("home_page")
4370            .and_then(|v| v.as_str())
4371            .map(String::from);
4372        let author = metadata
4373            .get("author")
4374            .and_then(|v| v.as_str())
4375            .map(String::from);
4376        let author_email = metadata
4377            .get("author_email")
4378            .and_then(|v| v.as_str())
4379            .map(String::from);
4380        let license = metadata
4381            .get("license")
4382            .and_then(|v| v.as_str())
4383            .map(String::from);
4384        let description = metadata
4385            .get("description")
4386            .and_then(|v| v.as_str())
4387            .map(String::from);
4388        let keywords = metadata
4389            .get("keywords")
4390            .and_then(|v| v.as_array())
4391            .map(|arr| {
4392                arr.iter()
4393                    .filter_map(|k| k.as_str().map(String::from))
4394                    .collect::<Vec<_>>()
4395            })
4396            .unwrap_or_default();
4397
4398        let mut parties = Vec::new();
4399        if author.is_some() || author_email.is_some() {
4400            parties.push(Party {
4401                r#type: Some("person".to_string()),
4402                role: Some("author".to_string()),
4403                name: author,
4404                email: author_email,
4405                url: None,
4406                organization: None,
4407                organization_url: None,
4408                timezone: None,
4409            });
4410        }
4411
4412        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4413            normalize_spdx_declared_license(license.as_deref());
4414        let extracted_license_statement = license.clone();
4415        let requires_dist = metadata
4416            .get("requires_dist")
4417            .and_then(|v| v.as_array())
4418            .map(|entries| {
4419                entries
4420                    .iter()
4421                    .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4422                    .collect::<Vec<_>>()
4423            })
4424            .unwrap_or_default();
4425        let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4426
4427        let purl = name.as_ref().and_then(|n| {
4428            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4429            if let Some(v) = &version {
4430                package_url.with_version(v).ok()?;
4431            }
4432            Some(package_url.to_string())
4433        });
4434
4435        if is_requested && has_direct_url {
4436            let mut extra_data = HashMap::new();
4437            if let Some(pv) = &pip_version {
4438                extra_data.insert(
4439                    "pip_version".to_string(),
4440                    serde_json::Value::String(pv.clone()),
4441                );
4442            }
4443            if let Some(iv) = &inspect_version {
4444                extra_data.insert(
4445                    "inspect_version".to_string(),
4446                    serde_json::Value::String(iv.clone()),
4447                );
4448            }
4449
4450            main_package = Some(PackageData {
4451                package_type: Some(PythonParser::PACKAGE_TYPE),
4452                namespace: None,
4453                name,
4454                version,
4455                qualifiers: None,
4456                subpath: None,
4457                primary_language: Some("Python".to_string()),
4458                description: description.or(summary),
4459                release_date: None,
4460                parties,
4461                keywords,
4462                homepage_url: home_page,
4463                download_url: None,
4464                size: None,
4465                sha1: None,
4466                md5: None,
4467                sha256: None,
4468                sha512: None,
4469                bug_tracking_url: None,
4470                code_view_url: None,
4471                vcs_url: None,
4472                copyright: None,
4473                holder: None,
4474                declared_license_expression,
4475                declared_license_expression_spdx,
4476                license_detections,
4477                other_license_expression: None,
4478                other_license_expression_spdx: None,
4479                other_license_detections: Vec::new(),
4480                extracted_license_statement,
4481                notice_text: None,
4482                source_packages: Vec::new(),
4483                file_references: Vec::new(),
4484                is_private: false,
4485                is_virtual: true,
4486                extra_data: if extra_data.is_empty() {
4487                    None
4488                } else {
4489                    Some(extra_data)
4490                },
4491                dependencies: parsed_dependencies,
4492                repository_homepage_url: None,
4493                repository_download_url: None,
4494                api_data_url: None,
4495                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4496                purl,
4497            });
4498        } else {
4499            let resolved_package = PackageData {
4500                package_type: Some(PythonParser::PACKAGE_TYPE),
4501                namespace: None,
4502                name: name.clone(),
4503                version: version.clone(),
4504                qualifiers: None,
4505                subpath: None,
4506                primary_language: Some("Python".to_string()),
4507                description: description.or(summary),
4508                release_date: None,
4509                parties,
4510                keywords,
4511                homepage_url: home_page,
4512                download_url: None,
4513                size: None,
4514                sha1: None,
4515                md5: None,
4516                sha256: None,
4517                sha512: None,
4518                bug_tracking_url: None,
4519                code_view_url: None,
4520                vcs_url: None,
4521                copyright: None,
4522                holder: None,
4523                declared_license_expression,
4524                declared_license_expression_spdx,
4525                license_detections,
4526                other_license_expression: None,
4527                other_license_expression_spdx: None,
4528                other_license_detections: Vec::new(),
4529                extracted_license_statement,
4530                notice_text: None,
4531                source_packages: Vec::new(),
4532                file_references: Vec::new(),
4533                is_private: false,
4534                is_virtual: true,
4535                extra_data: None,
4536                dependencies: parsed_dependencies,
4537                repository_homepage_url: None,
4538                repository_download_url: None,
4539                api_data_url: None,
4540                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4541                purl: purl.clone(),
4542            };
4543
4544            let resolved = package_data_to_resolved(&resolved_package);
4545            dependencies.push(Dependency {
4546                purl,
4547                extracted_requirement: None,
4548                scope: None,
4549                is_runtime: Some(true),
4550                is_optional: Some(false),
4551                is_pinned: Some(true),
4552                is_direct: Some(is_requested),
4553                resolved_package: Some(Box::new(resolved)),
4554                extra_data: None,
4555            });
4556        }
4557    }
4558
4559    if let Some(mut main_pkg) = main_package {
4560        let direct_requirement_purls: HashSet<String> = main_pkg
4561            .dependencies
4562            .iter()
4563            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4564            .collect();
4565
4566        let resolved_requirement_purls: HashSet<String> = dependencies
4567            .iter()
4568            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4569            .collect();
4570
4571        let unresolved_dependencies = main_pkg
4572            .dependencies
4573            .iter()
4574            .filter(|dep| {
4575                dep.purl.as_ref().is_some_and(|purl| {
4576                    !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4577                })
4578            })
4579            .cloned()
4580            .collect::<Vec<_>>();
4581
4582        for dependency in &mut dependencies {
4583            if dependency
4584                .purl
4585                .as_ref()
4586                .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4587            {
4588                dependency.is_direct = Some(true);
4589            }
4590        }
4591
4592        main_pkg.dependencies = dependencies;
4593        main_pkg.dependencies.extend(unresolved_dependencies);
4594        main_pkg
4595    } else {
4596        default_package_data(path)
4597    }
4598}
4599
4600fn base_dependency_purl(purl: &str) -> String {
4601    purl.split_once('@')
4602        .map(|(base, _)| base.to_string())
4603        .unwrap_or_else(|| purl.to_string())
4604}
4605
4606type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4607
4608fn extract_from_setup_cfg(path: &Path) -> PackageData {
4609    let content = match read_file_to_string(path) {
4610        Ok(content) => content,
4611        Err(e) => {
4612            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4613            return default_package_data(path);
4614        }
4615    };
4616
4617    let sections = parse_setup_cfg(&content);
4618    let name = get_ini_value(&sections, "metadata", "name");
4619    let version = get_ini_value(&sections, "metadata", "version");
4620    let description = get_ini_value(&sections, "metadata", "description");
4621    let author = get_ini_value(&sections, "metadata", "author");
4622    let author_email = get_ini_value(&sections, "metadata", "author_email");
4623    let maintainer = get_ini_value(&sections, "metadata", "maintainer");
4624    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4625    let license = get_ini_value(&sections, "metadata", "license");
4626    let mut homepage_url = get_ini_value(&sections, "metadata", "url");
4627    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4628    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4629    let python_requires = get_ini_value(&sections, "options", "python_requires");
4630    let parsed_project_urls =
4631        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4632    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4633    let mut extra_data = HashMap::new();
4634
4635    let mut parties = Vec::new();
4636    if author.is_some() || author_email.is_some() {
4637        parties.push(Party {
4638            r#type: Some("person".to_string()),
4639            role: Some("author".to_string()),
4640            name: author,
4641            email: author_email,
4642            url: None,
4643            organization: None,
4644            organization_url: None,
4645            timezone: None,
4646        });
4647    }
4648
4649    if maintainer.is_some() || maintainer_email.is_some() {
4650        parties.push(Party {
4651            r#type: Some("person".to_string()),
4652            role: Some("maintainer".to_string()),
4653            name: maintainer,
4654            email: maintainer_email,
4655            url: None,
4656            organization: None,
4657            organization_url: None,
4658            timezone: None,
4659        });
4660    }
4661
4662    let declared_license_expression = None;
4663    let declared_license_expression_spdx = None;
4664    let license_detections = Vec::new();
4665    let extracted_license_statement = license.clone();
4666
4667    let dependencies = extract_setup_cfg_dependencies(&sections);
4668
4669    if let Some(value) = python_requires {
4670        extra_data.insert(
4671            "python_requires".to_string(),
4672            serde_json::Value::String(value),
4673        );
4674    }
4675
4676    apply_project_url_mappings(
4677        &parsed_project_urls,
4678        &mut homepage_url,
4679        &mut bug_tracking_url,
4680        &mut code_view_url,
4681        &mut vcs_url,
4682        &mut extra_data,
4683    );
4684
4685    let extra_data = if extra_data.is_empty() {
4686        None
4687    } else {
4688        Some(extra_data)
4689    };
4690
4691    let purl = name.as_ref().and_then(|n| {
4692        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4693        if let Some(v) = &version {
4694            package_url.with_version(v).ok()?;
4695        }
4696        Some(package_url.to_string())
4697    });
4698
4699    PackageData {
4700        package_type: Some(PythonParser::PACKAGE_TYPE),
4701        namespace: None,
4702        name,
4703        version,
4704        qualifiers: None,
4705        subpath: None,
4706        primary_language: Some("Python".to_string()),
4707        description,
4708        release_date: None,
4709        parties,
4710        keywords,
4711        homepage_url,
4712        download_url: None,
4713        size: None,
4714        sha1: None,
4715        md5: None,
4716        sha256: None,
4717        sha512: None,
4718        bug_tracking_url,
4719        code_view_url,
4720        vcs_url,
4721        copyright: None,
4722        holder: None,
4723        declared_license_expression,
4724        declared_license_expression_spdx,
4725        license_detections,
4726        other_license_expression: None,
4727        other_license_expression_spdx: None,
4728        other_license_detections: Vec::new(),
4729        extracted_license_statement,
4730        notice_text: None,
4731        source_packages: Vec::new(),
4732        file_references: Vec::new(),
4733        is_private: has_private_classifier(&classifiers),
4734        is_virtual: false,
4735        extra_data,
4736        dependencies,
4737        repository_homepage_url: None,
4738        repository_download_url: None,
4739        api_data_url: None,
4740        datasource_id: Some(DatasourceId::PypiSetupCfg),
4741        purl,
4742    }
4743}
4744
4745fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4746    let Some(keywords) = value else {
4747        return Vec::new();
4748    };
4749
4750    keywords
4751        .split(',')
4752        .map(str::trim)
4753        .filter(|keyword| !keyword.is_empty())
4754        .map(ToOwned::to_owned)
4755        .collect()
4756}
4757
4758fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4759    entries
4760        .iter()
4761        .filter_map(|entry| {
4762            let (label, url) = entry.split_once('=')?;
4763            let label = label.trim();
4764            let url = url.trim();
4765            if label.is_empty() || url.is_empty() {
4766                None
4767            } else {
4768                Some((label.to_string(), url.to_string()))
4769            }
4770        })
4771        .collect()
4772}
4773
4774fn apply_project_url_mappings(
4775    parsed_urls: &[(String, String)],
4776    homepage_url: &mut Option<String>,
4777    bug_tracking_url: &mut Option<String>,
4778    code_view_url: &mut Option<String>,
4779    vcs_url: &mut Option<String>,
4780    extra_data: &mut HashMap<String, serde_json::Value>,
4781) {
4782    for (label, url) in parsed_urls {
4783        let label_lower = label.to_lowercase();
4784
4785        if bug_tracking_url.is_none()
4786            && matches!(
4787                label_lower.as_str(),
4788                "tracker"
4789                    | "bug reports"
4790                    | "bug tracker"
4791                    | "issues"
4792                    | "issue tracker"
4793                    | "github: issues"
4794            )
4795        {
4796            *bug_tracking_url = Some(url.clone());
4797        } else if code_view_url.is_none()
4798            && matches!(label_lower.as_str(), "source" | "source code" | "code")
4799        {
4800            *code_view_url = Some(url.clone());
4801        } else if vcs_url.is_none()
4802            && matches!(
4803                label_lower.as_str(),
4804                "github" | "gitlab" | "github: repo" | "repository"
4805            )
4806        {
4807            *vcs_url = Some(url.clone());
4808        } else if homepage_url.is_none()
4809            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4810        {
4811            *homepage_url = Some(url.clone());
4812        } else if label_lower == "changelog" {
4813            extra_data.insert(
4814                "changelog_url".to_string(),
4815                serde_json::Value::String(url.clone()),
4816            );
4817        }
4818    }
4819
4820    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4821        .iter()
4822        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4823        .collect();
4824
4825    if !project_urls_json.is_empty() {
4826        extra_data.insert(
4827            "project_urls".to_string(),
4828            serde_json::Value::Object(project_urls_json),
4829        );
4830    }
4831}
4832
4833fn parse_setup_cfg(content: &str) -> IniSections {
4834    let mut sections: IniSections = HashMap::new();
4835    let mut current_section: Option<String> = None;
4836    let mut current_key: Option<String> = None;
4837
4838    for raw_line in content.lines() {
4839        let line = raw_line.trim_end_matches('\r');
4840        let trimmed = line.trim();
4841        if trimmed.is_empty() {
4842            continue;
4843        }
4844
4845        let stripped = line.trim_start();
4846        if stripped.starts_with('#') || stripped.starts_with(';') {
4847            continue;
4848        }
4849
4850        if stripped.starts_with('[') && stripped.ends_with(']') {
4851            let section_name = stripped
4852                .trim_start_matches('[')
4853                .trim_end_matches(']')
4854                .trim()
4855                .to_ascii_lowercase();
4856            current_section = if section_name.is_empty() {
4857                None
4858            } else {
4859                Some(section_name)
4860            };
4861            current_key = None;
4862            continue;
4863        }
4864
4865        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4866            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4867                let value = stripped.trim();
4868                if !value.is_empty() {
4869                    sections
4870                        .entry(section.clone())
4871                        .or_default()
4872                        .entry(key.clone())
4873                        .or_default()
4874                        .push(value.to_string());
4875                }
4876            }
4877            continue;
4878        }
4879
4880        if let Some((key, value)) = stripped.split_once('=')
4881            && let Some(section) = current_section.as_ref()
4882        {
4883            let key_name = key.trim().to_ascii_lowercase();
4884            let value_trimmed = value.trim();
4885            let entry = sections
4886                .entry(section.clone())
4887                .or_default()
4888                .entry(key_name.clone())
4889                .or_default();
4890            if !value_trimmed.is_empty() {
4891                entry.push(value_trimmed.to_string());
4892            }
4893            current_key = Some(key_name);
4894        }
4895    }
4896
4897    sections
4898}
4899
4900fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4901    sections
4902        .get(&section.to_ascii_lowercase())
4903        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4904        .and_then(|entries| entries.first())
4905        .map(|value| value.trim().to_string())
4906}
4907
4908fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4909    sections
4910        .get(&section.to_ascii_lowercase())
4911        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4912        .cloned()
4913        .unwrap_or_default()
4914}
4915
4916fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
4917    let mut dependencies = Vec::new();
4918
4919    for (sub_section, scope) in [
4920        ("install_requires", "install"),
4921        ("tests_require", "test"),
4922        ("setup_requires", "setup"),
4923    ] {
4924        let reqs = get_ini_values(sections, "options", sub_section);
4925        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
4926    }
4927
4928    if let Some(extras) = sections.get("options.extras_require") {
4929        let mut extra_items: Vec<_> = extras.iter().collect();
4930        extra_items.sort_by_key(|(name, _)| *name);
4931        for (extra_name, reqs) in extra_items {
4932            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
4933        }
4934    }
4935
4936    dependencies
4937}
4938
4939fn parse_setup_cfg_requirements(
4940    reqs: &[String],
4941    scope: &str,
4942    is_optional: bool,
4943) -> Vec<Dependency> {
4944    reqs.iter()
4945        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
4946        .collect()
4947}
4948
4949fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
4950    let trimmed = req.trim();
4951    if trimmed.is_empty() || trimmed.starts_with('#') {
4952        return None;
4953    }
4954
4955    let name = extract_setup_cfg_dependency_name(trimmed)?;
4956    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4957
4958    Some(Dependency {
4959        purl: Some(purl.to_string()),
4960        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
4961        scope: Some(scope.to_string()),
4962        is_runtime: Some(true),
4963        is_optional: Some(is_optional),
4964        is_pinned: Some(false),
4965        is_direct: Some(true),
4966        resolved_package: None,
4967        extra_data: None,
4968    })
4969}
4970
4971fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
4972    let trimmed = req.trim();
4973    if trimmed.is_empty() {
4974        return None;
4975    }
4976
4977    let end = trimmed
4978        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
4979        .unwrap_or(trimmed.len());
4980    let name = trimmed[..end].trim();
4981    if name.is_empty() {
4982        None
4983    } else {
4984        Some(name.to_string())
4985    }
4986}
4987
4988fn normalize_setup_cfg_requirement(req: &str) -> String {
4989    req.chars().filter(|c| !c.is_whitespace()).collect()
4990}
4991
4992fn extract_setup_value(content: &str, key: &str) -> Option<String> {
4993    let patterns = vec![
4994        format!("{}=\"", key),   // name="value"
4995        format!("{} =\"", key),  // name ="value"
4996        format!("{}= \"", key),  // name= "value"
4997        format!("{} = \"", key), // name = "value"
4998        format!("{}='", key),    // name='value'
4999        format!("{} ='", key),   // name ='value'
5000        format!("{}= '", key),   // name= 'value'
5001        format!("{} = '", key),  // name = 'value'
5002    ];
5003
5004    for pattern in patterns {
5005        if let Some(start_idx) = content.find(&pattern) {
5006            let value_start = start_idx + pattern.len();
5007            let remaining = &content[value_start..];
5008
5009            if let Some(end_idx) = remaining.find(['"', '\'']) {
5010                return Some(remaining[..end_idx].to_string());
5011            }
5012        }
5013    }
5014
5015    None
5016}
5017
5018fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
5019    let mut dependencies = Vec::new();
5020
5021    if let Some(tests_deps) = extract_tests_require(content) {
5022        dependencies.extend(tests_deps);
5023    }
5024
5025    if let Some(extras_deps) = extract_extras_require(content) {
5026        dependencies.extend(extras_deps);
5027    }
5028
5029    dependencies
5030}
5031
5032fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
5033    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
5034    let re = Regex::new(pattern).ok()?;
5035    let captures = re.captures(content)?;
5036    let deps_str = captures.get(1)?.as_str();
5037
5038    let deps = parse_setup_py_dep_list(deps_str, "test", true);
5039    if deps.is_empty() { None } else { Some(deps) }
5040}
5041
5042fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
5043    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
5044    let re = Regex::new(pattern).ok()?;
5045    let captures = re.captures(content)?;
5046    let dict_content = captures.get(1)?.as_str();
5047
5048    let mut all_deps = Vec::new();
5049
5050    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
5051    let entry_re = Regex::new(entry_pattern).ok()?;
5052
5053    for entry_cap in entry_re.captures_iter(dict_content) {
5054        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
5055            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
5056            all_deps.extend(deps);
5057        }
5058    }
5059
5060    if all_deps.is_empty() {
5061        None
5062    } else {
5063        Some(all_deps)
5064    }
5065}
5066
5067fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
5068    let dep_pattern = r#"['"]([^'"]+)['"]"#;
5069    let re = match Regex::new(dep_pattern) {
5070        Ok(r) => r,
5071        Err(_) => return Vec::new(),
5072    };
5073
5074    re.captures_iter(deps_str)
5075        .filter_map(|cap| {
5076            let dep_str = cap.get(1)?.as_str().trim();
5077            if dep_str.is_empty() {
5078                return None;
5079            }
5080
5081            let name = extract_setup_cfg_dependency_name(dep_str)?;
5082            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5083
5084            Some(Dependency {
5085                purl: Some(purl.to_string()),
5086                extracted_requirement: Some(dep_str.to_string()),
5087                scope: Some(scope.to_string()),
5088                is_runtime: Some(true),
5089                is_optional: Some(is_optional),
5090                is_pinned: Some(false),
5091                is_direct: Some(true),
5092                resolved_package: None,
5093                extra_data: None,
5094            })
5095        })
5096        .collect()
5097}
5098
5099/// Reads and parses a TOML file
5100pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
5101    let content = read_file_to_string(path).map_err(|e| e.to_string())?;
5102    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
5103}
5104
5105/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
5106///
5107/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
5108/// Essential for SBOM compliance and package integrity verification.
5109///
5110/// # Returns
5111///
5112/// - `(Some(size), Some(hash))` on success
5113/// - `(None, None)` if file cannot be opened
5114/// - `(Some(size), None)` if hash calculation fails during read
5115fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<Sha256Digest>) {
5116    let mut file = match File::open(path) {
5117        Ok(f) => f,
5118        Err(_) => return (None, None),
5119    };
5120
5121    let metadata = match file.metadata() {
5122        Ok(m) => m,
5123        Err(_) => return (None, None),
5124    };
5125    let size = metadata.len();
5126
5127    let mut hasher = Sha256::new();
5128    let mut buffer = vec![0; 8192];
5129
5130    loop {
5131        match file.read(&mut buffer) {
5132            Ok(0) => break,
5133            Ok(n) => hasher.update(&buffer[..n]),
5134            Err(_) => return (Some(size), None),
5135        }
5136    }
5137
5138    let hash = Sha256Digest::from_bytes(hasher.finalize().into());
5139    (Some(size), Some(hash))
5140}
5141
5142fn default_package_data(path: &Path) -> PackageData {
5143    PackageData {
5144        package_type: Some(PythonParser::PACKAGE_TYPE),
5145        primary_language: Some("Python".to_string()),
5146        datasource_id: infer_python_datasource_id(path),
5147        ..Default::default()
5148    }
5149}
5150
5151fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
5152    let file_name = path.file_name().and_then(|name| name.to_str());
5153
5154    match file_name {
5155        Some("pyproject.toml") => {
5156            if read_toml_file(path)
5157                .ok()
5158                .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
5159                .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
5160                .is_some()
5161            {
5162                Some(DatasourceId::PypiPoetryPyprojectToml)
5163            } else {
5164                Some(DatasourceId::PypiPyprojectToml)
5165            }
5166        }
5167        Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
5168            Some(DatasourceId::PypiSetupPy)
5169        }
5170        Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
5171        Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
5172        Some("METADATA") if is_installed_wheel_metadata_path(path) => {
5173            Some(DatasourceId::PypiWheelMetadata)
5174        }
5175        Some("pypi.json") => Some(DatasourceId::PypiJson),
5176        Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
5177        Some("origin.json") if is_pip_cache_origin_json(path) => {
5178            Some(DatasourceId::PypiPipOriginJson)
5179        }
5180        _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
5181            Some(DatasourceId::PypiSdist)
5182        }
5183        _ if path
5184            .extension()
5185            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
5186        {
5187            Some(DatasourceId::PypiWheel)
5188        }
5189        _ if path
5190            .extension()
5191            .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
5192        {
5193            Some(DatasourceId::PypiEgg)
5194        }
5195        _ => None,
5196    }
5197}
5198
5199crate::register_parser!(
5200    "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
5201    &[
5202        "**/pyproject.toml",
5203        "**/setup.py",
5204        "**/*_setup.py",
5205        "**/setup.cfg",
5206        "**/pypi.json",
5207        "**/PKG-INFO",
5208        "**/*.dist-info/METADATA",
5209        "**/origin.json",
5210        "**/*.tar.gz",
5211        "**/*.tgz",
5212        "**/*.tar.bz2",
5213        "**/*.tar.xz",
5214        "**/*.zip",
5215        "**/*.whl",
5216        "**/*.egg"
5217    ],
5218    "pypi",
5219    "Python",
5220    Some("https://packaging.python.org/"),
5221);