Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{
35    DatasourceId, Dependency, FileReference, PackageData, PackageType, Party, Sha256Digest,
36};
37use crate::parser_warn as warn;
38use crate::parsers::utils::{
39    MAX_ITERATION_COUNT, read_file_to_string, split_name_email, truncate_field,
40};
41use base64::Engine;
42use base64::engine::general_purpose::URL_SAFE_NO_PAD;
43use bzip2::read::BzDecoder;
44use csv::ReaderBuilder;
45use flate2::read::GzDecoder;
46use liblzma::read::XzDecoder;
47use packageurl::PackageUrl;
48use regex::Regex;
49use ruff_python_ast as ast;
50use ruff_python_parser::parse_module;
51use serde_json::{Map as JsonMap, Value as JsonValue};
52use sha2::{Digest, Sha256};
53use std::collections::{HashMap, HashSet};
54use std::fs::File;
55use std::io::Read;
56use std::path::{Component, Path, PathBuf};
57use tar::Archive;
58use toml::Value as TomlValue;
59use toml::map::Map as TomlMap;
60use zip::ZipArchive;
61
62use super::PackageParser;
63use super::license_normalization::{
64    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
65    normalize_spdx_expression,
66};
67use super::pep508::parse_pep508_requirement;
68
69// Field constants for pyproject.toml
70const FIELD_PROJECT: &str = "project";
71const FIELD_NAME: &str = "name";
72const FIELD_VERSION: &str = "version";
73const FIELD_DESCRIPTION: &str = "description";
74const FIELD_KEYWORDS: &str = "keywords";
75const FIELD_LICENSE: &str = "license";
76const FIELD_AUTHORS: &str = "authors";
77const FIELD_MAINTAINERS: &str = "maintainers";
78const FIELD_URLS: &str = "urls";
79const FIELD_HOMEPAGE: &str = "homepage";
80const FIELD_REPOSITORY: &str = "repository";
81const FIELD_DEPENDENCIES: &str = "dependencies";
82const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
83const FIELD_EXTRAS: &str = "extras";
84
85type ProjectUrls = (
86    Option<String>,
87    Option<String>,
88    Option<String>,
89    Option<String>,
90    Option<String>,
91);
92const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
93const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
94const MAX_SETUP_PY_BYTES: usize = 1_048_576;
95const MAX_SETUP_PY_AST_NODES: usize = 10_000;
96const MAX_SETUP_PY_AST_DEPTH: usize = 50;
97const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
98const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
99const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
100
101/// Python package parser supporting 11 manifest formats.
102///
103/// Extracts metadata from Python package files including pyproject.toml, setup.py,
104/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
105///
106/// # Security
107///
108/// setup.py files are parsed using AST analysis rather than code execution to prevent
109/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
110pub struct PythonParser;
111
112#[derive(Clone, Copy, Debug)]
113enum PythonSdistArchiveFormat {
114    TarGz,
115    Tgz,
116    TarBz2,
117    TarXz,
118    Zip,
119}
120
121#[derive(Clone, Debug)]
122struct ValidatedZipEntry {
123    index: usize,
124    name: String,
125}
126
127impl PackageParser for PythonParser {
128    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
129
130    fn extract_packages(path: &Path) -> Vec<PackageData> {
131        vec![
132            if path.file_name().unwrap_or_default() == "pyproject.toml" {
133                extract_from_pyproject_toml(path)
134            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
135                extract_from_setup_cfg(path)
136            } else if is_setup_py_like_path(path) {
137                return extract_setup_py_packages(path);
138            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
139                extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
140            } else if is_installed_wheel_metadata_path(path) {
141                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
142            } else if is_pip_cache_origin_json(path) {
143                extract_from_pip_origin_json(path)
144            } else if path.file_name().unwrap_or_default() == "pypi.json" {
145                extract_from_pypi_json(path)
146            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
147                extract_from_pip_inspect(path)
148            } else if is_python_sdist_archive_path(path) {
149                extract_from_sdist_archive(path)
150            } else if path
151                .extension()
152                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
153            {
154                extract_from_wheel_archive(path)
155            } else if path
156                .extension()
157                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
158            {
159                extract_from_egg_archive(path)
160            } else {
161                default_package_data(path)
162            },
163        ]
164    }
165
166    fn is_match(path: &Path) -> bool {
167        if let Some(filename) = path.file_name()
168            && (filename == "pyproject.toml"
169                || filename == "setup.cfg"
170                || is_setup_py_like_path(path)
171                || filename == "PKG-INFO"
172                || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
173                || filename == "pypi.json"
174                || filename == "pip-inspect.deplock"
175                || is_pip_cache_origin_json(path))
176        {
177            return true;
178        }
179
180        if let Some(extension) = path.extension() {
181            let ext = extension.to_string_lossy().to_lowercase();
182            if (ext == "whl" && is_valid_wheel_archive_path(path))
183                || ext == "egg"
184                || is_python_sdist_archive_path(path)
185            {
186                return true;
187            }
188        }
189
190        false
191    }
192}
193
194fn is_setup_py_like_path(path: &Path) -> bool {
195    path.file_name()
196        .and_then(|name| name.to_str())
197        .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
198}
199
200fn is_installed_wheel_metadata_path(path: &Path) -> bool {
201    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
202        && path
203            .parent()
204            .and_then(|parent| parent.file_name())
205            .and_then(|name| name.to_str())
206            .is_some_and(|name| name.ends_with(".dist-info"))
207}
208
209#[derive(Debug, Clone)]
210struct InstalledWheelMetadata {
211    wheel_tags: Vec<String>,
212    wheel_version: Option<String>,
213    wheel_generator: Option<String>,
214    root_is_purelib: Option<bool>,
215    compressed_tag: Option<String>,
216}
217
218fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
219    let Some(parent) = path.parent() else {
220        return;
221    };
222
223    if !parent
224        .file_name()
225        .and_then(|name| name.to_str())
226        .is_some_and(|name| name.ends_with(".dist-info"))
227    {
228        return;
229    }
230
231    let wheel_path = parent.join("WHEEL");
232    if !wheel_path.exists() {
233        return;
234    }
235
236    let Ok(content) = read_file_to_string(&wheel_path, None) else {
237        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
238        return;
239    };
240
241    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
242        return;
243    };
244
245    apply_installed_wheel_metadata(package_data, &wheel_metadata);
246}
247
248fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
249    use super::rfc822::{get_header_all, get_header_first};
250
251    let metadata = super::rfc822::parse_rfc822_content(content);
252    let wheel_tags = get_header_all(&metadata.headers, "tag");
253    if wheel_tags.is_empty() {
254        return None;
255    }
256
257    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
258    let wheel_generator = get_header_first(&metadata.headers, "generator");
259    let root_is_purelib =
260        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
261            match value.to_ascii_lowercase().as_str() {
262                "true" => Some(true),
263                "false" => Some(false),
264                _ => None,
265            }
266        });
267
268    let compressed_tag = compress_wheel_tags(&wheel_tags);
269
270    Some(InstalledWheelMetadata {
271        wheel_tags,
272        wheel_version,
273        wheel_generator,
274        root_is_purelib,
275        compressed_tag,
276    })
277}
278
279fn compress_wheel_tags(tags: &[String]) -> Option<String> {
280    if tags.is_empty() {
281        return None;
282    }
283
284    if tags.len() == 1 {
285        return Some(tags[0].clone());
286    }
287
288    let mut python_tags = Vec::new();
289    let mut abi_tag: Option<&str> = None;
290    let mut platform_tag: Option<&str> = None;
291
292    for tag in tags {
293        let mut parts = tag.splitn(3, '-');
294        let python = parts.next()?;
295        let abi = parts.next()?;
296        let platform = parts.next()?;
297
298        if abi_tag.is_some_and(|existing| existing != abi)
299            || platform_tag.is_some_and(|existing| existing != platform)
300        {
301            return None;
302        }
303
304        abi_tag = Some(abi);
305        platform_tag = Some(platform);
306        python_tags.push(python.to_string());
307    }
308
309    Some(format!(
310        "{}-{}-{}",
311        python_tags.join("."),
312        abi_tag?,
313        platform_tag?
314    ))
315}
316
317fn apply_installed_wheel_metadata(
318    package_data: &mut PackageData,
319    wheel_metadata: &InstalledWheelMetadata,
320) {
321    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
322    extra_data.insert(
323        "wheel_tags".to_string(),
324        JsonValue::Array(
325            wheel_metadata
326                .wheel_tags
327                .iter()
328                .cloned()
329                .map(JsonValue::String)
330                .collect(),
331        ),
332    );
333
334    if let Some(wheel_version) = &wheel_metadata.wheel_version {
335        extra_data.insert(
336            "wheel_version".to_string(),
337            JsonValue::String(wheel_version.clone()),
338        );
339    }
340
341    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
342        extra_data.insert(
343            "wheel_generator".to_string(),
344            JsonValue::String(wheel_generator.clone()),
345        );
346    }
347
348    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
349        extra_data.insert(
350            "root_is_purelib".to_string(),
351            JsonValue::Bool(root_is_purelib),
352        );
353    }
354
355    if let (Some(name), Some(version), Some(extension)) = (
356        package_data.name.as_deref(),
357        package_data.version.as_deref(),
358        wheel_metadata.compressed_tag.as_deref(),
359    ) {
360        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
361    }
362}
363
364fn is_pip_cache_origin_json(path: &Path) -> bool {
365    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
366        && path.ancestors().skip(1).any(|ancestor| {
367            ancestor
368                .file_name()
369                .and_then(|name| name.to_str())
370                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
371        })
372}
373
374fn extract_from_pip_origin_json(path: &Path) -> PackageData {
375    let content = match read_file_to_string(path, None) {
376        Ok(content) => content,
377        Err(e) => {
378            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
379            return default_package_data(path);
380        }
381    };
382
383    let root: JsonValue = match serde_json::from_str(&content) {
384        Ok(root) => root,
385        Err(e) => {
386            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
387            return default_package_data(path);
388        }
389    };
390
391    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
392        warn!("No url found in pip cache origin.json at {:?}", path);
393        return default_package_data(path);
394    };
395
396    let sibling_wheel = find_sibling_cached_wheel(path);
397    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
398        sibling_wheel
399            .as_ref()
400            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
401    });
402
403    let Some((name, version)) = name_version else {
404        warn!(
405            "Failed to infer package name/version from pip cache origin.json at {:?}",
406            path
407        );
408        return default_package_data(path);
409    };
410
411    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
412        build_pypi_urls(Some(&name), Some(&version));
413    let purl = sibling_wheel
414        .as_ref()
415        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
416        .or(plain_purl);
417
418    PackageData {
419        package_type: Some(PythonParser::PACKAGE_TYPE),
420        primary_language: Some("Python".to_string()),
421        name: Some(truncate_field(name)),
422        version: Some(version),
423        datasource_id: Some(DatasourceId::PypiPipOriginJson),
424        download_url: Some(truncate_field(download_url.to_string())),
425        sha256: extract_sha256_from_origin_json(&root)
426            .and_then(|h| Sha256Digest::from_hex(&h).ok()),
427        repository_homepage_url,
428        repository_download_url,
429        api_data_url,
430        purl,
431        ..Default::default()
432    }
433}
434
435fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
436    let parent = path.parent()?;
437    let entries = parent.read_dir().ok()?;
438
439    for entry in entries.flatten() {
440        let sibling_path = entry.path();
441        if sibling_path
442            .extension()
443            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
444            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
445        {
446            return Some(wheel_info);
447        }
448    }
449
450    None
451}
452
453fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
454    let file_name = url.rsplit('/').next()?;
455
456    if file_name.ends_with(".whl") {
457        return parse_wheel_filename(Path::new(file_name))
458            .map(|wheel_info| (wheel_info.name, wheel_info.version));
459    }
460
461    let stem = strip_python_archive_extension(file_name)?;
462    let (name, version) = stem.rsplit_once('-')?;
463    if name.is_empty() || version.is_empty() {
464        return None;
465    }
466
467    Some((name.replace('_', "-"), version.to_string()))
468}
469
470fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
471    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
472        .iter()
473        .find_map(|suffix| file_name.strip_suffix(suffix))
474}
475
476fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
477    root.pointer("/archive_info/hashes/sha256")
478        .and_then(|value| value.as_str())
479        .map(ToOwned::to_owned)
480        .or_else(|| {
481            root.pointer("/archive_info/hash")
482                .and_then(|value| value.as_str())
483                .and_then(normalize_origin_hash)
484        })
485}
486
487fn normalize_origin_hash(hash: &str) -> Option<String> {
488    if let Some(value) = hash.strip_prefix("sha256=") {
489        return Some(value.to_string());
490    }
491    if let Some(value) = hash.strip_prefix("sha256:") {
492        return Some(value.to_string());
493    }
494    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
495        return Some(hash.to_string());
496    }
497    None
498}
499
500fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
501    let content = match read_file_to_string(path, None) {
502        Ok(content) => content,
503        Err(e) => {
504            warn!("Failed to read metadata at {:?}: {}", path, e);
505            return default_package_data(path);
506        }
507    };
508
509    let metadata = super::rfc822::parse_rfc822_content(&content);
510    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
511    merge_sibling_metadata_dependencies(path, &mut package_data);
512    merge_sibling_metadata_file_references(path, &mut package_data);
513    if datasource_id == DatasourceId::PypiWheelMetadata {
514        merge_sibling_wheel_metadata(path, &mut package_data);
515    }
516    package_data
517}
518
519fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
520    let mut extra_dependencies = Vec::new();
521
522    if let Some(parent) = path.parent() {
523        let direct_requires = parent.join("requires.txt");
524        if direct_requires.exists()
525            && let Ok(content) = read_file_to_string(&direct_requires, None)
526        {
527            extra_dependencies.extend(parse_requires_txt(&content));
528        }
529
530        let sibling_egg_info_requires = parent
531            .read_dir()
532            .ok()
533            .into_iter()
534            .flatten()
535            .flatten()
536            .find_map(|entry| {
537                let child_path = entry.path();
538                if child_path.is_dir()
539                    && child_path
540                        .file_name()
541                        .and_then(|name| name.to_str())
542                        .is_some_and(|name| name.ends_with(".egg-info"))
543                {
544                    let requires = child_path.join("requires.txt");
545                    requires.exists().then_some(requires)
546                } else {
547                    None
548                }
549            });
550
551        if let Some(requires_path) = sibling_egg_info_requires
552            && let Ok(content) = read_file_to_string(&requires_path, None)
553        {
554            extra_dependencies.extend(parse_requires_txt(&content));
555        }
556    }
557
558    for dependency in extra_dependencies {
559        if !package_data.dependencies.iter().any(|existing| {
560            existing.purl == dependency.purl
561                && existing.scope == dependency.scope
562                && existing.extracted_requirement == dependency.extracted_requirement
563                && existing.extra_data == dependency.extra_data
564        }) {
565            package_data.dependencies.push(dependency);
566        }
567    }
568}
569
570fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
571    let mut extra_refs = Vec::new();
572
573    if let Some(parent) = path.parent() {
574        let record_path = parent.join("RECORD");
575        if record_path.exists()
576            && let Ok(content) = read_file_to_string(&record_path, None)
577        {
578            extra_refs.extend(parse_record_csv(&content));
579        }
580
581        let installed_files_path = parent.join("installed-files.txt");
582        if installed_files_path.exists()
583            && let Ok(content) = read_file_to_string(&installed_files_path, None)
584        {
585            extra_refs.extend(parse_installed_files_txt(&content));
586        }
587
588        let sources_path = parent.join("SOURCES.txt");
589        if sources_path.exists()
590            && let Ok(content) = read_file_to_string(&sources_path, None)
591        {
592            extra_refs.extend(parse_sources_txt(&content));
593        }
594    }
595
596    for file_ref in extra_refs {
597        if !package_data
598            .file_references
599            .iter()
600            .any(|existing| existing.path == file_ref.path)
601        {
602            package_data.file_references.push(file_ref);
603        }
604    }
605}
606
607fn collect_validated_zip_entries<R: Read + std::io::Seek>(
608    archive: &mut ZipArchive<R>,
609    path: &Path,
610    archive_type: &str,
611) -> Result<Vec<ValidatedZipEntry>, String> {
612    let mut total_extracted = 0u64;
613    let mut entries = Vec::new();
614    let mut entry_count = 0usize;
615
616    for i in 0..archive.len() {
617        entry_count += 1;
618        if entry_count > MAX_ITERATION_COUNT {
619            warn!(
620                "Exceeded max entry count in {} {:?}; stopping at {} entries",
621                archive_type, path, MAX_ITERATION_COUNT
622            );
623            break;
624        }
625        if let Ok(file) = archive.by_index_raw(i) {
626            let compressed_size = file.compressed_size();
627            let uncompressed_size = file.size();
628            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
629                warn!(
630                    "Skipping unsafe path in {} {:?}: {}",
631                    archive_type,
632                    path,
633                    file.name()
634                );
635                continue;
636            };
637
638            if compressed_size > 0 {
639                let ratio = uncompressed_size as f64 / compressed_size as f64;
640                if ratio > MAX_COMPRESSION_RATIO {
641                    warn!(
642                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
643                        archive_type, path, ratio
644                    );
645                    continue;
646                }
647            }
648
649            if uncompressed_size > MAX_FILE_SIZE {
650                warn!(
651                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
652                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
653                );
654                continue;
655            }
656
657            total_extracted += uncompressed_size;
658            if total_extracted > MAX_ARCHIVE_SIZE {
659                let msg = format!(
660                    "Total extracted size exceeds limit for {} {:?}",
661                    archive_type, path
662                );
663                warn!("{}", msg);
664                return Err(msg);
665            }
666
667            entries.push(ValidatedZipEntry {
668                index: i,
669                name: entry_name,
670            });
671        }
672    }
673
674    Ok(entries)
675}
676
677fn is_python_sdist_archive_path(path: &Path) -> bool {
678    detect_python_sdist_archive_format(path).is_some()
679}
680
681fn is_valid_wheel_archive_path(path: &Path) -> bool {
682    if !path.is_file() {
683        return true;
684    }
685
686    let file = match File::open(path) {
687        Ok(file) => file,
688        Err(_) => return false,
689    };
690    let mut archive = match ZipArchive::new(file) {
691        Ok(archive) => archive,
692        Err(_) => return false,
693    };
694
695    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
696        Ok(entries) => entries,
697        Err(_) => return false,
698    };
699
700    find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA").is_some()
701}
702
703fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
704    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
705
706    if !is_likely_python_sdist_filename(&file_name) {
707        return None;
708    }
709
710    if file_name.ends_with(".tar.gz") {
711        tar_gz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarGz)
712    } else if file_name.ends_with(".tgz") {
713        tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
714    } else if file_name.ends_with(".tar.bz2") {
715        tar_bz2_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarBz2)
716    } else if file_name.ends_with(".tar.xz") {
717        tar_xz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarXz)
718    } else if file_name.ends_with(".zip") {
719        zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
720    } else {
721        None
722    }
723}
724
725fn tar_gz_sdist_contains_pkg_info(path: &Path) -> bool {
726    let Some(compressed_size) = compressed_archive_size(path) else {
727        return false;
728    };
729    let file = match File::open(path) {
730        Ok(file) => file,
731        Err(_) => return false,
732    };
733    let decoder = GzDecoder::new(file);
734    tar_sdist_contains_pkg_info(path, decoder, "tar.gz", compressed_size)
735}
736
737fn tar_bz2_sdist_contains_pkg_info(path: &Path) -> bool {
738    let Some(compressed_size) = compressed_archive_size(path) else {
739        return false;
740    };
741    let file = match File::open(path) {
742        Ok(file) => file,
743        Err(_) => return false,
744    };
745    let decoder = BzDecoder::new(file);
746    tar_sdist_contains_pkg_info(path, decoder, "tar.bz2", compressed_size)
747}
748
749fn tar_xz_sdist_contains_pkg_info(path: &Path) -> bool {
750    let Some(compressed_size) = compressed_archive_size(path) else {
751        return false;
752    };
753    let file = match File::open(path) {
754        Ok(file) => file,
755        Err(_) => return false,
756    };
757    let decoder = XzDecoder::new(file);
758    tar_sdist_contains_pkg_info(path, decoder, "tar.xz", compressed_size)
759}
760
761fn compressed_archive_size(path: &Path) -> Option<u64> {
762    std::fs::metadata(path).ok().map(|metadata| metadata.len())
763}
764
765fn tar_sdist_contains_pkg_info<R: Read>(
766    path: &Path,
767    reader: R,
768    archive_type: &str,
769    compressed_size: u64,
770) -> bool {
771    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
772    else {
773        return false;
774    };
775
776    select_sdist_pkginfo_entry(path, &entries).is_some()
777}
778
779fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
780    if !path.is_file() {
781        return true;
782    }
783
784    let Some(compressed_size) = compressed_archive_size(path) else {
785        return false;
786    };
787    let file = match File::open(path) {
788        Ok(file) => file,
789        Err(_) => return false,
790    };
791    let decoder = GzDecoder::new(file);
792    tar_sdist_contains_pkg_info(path, decoder, "tgz", compressed_size)
793}
794
795fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
796    if !path.is_file() {
797        return true;
798    }
799
800    let file = match File::open(path) {
801        Ok(file) => file,
802        Err(_) => return false,
803    };
804    let mut archive = match ZipArchive::new(file) {
805        Ok(archive) => archive,
806        Err(_) => return false,
807    };
808
809    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
810        Ok(entries) => entries,
811        Err(_) => return false,
812    };
813    let metadata_entries: Vec<_> = validated_entries
814        .iter()
815        .filter(|entry| entry.name.ends_with("/PKG-INFO"))
816        .filter_map(|entry| {
817            read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
818                .ok()
819                .map(|content| (entry.name.clone(), content))
820        })
821        .collect();
822
823    has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
824}
825
826fn is_likely_python_sdist_filename(file_name: &str) -> bool {
827    let Some(stem) = strip_python_archive_extension(file_name) else {
828        return false;
829    };
830
831    let Some((name, version)) = stem.rsplit_once('-') else {
832        return false;
833    };
834
835    !name.is_empty()
836        && !version.is_empty()
837        && version.chars().any(|ch| ch.is_ascii_digit())
838        && name
839            .chars()
840            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
841}
842
843fn extract_from_sdist_archive(path: &Path) -> PackageData {
844    let metadata = match std::fs::metadata(path) {
845        Ok(m) => m,
846        Err(e) => {
847            warn!(
848                "Failed to read metadata for sdist archive {:?}: {}",
849                path, e
850            );
851            return default_package_data(path);
852        }
853    };
854
855    if metadata.len() > MAX_ARCHIVE_SIZE {
856        warn!(
857            "sdist archive too large: {} bytes (limit: {} bytes)",
858            metadata.len(),
859            MAX_ARCHIVE_SIZE
860        );
861        return default_package_data(path);
862    }
863
864    let Some(format) = detect_python_sdist_archive_format(path) else {
865        return default_package_data(path);
866    };
867
868    let mut package_data = match format {
869        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
870            let file = match File::open(path) {
871                Ok(file) => file,
872                Err(e) => {
873                    warn!("Failed to open sdist archive {:?}: {}", path, e);
874                    return default_package_data(path);
875                }
876            };
877            let decoder = GzDecoder::new(file);
878            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
879        }
880        PythonSdistArchiveFormat::TarBz2 => {
881            let file = match File::open(path) {
882                Ok(file) => file,
883                Err(e) => {
884                    warn!("Failed to open sdist archive {:?}: {}", path, e);
885                    return default_package_data(path);
886                }
887            };
888            let decoder = BzDecoder::new(file);
889            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
890        }
891        PythonSdistArchiveFormat::TarXz => {
892            let file = match File::open(path) {
893                Ok(file) => file,
894                Err(e) => {
895                    warn!("Failed to open sdist archive {:?}: {}", path, e);
896                    return default_package_data(path);
897                }
898            };
899            let decoder = XzDecoder::new(file);
900            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
901        }
902        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
903    };
904
905    if package_data.package_type.is_some() {
906        let (size, sha256) = calculate_file_checksums(path);
907        package_data.size = size;
908        package_data.sha256 = sha256;
909    }
910
911    package_data
912}
913
914fn extract_from_tar_sdist_archive<R: Read>(
915    path: &Path,
916    reader: R,
917    archive_type: &str,
918    compressed_size: u64,
919) -> PackageData {
920    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
921    else {
922        return default_package_data(path);
923    };
924
925    build_sdist_package_data(path, entries)
926}
927
928fn collect_tar_sdist_entries<R: Read>(
929    path: &Path,
930    reader: R,
931    archive_type: &str,
932    compressed_size: u64,
933) -> Option<Vec<(String, String)>> {
934    let mut archive = Archive::new(reader);
935    let archive_entries = match archive.entries() {
936        Ok(entries) => entries,
937        Err(e) => {
938            warn!(
939                "Failed to read {} sdist archive {:?}: {}",
940                archive_type, path, e
941            );
942            return None;
943        }
944    };
945
946    let mut total_extracted = 0u64;
947    let mut entries = Vec::new();
948    let mut entry_count = 0usize;
949
950    for entry_result in archive_entries {
951        entry_count += 1;
952        if entry_count > MAX_ITERATION_COUNT {
953            warn!(
954                "Exceeded max entry count in {} sdist {:?}; stopping at {} entries",
955                archive_type, path, MAX_ITERATION_COUNT
956            );
957            break;
958        }
959
960        let mut entry = match entry_result {
961            Ok(entry) => entry,
962            Err(e) => {
963                warn!(
964                    "Failed to read {} sdist entry from {:?}: {}",
965                    archive_type, path, e
966                );
967                continue;
968            }
969        };
970
971        let entry_size = entry.size();
972        if entry_size > MAX_FILE_SIZE {
973            warn!(
974                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
975                archive_type, path, entry_size, MAX_FILE_SIZE
976            );
977            continue;
978        }
979
980        total_extracted += entry_size;
981        if total_extracted > MAX_ARCHIVE_SIZE {
982            warn!(
983                "Total extracted size exceeds limit for {} sdist {:?}",
984                archive_type, path
985            );
986            return None;
987        }
988
989        if compressed_size > 0 {
990            let ratio = total_extracted as f64 / compressed_size as f64;
991            if ratio > MAX_COMPRESSION_RATIO {
992                warn!(
993                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
994                    archive_type, path, ratio
995                );
996                return None;
997            }
998        }
999
1000        let entry_path = match entry.path() {
1001            Ok(path) => path.to_string_lossy().replace('\\', "/"),
1002            Err(e) => {
1003                warn!(
1004                    "Failed to get {} sdist entry path from {:?}: {}",
1005                    archive_type, path, e
1006                );
1007                continue;
1008            }
1009        };
1010
1011        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
1012            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
1013            continue;
1014        };
1015
1016        if !is_relevant_sdist_text_entry(&entry_path) {
1017            continue;
1018        }
1019
1020        if let Ok(content) = read_limited_utf8(
1021            &mut entry,
1022            MAX_FILE_SIZE,
1023            &format!("{} entry {}", archive_type, entry_path),
1024        ) {
1025            entries.push((entry_path, content));
1026        }
1027    }
1028
1029    Some(entries)
1030}
1031
1032fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
1033    let file = match File::open(path) {
1034        Ok(file) => file,
1035        Err(e) => {
1036            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
1037            return default_package_data(path);
1038        }
1039    };
1040
1041    let mut archive = match ZipArchive::new(file) {
1042        Ok(archive) => archive,
1043        Err(e) => {
1044            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
1045            return default_package_data(path);
1046        }
1047    };
1048
1049    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
1050        Ok(entries) => entries,
1051        Err(_) => return default_package_data(path),
1052    };
1053
1054    let mut entries = Vec::new();
1055    for entry in validated_entries.iter() {
1056        if !is_relevant_sdist_text_entry(&entry.name) {
1057            continue;
1058        }
1059
1060        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
1061            entries.push((entry.name.clone(), content));
1062        }
1063    }
1064
1065    build_sdist_package_data(path, entries)
1066}
1067
1068fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
1069    entry_path.ends_with("/PKG-INFO")
1070        || entry_path.ends_with("/requires.txt")
1071        || entry_path.ends_with("/SOURCES.txt")
1072}
1073
1074fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
1075    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
1076        warn!("No PKG-INFO file found in sdist archive {:?}", path);
1077        return default_package_data(path);
1078    };
1079
1080    let mut package_data =
1081        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
1082    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
1083    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
1084    apply_sdist_name_version_fallback(path, &mut package_data);
1085    package_data.datasource_id = Some(DatasourceId::PypiSdist);
1086    package_data
1087}
1088
1089fn select_sdist_pkginfo_entry(
1090    archive_path: &Path,
1091    entries: &[(String, String)],
1092) -> Option<(String, String)> {
1093    let expected_name = sdist_archive_expected_name(archive_path);
1094
1095    entries
1096        .iter()
1097        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
1098        .min_by_key(|(entry_path, content)| {
1099            let components: Vec<_> = entry_path
1100                .split('/')
1101                .filter(|part| !part.is_empty())
1102                .collect();
1103            let candidate_name = sdist_pkginfo_candidate_name(content);
1104            let name_rank = if candidate_name == expected_name {
1105                0
1106            } else {
1107                1
1108            };
1109            let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1110
1111            (name_rank, kind_rank, components.len(), entry_path.clone())
1112        })
1113        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1114}
1115
1116fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1117    let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1118        return false;
1119    };
1120
1121    entries.iter().any(|(entry_path, content)| {
1122        sdist_pkginfo_kind_rank(entry_path) < 3
1123            && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1124    })
1125}
1126
1127fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1128    archive_path
1129        .file_name()
1130        .and_then(|name| name.to_str())
1131        .and_then(strip_python_archive_extension)
1132        .and_then(|stem| {
1133            stem.rsplit_once('-')
1134                .map(|(name, _)| normalize_python_package_name(name))
1135        })
1136}
1137
1138fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1139    let metadata = super::rfc822::parse_rfc822_content(content);
1140    super::rfc822::get_header_first(&metadata.headers, "name")
1141        .map(|name| normalize_python_package_name(&name))
1142}
1143
1144fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1145    let components: Vec<_> = entry_path
1146        .split('/')
1147        .filter(|part| !part.is_empty())
1148        .collect();
1149
1150    if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1151    {
1152        0
1153    } else if components.len() == 2 && components[1] == "PKG-INFO" {
1154        1
1155    } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1156        2
1157    } else {
1158        3
1159    }
1160}
1161
1162fn merge_sdist_archive_dependencies(
1163    entries: &[(String, String)],
1164    metadata_path: &str,
1165    package_data: &mut PackageData,
1166) {
1167    let metadata_dir = metadata_path
1168        .rsplit_once('/')
1169        .map(|(dir, _)| dir)
1170        .unwrap_or("");
1171    let archive_root = metadata_path.split('/').next().unwrap_or("");
1172    let matched_egg_info_dir =
1173        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1174    let mut extra_dependencies = Vec::new();
1175
1176    for (entry_path, content) in entries {
1177        let is_direct_requires =
1178            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1179        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1180            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1181        });
1182
1183        if is_direct_requires || is_egg_info_requires {
1184            extra_dependencies.extend(parse_requires_txt(content));
1185        }
1186    }
1187
1188    for dependency in extra_dependencies {
1189        if !package_data.dependencies.iter().any(|existing| {
1190            existing.purl == dependency.purl
1191                && existing.scope == dependency.scope
1192                && existing.extracted_requirement == dependency.extracted_requirement
1193                && existing.extra_data == dependency.extra_data
1194        }) {
1195            package_data.dependencies.push(dependency);
1196        }
1197    }
1198}
1199
1200fn merge_sdist_archive_file_references(
1201    entries: &[(String, String)],
1202    metadata_path: &str,
1203    package_data: &mut PackageData,
1204) {
1205    let metadata_dir = metadata_path
1206        .rsplit_once('/')
1207        .map(|(dir, _)| dir)
1208        .unwrap_or("");
1209    let archive_root = metadata_path.split('/').next().unwrap_or("");
1210    let matched_egg_info_dir =
1211        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1212    let mut extra_refs = Vec::new();
1213
1214    for (entry_path, content) in entries {
1215        let is_direct_sources =
1216            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1217        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1218            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1219        });
1220
1221        if is_direct_sources || is_egg_info_sources {
1222            extra_refs.extend(parse_sources_txt(content));
1223        }
1224    }
1225
1226    for file_ref in extra_refs {
1227        if !package_data
1228            .file_references
1229            .iter()
1230            .any(|existing| existing.path == file_ref.path)
1231        {
1232            package_data.file_references.push(file_ref);
1233        }
1234    }
1235}
1236
1237fn select_matching_sdist_egg_info_dir(
1238    entries: &[(String, String)],
1239    archive_root: &str,
1240    package_name: Option<&str>,
1241) -> Option<String> {
1242    let normalized_package_name = package_name.map(normalize_python_package_name);
1243
1244    entries
1245        .iter()
1246        .filter_map(|(entry_path, _)| {
1247            let components: Vec<_> = entry_path
1248                .split('/')
1249                .filter(|part| !part.is_empty())
1250                .collect();
1251            if components.len() == 3
1252                && components[0] == archive_root
1253                && components[1].ends_with(".egg-info")
1254            {
1255                Some(components[1].to_string())
1256            } else {
1257                None
1258            }
1259        })
1260        .min_by_key(|egg_info_dir| {
1261            let normalized_dir_name =
1262                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1263            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1264                0
1265            } else {
1266                1
1267            };
1268
1269            (name_rank, egg_info_dir.clone())
1270        })
1271}
1272
1273fn normalize_python_package_name(name: &str) -> String {
1274    name.to_ascii_lowercase().replace('_', "-")
1275}
1276
1277fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1278    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1279        return;
1280    };
1281
1282    let Some(stem) = strip_python_archive_extension(file_name) else {
1283        return;
1284    };
1285
1286    let Some((name, version)) = stem.rsplit_once('-') else {
1287        return;
1288    };
1289
1290    if package_data.name.is_none() {
1291        package_data.name = Some(name.replace('_', "-"));
1292    }
1293    if package_data.version.is_none() {
1294        package_data.version = Some(version.to_string());
1295    }
1296
1297    if package_data.purl.is_none()
1298        || package_data.repository_homepage_url.is_none()
1299        || package_data.repository_download_url.is_none()
1300        || package_data.api_data_url.is_none()
1301    {
1302        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1303            build_pypi_urls(
1304                package_data.name.as_deref(),
1305                package_data.version.as_deref(),
1306            );
1307
1308        if package_data.repository_homepage_url.is_none() {
1309            package_data.repository_homepage_url = repository_homepage_url;
1310        }
1311        if package_data.repository_download_url.is_none() {
1312            package_data.repository_download_url = repository_download_url;
1313        }
1314        if package_data.api_data_url.is_none() {
1315            package_data.api_data_url = api_data_url;
1316        }
1317        if package_data.purl.is_none() {
1318            package_data.purl = purl;
1319        }
1320    }
1321}
1322
1323fn extract_from_wheel_archive(path: &Path) -> PackageData {
1324    let metadata = match std::fs::metadata(path) {
1325        Ok(m) => m,
1326        Err(e) => {
1327            warn!(
1328                "Failed to read metadata for wheel archive {:?}: {}",
1329                path, e
1330            );
1331            return default_package_data(path);
1332        }
1333    };
1334
1335    if metadata.len() > MAX_ARCHIVE_SIZE {
1336        warn!(
1337            "Wheel archive too large: {} bytes (limit: {} bytes)",
1338            metadata.len(),
1339            MAX_ARCHIVE_SIZE
1340        );
1341        return default_package_data(path);
1342    }
1343
1344    let file = match File::open(path) {
1345        Ok(f) => f,
1346        Err(e) => {
1347            warn!("Failed to open wheel archive {:?}: {}", path, e);
1348            return default_package_data(path);
1349        }
1350    };
1351
1352    let mut archive = match ZipArchive::new(file) {
1353        Ok(a) => a,
1354        Err(e) => {
1355            warn!("Failed to read wheel archive {:?}: {}", path, e);
1356            return default_package_data(path);
1357        }
1358    };
1359
1360    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1361        Ok(entries) => entries,
1362        Err(_) => return default_package_data(path),
1363    };
1364
1365    let metadata_entry =
1366        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1367            Some(entry) => entry,
1368            None => {
1369                warn!("No METADATA file found in wheel archive {:?}", path);
1370                return default_package_data(path);
1371            }
1372        };
1373
1374    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1375        Ok(c) => c,
1376        Err(e) => {
1377            warn!("Failed to read METADATA from {:?}: {}", path, e);
1378            return default_package_data(path);
1379        }
1380    };
1381
1382    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1383
1384    let (size, sha256) = calculate_file_checksums(path);
1385    package_data.size = size;
1386    package_data.sha256 = sha256;
1387
1388    if let Some(record_entry) =
1389        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1390        && let Ok(record_content) =
1391            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1392    {
1393        package_data.file_references = parse_record_csv(&record_content);
1394    }
1395
1396    if let Some(wheel_info) = parse_wheel_filename(path) {
1397        if package_data.name.is_none() {
1398            package_data.name = Some(wheel_info.name.clone());
1399        }
1400        if package_data.version.is_none() {
1401            package_data.version = Some(wheel_info.version.clone());
1402        }
1403
1404        package_data.qualifiers = Some(std::collections::HashMap::from([(
1405            "extension".to_string(),
1406            format!(
1407                "{}-{}-{}",
1408                wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1409            ),
1410        )]));
1411
1412        package_data.purl = build_wheel_purl(
1413            package_data.name.as_deref(),
1414            package_data.version.as_deref(),
1415            &wheel_info,
1416        );
1417
1418        let mut extra_data = package_data.extra_data.unwrap_or_default();
1419        extra_data.insert(
1420            "python_requires".to_string(),
1421            serde_json::Value::String(wheel_info.python_tag.clone()),
1422        );
1423        extra_data.insert(
1424            "abi_tag".to_string(),
1425            serde_json::Value::String(wheel_info.abi_tag.clone()),
1426        );
1427        extra_data.insert(
1428            "platform_tag".to_string(),
1429            serde_json::Value::String(wheel_info.platform_tag.clone()),
1430        );
1431        package_data.extra_data = Some(extra_data);
1432    }
1433
1434    package_data
1435}
1436
1437fn extract_from_egg_archive(path: &Path) -> PackageData {
1438    let metadata = match std::fs::metadata(path) {
1439        Ok(m) => m,
1440        Err(e) => {
1441            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1442            return default_package_data(path);
1443        }
1444    };
1445
1446    if metadata.len() > MAX_ARCHIVE_SIZE {
1447        warn!(
1448            "Egg archive too large: {} bytes (limit: {} bytes)",
1449            metadata.len(),
1450            MAX_ARCHIVE_SIZE
1451        );
1452        return default_package_data(path);
1453    }
1454
1455    let file = match File::open(path) {
1456        Ok(f) => f,
1457        Err(e) => {
1458            warn!("Failed to open egg archive {:?}: {}", path, e);
1459            return default_package_data(path);
1460        }
1461    };
1462
1463    let mut archive = match ZipArchive::new(file) {
1464        Ok(a) => a,
1465        Err(e) => {
1466            warn!("Failed to read egg archive {:?}: {}", path, e);
1467            return default_package_data(path);
1468        }
1469    };
1470
1471    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1472        Ok(entries) => entries,
1473        Err(_) => return default_package_data(path),
1474    };
1475
1476    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1477        &validated_entries,
1478        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1479    ) {
1480        Some(entry) => entry,
1481        None => {
1482            warn!("No PKG-INFO file found in egg archive {:?}", path);
1483            return default_package_data(path);
1484        }
1485    };
1486
1487    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1488        Ok(c) => c,
1489        Err(e) => {
1490            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1491            return default_package_data(path);
1492        }
1493    };
1494
1495    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1496
1497    let (size, sha256) = calculate_file_checksums(path);
1498    package_data.size = size;
1499    package_data.sha256 = sha256;
1500
1501    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1502        &validated_entries,
1503        &[
1504            "EGG-INFO/installed-files.txt",
1505            ".egg-info/installed-files.txt",
1506        ],
1507    ) && let Ok(installed_files_content) =
1508        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1509    {
1510        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1511    }
1512
1513    if let Some(egg_info) = parse_egg_filename(path) {
1514        if package_data.name.is_none() {
1515            package_data.name = Some(egg_info.name.clone());
1516        }
1517        if package_data.version.is_none() {
1518            package_data.version = Some(egg_info.version.clone());
1519        }
1520
1521        if let Some(python_version) = &egg_info.python_version {
1522            let mut extra_data = package_data.extra_data.unwrap_or_default();
1523            extra_data.insert(
1524                "python_version".to_string(),
1525                serde_json::Value::String(python_version.clone()),
1526            );
1527            package_data.extra_data = Some(extra_data);
1528        }
1529    }
1530
1531    package_data.purl = build_egg_purl(
1532        package_data.name.as_deref(),
1533        package_data.version.as_deref(),
1534    );
1535
1536    package_data
1537}
1538
1539fn find_validated_zip_entry_by_suffix<'a>(
1540    entries: &'a [ValidatedZipEntry],
1541    suffix: &str,
1542) -> Option<&'a ValidatedZipEntry> {
1543    entries.iter().find(|entry| entry.name.ends_with(suffix))
1544}
1545
1546fn find_validated_zip_entry_by_any_suffix<'a>(
1547    entries: &'a [ValidatedZipEntry],
1548    suffixes: &[&str],
1549) -> Option<&'a ValidatedZipEntry> {
1550    entries
1551        .iter()
1552        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1553}
1554
1555fn read_validated_zip_entry<R: Read + std::io::Seek>(
1556    archive: &mut ZipArchive<R>,
1557    entry: &ValidatedZipEntry,
1558    path: &Path,
1559    archive_type: &str,
1560) -> Result<String, String> {
1561    let mut file = archive
1562        .by_index(entry.index)
1563        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1564
1565    let compressed_size = file.compressed_size();
1566    let uncompressed_size = file.size();
1567
1568    if compressed_size > 0 {
1569        let ratio = uncompressed_size as f64 / compressed_size as f64;
1570        if ratio > MAX_COMPRESSION_RATIO {
1571            return Err(format!(
1572                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1573                archive_type, path, ratio
1574            ));
1575        }
1576    }
1577
1578    if uncompressed_size > MAX_FILE_SIZE {
1579        return Err(format!(
1580            "Rejected oversized entry in {} {:?}: {} bytes",
1581            archive_type, path, uncompressed_size
1582        ));
1583    }
1584
1585    read_limited_utf8(
1586        &mut file,
1587        MAX_FILE_SIZE,
1588        &format!("{} entry {}", archive_type, entry.name),
1589    )
1590}
1591
1592fn read_limited_utf8<R: Read>(
1593    reader: &mut R,
1594    max_bytes: u64,
1595    context: &str,
1596) -> Result<String, String> {
1597    let mut limited = reader.take(max_bytes + 1);
1598    let mut bytes = Vec::new();
1599    limited
1600        .read_to_end(&mut bytes)
1601        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1602
1603    if bytes.len() as u64 > max_bytes {
1604        return Err(format!(
1605            "{} exceeded {} byte limit while reading",
1606            context, max_bytes
1607        ));
1608    }
1609
1610    match String::from_utf8(bytes) {
1611        Ok(s) => Ok(s),
1612        Err(err) => {
1613            let bytes = err.into_bytes();
1614            warn!("Invalid UTF-8 in archive entry; using lossy conversion");
1615            Ok(String::from_utf8_lossy(&bytes).into_owned())
1616        }
1617    }
1618}
1619
1620fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1621    let normalized = entry_path.replace('\\', "/");
1622    if normalized.len() >= 3 {
1623        let bytes = normalized.as_bytes();
1624        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1625            return None;
1626        }
1627    }
1628    let path = Path::new(&normalized);
1629    let mut components = Vec::new();
1630
1631    for component in path.components() {
1632        match component {
1633            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1634            Component::CurDir => {}
1635            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1636        }
1637    }
1638
1639    (!components.is_empty()).then_some(components.join("/"))
1640}
1641
1642/// Parses RECORD CSV format from wheel archives (PEP 427).
1643/// Format: path,hash,size (3 columns, no header)
1644/// Hash format: sha256=urlsafe_base64_hash or empty
1645/// Size: bytes as u64 or empty
1646pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1647    let mut reader = ReaderBuilder::new()
1648        .has_headers(false)
1649        .from_reader(content.as_bytes());
1650
1651    let mut file_references = Vec::new();
1652    let mut record_count = 0usize;
1653
1654    for result in reader.records() {
1655        record_count += 1;
1656        if record_count > MAX_ITERATION_COUNT {
1657            warn!(
1658                "Exceeded max record count in RECORD CSV; stopping at {} records",
1659                MAX_ITERATION_COUNT
1660            );
1661            break;
1662        }
1663        match result {
1664            Ok(record) => {
1665                if record.len() < 3 {
1666                    continue;
1667                }
1668
1669                let path = record.get(0).unwrap_or("").trim().to_string();
1670                if path.is_empty() {
1671                    continue;
1672                }
1673
1674                let hash_field = record.get(1).unwrap_or("").trim();
1675                let size_field = record.get(2).unwrap_or("").trim();
1676
1677                // Parse hash: format is "algorithm=value"
1678                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1679                    let parts: Vec<&str> = hash_field.split('=').collect();
1680                    if parts.len() == 2 && parts[0] == "sha256" {
1681                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1682                            Ok(decoded) => {
1683                                let hex = decoded
1684                                    .iter()
1685                                    .map(|b| format!("{:02x}", b))
1686                                    .collect::<String>();
1687                                Sha256Digest::from_hex(&hex).ok()
1688                            }
1689                            Err(_) => None,
1690                        }
1691                    } else {
1692                        None
1693                    }
1694                } else {
1695                    None
1696                };
1697
1698                // Parse size
1699                let size = if !size_field.is_empty() && size_field != "-" {
1700                    size_field.parse::<u64>().ok()
1701                } else {
1702                    None
1703                };
1704
1705                file_references.push(FileReference {
1706                    path,
1707                    size,
1708                    sha1: None,
1709                    md5: None,
1710                    sha256,
1711                    sha512: None,
1712                    extra_data: None,
1713                });
1714            }
1715            Err(e) => {
1716                warn!("Failed to parse RECORD CSV row: {}", e);
1717                continue;
1718            }
1719        }
1720    }
1721
1722    file_references
1723}
1724
1725/// Parses installed-files.txt format from egg archives (PEP 376).
1726/// Format: one file path per line, no headers, no hash, no size
1727pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1728    content
1729        .lines()
1730        .map(|line| line.trim())
1731        .filter(|line| !line.is_empty())
1732        .map(|path| FileReference {
1733            path: path.to_string(),
1734            size: None,
1735            sha1: None,
1736            md5: None,
1737            sha256: None,
1738            sha512: None,
1739            extra_data: None,
1740        })
1741        .collect()
1742}
1743
1744pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1745    content
1746        .lines()
1747        .map(str::trim)
1748        .filter(|line| !line.is_empty())
1749        .map(|path| FileReference {
1750            path: path.to_string(),
1751            size: None,
1752            sha1: None,
1753            md5: None,
1754            sha256: None,
1755            sha512: None,
1756            extra_data: None,
1757        })
1758        .collect()
1759}
1760
1761struct WheelInfo {
1762    name: String,
1763    version: String,
1764    python_tag: String,
1765    abi_tag: String,
1766    platform_tag: String,
1767}
1768
1769fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1770    let stem = path.file_stem()?.to_string_lossy();
1771    let parts: Vec<&str> = stem.split('-').collect();
1772
1773    if parts.len() >= 5 {
1774        Some(WheelInfo {
1775            name: parts[0].replace('_', "-"),
1776            version: parts[1].to_string(),
1777            python_tag: parts[2].to_string(),
1778            abi_tag: parts[3].to_string(),
1779            platform_tag: parts[4..].join("-"),
1780        })
1781    } else {
1782        None
1783    }
1784}
1785
1786struct EggInfo {
1787    name: String,
1788    version: String,
1789    python_version: Option<String>,
1790}
1791
1792fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1793    let stem = path.file_stem()?.to_string_lossy();
1794    let parts: Vec<&str> = stem.split('-').collect();
1795
1796    if parts.len() >= 2 {
1797        Some(EggInfo {
1798            name: parts[0].replace('_', "-"),
1799            version: parts[1].to_string(),
1800            python_version: parts.get(2).map(|s| s.to_string()),
1801        })
1802    } else {
1803        None
1804    }
1805}
1806
1807fn build_wheel_purl(
1808    name: Option<&str>,
1809    version: Option<&str>,
1810    wheel_info: &WheelInfo,
1811) -> Option<String> {
1812    let name = name?;
1813    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1814
1815    if let Some(ver) = version {
1816        package_url.with_version(ver).ok()?;
1817    }
1818
1819    let extension = format!(
1820        "{}-{}-{}",
1821        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1822    );
1823    package_url.add_qualifier("extension", extension).ok()?;
1824
1825    Some(package_url.to_string())
1826}
1827
1828fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1829    let name = name?;
1830    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1831
1832    if let Some(ver) = version {
1833        package_url.with_version(ver).ok()?;
1834    }
1835
1836    package_url.add_qualifier("type", "egg").ok()?;
1837
1838    Some(package_url.to_string())
1839}
1840
1841fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1842    let metadata = super::rfc822::parse_rfc822_content(content);
1843    build_package_data_from_rfc822(&metadata, datasource_id)
1844}
1845
1846/// Builds PackageData from parsed RFC822 metadata.
1847///
1848/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1849/// and `python_parse_rfc822_content` (content-based) functions.
1850fn build_package_data_from_rfc822(
1851    metadata: &super::rfc822::Rfc822Metadata,
1852    datasource_id: DatasourceId,
1853) -> PackageData {
1854    use super::rfc822::{get_header_all, get_header_first};
1855
1856    let name = get_header_first(&metadata.headers, "name").map(truncate_field);
1857    let version = get_header_first(&metadata.headers, "version").map(truncate_field);
1858    let summary = get_header_first(&metadata.headers, "summary").map(truncate_field);
1859    let mut homepage_url = get_header_first(&metadata.headers, "home-page").map(truncate_field);
1860    let author = get_header_first(&metadata.headers, "author").map(truncate_field);
1861    let author_email = get_header_first(&metadata.headers, "author-email").map(truncate_field);
1862    let license = get_header_first(&metadata.headers, "license").map(truncate_field);
1863    let license_expression = get_header_first(&metadata.headers, "license-expression");
1864    let download_url = get_header_first(&metadata.headers, "download-url");
1865    let platform = get_header_first(&metadata.headers, "platform");
1866    let requires_python = get_header_first(&metadata.headers, "requires-python");
1867    let classifiers = get_header_all(&metadata.headers, "classifier");
1868    let license_files = get_header_all(&metadata.headers, "license-file");
1869
1870    let description_body = if metadata.body.is_empty() {
1871        get_header_first(&metadata.headers, "description").unwrap_or_default()
1872    } else {
1873        metadata.body.clone()
1874    };
1875
1876    let description = build_description(summary.as_deref(), &description_body).map(truncate_field);
1877
1878    let mut parties = Vec::new();
1879    if author.is_some() || author_email.is_some() {
1880        parties.push(Party {
1881            r#type: Some("person".to_string()),
1882            role: Some("author".to_string()),
1883            name: author,
1884            email: author_email,
1885            url: None,
1886            organization: None,
1887            organization_url: None,
1888            timezone: None,
1889        });
1890    }
1891
1892    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1893    let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1894    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1895        license_expression
1896            .as_deref()
1897            .and_then(normalize_spdx_expression)
1898            .map(|normalized| {
1899                build_declared_license_data(
1900                    normalized,
1901                    DeclaredLicenseMatchMetadata::single_line(
1902                        license_expression.as_deref().unwrap_or_default(),
1903                    )
1904                    .with_referenced_filenames(&referenced_license_files),
1905                )
1906            })
1907            .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1908
1909    let extracted_license_statement = license_expression
1910        .clone()
1911        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1912
1913    let mut extra_data = HashMap::new();
1914    if let Some(platform_value) = platform
1915        && !platform_value.eq_ignore_ascii_case("unknown")
1916        && !platform_value.is_empty()
1917    {
1918        extra_data.insert(
1919            "platform".to_string(),
1920            serde_json::Value::String(platform_value),
1921        );
1922    }
1923
1924    if let Some(requires_python_value) = requires_python
1925        && !requires_python_value.is_empty()
1926    {
1927        extra_data.insert(
1928            "requires_python".to_string(),
1929            serde_json::Value::String(requires_python_value),
1930        );
1931    }
1932
1933    if !license_files.is_empty() {
1934        extra_data.insert(
1935            "license_files".to_string(),
1936            serde_json::Value::Array(
1937                license_files
1938                    .iter()
1939                    .cloned()
1940                    .map(serde_json::Value::String)
1941                    .collect(),
1942            ),
1943        );
1944    }
1945
1946    let file_references = license_files
1947        .iter()
1948        .map(|path| FileReference {
1949            path: path.clone(),
1950            size: None,
1951            sha1: None,
1952            md5: None,
1953            sha256: None,
1954            sha512: None,
1955            extra_data: None,
1956        })
1957        .collect();
1958
1959    let project_urls = get_header_all(&metadata.headers, "project-url");
1960    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1961    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1962
1963    if !project_urls.is_empty() {
1964        let parsed_urls = parse_project_urls(&project_urls);
1965
1966        for (label, url) in &parsed_urls {
1967            let label_lower = label.to_lowercase();
1968
1969            if bug_tracking_url.is_none()
1970                && matches!(
1971                    label_lower.as_str(),
1972                    "tracker"
1973                        | "bug reports"
1974                        | "bug tracker"
1975                        | "issues"
1976                        | "issue tracker"
1977                        | "github: issues"
1978                )
1979            {
1980                bug_tracking_url = Some(url.clone());
1981            } else if code_view_url.is_none()
1982                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1983            {
1984                code_view_url = Some(url.clone());
1985            } else if vcs_url.is_none()
1986                && matches!(
1987                    label_lower.as_str(),
1988                    "github" | "gitlab" | "github: repo" | "repository"
1989                )
1990            {
1991                vcs_url = Some(url.clone());
1992            } else if homepage_url.is_none()
1993                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1994            {
1995                homepage_url = Some(url.clone());
1996            } else if label_lower == "changelog" {
1997                extra_data.insert(
1998                    "changelog_url".to_string(),
1999                    serde_json::Value::String(url.clone()),
2000                );
2001            }
2002        }
2003
2004        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
2005            .iter()
2006            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
2007            .collect();
2008
2009        if !project_urls_json.is_empty() {
2010            extra_data.insert(
2011                "project_urls".to_string(),
2012                serde_json::Value::Object(project_urls_json),
2013            );
2014        }
2015    }
2016
2017    let extra_data = if extra_data.is_empty() {
2018        None
2019    } else {
2020        Some(extra_data)
2021    };
2022
2023    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
2024        build_pypi_urls(name.as_deref(), version.as_deref());
2025
2026    PackageData {
2027        package_type: Some(PythonParser::PACKAGE_TYPE),
2028        namespace: None,
2029        name,
2030        version,
2031        qualifiers: None,
2032        subpath: None,
2033        primary_language: Some("Python".to_string()),
2034        description,
2035        release_date: None,
2036        parties,
2037        keywords,
2038        homepage_url,
2039        download_url,
2040        size: None,
2041        sha1: None,
2042        md5: None,
2043        sha256: None,
2044        sha512: None,
2045        bug_tracking_url,
2046        code_view_url,
2047        vcs_url,
2048        copyright: None,
2049        holder: None,
2050        declared_license_expression,
2051        declared_license_expression_spdx,
2052        license_detections,
2053        other_license_expression: None,
2054        other_license_expression_spdx: None,
2055        other_license_detections: Vec::new(),
2056        extracted_license_statement,
2057        notice_text: None,
2058        source_packages: Vec::new(),
2059        file_references,
2060        is_private: false,
2061        is_virtual: false,
2062        extra_data,
2063        dependencies,
2064        repository_homepage_url,
2065        repository_download_url,
2066        api_data_url,
2067        datasource_id: Some(datasource_id),
2068        purl,
2069    }
2070}
2071
2072fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
2073    project_urls
2074        .iter()
2075        .filter_map(|url_entry| {
2076            if let Some((label, url)) = url_entry.split_once(", ") {
2077                let label_trimmed = label.trim();
2078                let url_trimmed = url.trim();
2079                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
2080                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
2081                }
2082            }
2083            None
2084        })
2085        .collect()
2086}
2087
2088fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
2089    let mut parts = Vec::new();
2090    if let Some(summary_value) = summary
2091        && !summary_value.trim().is_empty()
2092    {
2093        parts.push(summary_value.trim().to_string());
2094    }
2095
2096    if !body.trim().is_empty() {
2097        parts.push(body.trim().to_string());
2098    }
2099
2100    if parts.is_empty() {
2101        None
2102    } else {
2103        Some(parts.join("\n"))
2104    }
2105}
2106
2107fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
2108    let mut keywords = Vec::new();
2109    let mut license_classifiers = Vec::new();
2110
2111    for classifier in classifiers {
2112        if classifier.starts_with("License ::") {
2113            license_classifiers.push(classifier.to_string());
2114        } else {
2115            keywords.push(classifier.to_string());
2116        }
2117    }
2118
2119    (keywords, license_classifiers)
2120}
2121
2122fn build_extracted_license_statement(
2123    license: Option<&str>,
2124    license_classifiers: &[String],
2125) -> Option<String> {
2126    let mut lines = Vec::new();
2127
2128    if let Some(value) = license
2129        && !value.trim().is_empty()
2130    {
2131        lines.push(format!("license: {}", value.trim()));
2132    }
2133
2134    if !license_classifiers.is_empty() {
2135        lines.push("classifiers:".to_string());
2136        for classifier in license_classifiers {
2137            lines.push(format!("  - '{}'", classifier));
2138        }
2139    }
2140
2141    if lines.is_empty() {
2142        None
2143    } else {
2144        Some(format!("{}\n", lines.join("\n")))
2145    }
2146}
2147
2148pub(crate) fn build_pypi_urls(
2149    name: Option<&str>,
2150    version: Option<&str>,
2151) -> (
2152    Option<String>,
2153    Option<String>,
2154    Option<String>,
2155    Option<String>,
2156) {
2157    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2158
2159    let repository_download_url = name.and_then(|value| {
2160        version.map(|ver| {
2161            format!(
2162                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2163                &value[..1.min(value.len())],
2164                value,
2165                value,
2166                ver
2167            )
2168        })
2169    });
2170
2171    let api_data_url = name.map(|value| {
2172        if let Some(ver) = version {
2173            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2174        } else {
2175            format!("https://pypi.org/pypi/{}/json", value)
2176        }
2177    });
2178
2179    let purl = name.and_then(|value| {
2180        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2181        if let Some(ver) = version {
2182            package_url.with_version(ver).ok()?;
2183        }
2184        Some(package_url.to_string())
2185    });
2186
2187    (
2188        repository_homepage_url,
2189        repository_download_url,
2190        api_data_url,
2191        purl,
2192    )
2193}
2194
2195fn build_pypi_purl_with_extension(
2196    name: &str,
2197    version: Option<&str>,
2198    extension: &str,
2199) -> Option<String> {
2200    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2201    if let Some(ver) = version {
2202        package_url.with_version(ver).ok()?;
2203    }
2204    package_url.add_qualifier("extension", extension).ok()?;
2205    Some(package_url.to_string())
2206}
2207
2208fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2209    let toml_content = match read_toml_file(path) {
2210        Ok(content) => content,
2211        Err(e) => {
2212            warn!(
2213                "Failed to read or parse pyproject.toml at {:?}: {}",
2214                path, e
2215            );
2216            return default_package_data(path);
2217        }
2218    };
2219
2220    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2221    let is_poetry_pyproject = tool_table
2222        .and_then(|tool| tool.get("poetry"))
2223        .and_then(|value| value.as_table())
2224        .is_some();
2225
2226    // Handle both PEP 621 (project table) and poetry formats
2227    let project_table =
2228        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2229            // Standard PEP 621 format with [project] table
2230            project.clone()
2231        } else if let Some(tool) = tool_table {
2232            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2233                // Poetry format with [tool.poetry] table
2234                poetry.clone()
2235            } else {
2236                return default_package_data(path);
2237            }
2238        } else if toml_content.get(FIELD_NAME).is_some() {
2239            // Other format with top-level fields
2240            match toml_content.as_table() {
2241                Some(table) => table.clone(),
2242                None => {
2243                    warn!("Failed to convert TOML content to table in {:?}", path);
2244                    return default_package_data(path);
2245                }
2246            }
2247        } else {
2248            return default_package_data(path);
2249        };
2250
2251    let name = project_table
2252        .get(FIELD_NAME)
2253        .and_then(|v| v.as_str())
2254        .map(|v| truncate_field(v.to_string()));
2255
2256    let version = project_table
2257        .get(FIELD_VERSION)
2258        .and_then(|v| v.as_str())
2259        .map(String::from);
2260    let classifiers = project_table
2261        .get("classifiers")
2262        .and_then(|value| value.as_array())
2263        .map(|values| {
2264            values
2265                .iter()
2266                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2267                .collect::<Vec<_>>()
2268        })
2269        .unwrap_or_default();
2270    let (classifier_keywords, license_classifiers) = split_classifiers(&classifiers);
2271
2272    let extracted_license_statement = extract_raw_license_string(&project_table);
2273    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2274        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2275
2276    let description = project_table
2277        .get(FIELD_DESCRIPTION)
2278        .and_then(|value| value.as_str())
2279        .map(|value| truncate_field(value.to_string()));
2280    let mut keywords = project_table
2281        .get(FIELD_KEYWORDS)
2282        .and_then(|value| value.as_array())
2283        .map(|values| {
2284            values
2285                .iter()
2286                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2287                .collect::<Vec<_>>()
2288        })
2289        .unwrap_or_default();
2290    for classifier in classifier_keywords {
2291        if !keywords.contains(&classifier) {
2292            keywords.push(classifier);
2293        }
2294    }
2295
2296    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2297    let mut extra_data = extract_pyproject_extra_data(&toml_content).unwrap_or_default();
2298    let (homepage_url, download_url, bug_tracking_url, code_view_url, repository_url) =
2299        extract_urls(&project_table, &mut extra_data);
2300
2301    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2302
2303    // Create package URL
2304    let purl = name.as_ref().and_then(|n| {
2305        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2306            Ok(p) => p,
2307            Err(e) => {
2308                warn!(
2309                    "Failed to create PackageUrl for Python package '{}': {}",
2310                    n, e
2311                );
2312                return None;
2313            }
2314        };
2315
2316        if let Some(v) = &version
2317            && let Err(e) = package_url.with_version(v)
2318        {
2319            warn!(
2320                "Failed to set version '{}' for Python package '{}': {}",
2321                v, n, e
2322            );
2323            return None;
2324        }
2325
2326        Some(package_url.to_string())
2327    });
2328
2329    let api_data_url = name.as_ref().map(|n| {
2330        if let Some(v) = &version {
2331            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2332        } else {
2333            format!("https://pypi.org/pypi/{}/json", n)
2334        }
2335    });
2336
2337    let pypi_homepage_url = name
2338        .as_ref()
2339        .map(|n| format!("https://pypi.org/project/{}", n));
2340
2341    let pypi_download_url = name.as_ref().and_then(|n| {
2342        version.as_ref().map(|v| {
2343            format!(
2344                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2345                &n[..1.min(n.len())],
2346                n,
2347                n,
2348                v
2349            )
2350        })
2351    });
2352
2353    PackageData {
2354        package_type: Some(PythonParser::PACKAGE_TYPE),
2355        namespace: None,
2356        name,
2357        version,
2358        qualifiers: None,
2359        subpath: None,
2360        primary_language: None,
2361        description,
2362        release_date: None,
2363        parties: extract_parties(&project_table),
2364        keywords,
2365        homepage_url: homepage_url.or(pypi_homepage_url),
2366        download_url: download_url
2367            .or_else(|| repository_url.clone())
2368            .or(pypi_download_url),
2369        size: None,
2370        sha1: None,
2371        md5: None,
2372        sha256: None,
2373        sha512: None,
2374        bug_tracking_url,
2375        code_view_url,
2376        vcs_url: repository_url,
2377        copyright: None,
2378        holder: None,
2379        declared_license_expression,
2380        declared_license_expression_spdx,
2381        license_detections,
2382        other_license_expression: None,
2383        other_license_expression_spdx: None,
2384        other_license_detections: Vec::new(),
2385        extracted_license_statement: extracted_license_statement
2386            .or_else(|| build_extracted_license_statement(None, &license_classifiers)),
2387        notice_text: None,
2388        source_packages: Vec::new(),
2389        file_references: Vec::new(),
2390        is_private: has_private_classifier(&classifiers),
2391        is_virtual: false,
2392        extra_data: if extra_data.is_empty() {
2393            None
2394        } else {
2395            Some(extra_data)
2396        },
2397        dependencies: [dependencies, optional_dependencies].concat(),
2398        repository_homepage_url: None,
2399        repository_download_url: None,
2400        api_data_url,
2401        datasource_id: Some(if is_poetry_pyproject {
2402            DatasourceId::PypiPoetryPyprojectToml
2403        } else {
2404            DatasourceId::PypiPyprojectToml
2405        }),
2406        purl,
2407    }
2408}
2409
2410fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2411    let path_str = path.to_string_lossy().replace('\\', "/");
2412    if path_str.contains("/EGG-INFO/PKG-INFO") {
2413        DatasourceId::PypiEggPkginfo
2414    } else if path_str.ends_with(".egg-info/PKG-INFO") {
2415        DatasourceId::PypiEditableEggPkginfo
2416    } else {
2417        DatasourceId::PypiSdistPkginfo
2418    }
2419}
2420
2421fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2422    project
2423        .get(FIELD_LICENSE)
2424        .and_then(|license_value| match license_value {
2425            TomlValue::String(license_str) => Some(license_str.clone()),
2426            TomlValue::Table(license_table) => license_table
2427                .get("text")
2428                .and_then(|v| v.as_str())
2429                .map(|s| s.to_string())
2430                .or_else(|| {
2431                    license_table
2432                        .get("expression")
2433                        .and_then(|v| v.as_str())
2434                        .map(|expr| expr.to_string())
2435                }),
2436            _ => None,
2437        })
2438}
2439
2440fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2441    match project.get(FIELD_LICENSE) {
2442        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2443        Some(TomlValue::Table(license_table)) => license_table
2444            .get("expression")
2445            .and_then(|value| value.as_str()),
2446        _ => None,
2447    }
2448}
2449
2450fn extract_urls(
2451    project: &TomlMap<String, TomlValue>,
2452    extra_data: &mut HashMap<String, serde_json::Value>,
2453) -> ProjectUrls {
2454    let mut homepage_url = None;
2455    let mut download_url = None;
2456    let mut bug_tracking_url = None;
2457    let mut code_view_url = None;
2458    let mut repository_url = None;
2459
2460    // Check for URLs table
2461    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2462        let parsed_urls: Vec<(String, String)> = urls
2463            .iter()
2464            .filter_map(|(label, value)| {
2465                value
2466                    .as_str()
2467                    .map(|url| (label.to_string(), url.to_string()))
2468            })
2469            .collect();
2470        apply_project_url_mappings(
2471            &parsed_urls,
2472            &mut homepage_url,
2473            &mut bug_tracking_url,
2474            &mut code_view_url,
2475            &mut repository_url,
2476            extra_data,
2477        );
2478
2479        download_url = urls
2480            .get("Downloads")
2481            .or_else(|| urls.get("downloads"))
2482            .and_then(|v| v.as_str())
2483            .map(String::from);
2484
2485        if homepage_url.is_none() {
2486            homepage_url = urls
2487                .get(FIELD_HOMEPAGE)
2488                .and_then(|v| v.as_str())
2489                .map(String::from);
2490        }
2491        if repository_url.is_none() {
2492            repository_url = urls
2493                .get(FIELD_REPOSITORY)
2494                .and_then(|v| v.as_str())
2495                .map(String::from);
2496        }
2497    }
2498
2499    // If not found in URLs table, check for top-level keys
2500    if homepage_url.is_none() {
2501        homepage_url = project
2502            .get(FIELD_HOMEPAGE)
2503            .and_then(|v| v.as_str())
2504            .map(String::from);
2505    }
2506
2507    if repository_url.is_none() {
2508        repository_url = project
2509            .get(FIELD_REPOSITORY)
2510            .and_then(|v| v.as_str())
2511            .map(String::from);
2512    }
2513
2514    (
2515        homepage_url,
2516        download_url,
2517        bug_tracking_url,
2518        code_view_url,
2519        repository_url,
2520    )
2521}
2522
2523fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2524    let mut parties = Vec::new();
2525
2526    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2527        for author in authors {
2528            if let Some(author_str) = author.as_str() {
2529                let (name, email) = split_name_email(author_str);
2530                parties.push(Party {
2531                    r#type: None,
2532                    role: Some("author".to_string()),
2533                    name,
2534                    email,
2535                    url: None,
2536                    organization: None,
2537                    organization_url: None,
2538                    timezone: None,
2539                });
2540            } else if let Some(author_table) = author.as_table() {
2541                let name = author_table
2542                    .get("name")
2543                    .and_then(|value| value.as_str())
2544                    .map(|value| value.to_string());
2545                let email = author_table
2546                    .get("email")
2547                    .and_then(|value| value.as_str())
2548                    .map(|value| value.to_string());
2549                if name.is_some() || email.is_some() {
2550                    parties.push(Party {
2551                        r#type: None,
2552                        role: Some("author".to_string()),
2553                        name,
2554                        email,
2555                        url: None,
2556                        organization: None,
2557                        organization_url: None,
2558                        timezone: None,
2559                    });
2560                }
2561            }
2562        }
2563    }
2564
2565    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2566        for maintainer in maintainers {
2567            if let Some(maintainer_str) = maintainer.as_str() {
2568                let (name, email) = split_name_email(maintainer_str);
2569                parties.push(Party {
2570                    r#type: None,
2571                    role: Some("maintainer".to_string()),
2572                    name,
2573                    email,
2574                    url: None,
2575                    organization: None,
2576                    organization_url: None,
2577                    timezone: None,
2578                });
2579            } else if let Some(maintainer_table) = maintainer.as_table() {
2580                let name = maintainer_table
2581                    .get("name")
2582                    .and_then(|value| value.as_str())
2583                    .map(|value| value.to_string());
2584                let email = maintainer_table
2585                    .get("email")
2586                    .and_then(|value| value.as_str())
2587                    .map(|value| value.to_string());
2588                if name.is_some() || email.is_some() {
2589                    parties.push(Party {
2590                        r#type: None,
2591                        role: Some("maintainer".to_string()),
2592                        name,
2593                        email,
2594                        url: None,
2595                        organization: None,
2596                        organization_url: None,
2597                        timezone: None,
2598                    });
2599                }
2600            }
2601        }
2602    }
2603
2604    parties
2605}
2606
2607fn extract_dependencies(
2608    project: &TomlMap<String, TomlValue>,
2609    toml_content: &TomlValue,
2610) -> (Vec<Dependency>, Vec<Dependency>) {
2611    let mut dependencies = Vec::new();
2612    let mut optional_dependencies = Vec::new();
2613
2614    // Handle dependencies - can be array or table format
2615    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2616        match deps_value {
2617            TomlValue::Array(arr) => {
2618                dependencies = parse_dependency_array(arr, false, None);
2619            }
2620            TomlValue::Table(table) => {
2621                dependencies = parse_dependency_table(table, false, None);
2622            }
2623            _ => {}
2624        }
2625    }
2626
2627    // Handle PEP 621 optional-dependencies with scope
2628    if let Some(opt_deps_table) = project
2629        .get(FIELD_OPTIONAL_DEPENDENCIES)
2630        .and_then(|v| v.as_table())
2631    {
2632        for (extra_name, deps) in opt_deps_table {
2633            match deps {
2634                TomlValue::Array(arr) => {
2635                    optional_dependencies.extend(parse_dependency_array(
2636                        arr,
2637                        true,
2638                        Some(extra_name),
2639                    ));
2640                }
2641                TomlValue::Table(table) => {
2642                    optional_dependencies.extend(parse_dependency_table(
2643                        table,
2644                        true,
2645                        Some(extra_name),
2646                    ));
2647                }
2648                _ => {}
2649            }
2650        }
2651    }
2652
2653    // Handle Poetry dev-dependencies
2654    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2655        match dev_deps_value {
2656            TomlValue::Array(arr) => {
2657                optional_dependencies.extend(parse_dependency_array(
2658                    arr,
2659                    true,
2660                    Some(FIELD_DEV_DEPENDENCIES),
2661                ));
2662            }
2663            TomlValue::Table(table) => {
2664                optional_dependencies.extend(parse_dependency_table(
2665                    table,
2666                    true,
2667                    Some(FIELD_DEV_DEPENDENCIES),
2668                ));
2669            }
2670            _ => {}
2671        }
2672    }
2673
2674    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2675    if let Some(groups_table) = toml_content
2676        .get("tool")
2677        .and_then(|value| value.as_table())
2678        .and_then(|tool| tool.get("poetry"))
2679        .and_then(|value| value.as_table())
2680        .and_then(|poetry| poetry.get("group"))
2681        .and_then(|value| value.as_table())
2682    {
2683        for (group_name, group_data) in groups_table {
2684            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2685                match group_deps {
2686                    TomlValue::Array(arr) => {
2687                        optional_dependencies.extend(parse_dependency_array(
2688                            arr,
2689                            true,
2690                            Some(group_name),
2691                        ));
2692                    }
2693                    TomlValue::Table(table) => {
2694                        optional_dependencies.extend(parse_poetry_group_dependency_table(
2695                            table,
2696                            true,
2697                            Some(group_name),
2698                        ));
2699                    }
2700                    _ => {}
2701                }
2702            }
2703        }
2704    }
2705
2706    if let Some(groups_table) = toml_content
2707        .get(FIELD_DEPENDENCY_GROUPS)
2708        .and_then(|value| value.as_table())
2709    {
2710        for (group_name, deps) in groups_table {
2711            match deps {
2712                TomlValue::Array(arr) => {
2713                    optional_dependencies.extend(parse_dependency_array(
2714                        arr,
2715                        true,
2716                        Some(group_name),
2717                    ));
2718                }
2719                TomlValue::Table(table) => {
2720                    optional_dependencies.extend(parse_dependency_table(
2721                        table,
2722                        true,
2723                        Some(group_name),
2724                    ));
2725                }
2726                _ => {}
2727            }
2728        }
2729    }
2730
2731    if let Some(dev_deps_value) = toml_content
2732        .get("tool")
2733        .and_then(|value| value.as_table())
2734        .and_then(|tool| tool.get("uv"))
2735        .and_then(|value| value.as_table())
2736        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2737    {
2738        match dev_deps_value {
2739            TomlValue::Array(arr) => {
2740                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2741            }
2742            TomlValue::Table(table) => {
2743                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2744            }
2745            _ => {}
2746        }
2747    }
2748
2749    (dependencies, optional_dependencies)
2750}
2751
2752fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2753    let mut extra_data = HashMap::new();
2754
2755    if let Some(tool_uv) = toml_content
2756        .get("tool")
2757        .and_then(|value| value.as_table())
2758        .and_then(|tool| tool.get("uv"))
2759    {
2760        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2761    }
2762
2763    if extra_data.is_empty() {
2764        None
2765    } else {
2766        Some(extra_data)
2767    }
2768}
2769
2770fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2771    match value {
2772        TomlValue::String(value) => JsonValue::String(value.clone()),
2773        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2774        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2775        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2776        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2777        TomlValue::Array(values) => {
2778            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2779        }
2780        TomlValue::Table(values) => JsonValue::Object(
2781            values
2782                .iter()
2783                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2784                .collect::<JsonMap<String, JsonValue>>(),
2785        ),
2786    }
2787}
2788
2789fn parse_dependency_table(
2790    table: &TomlMap<String, TomlValue>,
2791    is_optional: bool,
2792    scope: Option<&str>,
2793) -> Vec<Dependency> {
2794    table
2795        .iter()
2796        .filter_map(|(name, version)| {
2797            let version_str = version.as_str().map(|s| s.to_string());
2798            let mut package_url =
2799                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2800
2801            if let Some(v) = &version_str {
2802                package_url.with_version(v).ok()?;
2803            }
2804
2805            Some(Dependency {
2806                purl: Some(package_url.to_string()),
2807                extracted_requirement: None,
2808                scope: scope.map(|s| s.to_string()),
2809                is_runtime: Some(!is_optional),
2810                is_optional: Some(is_optional),
2811                is_pinned: None,
2812                is_direct: Some(true),
2813                resolved_package: None,
2814                extra_data: None,
2815            })
2816        })
2817        .collect()
2818}
2819
2820fn parse_poetry_group_dependency_table(
2821    table: &TomlMap<String, TomlValue>,
2822    is_optional: bool,
2823    scope: Option<&str>,
2824) -> Vec<Dependency> {
2825    table
2826        .iter()
2827        .filter_map(|(name, value)| build_poetry_group_dependency(name, value, is_optional, scope))
2828        .collect()
2829}
2830
2831fn build_poetry_group_dependency(
2832    name: &str,
2833    value: &TomlValue,
2834    is_optional: bool,
2835    scope: Option<&str>,
2836) -> Option<Dependency> {
2837    let normalized_name = normalize_python_dependency_name(name);
2838    let (version_spec, extras, marker) = match value {
2839        TomlValue::String(spec) => (Some(spec.trim().to_string()), Vec::new(), None),
2840        TomlValue::Table(table) => {
2841            let version_spec = table
2842                .get(FIELD_VERSION)
2843                .and_then(|value| value.as_str())
2844                .map(str::trim)
2845                .filter(|value| !value.is_empty())
2846                .map(ToOwned::to_owned);
2847            let extras = table
2848                .get(FIELD_EXTRAS)
2849                .and_then(|value| value.as_array())
2850                .map(|values| {
2851                    values
2852                        .iter()
2853                        .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2854                        .collect::<Vec<_>>()
2855                })
2856                .unwrap_or_default();
2857            let marker = table
2858                .get("markers")
2859                .and_then(|value| value.as_str())
2860                .map(str::trim)
2861                .filter(|value| !value.is_empty())
2862                .map(ToOwned::to_owned);
2863
2864            (version_spec, extras, marker)
2865        }
2866        _ => return None,
2867    };
2868
2869    let pinned_version = version_spec
2870        .as_deref()
2871        .and_then(extract_exact_pinned_version);
2872    let purl = build_python_dependency_purl(&normalized_name, pinned_version.as_deref())?;
2873
2874    let mut extra_data = HashMap::new();
2875    if let Some(marker) = marker {
2876        extra_data.insert("marker".to_string(), JsonValue::String(marker));
2877    }
2878    if !extras.is_empty() {
2879        extra_data.insert(
2880            "extras".to_string(),
2881            JsonValue::Array(extras.into_iter().map(JsonValue::String).collect()),
2882        );
2883    }
2884
2885    Some(Dependency {
2886        purl: Some(purl),
2887        extracted_requirement: version_spec,
2888        scope: scope.map(|value| value.to_string()),
2889        is_runtime: Some(!is_optional),
2890        is_optional: Some(is_optional),
2891        is_pinned: Some(pinned_version.is_some()),
2892        is_direct: Some(true),
2893        resolved_package: None,
2894        extra_data: if extra_data.is_empty() {
2895            None
2896        } else {
2897            Some(extra_data)
2898        },
2899    })
2900}
2901
2902fn parse_dependency_array(
2903    array: &[TomlValue],
2904    is_optional: bool,
2905    scope: Option<&str>,
2906) -> Vec<Dependency> {
2907    array
2908        .iter()
2909        .filter_map(|dep| {
2910            let dep_str = dep.as_str()?;
2911            build_pyproject_array_dependency(dep_str, is_optional, scope)
2912        })
2913        .collect()
2914}
2915
2916fn build_pyproject_array_dependency(
2917    dep_str: &str,
2918    is_optional: bool,
2919    scope: Option<&str>,
2920) -> Option<Dependency> {
2921    let parsed = parse_pep508_requirement(dep_str)?;
2922    let name = normalize_python_package_name(&parsed.name);
2923    let pinned_version = parsed
2924        .specifiers
2925        .as_deref()
2926        .and_then(extract_exact_pinned_version);
2927
2928    let purl = build_python_dependency_purl(&name, pinned_version.as_deref())?;
2929
2930    let mut extra_data = HashMap::new();
2931    if let Some(marker) = parsed.marker {
2932        extra_data.insert("marker".to_string(), JsonValue::String(marker));
2933    }
2934    if !parsed.extras.is_empty() {
2935        extra_data.insert(
2936            "extras".to_string(),
2937            JsonValue::Array(parsed.extras.into_iter().map(JsonValue::String).collect()),
2938        );
2939    }
2940
2941    let extracted_requirement = parsed.specifiers.or(parsed.url);
2942
2943    Some(Dependency {
2944        purl: Some(purl),
2945        extracted_requirement: extracted_requirement.clone(),
2946        scope: scope.map(|s| s.to_string()),
2947        is_runtime: Some(!is_optional),
2948        is_optional: Some(is_optional),
2949        is_pinned: Some(pinned_version.is_some()),
2950        is_direct: Some(true),
2951        resolved_package: None,
2952        extra_data: if extra_data.is_empty() {
2953            None
2954        } else {
2955            Some(extra_data)
2956        },
2957    })
2958}
2959
2960fn extract_exact_pinned_version(specifiers: &str) -> Option<String> {
2961    let trimmed = specifiers.trim();
2962    if trimmed.contains(',') {
2963        return None;
2964    }
2965
2966    let stripped = if let Some(version) = trimmed.strip_prefix("===") {
2967        version
2968    } else if let Some(version) = trimmed.strip_prefix("==") {
2969        version
2970    } else {
2971        return None;
2972    };
2973
2974    let version = stripped.trim();
2975    if version.is_empty() {
2976        None
2977    } else {
2978        Some(version.to_string())
2979    }
2980}
2981
2982#[derive(Debug, Clone)]
2983enum Value {
2984    String(String),
2985    Number(f64),
2986    Bool(bool),
2987    None,
2988    List(Vec<Value>),
2989    Tuple(Vec<Value>),
2990    Dict(HashMap<String, Value>),
2991}
2992
2993struct LiteralEvaluator {
2994    constants: HashMap<String, Value>,
2995    max_depth: usize,
2996    max_nodes: usize,
2997    nodes_visited: usize,
2998}
2999
3000impl LiteralEvaluator {
3001    fn new(constants: HashMap<String, Value>) -> Self {
3002        Self {
3003            constants,
3004            max_depth: MAX_SETUP_PY_AST_DEPTH,
3005            max_nodes: MAX_SETUP_PY_AST_NODES,
3006            nodes_visited: 0,
3007        }
3008    }
3009
3010    fn insert_constant(&mut self, name: String, value: Value) {
3011        self.constants.insert(name, value);
3012    }
3013
3014    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
3015        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
3016            return None;
3017        }
3018        self.nodes_visited += 1;
3019
3020        match expr {
3021            ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
3022                Some(Value::String(value.to_str().to_string()))
3023            }
3024            ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
3025                Some(Value::Bool(*value))
3026            }
3027            ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
3028                self.evaluate_number(value)
3029            }
3030            ast::Expr::NoneLiteral(_) => Some(Value::None),
3031            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
3032            ast::Expr::List(ast::ExprList { elts, .. }) => {
3033                let mut values = Vec::new();
3034                for elt in elts {
3035                    values.push(self.evaluate_expr(elt, depth + 1)?);
3036                }
3037                Some(Value::List(values))
3038            }
3039            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
3040                let mut values = Vec::new();
3041                for elt in elts {
3042                    values.push(self.evaluate_expr(elt, depth + 1)?);
3043                }
3044                Some(Value::Tuple(values))
3045            }
3046            ast::Expr::Dict(ast::ExprDict { items, .. }) => {
3047                let mut dict = HashMap::new();
3048                for item in items {
3049                    let key_expr = item.key.as_ref()?;
3050                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
3051                    let key = value_to_string(&key_value)?;
3052                    let value = self.evaluate_expr(&item.value, depth + 1)?;
3053                    dict.insert(key, value);
3054                }
3055                Some(Value::Dict(dict))
3056            }
3057            ast::Expr::Call(ast::ExprCall {
3058                func, arguments, ..
3059            }) => {
3060                let args = arguments.args.as_ref();
3061                let keywords = arguments.keywords.as_ref();
3062                if keywords.is_empty()
3063                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
3064                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
3065                {
3066                    return self.evaluate_ordered_dict(args, depth + 1);
3067                }
3068
3069                if !args.is_empty() {
3070                    return None;
3071                }
3072
3073                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
3074                    && id == "dict"
3075                {
3076                    let mut dict = HashMap::new();
3077                    for keyword in keywords {
3078                        let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
3079                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
3080                        dict.insert(key.to_string(), value);
3081                    }
3082                    return Some(Value::Dict(dict));
3083                }
3084
3085                None
3086            }
3087            _ => None,
3088        }
3089    }
3090
3091    fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
3092        match number {
3093            ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
3094            ast::Number::Float(value) => Some(Value::Number(*value)),
3095            ast::Number::Complex { .. } => None,
3096        }
3097    }
3098
3099    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
3100        if args.len() != 1 {
3101            return None;
3102        }
3103
3104        let items = match self.evaluate_expr(&args[0], depth)? {
3105            Value::List(items) | Value::Tuple(items) => items,
3106            _ => return None,
3107        };
3108
3109        let mut dict = HashMap::new();
3110        for item in items {
3111            let Value::Tuple(values) = item else {
3112                return None;
3113            };
3114            if values.len() != 2 {
3115                return None;
3116            }
3117            let key = value_to_string(&values[0])?;
3118            dict.insert(key, values[1].clone());
3119        }
3120
3121        Some(Value::Dict(dict))
3122    }
3123}
3124
3125#[derive(Default)]
3126struct SetupAliases {
3127    setup_names: HashSet<String>,
3128    module_aliases: HashMap<String, String>,
3129}
3130
3131fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
3132    extract_from_setup_py(path).into_iter().collect()
3133}
3134
3135fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
3136    let content = match read_file_to_string(path, None) {
3137        Ok(content) => content,
3138        Err(e) => {
3139            warn!("Failed to read setup.py at {:?}: {}", path, e);
3140            return Some(default_package_data(path));
3141        }
3142    };
3143
3144    if content.len() > MAX_SETUP_PY_BYTES {
3145        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
3146        let package_data = extract_from_setup_py_regex(&content);
3147        return should_emit_setup_py_package(&package_data).then_some(package_data);
3148    }
3149
3150    let mut package_data = match extract_from_setup_py_ast(&content) {
3151        Ok(Some(data)) => data,
3152        Ok(None) => return Some(default_package_data(path)),
3153        Err(e) => {
3154            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
3155            extract_from_setup_py_regex(&content)
3156        }
3157    };
3158
3159    if package_data.name.is_none() {
3160        package_data.name = extract_setup_value(&content, "name");
3161    }
3162
3163    if package_data.version.is_none() {
3164        package_data.version = extract_setup_value(&content, "version");
3165    }
3166
3167    if package_data
3168        .version
3169        .as_deref()
3170        .is_some_and(|version| version.trim().is_empty())
3171    {
3172        package_data.version = None;
3173    }
3174
3175    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
3176    package_data.purl = build_setup_py_purl(
3177        package_data.name.as_deref(),
3178        package_data.version.as_deref(),
3179    );
3180
3181    if should_emit_setup_py_package(&package_data) {
3182        Some(package_data)
3183    } else {
3184        Some(default_package_data(path))
3185    }
3186}
3187
3188fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
3189    package_data.name.is_some()
3190        || package_data.version.is_some()
3191        || package_data.purl.is_some()
3192        || !package_data.dependencies.is_empty()
3193        || package_data.extracted_license_statement.is_some()
3194        || !package_data.license_detections.is_empty()
3195        || !package_data.parties.is_empty()
3196        || package_data.description.is_some()
3197        || package_data.homepage_url.is_some()
3198        || package_data.bug_tracking_url.is_some()
3199        || package_data.code_view_url.is_some()
3200        || package_data.vcs_url.is_some()
3201}
3202
3203fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
3204    if package_data.version.is_some()
3205        && package_data.extracted_license_statement.is_some()
3206        && package_data
3207            .parties
3208            .iter()
3209            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
3210    {
3211        return;
3212    }
3213
3214    let Some(root) = path.parent() else {
3215        return;
3216    };
3217
3218    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
3219
3220    if package_data.version.is_none() {
3221        package_data.version = dunder_metadata.version;
3222    }
3223
3224    if package_data.extracted_license_statement.is_none() {
3225        package_data.extracted_license_statement = dunder_metadata.license;
3226    }
3227
3228    let has_author = package_data
3229        .parties
3230        .iter()
3231        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
3232
3233    if !has_author && let Some(author) = dunder_metadata.author {
3234        package_data.parties.push(Party {
3235            r#type: Some("person".to_string()),
3236            role: Some("author".to_string()),
3237            name: Some(author),
3238            email: None,
3239            url: None,
3240            organization: None,
3241            organization_url: None,
3242            timezone: None,
3243        });
3244    }
3245}
3246
3247#[derive(Default)]
3248struct DunderMetadata {
3249    version: Option<String>,
3250    author: Option<String>,
3251    license: Option<String>,
3252}
3253
3254fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
3255    let statements = match parse_module(content) {
3256        Ok(parsed) => parsed.into_suite(),
3257        Err(_) => return DunderMetadata::default(),
3258    };
3259
3260    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3261    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3262    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3263    let mut metadata = DunderMetadata::default();
3264    let mut candidate_paths = Vec::new();
3265
3266    for module in imported_dunder_modules(&statements) {
3267        let Some(path) = resolve_imported_module_path(root, &module) else {
3268            continue;
3269        };
3270
3271        candidate_paths.push(path);
3272    }
3273
3274    candidate_paths.extend(referenced_dunder_init_paths(root, content));
3275
3276    let mut seen_paths = HashSet::new();
3277    for path in candidate_paths {
3278        if !seen_paths.insert(path.clone()) {
3279            continue;
3280        }
3281
3282        let Ok(module_content) = read_file_to_string(&path, None) else {
3283            continue;
3284        };
3285
3286        if metadata.version.is_none() {
3287            metadata.version = version_re
3288                .as_ref()
3289                .and_then(|regex| regex.captures(&module_content))
3290                .and_then(|captures| captures.get(1))
3291                .map(|match_| match_.as_str().to_string());
3292        }
3293
3294        if metadata.author.is_none() {
3295            metadata.author = author_re
3296                .as_ref()
3297                .and_then(|regex| regex.captures(&module_content))
3298                .and_then(|captures| captures.get(1))
3299                .map(|match_| match_.as_str().to_string());
3300        }
3301
3302        if metadata.license.is_none() {
3303            metadata.license = license_re
3304                .as_ref()
3305                .and_then(|regex| regex.captures(&module_content))
3306                .and_then(|captures| captures.get(1))
3307                .map(|match_| match_.as_str().to_string());
3308        }
3309
3310        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
3311            return metadata;
3312        }
3313    }
3314
3315    metadata
3316}
3317
3318fn referenced_dunder_init_paths(root: &Path, content: &str) -> Vec<PathBuf> {
3319    let open_re = match Regex::new(r#"open\(\s*['\"]([^'\"]+__init__\.py)['\"]"#) {
3320        Ok(regex) => regex,
3321        Err(_) => return Vec::new(),
3322    };
3323
3324    open_re
3325        .captures_iter(content)
3326        .filter_map(|captures| captures.get(1).map(|m| m.as_str()))
3327        .filter_map(|relative| {
3328            let relative_path = PathBuf::from(relative);
3329            if relative_path.is_absolute()
3330                || relative_path.components().any(|component| {
3331                    matches!(
3332                        component,
3333                        Component::ParentDir | Component::RootDir | Component::Prefix(_)
3334                    )
3335                })
3336            {
3337                return None;
3338            }
3339
3340            let candidate = root.join(relative_path);
3341            candidate.exists().then_some(candidate)
3342        })
3343        .collect()
3344}
3345
3346fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
3347    let mut modules = Vec::new();
3348
3349    for statement in statements {
3350        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
3351            continue;
3352        };
3353        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
3354            continue;
3355        };
3356        let imports_dunder = names.iter().any(|alias| {
3357            matches!(
3358                alias.name.as_str(),
3359                "__version__" | "__author__" | "__license__"
3360            )
3361        });
3362        if imports_dunder {
3363            modules.push(module.to_string());
3364        }
3365    }
3366
3367    modules
3368}
3369
3370fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
3371    let relative = PathBuf::from_iter(module.split('.'));
3372    let candidates = [
3373        root.join(relative.with_extension("py")),
3374        root.join(&relative).join("__init__.py"),
3375        root.join("src").join(relative.with_extension("py")),
3376        root.join("src").join(relative).join("__init__.py"),
3377    ];
3378
3379    candidates.into_iter().find(|candidate| candidate.exists())
3380}
3381
3382/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
3383///
3384/// # Security Model
3385///
3386/// This function parses setup.py as a Python AST and evaluates only literal values
3387/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
3388/// arbitrary code execution during scanning.
3389///
3390/// # DoS Prevention
3391///
3392/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
3393/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
3394/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
3395///
3396/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
3397fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
3398    let statements = parse_module(content)
3399        .map(|parsed| parsed.into_suite())
3400        .map_err(|e| e.to_string())?;
3401    let aliases = collect_setup_aliases(&statements);
3402    let mut evaluator = LiteralEvaluator::new(HashMap::new());
3403    build_setup_py_constants(&statements, &mut evaluator);
3404
3405    let setup_call = find_setup_call(&statements, &aliases);
3406    let Some(call_expr) = setup_call else {
3407        return Ok(None);
3408    };
3409
3410    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3411    Ok(Some(build_setup_py_package_data(&setup_values)))
3412}
3413
3414fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3415    for stmt in statements {
3416        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3417            if targets.len() != 1 {
3418                continue;
3419            }
3420
3421            let Some(name) = extract_assign_name(&targets[0]) else {
3422                continue;
3423            };
3424
3425            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3426                evaluator.insert_constant(name, value);
3427            }
3428        }
3429    }
3430}
3431
3432fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3433    match target {
3434        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3435        _ => None,
3436    }
3437}
3438
3439fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3440    let mut aliases = SetupAliases::default();
3441    aliases.setup_names.insert("setup".to_string());
3442
3443    for stmt in statements {
3444        match stmt {
3445            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3446                for alias in names {
3447                    let module_name = alias.name.as_str();
3448                    if !is_setup_module(module_name) {
3449                        continue;
3450                    }
3451                    let alias_name = alias
3452                        .asname
3453                        .as_ref()
3454                        .map(|name| name.as_str())
3455                        .unwrap_or(module_name);
3456                    aliases
3457                        .module_aliases
3458                        .insert(alias_name.to_string(), module_name.to_string());
3459                }
3460            }
3461            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3462                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3463                    continue;
3464                };
3465                if !is_setup_module(module_name) {
3466                    continue;
3467                }
3468                for alias in names {
3469                    if alias.name.as_str() != "setup" {
3470                        continue;
3471                    }
3472                    let alias_name = alias
3473                        .asname
3474                        .as_ref()
3475                        .map(|name| name.as_str())
3476                        .unwrap_or("setup");
3477                    aliases.setup_names.insert(alias_name.to_string());
3478                }
3479            }
3480            _ => {}
3481        }
3482    }
3483
3484    aliases
3485}
3486
3487fn is_setup_module(module_name: &str) -> bool {
3488    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3489}
3490
3491fn find_setup_call<'a>(
3492    statements: &'a [ast::Stmt],
3493    aliases: &'a SetupAliases,
3494) -> Option<&'a ast::Expr> {
3495    let mut finder = SetupCallFinder {
3496        aliases,
3497        called_function_names: collect_top_level_called_function_names(statements),
3498        nodes_visited: 0,
3499    };
3500    finder.find_in_statements(statements)
3501}
3502
3503fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3504    let mut called = HashSet::new();
3505    collect_called_function_names_in_statements(statements, &mut called);
3506    called
3507}
3508
3509fn collect_called_function_names_in_statements(
3510    statements: &[ast::Stmt],
3511    called: &mut HashSet<String>,
3512) {
3513    for stmt in statements {
3514        match stmt {
3515            ast::Stmt::Expr(ast::StmtExpr { value, .. })
3516            | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3517                collect_called_function_names_in_expr(value.as_ref(), called);
3518            }
3519            ast::Stmt::If(ast::StmtIf {
3520                body,
3521                elif_else_clauses,
3522                ..
3523            }) => {
3524                collect_called_function_names_in_statements(body, called);
3525                for clause in elif_else_clauses {
3526                    collect_called_function_names_in_statements(&clause.body, called);
3527                }
3528            }
3529            ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3530            | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3531                collect_called_function_names_in_statements(body, called);
3532                collect_called_function_names_in_statements(orelse, called);
3533            }
3534            ast::Stmt::With(ast::StmtWith { body, .. }) => {
3535                collect_called_function_names_in_statements(body, called);
3536            }
3537            ast::Stmt::Try(ast::StmtTry {
3538                body,
3539                orelse,
3540                finalbody,
3541                handlers,
3542                ..
3543            }) => {
3544                collect_called_function_names_in_statements(body, called);
3545                collect_called_function_names_in_statements(orelse, called);
3546                collect_called_function_names_in_statements(finalbody, called);
3547                for handler in handlers {
3548                    let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3549                        body,
3550                        ..
3551                    }) = handler;
3552                    collect_called_function_names_in_statements(body, called);
3553                }
3554            }
3555            _ => {}
3556        }
3557    }
3558}
3559
3560fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3561    if let ast::Expr::Call(ast::ExprCall {
3562        func, arguments, ..
3563    }) = expr
3564    {
3565        if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3566            called.insert(id.as_str().to_string());
3567        }
3568
3569        for arg in arguments.args.iter() {
3570            collect_called_function_names_in_expr(arg, called);
3571        }
3572        for keyword in arguments.keywords.iter() {
3573            collect_called_function_names_in_expr(&keyword.value, called);
3574        }
3575    }
3576}
3577
3578struct SetupCallFinder<'a> {
3579    aliases: &'a SetupAliases,
3580    called_function_names: HashSet<String>,
3581    nodes_visited: usize,
3582}
3583
3584impl<'a> SetupCallFinder<'a> {
3585    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3586        for stmt in statements {
3587            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3588                return None;
3589            }
3590            self.nodes_visited += 1;
3591
3592            let found = match stmt {
3593                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3594                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3595                ast::Stmt::If(ast::StmtIf {
3596                    body,
3597                    elif_else_clauses,
3598                    ..
3599                }) => self.find_in_statements(body).or_else(|| {
3600                    for clause in elif_else_clauses {
3601                        if let Some(found) = self.find_in_statements(&clause.body) {
3602                            return Some(found);
3603                        }
3604                    }
3605                    None
3606                }),
3607                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3608                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3609                    .find_in_statements(body)
3610                    .or_else(|| self.find_in_statements(orelse)),
3611                ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3612                    .called_function_names
3613                    .contains(name.as_str())
3614                    .then(|| self.find_in_statements(body))
3615                    .flatten(),
3616                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3617                ast::Stmt::Try(ast::StmtTry {
3618                    body,
3619                    orelse,
3620                    finalbody,
3621                    handlers,
3622                    ..
3623                }) => self
3624                    .find_in_statements(body)
3625                    .or_else(|| self.find_in_statements(orelse))
3626                    .or_else(|| self.find_in_statements(finalbody))
3627                    .or_else(|| {
3628                        for handler in handlers {
3629                            let ast::ExceptHandler::ExceptHandler(
3630                                ast::ExceptHandlerExceptHandler { body, .. },
3631                            ) = handler;
3632                            if let Some(found) = self.find_in_statements(body) {
3633                                return Some(found);
3634                            }
3635                        }
3636                        None
3637                    }),
3638                _ => None,
3639            };
3640
3641            if found.is_some() {
3642                return found;
3643            }
3644        }
3645
3646        None
3647    }
3648
3649    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3650        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3651            return None;
3652        }
3653        self.nodes_visited += 1;
3654
3655        match expr {
3656            ast::Expr::Call(ast::ExprCall { func, .. })
3657                if is_setup_call(func.as_ref(), self.aliases) =>
3658            {
3659                Some(expr)
3660            }
3661            _ => None,
3662        }
3663    }
3664}
3665
3666fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3667    let Some(dotted) = dotted_name(func, 0) else {
3668        return false;
3669    };
3670
3671    if aliases.setup_names.contains(&dotted) {
3672        return true;
3673    }
3674
3675    let Some(module) = dotted.strip_suffix(".setup") else {
3676        return false;
3677    };
3678
3679    let resolved = resolve_module_alias(module, aliases);
3680    is_setup_module(&resolved)
3681}
3682
3683fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3684    if depth >= MAX_SETUP_PY_AST_DEPTH {
3685        return None;
3686    }
3687
3688    match expr {
3689        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3690        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3691            let base = dotted_name(value.as_ref(), depth + 1)?;
3692            Some(format!("{}.{}", base, attr.as_str()))
3693        }
3694        _ => None,
3695    }
3696}
3697
3698fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3699    if let Some(mapped) = aliases.module_aliases.get(module) {
3700        return mapped.clone();
3701    }
3702
3703    let Some((base, rest)) = module.split_once('.') else {
3704        return module.to_string();
3705    };
3706
3707    if let Some(mapped) = aliases.module_aliases.get(base) {
3708        return format!("{}.{}", mapped, rest);
3709    }
3710
3711    module.to_string()
3712}
3713
3714fn extract_setup_keywords(
3715    call_expr: &ast::Expr,
3716    evaluator: &mut LiteralEvaluator,
3717) -> HashMap<String, Value> {
3718    let mut values = HashMap::new();
3719    let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3720        return values;
3721    };
3722
3723    for keyword in arguments.keywords.iter() {
3724        if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3725            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3726                values.insert(arg.to_string(), value);
3727            }
3728        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3729            for (key, value) in dict {
3730                values.insert(key, value);
3731            }
3732        }
3733    }
3734
3735    values
3736}
3737
3738fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3739    let name = get_value_string(values, "name").map(truncate_field);
3740    let version = get_value_string(values, "version").map(truncate_field);
3741    let description = get_value_string(values, "description")
3742        .or_else(|| get_value_string(values, "summary"))
3743        .map(truncate_field);
3744    let homepage_url = get_value_string(values, "url")
3745        .or_else(|| get_value_string(values, "home_page"))
3746        .map(truncate_field);
3747    let author = get_value_string(values, "author").map(truncate_field);
3748    let author_email = get_value_string(values, "author_email");
3749    let maintainer = get_value_string(values, "maintainer").map(truncate_field);
3750    let maintainer_email = get_value_string(values, "maintainer_email");
3751    let license = get_value_string(values, "license").map(truncate_field);
3752    let classifiers = values
3753        .get("classifiers")
3754        .and_then(value_to_string_list)
3755        .unwrap_or_default();
3756
3757    let mut parties = Vec::new();
3758    if author.is_some() || author_email.is_some() {
3759        parties.push(Party {
3760            r#type: Some("person".to_string()),
3761            role: Some("author".to_string()),
3762            name: author,
3763            email: author_email,
3764            url: None,
3765            organization: None,
3766            organization_url: None,
3767            timezone: None,
3768        });
3769    }
3770
3771    if maintainer.is_some() || maintainer_email.is_some() {
3772        parties.push(Party {
3773            r#type: Some("person".to_string()),
3774            role: Some("maintainer".to_string()),
3775            name: maintainer,
3776            email: maintainer_email,
3777            url: None,
3778            organization: None,
3779            organization_url: None,
3780            timezone: None,
3781        });
3782    }
3783
3784    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3785        normalize_spdx_declared_license(license.as_deref());
3786    let extracted_license_statement = license.clone();
3787
3788    let dependencies = build_setup_py_dependencies(values);
3789    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3790    let mut homepage_from_project_urls = None;
3791    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3792    let mut extra_data = HashMap::new();
3793
3794    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3795        apply_project_url_mappings(
3796            &parsed_project_urls,
3797            &mut homepage_from_project_urls,
3798            &mut bug_tracking_url,
3799            &mut code_view_url,
3800            &mut vcs_url,
3801            &mut extra_data,
3802        );
3803    }
3804
3805    let extra_data = if extra_data.is_empty() {
3806        None
3807    } else {
3808        Some(extra_data)
3809    };
3810
3811    PackageData {
3812        package_type: Some(PythonParser::PACKAGE_TYPE),
3813        namespace: None,
3814        name,
3815        version,
3816        qualifiers: None,
3817        subpath: None,
3818        primary_language: Some("Python".to_string()),
3819        description,
3820        release_date: None,
3821        parties,
3822        keywords: Vec::new(),
3823        homepage_url: homepage_url.or(homepage_from_project_urls),
3824        download_url: None,
3825        size: None,
3826        sha1: None,
3827        md5: None,
3828        sha256: None,
3829        sha512: None,
3830        bug_tracking_url,
3831        code_view_url,
3832        vcs_url,
3833        copyright: None,
3834        holder: None,
3835        declared_license_expression,
3836        declared_license_expression_spdx,
3837        license_detections,
3838        other_license_expression: None,
3839        other_license_expression_spdx: None,
3840        other_license_detections: Vec::new(),
3841        extracted_license_statement,
3842        notice_text: None,
3843        source_packages: Vec::new(),
3844        file_references: Vec::new(),
3845        is_private: has_private_classifier(&classifiers),
3846        is_virtual: false,
3847        extra_data,
3848        dependencies,
3849        repository_homepage_url: None,
3850        repository_download_url: None,
3851        api_data_url: None,
3852        datasource_id: Some(DatasourceId::PypiSetupPy),
3853        purl,
3854    }
3855}
3856
3857fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3858    let mut dependencies = Vec::new();
3859
3860    if let Some(reqs) = values
3861        .get("install_requires")
3862        .and_then(value_to_string_list)
3863    {
3864        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3865    }
3866
3867    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3868        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3869    }
3870
3871    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3872        let mut extra_items: Vec<_> = extras.iter().collect();
3873        extra_items.sort_by_key(|(name, _)| *name);
3874        for (extra_name, extra_value) in extra_items {
3875            if let Some(reqs) = value_to_string_list(extra_value) {
3876                dependencies.extend(build_setup_py_dependency_list(
3877                    reqs.as_slice(),
3878                    extra_name,
3879                    true,
3880                ));
3881            }
3882        }
3883    }
3884
3885    dependencies
3886}
3887
3888fn build_setup_py_dependency_list(
3889    reqs: &[String],
3890    scope: &str,
3891    is_optional: bool,
3892) -> Vec<Dependency> {
3893    reqs.iter()
3894        .filter_map(|req| build_python_dependency(req, scope, is_optional, None))
3895        .collect()
3896}
3897
3898fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3899    values.get(key).and_then(value_to_string)
3900}
3901
3902fn value_to_string(value: &Value) -> Option<String> {
3903    match value {
3904        Value::String(value) => Some(value.clone()),
3905        Value::Number(value) => Some(value.to_string()),
3906        Value::Bool(value) => Some(value.to_string()),
3907        _ => None,
3908    }
3909}
3910
3911fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3912    match value {
3913        Value::String(value) => Some(vec![value.clone()]),
3914        Value::List(values) | Value::Tuple(values) => {
3915            let mut items = Vec::new();
3916            for item in values {
3917                items.push(value_to_string(item)?);
3918            }
3919            Some(items)
3920        }
3921        _ => None,
3922    }
3923}
3924
3925fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3926    let Value::Dict(dict) = value else {
3927        return None;
3928    };
3929
3930    let mut pairs: Vec<(String, String)> = dict
3931        .iter()
3932        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3933        .collect::<Option<Vec<_>>>()?;
3934    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3935    Some(pairs)
3936}
3937
3938fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3939    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3940    extract_requires_dist_dependencies(&requires_dist)
3941}
3942
3943pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3944    requires_dist
3945        .iter()
3946        .filter_map(|entry| build_rfc822_dependency(entry))
3947        .collect()
3948}
3949
3950fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3951    build_python_dependency(entry, "install", false, None)
3952}
3953
3954fn build_python_dependency(
3955    entry: &str,
3956    default_scope: &str,
3957    default_optional: bool,
3958    marker_override: Option<&str>,
3959) -> Option<Dependency> {
3960    let (requirement_part, marker_part) = entry
3961        .split_once(';')
3962        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3963        .unwrap_or((entry.trim(), None));
3964
3965    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3966    let requirement = normalize_rfc822_requirement(requirement_part);
3967    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3968        marker_part.or(marker_override),
3969        default_scope,
3970        default_optional,
3971    );
3972    let purl = build_python_dependency_purl(&name, None)?;
3973
3974    let is_pinned = requirement
3975        .as_deref()
3976        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3977    let purl = if is_pinned {
3978        requirement
3979            .as_deref()
3980            .map(|req| req.trim_start_matches('='))
3981            .and_then(|version| build_python_dependency_purl(&name, Some(version)))
3982            .unwrap_or(purl)
3983    } else {
3984        purl
3985    };
3986
3987    let mut extra_data = HashMap::new();
3988    extra_data.extend(marker_data);
3989    if let Some(marker) = marker {
3990        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3991    }
3992
3993    Some(Dependency {
3994        purl: Some(purl),
3995        extracted_requirement: requirement,
3996        scope: Some(scope),
3997        is_runtime: Some(true),
3998        is_optional: Some(is_optional),
3999        is_pinned: Some(is_pinned),
4000        is_direct: Some(true),
4001        resolved_package: None,
4002        extra_data: if extra_data.is_empty() {
4003            None
4004        } else {
4005            Some(extra_data)
4006        },
4007    })
4008}
4009
4010fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
4011    let name = extract_setup_cfg_dependency_name(requirement_part)?;
4012    let trimmed = requirement_part.trim();
4013    let mut remainder = trimmed[name.len()..].trim();
4014
4015    if let Some(stripped) = remainder.strip_prefix('[')
4016        && let Some(end_idx) = stripped.find(']')
4017    {
4018        remainder = stripped[end_idx + 1..].trim();
4019    }
4020
4021    let remainder = remainder
4022        .strip_prefix('(')
4023        .and_then(|value| value.strip_suffix(')'))
4024        .unwrap_or(remainder)
4025        .trim();
4026
4027    if remainder.is_empty() {
4028        return None;
4029    }
4030
4031    let mut specifiers: Vec<String> = remainder
4032        .split(',')
4033        .map(|specifier| specifier.trim().replace(' ', ""))
4034        .filter(|specifier| !specifier.is_empty())
4035        .collect();
4036    specifiers.sort();
4037    Some(specifiers.join(","))
4038}
4039
4040fn encode_python_dependency_purl_version(version: &str) -> String {
4041    version.replace('*', "%2A")
4042}
4043
4044fn build_python_dependency_purl(name: &str, version: Option<&str>) -> Option<String> {
4045    let normalized_name = normalize_python_dependency_name(name);
4046
4047    PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &normalized_name)
4048        .ok()
4049        .map(|_| match version {
4050            Some(version) => {
4051                format!(
4052                    "pkg:pypi/{normalized_name}@{}",
4053                    encode_python_dependency_purl_version(version)
4054                )
4055            }
4056            None => format!("pkg:pypi/{normalized_name}"),
4057        })
4058}
4059
4060fn normalize_python_dependency_name(name: &str) -> String {
4061    name.trim().to_ascii_lowercase().replace('_', "-")
4062}
4063
4064fn parse_rfc822_marker(
4065    marker_part: Option<&str>,
4066    default_scope: &str,
4067    default_optional: bool,
4068) -> (
4069    String,
4070    bool,
4071    Option<String>,
4072    HashMap<String, serde_json::Value>,
4073) {
4074    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
4075        return (
4076            default_scope.to_string(),
4077            default_optional,
4078            None,
4079            HashMap::new(),
4080        );
4081    };
4082
4083    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
4084        .expect("extra marker regex should compile");
4085    let mut extra_data = HashMap::new();
4086
4087    if let Some(python_version) = extract_marker_field(marker, "python_version") {
4088        extra_data.insert(
4089            "python_version".to_string(),
4090            serde_json::Value::String(python_version),
4091        );
4092    }
4093    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
4094        extra_data.insert(
4095            "sys_platform".to_string(),
4096            serde_json::Value::String(sys_platform),
4097        );
4098    }
4099
4100    if let Some(captures) = extra_re.captures(marker)
4101        && let Some(scope) = captures.get(1)
4102    {
4103        return (
4104            scope.as_str().to_string(),
4105            true,
4106            Some(marker.trim().to_string()),
4107            extra_data,
4108        );
4109    }
4110
4111    (
4112        default_scope.to_string(),
4113        default_optional,
4114        Some(marker.trim().to_string()),
4115        extra_data,
4116    )
4117}
4118
4119fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
4120    let re = Regex::new(&format!(
4121        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
4122        field
4123    ))
4124    .ok()?;
4125    let captures = re.captures(marker)?;
4126    let operator = captures.get(1)?.as_str();
4127    let value = captures.get(2)?.as_str();
4128    Some(format!("{} {}", operator, value))
4129}
4130
4131fn parse_requires_txt(content: &str) -> Vec<Dependency> {
4132    let mut dependencies = Vec::new();
4133    let mut current_scope = "install".to_string();
4134    let mut current_optional = false;
4135    let mut current_marker: Option<String> = None;
4136    let mut line_count = 0usize;
4137
4138    for line in content.lines() {
4139        line_count += 1;
4140        if line_count > MAX_ITERATION_COUNT {
4141            warn!(
4142                "Exceeded max line count in requires.txt; stopping at {} lines",
4143                MAX_ITERATION_COUNT
4144            );
4145            break;
4146        }
4147        let trimmed = line.trim();
4148        if trimmed.is_empty() || trimmed.starts_with('#') {
4149            continue;
4150        }
4151
4152        if trimmed.starts_with('[') && trimmed.ends_with(']') {
4153            let inner = &trimmed[1..trimmed.len() - 1];
4154            if let Some(rest) = inner.strip_prefix(':') {
4155                current_scope = "install".to_string();
4156                current_optional = false;
4157                current_marker = Some(rest.trim().to_string());
4158            } else if let Some((scope, marker)) = inner.split_once(':') {
4159                current_scope = scope.trim().to_string();
4160                current_optional = true;
4161                current_marker = Some(marker.trim().to_string());
4162            } else {
4163                current_scope = inner.trim().to_string();
4164                current_optional = true;
4165                current_marker = None;
4166            }
4167            continue;
4168        }
4169
4170        if let Some(dependency) = build_python_dependency(
4171            trimmed,
4172            &current_scope,
4173            current_optional,
4174            current_marker.as_deref(),
4175        ) {
4176            dependencies.push(dependency);
4177        }
4178    }
4179
4180    dependencies
4181}
4182
4183fn has_private_classifier(classifiers: &[String]) -> bool {
4184    classifiers
4185        .iter()
4186        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
4187}
4188
4189fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
4190    let name = name?;
4191    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
4192    if let Some(version) = version {
4193        package_url.with_version(version).ok()?;
4194    }
4195    Some(package_url.to_string())
4196}
4197
4198fn extract_from_setup_py_regex(content: &str) -> PackageData {
4199    let name = extract_setup_value(content, "name").map(truncate_field);
4200    let version = extract_setup_value(content, "version").map(truncate_field);
4201    let license_expression = extract_setup_value(content, "license").map(truncate_field);
4202
4203    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4204        normalize_spdx_declared_license(license_expression.as_deref());
4205    let extracted_license_statement = license_expression.clone();
4206
4207    let dependencies = extract_setup_py_dependencies(content);
4208    let homepage_url = extract_setup_value(content, "url").map(truncate_field);
4209    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
4210
4211    PackageData {
4212        package_type: Some(PythonParser::PACKAGE_TYPE),
4213        namespace: None,
4214        name,
4215        version,
4216        qualifiers: None,
4217        subpath: None,
4218        primary_language: Some("Python".to_string()),
4219        description: None,
4220        release_date: None,
4221        parties: Vec::new(),
4222        keywords: Vec::new(),
4223        homepage_url,
4224        download_url: None,
4225        size: None,
4226        sha1: None,
4227        md5: None,
4228        sha256: None,
4229        sha512: None,
4230        bug_tracking_url: None,
4231        code_view_url: None,
4232        vcs_url: None,
4233        copyright: None,
4234        holder: None,
4235        declared_license_expression,
4236        declared_license_expression_spdx,
4237        license_detections,
4238        other_license_expression: None,
4239        other_license_expression_spdx: None,
4240        other_license_detections: Vec::new(),
4241        extracted_license_statement,
4242        notice_text: None,
4243        source_packages: Vec::new(),
4244        file_references: Vec::new(),
4245        is_private: false,
4246        is_virtual: false,
4247        extra_data: None,
4248        dependencies,
4249        repository_homepage_url: None,
4250        repository_download_url: None,
4251        api_data_url: None,
4252        datasource_id: Some(DatasourceId::PypiSetupPy),
4253        purl,
4254    }
4255}
4256
4257fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
4258    crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
4259}
4260
4261fn extract_from_pypi_json(path: &Path) -> PackageData {
4262    let default = PackageData {
4263        package_type: Some(PythonParser::PACKAGE_TYPE),
4264        datasource_id: Some(DatasourceId::PypiJson),
4265        ..Default::default()
4266    };
4267
4268    let content = match read_file_to_string(path, None) {
4269        Ok(content) => content,
4270        Err(error) => {
4271            warn!("Failed to read pypi.json at {:?}: {}", path, error);
4272            return default;
4273        }
4274    };
4275
4276    let root: serde_json::Value = match serde_json::from_str(&content) {
4277        Ok(value) => value,
4278        Err(error) => {
4279            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
4280            return default;
4281        }
4282    };
4283
4284    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
4285        warn!("No info object found in pypi.json at {:?}", path);
4286        return default;
4287    };
4288
4289    let name = info
4290        .get("name")
4291        .and_then(|value| value.as_str())
4292        .map(|v| truncate_field(v.to_owned()));
4293    let version = info
4294        .get("version")
4295        .and_then(|value| value.as_str())
4296        .map(ToOwned::to_owned);
4297    let summary = info
4298        .get("summary")
4299        .and_then(|value| value.as_str())
4300        .map(|v| truncate_field(v.to_owned()));
4301    let description = info
4302        .get("description")
4303        .and_then(|value| value.as_str())
4304        .filter(|value| !value.trim().is_empty())
4305        .map(|v| truncate_field(v.to_owned()))
4306        .or(summary);
4307    let mut homepage_url = info
4308        .get("home_page")
4309        .and_then(|value| value.as_str())
4310        .map(|v| truncate_field(v.to_owned()));
4311    let author = info
4312        .get("author")
4313        .and_then(|value| value.as_str())
4314        .filter(|value| !value.trim().is_empty())
4315        .map(|v| truncate_field(v.to_owned()));
4316    let author_email = info
4317        .get("author_email")
4318        .and_then(|value| value.as_str())
4319        .filter(|value| !value.trim().is_empty())
4320        .map(ToOwned::to_owned);
4321    let license = info
4322        .get("license")
4323        .and_then(|value| value.as_str())
4324        .filter(|value| !value.trim().is_empty())
4325        .map(ToOwned::to_owned);
4326    let keywords = parse_setup_cfg_keywords(
4327        info.get("keywords")
4328            .and_then(|value| value.as_str())
4329            .map(ToOwned::to_owned),
4330    );
4331    let classifiers = info
4332        .get("classifiers")
4333        .and_then(|value| value.as_array())
4334        .map(|values| {
4335            values
4336                .iter()
4337                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
4338                .collect::<Vec<_>>()
4339        })
4340        .unwrap_or_default();
4341
4342    let mut parties = Vec::new();
4343    if author.is_some() || author_email.is_some() {
4344        parties.push(Party {
4345            r#type: Some("person".to_string()),
4346            role: Some("author".to_string()),
4347            name: author,
4348            email: author_email,
4349            url: None,
4350            organization: None,
4351            organization_url: None,
4352            timezone: None,
4353        });
4354    }
4355
4356    let mut bug_tracking_url = None;
4357    let mut code_view_url = None;
4358    let mut vcs_url = None;
4359    let mut extra_data = HashMap::new();
4360
4361    let parsed_project_urls = info
4362        .get("project_urls")
4363        .and_then(|value| value.as_object())
4364        .map(|map| {
4365            let mut pairs: Vec<(String, String)> = map
4366                .iter()
4367                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
4368                .collect();
4369            pairs.sort_by(|left, right| left.0.cmp(&right.0));
4370            pairs
4371        })
4372        .unwrap_or_default();
4373
4374    apply_project_url_mappings(
4375        &parsed_project_urls,
4376        &mut homepage_url,
4377        &mut bug_tracking_url,
4378        &mut code_view_url,
4379        &mut vcs_url,
4380        &mut extra_data,
4381    );
4382
4383    let (download_url, size, sha256) = root
4384        .get("urls")
4385        .and_then(|value| value.as_array())
4386        .map(|urls| select_pypi_json_artifact(urls))
4387        .unwrap_or((None, None, None));
4388
4389    let sha256 = sha256.and_then(|h| Sha256Digest::from_hex(&h).ok());
4390
4391    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4392        normalize_spdx_declared_license(license.as_deref());
4393    let dependencies = info
4394        .get("requires_dist")
4395        .and_then(|value| value.as_array())
4396        .map(|entries| {
4397            entries
4398                .iter()
4399                .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4400                .collect::<Vec<_>>()
4401        })
4402        .map(|entries| extract_requires_dist_dependencies(&entries))
4403        .unwrap_or_default();
4404
4405    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
4406        build_pypi_urls(name.as_deref(), version.as_deref());
4407
4408    PackageData {
4409        package_type: Some(PythonParser::PACKAGE_TYPE),
4410        namespace: None,
4411        name,
4412        version,
4413        qualifiers: None,
4414        subpath: None,
4415        primary_language: None,
4416        description,
4417        release_date: None,
4418        parties,
4419        keywords,
4420        homepage_url: homepage_url.or(repository_homepage_url.clone()),
4421        download_url,
4422        size,
4423        sha1: None,
4424        md5: None,
4425        sha256,
4426        sha512: None,
4427        bug_tracking_url,
4428        code_view_url,
4429        vcs_url,
4430        copyright: None,
4431        holder: None,
4432        declared_license_expression,
4433        declared_license_expression_spdx,
4434        license_detections,
4435        other_license_expression: None,
4436        other_license_expression_spdx: None,
4437        other_license_detections: Vec::new(),
4438        extracted_license_statement: license,
4439        notice_text: None,
4440        source_packages: Vec::new(),
4441        file_references: Vec::new(),
4442        is_private: has_private_classifier(&classifiers),
4443        is_virtual: false,
4444        extra_data: if extra_data.is_empty() {
4445            None
4446        } else {
4447            Some(extra_data)
4448        },
4449        dependencies,
4450        repository_homepage_url,
4451        repository_download_url,
4452        api_data_url,
4453        datasource_id: Some(DatasourceId::PypiJson),
4454        purl,
4455    }
4456}
4457
4458fn select_pypi_json_artifact(
4459    urls: &[serde_json::Value],
4460) -> (Option<String>, Option<u64>, Option<String>) {
4461    let selected = urls
4462        .iter()
4463        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4464        .or_else(|| urls.first());
4465
4466    let Some(entry) = selected else {
4467        return (None, None, None);
4468    };
4469
4470    let download_url = entry
4471        .get("url")
4472        .and_then(|value| value.as_str())
4473        .map(ToOwned::to_owned);
4474    let size = entry.get("size").and_then(|value| value.as_u64());
4475    let sha256 = entry
4476        .get("digests")
4477        .and_then(|value| value.as_object())
4478        .and_then(|digests| digests.get("sha256"))
4479        .and_then(|value| value.as_str())
4480        .map(ToOwned::to_owned);
4481
4482    (download_url, size, sha256)
4483}
4484
4485fn extract_from_pip_inspect(path: &Path) -> PackageData {
4486    let content = match read_file_to_string(path, None) {
4487        Ok(content) => content,
4488        Err(e) => {
4489            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4490            return default_package_data(path);
4491        }
4492    };
4493
4494    let root: serde_json::Value = match serde_json::from_str(&content) {
4495        Ok(value) => value,
4496        Err(e) => {
4497            warn!(
4498                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4499                path, e
4500            );
4501            return default_package_data(path);
4502        }
4503    };
4504
4505    let installed = match root.get("installed").and_then(|v| v.as_array()) {
4506        Some(arr) => arr,
4507        None => {
4508            warn!(
4509                "No 'installed' array found in pip-inspect.deplock at {:?}",
4510                path
4511            );
4512            return default_package_data(path);
4513        }
4514    };
4515
4516    let pip_version = root
4517        .get("pip_version")
4518        .and_then(|v| v.as_str())
4519        .map(String::from);
4520    let inspect_version = root
4521        .get("version")
4522        .and_then(|v| v.as_str())
4523        .map(String::from);
4524
4525    let mut main_package: Option<PackageData> = None;
4526    let mut dependencies: Vec<Dependency> = Vec::new();
4527
4528    for package_entry in installed {
4529        let metadata = match package_entry.get("metadata") {
4530            Some(m) => m,
4531            None => continue,
4532        };
4533
4534        let is_requested = package_entry
4535            .get("requested")
4536            .and_then(|v| v.as_bool())
4537            .unwrap_or(false);
4538        let has_direct_url = package_entry.get("direct_url").is_some();
4539
4540        let name = metadata
4541            .get("name")
4542            .and_then(|v| v.as_str())
4543            .map(|v| truncate_field(v.to_string()));
4544        let version = metadata
4545            .get("version")
4546            .and_then(|v| v.as_str())
4547            .map(String::from);
4548        let summary = metadata
4549            .get("summary")
4550            .and_then(|v| v.as_str())
4551            .map(|v| truncate_field(v.to_string()));
4552        let home_page = metadata
4553            .get("home_page")
4554            .and_then(|v| v.as_str())
4555            .map(|v| truncate_field(v.to_string()));
4556        let author = metadata
4557            .get("author")
4558            .and_then(|v| v.as_str())
4559            .map(|v| truncate_field(v.to_string()));
4560        let author_email = metadata
4561            .get("author_email")
4562            .and_then(|v| v.as_str())
4563            .map(String::from);
4564        let license = metadata
4565            .get("license")
4566            .and_then(|v| v.as_str())
4567            .map(|v| truncate_field(v.to_string()));
4568        let description = metadata
4569            .get("description")
4570            .and_then(|v| v.as_str())
4571            .map(|v| truncate_field(v.to_string()));
4572        let keywords = metadata
4573            .get("keywords")
4574            .and_then(|v| v.as_array())
4575            .map(|arr| {
4576                arr.iter()
4577                    .filter_map(|k| k.as_str().map(String::from))
4578                    .collect::<Vec<_>>()
4579            })
4580            .unwrap_or_default();
4581
4582        let mut parties = Vec::new();
4583        if author.is_some() || author_email.is_some() {
4584            parties.push(Party {
4585                r#type: Some("person".to_string()),
4586                role: Some("author".to_string()),
4587                name: author,
4588                email: author_email,
4589                url: None,
4590                organization: None,
4591                organization_url: None,
4592                timezone: None,
4593            });
4594        }
4595
4596        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4597            normalize_spdx_declared_license(license.as_deref());
4598        let extracted_license_statement = license.clone();
4599        let requires_dist = metadata
4600            .get("requires_dist")
4601            .and_then(|v| v.as_array())
4602            .map(|entries| {
4603                entries
4604                    .iter()
4605                    .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4606                    .collect::<Vec<_>>()
4607            })
4608            .unwrap_or_default();
4609        let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4610
4611        let purl = name.as_ref().and_then(|n| {
4612            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4613            if let Some(v) = &version {
4614                package_url.with_version(v).ok()?;
4615            }
4616            Some(package_url.to_string())
4617        });
4618
4619        if is_requested && has_direct_url {
4620            let mut extra_data = HashMap::new();
4621            if let Some(pv) = &pip_version {
4622                extra_data.insert(
4623                    "pip_version".to_string(),
4624                    serde_json::Value::String(pv.clone()),
4625                );
4626            }
4627            if let Some(iv) = &inspect_version {
4628                extra_data.insert(
4629                    "inspect_version".to_string(),
4630                    serde_json::Value::String(iv.clone()),
4631                );
4632            }
4633
4634            main_package = Some(PackageData {
4635                package_type: Some(PythonParser::PACKAGE_TYPE),
4636                namespace: None,
4637                name,
4638                version,
4639                qualifiers: None,
4640                subpath: None,
4641                primary_language: Some("Python".to_string()),
4642                description: description.or(summary),
4643                release_date: None,
4644                parties,
4645                keywords,
4646                homepage_url: home_page,
4647                download_url: None,
4648                size: None,
4649                sha1: None,
4650                md5: None,
4651                sha256: None,
4652                sha512: None,
4653                bug_tracking_url: None,
4654                code_view_url: None,
4655                vcs_url: None,
4656                copyright: None,
4657                holder: None,
4658                declared_license_expression,
4659                declared_license_expression_spdx,
4660                license_detections,
4661                other_license_expression: None,
4662                other_license_expression_spdx: None,
4663                other_license_detections: Vec::new(),
4664                extracted_license_statement,
4665                notice_text: None,
4666                source_packages: Vec::new(),
4667                file_references: Vec::new(),
4668                is_private: false,
4669                is_virtual: true,
4670                extra_data: if extra_data.is_empty() {
4671                    None
4672                } else {
4673                    Some(extra_data)
4674                },
4675                dependencies: parsed_dependencies,
4676                repository_homepage_url: None,
4677                repository_download_url: None,
4678                api_data_url: None,
4679                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4680                purl,
4681            });
4682        } else {
4683            let resolved_package = PackageData {
4684                package_type: Some(PythonParser::PACKAGE_TYPE),
4685                namespace: None,
4686                name: name.clone(),
4687                version: version.clone(),
4688                qualifiers: None,
4689                subpath: None,
4690                primary_language: Some("Python".to_string()),
4691                description: description.or(summary),
4692                release_date: None,
4693                parties,
4694                keywords,
4695                homepage_url: home_page,
4696                download_url: None,
4697                size: None,
4698                sha1: None,
4699                md5: None,
4700                sha256: None,
4701                sha512: None,
4702                bug_tracking_url: None,
4703                code_view_url: None,
4704                vcs_url: None,
4705                copyright: None,
4706                holder: None,
4707                declared_license_expression,
4708                declared_license_expression_spdx,
4709                license_detections,
4710                other_license_expression: None,
4711                other_license_expression_spdx: None,
4712                other_license_detections: Vec::new(),
4713                extracted_license_statement,
4714                notice_text: None,
4715                source_packages: Vec::new(),
4716                file_references: Vec::new(),
4717                is_private: false,
4718                is_virtual: true,
4719                extra_data: None,
4720                dependencies: parsed_dependencies,
4721                repository_homepage_url: None,
4722                repository_download_url: None,
4723                api_data_url: None,
4724                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4725                purl: purl.clone(),
4726            };
4727
4728            let resolved = package_data_to_resolved(&resolved_package);
4729            dependencies.push(Dependency {
4730                purl,
4731                extracted_requirement: None,
4732                scope: None,
4733                is_runtime: Some(true),
4734                is_optional: Some(false),
4735                is_pinned: Some(true),
4736                is_direct: Some(is_requested),
4737                resolved_package: Some(Box::new(resolved)),
4738                extra_data: None,
4739            });
4740        }
4741    }
4742
4743    if let Some(mut main_pkg) = main_package {
4744        let direct_requirement_purls: HashSet<String> = main_pkg
4745            .dependencies
4746            .iter()
4747            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4748            .collect();
4749
4750        let resolved_requirement_purls: HashSet<String> = dependencies
4751            .iter()
4752            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4753            .collect();
4754
4755        let unresolved_dependencies = main_pkg
4756            .dependencies
4757            .iter()
4758            .filter(|dep| {
4759                dep.purl.as_ref().is_some_and(|purl| {
4760                    !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4761                })
4762            })
4763            .cloned()
4764            .collect::<Vec<_>>();
4765
4766        for dependency in &mut dependencies {
4767            if dependency
4768                .purl
4769                .as_ref()
4770                .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4771            {
4772                dependency.is_direct = Some(true);
4773            }
4774        }
4775
4776        main_pkg.dependencies = dependencies;
4777        main_pkg.dependencies.extend(unresolved_dependencies);
4778        main_pkg
4779    } else {
4780        default_package_data(path)
4781    }
4782}
4783
4784fn base_dependency_purl(purl: &str) -> String {
4785    purl.split_once('@')
4786        .map(|(base, _)| base.to_string())
4787        .unwrap_or_else(|| purl.to_string())
4788}
4789
4790type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4791
4792fn extract_from_setup_cfg(path: &Path) -> PackageData {
4793    let content = match read_file_to_string(path, None) {
4794        Ok(content) => content,
4795        Err(e) => {
4796            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4797            return default_package_data(path);
4798        }
4799    };
4800
4801    let sections = parse_setup_cfg(&content);
4802    let name = get_ini_value(&sections, "metadata", "name").map(truncate_field);
4803    let version = get_ini_value(&sections, "metadata", "version").map(truncate_field);
4804    let description = get_ini_value(&sections, "metadata", "description").map(truncate_field);
4805    let author = get_ini_value(&sections, "metadata", "author").map(truncate_field);
4806    let author_email = get_ini_value(&sections, "metadata", "author_email");
4807    let maintainer = get_ini_value(&sections, "metadata", "maintainer").map(truncate_field);
4808    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4809    let license = get_ini_value(&sections, "metadata", "license").map(truncate_field);
4810    let mut homepage_url = get_ini_value(&sections, "metadata", "url").map(truncate_field);
4811    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4812    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4813    let python_requires = get_ini_value(&sections, "options", "python_requires");
4814    let parsed_project_urls =
4815        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4816    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4817    let mut extra_data = HashMap::new();
4818
4819    let mut parties = Vec::new();
4820    if author.is_some() || author_email.is_some() {
4821        parties.push(Party {
4822            r#type: Some("person".to_string()),
4823            role: Some("author".to_string()),
4824            name: author,
4825            email: author_email,
4826            url: None,
4827            organization: None,
4828            organization_url: None,
4829            timezone: None,
4830        });
4831    }
4832
4833    if maintainer.is_some() || maintainer_email.is_some() {
4834        parties.push(Party {
4835            r#type: Some("person".to_string()),
4836            role: Some("maintainer".to_string()),
4837            name: maintainer,
4838            email: maintainer_email,
4839            url: None,
4840            organization: None,
4841            organization_url: None,
4842            timezone: None,
4843        });
4844    }
4845
4846    let declared_license_expression = None;
4847    let declared_license_expression_spdx = None;
4848    let license_detections = Vec::new();
4849    let extracted_license_statement = license.clone();
4850
4851    let dependencies = extract_setup_cfg_dependencies(&sections);
4852
4853    if let Some(value) = python_requires {
4854        extra_data.insert(
4855            "python_requires".to_string(),
4856            serde_json::Value::String(value),
4857        );
4858    }
4859
4860    apply_project_url_mappings(
4861        &parsed_project_urls,
4862        &mut homepage_url,
4863        &mut bug_tracking_url,
4864        &mut code_view_url,
4865        &mut vcs_url,
4866        &mut extra_data,
4867    );
4868
4869    let extra_data = if extra_data.is_empty() {
4870        None
4871    } else {
4872        Some(extra_data)
4873    };
4874
4875    let purl = name.as_ref().and_then(|n| {
4876        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4877        if let Some(v) = &version {
4878            package_url.with_version(v).ok()?;
4879        }
4880        Some(package_url.to_string())
4881    });
4882
4883    PackageData {
4884        package_type: Some(PythonParser::PACKAGE_TYPE),
4885        namespace: None,
4886        name,
4887        version,
4888        qualifiers: None,
4889        subpath: None,
4890        primary_language: Some("Python".to_string()),
4891        description,
4892        release_date: None,
4893        parties,
4894        keywords,
4895        homepage_url,
4896        download_url: None,
4897        size: None,
4898        sha1: None,
4899        md5: None,
4900        sha256: None,
4901        sha512: None,
4902        bug_tracking_url,
4903        code_view_url,
4904        vcs_url,
4905        copyright: None,
4906        holder: None,
4907        declared_license_expression,
4908        declared_license_expression_spdx,
4909        license_detections,
4910        other_license_expression: None,
4911        other_license_expression_spdx: None,
4912        other_license_detections: Vec::new(),
4913        extracted_license_statement,
4914        notice_text: None,
4915        source_packages: Vec::new(),
4916        file_references: Vec::new(),
4917        is_private: has_private_classifier(&classifiers),
4918        is_virtual: false,
4919        extra_data,
4920        dependencies,
4921        repository_homepage_url: None,
4922        repository_download_url: None,
4923        api_data_url: None,
4924        datasource_id: Some(DatasourceId::PypiSetupCfg),
4925        purl,
4926    }
4927}
4928
4929fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4930    let Some(keywords) = value else {
4931        return Vec::new();
4932    };
4933
4934    keywords
4935        .split(',')
4936        .map(str::trim)
4937        .filter(|keyword| !keyword.is_empty())
4938        .map(ToOwned::to_owned)
4939        .collect()
4940}
4941
4942fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4943    entries
4944        .iter()
4945        .filter_map(|entry| {
4946            let (label, url) = entry.split_once('=')?;
4947            let label = label.trim();
4948            let url = url.trim();
4949            if label.is_empty() || url.is_empty() {
4950                None
4951            } else {
4952                Some((label.to_string(), url.to_string()))
4953            }
4954        })
4955        .collect()
4956}
4957
4958fn apply_project_url_mappings(
4959    parsed_urls: &[(String, String)],
4960    homepage_url: &mut Option<String>,
4961    bug_tracking_url: &mut Option<String>,
4962    code_view_url: &mut Option<String>,
4963    vcs_url: &mut Option<String>,
4964    extra_data: &mut HashMap<String, serde_json::Value>,
4965) {
4966    for (label, url) in parsed_urls {
4967        let label_lower = label.to_lowercase();
4968
4969        if bug_tracking_url.is_none()
4970            && matches!(
4971                label_lower.as_str(),
4972                "tracker"
4973                    | "bug reports"
4974                    | "bug tracker"
4975                    | "issues"
4976                    | "issue tracker"
4977                    | "github: issues"
4978            )
4979        {
4980            *bug_tracking_url = Some(url.clone());
4981        } else if code_view_url.is_none()
4982            && matches!(label_lower.as_str(), "source" | "source code" | "code")
4983        {
4984            *code_view_url = Some(url.clone());
4985        } else if vcs_url.is_none()
4986            && matches!(
4987                label_lower.as_str(),
4988                "github" | "gitlab" | "github: repo" | "repository"
4989            )
4990        {
4991            *vcs_url = Some(url.clone());
4992        } else if homepage_url.is_none()
4993            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4994        {
4995            *homepage_url = Some(url.clone());
4996        } else if label_lower == "changelog" {
4997            extra_data.insert(
4998                "changelog_url".to_string(),
4999                serde_json::Value::String(url.clone()),
5000            );
5001        }
5002    }
5003
5004    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
5005        .iter()
5006        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
5007        .collect();
5008
5009    if !project_urls_json.is_empty() {
5010        extra_data.insert(
5011            "project_urls".to_string(),
5012            serde_json::Value::Object(project_urls_json),
5013        );
5014    }
5015}
5016
5017fn parse_setup_cfg(content: &str) -> IniSections {
5018    let mut sections: IniSections = HashMap::new();
5019    let mut current_section: Option<String> = None;
5020    let mut current_key: Option<String> = None;
5021
5022    for raw_line in content.lines() {
5023        let line = raw_line.trim_end_matches('\r');
5024        let trimmed = line.trim();
5025        if trimmed.is_empty() {
5026            continue;
5027        }
5028
5029        let stripped = line.trim_start();
5030        if stripped.starts_with('#') || stripped.starts_with(';') {
5031            continue;
5032        }
5033
5034        if stripped.starts_with('[') && stripped.ends_with(']') {
5035            let section_name = stripped
5036                .trim_start_matches('[')
5037                .trim_end_matches(']')
5038                .trim()
5039                .to_ascii_lowercase();
5040            current_section = if section_name.is_empty() {
5041                None
5042            } else {
5043                Some(section_name)
5044            };
5045            current_key = None;
5046            continue;
5047        }
5048
5049        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
5050            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
5051                let value = stripped.trim();
5052                if !value.is_empty() {
5053                    sections
5054                        .entry(section.clone())
5055                        .or_default()
5056                        .entry(key.clone())
5057                        .or_default()
5058                        .push(value.to_string());
5059                }
5060            }
5061            continue;
5062        }
5063
5064        if let Some((key, value)) = stripped.split_once('=')
5065            && let Some(section) = current_section.as_ref()
5066        {
5067            let key_name = key.trim().to_ascii_lowercase();
5068            let value_trimmed = value.trim();
5069            let entry = sections
5070                .entry(section.clone())
5071                .or_default()
5072                .entry(key_name.clone())
5073                .or_default();
5074            if !value_trimmed.is_empty() {
5075                entry.push(value_trimmed.to_string());
5076            }
5077            current_key = Some(key_name);
5078        }
5079    }
5080
5081    sections
5082}
5083
5084fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
5085    sections
5086        .get(&section.to_ascii_lowercase())
5087        .and_then(|values| values.get(&key.to_ascii_lowercase()))
5088        .and_then(|entries| entries.first())
5089        .map(|value| value.trim().to_string())
5090}
5091
5092fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
5093    sections
5094        .get(&section.to_ascii_lowercase())
5095        .and_then(|values| values.get(&key.to_ascii_lowercase()))
5096        .cloned()
5097        .unwrap_or_default()
5098}
5099
5100fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
5101    let mut dependencies = Vec::new();
5102
5103    for (sub_section, scope) in [
5104        ("install_requires", "install"),
5105        ("tests_require", "test"),
5106        ("setup_requires", "setup"),
5107    ] {
5108        let reqs = get_ini_values(sections, "options", sub_section);
5109        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
5110    }
5111
5112    if let Some(extras) = sections.get("options.extras_require") {
5113        let mut extra_items: Vec<_> = extras.iter().collect();
5114        extra_items.sort_by_key(|(name, _)| *name);
5115        for (extra_name, reqs) in extra_items {
5116            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
5117        }
5118    }
5119
5120    dependencies
5121}
5122
5123fn parse_setup_cfg_requirements(
5124    reqs: &[String],
5125    scope: &str,
5126    is_optional: bool,
5127) -> Vec<Dependency> {
5128    reqs.iter()
5129        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
5130        .collect()
5131}
5132
5133fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
5134    let trimmed = req.trim();
5135    if trimmed.is_empty() || trimmed.starts_with('#') {
5136        return None;
5137    }
5138
5139    let name = extract_setup_cfg_dependency_name(trimmed)?;
5140    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5141
5142    Some(Dependency {
5143        purl: Some(purl.to_string()),
5144        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
5145        scope: Some(scope.to_string()),
5146        is_runtime: Some(true),
5147        is_optional: Some(is_optional),
5148        is_pinned: Some(false),
5149        is_direct: Some(true),
5150        resolved_package: None,
5151        extra_data: None,
5152    })
5153}
5154
5155fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
5156    let trimmed = req.trim();
5157    if trimmed.is_empty() {
5158        return None;
5159    }
5160
5161    let end = trimmed
5162        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
5163        .unwrap_or(trimmed.len());
5164    let name = trimmed[..end].trim();
5165    if name.is_empty() {
5166        None
5167    } else {
5168        Some(name.to_string())
5169    }
5170}
5171
5172fn normalize_setup_cfg_requirement(req: &str) -> String {
5173    req.chars().filter(|c| !c.is_whitespace()).collect()
5174}
5175
5176fn extract_setup_value(content: &str, key: &str) -> Option<String> {
5177    let patterns = vec![
5178        format!("{}=\"", key),   // name="value"
5179        format!("{} =\"", key),  // name ="value"
5180        format!("{}= \"", key),  // name= "value"
5181        format!("{} = \"", key), // name = "value"
5182        format!("{}='", key),    // name='value'
5183        format!("{} ='", key),   // name ='value'
5184        format!("{}= '", key),   // name= 'value'
5185        format!("{} = '", key),  // name = 'value'
5186    ];
5187
5188    for pattern in patterns {
5189        if let Some(start_idx) = content.find(&pattern) {
5190            let value_start = start_idx + pattern.len();
5191            let remaining = &content[value_start..];
5192
5193            if let Some(end_idx) = remaining.find(['"', '\'']) {
5194                return Some(remaining[..end_idx].to_string());
5195            }
5196        }
5197    }
5198
5199    None
5200}
5201
5202fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
5203    let mut dependencies = Vec::new();
5204
5205    if let Some(tests_deps) = extract_tests_require(content) {
5206        dependencies.extend(tests_deps);
5207    }
5208
5209    if let Some(extras_deps) = extract_extras_require(content) {
5210        dependencies.extend(extras_deps);
5211    }
5212
5213    dependencies
5214}
5215
5216fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
5217    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
5218    let re = Regex::new(pattern).ok()?;
5219    let captures = re.captures(content)?;
5220    let deps_str = captures.get(1)?.as_str();
5221
5222    let deps = parse_setup_py_dep_list(deps_str, "test", true);
5223    if deps.is_empty() { None } else { Some(deps) }
5224}
5225
5226fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
5227    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
5228    let re = Regex::new(pattern).ok()?;
5229    let captures = re.captures(content)?;
5230    let dict_content = captures.get(1)?.as_str();
5231
5232    let mut all_deps = Vec::new();
5233
5234    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
5235    let entry_re = Regex::new(entry_pattern).ok()?;
5236
5237    for entry_cap in entry_re.captures_iter(dict_content) {
5238        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
5239            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
5240            all_deps.extend(deps);
5241        }
5242    }
5243
5244    if all_deps.is_empty() {
5245        None
5246    } else {
5247        Some(all_deps)
5248    }
5249}
5250
5251fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
5252    let dep_pattern = r#"['"]([^'"]+)['"]"#;
5253    let re = match Regex::new(dep_pattern) {
5254        Ok(r) => r,
5255        Err(_) => return Vec::new(),
5256    };
5257
5258    re.captures_iter(deps_str)
5259        .filter_map(|cap| {
5260            let dep_str = cap.get(1)?.as_str().trim();
5261            if dep_str.is_empty() {
5262                return None;
5263            }
5264
5265            let name = extract_setup_cfg_dependency_name(dep_str)?;
5266            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5267
5268            Some(Dependency {
5269                purl: Some(purl.to_string()),
5270                extracted_requirement: Some(dep_str.to_string()),
5271                scope: Some(scope.to_string()),
5272                is_runtime: Some(true),
5273                is_optional: Some(is_optional),
5274                is_pinned: Some(false),
5275                is_direct: Some(true),
5276                resolved_package: None,
5277                extra_data: None,
5278            })
5279        })
5280        .collect()
5281}
5282
5283/// Reads and parses a TOML file
5284pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
5285    let content = read_file_to_string(path, None).map_err(|e| e.to_string())?;
5286    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
5287}
5288
5289/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
5290///
5291/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
5292/// Essential for SBOM compliance and package integrity verification.
5293///
5294/// # Returns
5295///
5296/// - `(Some(size), Some(hash))` on success
5297/// - `(None, None)` if file cannot be opened
5298/// - `(Some(size), None)` if hash calculation fails during read
5299fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<Sha256Digest>) {
5300    let mut file = match File::open(path) {
5301        Ok(f) => f,
5302        Err(_) => return (None, None),
5303    };
5304
5305    let metadata = match file.metadata() {
5306        Ok(m) => m,
5307        Err(_) => return (None, None),
5308    };
5309    let size = metadata.len();
5310
5311    let mut hasher = Sha256::new();
5312    let mut buffer = vec![0; 8192];
5313
5314    loop {
5315        match file.read(&mut buffer) {
5316            Ok(0) => break,
5317            Ok(n) => hasher.update(&buffer[..n]),
5318            Err(_) => return (Some(size), None),
5319        }
5320    }
5321
5322    let hash = Sha256Digest::from_bytes(hasher.finalize().into());
5323    (Some(size), Some(hash))
5324}
5325
5326fn default_package_data(path: &Path) -> PackageData {
5327    PackageData {
5328        package_type: Some(PythonParser::PACKAGE_TYPE),
5329        primary_language: Some("Python".to_string()),
5330        datasource_id: infer_python_datasource_id(path),
5331        ..Default::default()
5332    }
5333}
5334
5335fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
5336    let file_name = path.file_name().and_then(|name| name.to_str());
5337
5338    match file_name {
5339        Some("pyproject.toml") => {
5340            if read_toml_file(path)
5341                .ok()
5342                .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
5343                .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
5344                .is_some()
5345            {
5346                Some(DatasourceId::PypiPoetryPyprojectToml)
5347            } else {
5348                Some(DatasourceId::PypiPyprojectToml)
5349            }
5350        }
5351        Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
5352            Some(DatasourceId::PypiSetupPy)
5353        }
5354        Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
5355        Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
5356        Some("METADATA") if is_installed_wheel_metadata_path(path) => {
5357            Some(DatasourceId::PypiWheelMetadata)
5358        }
5359        Some("pypi.json") => Some(DatasourceId::PypiJson),
5360        Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
5361        Some("origin.json") if is_pip_cache_origin_json(path) => {
5362            Some(DatasourceId::PypiPipOriginJson)
5363        }
5364        _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
5365            Some(DatasourceId::PypiSdist)
5366        }
5367        _ if path
5368            .extension()
5369            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
5370        {
5371            Some(DatasourceId::PypiWheel)
5372        }
5373        _ if path
5374            .extension()
5375            .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
5376        {
5377            Some(DatasourceId::PypiEgg)
5378        }
5379        _ => None,
5380    }
5381}
5382
5383crate::register_parser!(
5384    "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
5385    &[
5386        "**/pyproject.toml",
5387        "**/setup.py",
5388        "**/*_setup.py",
5389        "**/setup.cfg",
5390        "**/pypi.json",
5391        "**/PKG-INFO",
5392        "**/*.dist-info/METADATA",
5393        "**/origin.json",
5394        "**/*.tar.gz",
5395        "**/*.tgz",
5396        "**/*.tar.bz2",
5397        "**/*.tar.xz",
5398        "**/*.zip",
5399        "**/*.whl",
5400        "**/*.egg"
5401    ],
5402    "pypi",
5403    "Python",
5404    Some("https://packaging.python.org/"),
5405);