Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{
35    DatasourceId, Dependency, FileReference, PackageData, PackageType, Party, Sha256Digest,
36};
37use crate::parser_warn as warn;
38use crate::parsers::utils::{
39    MAX_ITERATION_COUNT, read_file_to_string, split_name_email, truncate_field,
40};
41use base64::Engine;
42use base64::engine::general_purpose::URL_SAFE_NO_PAD;
43use bzip2::read::BzDecoder;
44use csv::ReaderBuilder;
45use flate2::read::GzDecoder;
46use liblzma::read::XzDecoder;
47use packageurl::PackageUrl;
48use regex::Regex;
49use ruff_python_ast as ast;
50use ruff_python_parser::parse_module;
51use serde_json::{Map as JsonMap, Value as JsonValue};
52use sha2::{Digest, Sha256};
53use std::collections::{HashMap, HashSet};
54use std::fs::File;
55use std::io::Read;
56use std::path::{Component, Path, PathBuf};
57use tar::Archive;
58use toml::Value as TomlValue;
59use toml::map::Map as TomlMap;
60use zip::ZipArchive;
61
62use super::PackageParser;
63use super::license_normalization::{
64    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
65    normalize_spdx_expression,
66};
67use super::pep508::parse_pep508_requirement;
68
69// Field constants for pyproject.toml
70const FIELD_PROJECT: &str = "project";
71const FIELD_NAME: &str = "name";
72const FIELD_VERSION: &str = "version";
73const FIELD_DESCRIPTION: &str = "description";
74const FIELD_KEYWORDS: &str = "keywords";
75const FIELD_LICENSE: &str = "license";
76const FIELD_AUTHORS: &str = "authors";
77const FIELD_MAINTAINERS: &str = "maintainers";
78const FIELD_URLS: &str = "urls";
79const FIELD_HOMEPAGE: &str = "homepage";
80const FIELD_REPOSITORY: &str = "repository";
81const FIELD_DEPENDENCIES: &str = "dependencies";
82const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
83const FIELD_EXTRAS: &str = "extras";
84
85type ProjectUrls = (
86    Option<String>,
87    Option<String>,
88    Option<String>,
89    Option<String>,
90    Option<String>,
91);
92const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
93const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
94const MAX_SETUP_PY_BYTES: usize = 1_048_576;
95const MAX_SETUP_PY_AST_NODES: usize = 10_000;
96const MAX_SETUP_PY_AST_DEPTH: usize = 50;
97const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
98const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
99const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
100
101/// Python package parser supporting 11 manifest formats.
102///
103/// Extracts metadata from Python package files including pyproject.toml, setup.py,
104/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
105///
106/// # Security
107///
108/// setup.py files are parsed using AST analysis rather than code execution to prevent
109/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
110pub struct PythonParser;
111
112#[derive(Clone, Copy, Debug)]
113enum PythonSdistArchiveFormat {
114    TarGz,
115    Tgz,
116    TarBz2,
117    TarXz,
118    Zip,
119}
120
121#[derive(Clone, Debug)]
122struct ValidatedZipEntry {
123    index: usize,
124    name: String,
125}
126
127impl PackageParser for PythonParser {
128    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
129
130    fn extract_packages(path: &Path) -> Vec<PackageData> {
131        vec![
132            if path.file_name().unwrap_or_default() == "pyproject.toml" {
133                extract_from_pyproject_toml(path)
134            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
135                extract_from_setup_cfg(path)
136            } else if is_setup_py_like_path(path) {
137                return extract_setup_py_packages(path);
138            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
139                extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
140            } else if is_installed_wheel_metadata_path(path) {
141                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
142            } else if is_pip_cache_origin_json(path) {
143                extract_from_pip_origin_json(path)
144            } else if path.file_name().unwrap_or_default() == "pypi.json" {
145                extract_from_pypi_json(path)
146            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
147                extract_from_pip_inspect(path)
148            } else if is_python_sdist_archive_path(path) {
149                extract_from_sdist_archive(path)
150            } else if path
151                .extension()
152                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
153            {
154                extract_from_wheel_archive(path)
155            } else if path
156                .extension()
157                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
158            {
159                extract_from_egg_archive(path)
160            } else {
161                default_package_data(path)
162            },
163        ]
164    }
165
166    fn is_match(path: &Path) -> bool {
167        if let Some(filename) = path.file_name()
168            && (filename == "pyproject.toml"
169                || filename == "setup.cfg"
170                || is_setup_py_like_path(path)
171                || filename == "PKG-INFO"
172                || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
173                || filename == "pypi.json"
174                || filename == "pip-inspect.deplock"
175                || is_pip_cache_origin_json(path))
176        {
177            return true;
178        }
179
180        if let Some(extension) = path.extension() {
181            let ext = extension.to_string_lossy().to_lowercase();
182            if (ext == "whl" && is_valid_wheel_archive_path(path))
183                || ext == "egg"
184                || is_python_sdist_archive_path(path)
185            {
186                return true;
187            }
188        }
189
190        false
191    }
192}
193
194fn is_setup_py_like_path(path: &Path) -> bool {
195    path.file_name()
196        .and_then(|name| name.to_str())
197        .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
198}
199
200fn is_installed_wheel_metadata_path(path: &Path) -> bool {
201    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
202        && path
203            .parent()
204            .and_then(|parent| parent.file_name())
205            .and_then(|name| name.to_str())
206            .is_some_and(|name| name.ends_with(".dist-info"))
207}
208
209#[derive(Debug, Clone)]
210struct InstalledWheelMetadata {
211    wheel_tags: Vec<String>,
212    wheel_version: Option<String>,
213    wheel_generator: Option<String>,
214    root_is_purelib: Option<bool>,
215    compressed_tag: Option<String>,
216}
217
218fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
219    let Some(parent) = path.parent() else {
220        return;
221    };
222
223    if !parent
224        .file_name()
225        .and_then(|name| name.to_str())
226        .is_some_and(|name| name.ends_with(".dist-info"))
227    {
228        return;
229    }
230
231    let wheel_path = parent.join("WHEEL");
232    if !wheel_path.exists() {
233        return;
234    }
235
236    let Ok(content) = read_file_to_string(&wheel_path, None) else {
237        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
238        return;
239    };
240
241    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
242        return;
243    };
244
245    apply_installed_wheel_metadata(package_data, &wheel_metadata);
246}
247
248fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
249    use super::rfc822::{get_header_all, get_header_first};
250
251    let metadata = super::rfc822::parse_rfc822_content(content);
252    let wheel_tags = get_header_all(&metadata.headers, "tag");
253    if wheel_tags.is_empty() {
254        return None;
255    }
256
257    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
258    let wheel_generator = get_header_first(&metadata.headers, "generator");
259    let root_is_purelib =
260        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
261            match value.to_ascii_lowercase().as_str() {
262                "true" => Some(true),
263                "false" => Some(false),
264                _ => None,
265            }
266        });
267
268    let compressed_tag = compress_wheel_tags(&wheel_tags);
269
270    Some(InstalledWheelMetadata {
271        wheel_tags,
272        wheel_version,
273        wheel_generator,
274        root_is_purelib,
275        compressed_tag,
276    })
277}
278
279fn compress_wheel_tags(tags: &[String]) -> Option<String> {
280    if tags.is_empty() {
281        return None;
282    }
283
284    if tags.len() == 1 {
285        return Some(tags[0].clone());
286    }
287
288    let mut python_tags = Vec::new();
289    let mut abi_tag: Option<&str> = None;
290    let mut platform_tag: Option<&str> = None;
291
292    for tag in tags {
293        let mut parts = tag.splitn(3, '-');
294        let python = parts.next()?;
295        let abi = parts.next()?;
296        let platform = parts.next()?;
297
298        if abi_tag.is_some_and(|existing| existing != abi)
299            || platform_tag.is_some_and(|existing| existing != platform)
300        {
301            return None;
302        }
303
304        abi_tag = Some(abi);
305        platform_tag = Some(platform);
306        python_tags.push(python.to_string());
307    }
308
309    Some(format!(
310        "{}-{}-{}",
311        python_tags.join("."),
312        abi_tag?,
313        platform_tag?
314    ))
315}
316
317fn apply_installed_wheel_metadata(
318    package_data: &mut PackageData,
319    wheel_metadata: &InstalledWheelMetadata,
320) {
321    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
322    extra_data.insert(
323        "wheel_tags".to_string(),
324        JsonValue::Array(
325            wheel_metadata
326                .wheel_tags
327                .iter()
328                .cloned()
329                .map(JsonValue::String)
330                .collect(),
331        ),
332    );
333
334    if let Some(wheel_version) = &wheel_metadata.wheel_version {
335        extra_data.insert(
336            "wheel_version".to_string(),
337            JsonValue::String(wheel_version.clone()),
338        );
339    }
340
341    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
342        extra_data.insert(
343            "wheel_generator".to_string(),
344            JsonValue::String(wheel_generator.clone()),
345        );
346    }
347
348    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
349        extra_data.insert(
350            "root_is_purelib".to_string(),
351            JsonValue::Bool(root_is_purelib),
352        );
353    }
354
355    if let (Some(name), Some(version), Some(extension)) = (
356        package_data.name.as_deref(),
357        package_data.version.as_deref(),
358        wheel_metadata.compressed_tag.as_deref(),
359    ) {
360        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
361    }
362}
363
364fn is_pip_cache_origin_json(path: &Path) -> bool {
365    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
366        && path.ancestors().skip(1).any(|ancestor| {
367            ancestor
368                .file_name()
369                .and_then(|name| name.to_str())
370                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
371        })
372}
373
374fn extract_from_pip_origin_json(path: &Path) -> PackageData {
375    let content = match read_file_to_string(path, None) {
376        Ok(content) => content,
377        Err(e) => {
378            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
379            return default_package_data(path);
380        }
381    };
382
383    let root: JsonValue = match serde_json::from_str(&content) {
384        Ok(root) => root,
385        Err(e) => {
386            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
387            return default_package_data(path);
388        }
389    };
390
391    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
392        warn!("No url found in pip cache origin.json at {:?}", path);
393        return default_package_data(path);
394    };
395
396    let sibling_wheel = find_sibling_cached_wheel(path);
397    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
398        sibling_wheel
399            .as_ref()
400            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
401    });
402
403    let Some((name, version)) = name_version else {
404        warn!(
405            "Failed to infer package name/version from pip cache origin.json at {:?}",
406            path
407        );
408        return default_package_data(path);
409    };
410
411    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
412        build_pypi_urls(Some(&name), Some(&version));
413    let purl = sibling_wheel
414        .as_ref()
415        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
416        .or(plain_purl);
417
418    PackageData {
419        package_type: Some(PythonParser::PACKAGE_TYPE),
420        primary_language: Some("Python".to_string()),
421        name: Some(truncate_field(name)),
422        version: Some(version),
423        datasource_id: Some(DatasourceId::PypiPipOriginJson),
424        download_url: Some(truncate_field(download_url.to_string())),
425        sha256: extract_sha256_from_origin_json(&root)
426            .and_then(|h| Sha256Digest::from_hex(&h).ok()),
427        repository_homepage_url,
428        repository_download_url,
429        api_data_url,
430        purl,
431        ..Default::default()
432    }
433}
434
435fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
436    let parent = path.parent()?;
437    let entries = parent.read_dir().ok()?;
438
439    for entry in entries.flatten() {
440        let sibling_path = entry.path();
441        if sibling_path
442            .extension()
443            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
444            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
445        {
446            return Some(wheel_info);
447        }
448    }
449
450    None
451}
452
453fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
454    let file_name = url.rsplit('/').next()?;
455
456    if file_name.ends_with(".whl") {
457        return parse_wheel_filename(Path::new(file_name))
458            .map(|wheel_info| (wheel_info.name, wheel_info.version));
459    }
460
461    let stem = strip_python_archive_extension(file_name)?;
462    let (name, version) = stem.rsplit_once('-')?;
463    if name.is_empty() || version.is_empty() {
464        return None;
465    }
466
467    Some((name.replace('_', "-"), version.to_string()))
468}
469
470fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
471    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
472        .iter()
473        .find_map(|suffix| file_name.strip_suffix(suffix))
474}
475
476fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
477    root.pointer("/archive_info/hashes/sha256")
478        .and_then(|value| value.as_str())
479        .map(ToOwned::to_owned)
480        .or_else(|| {
481            root.pointer("/archive_info/hash")
482                .and_then(|value| value.as_str())
483                .and_then(normalize_origin_hash)
484        })
485}
486
487fn normalize_origin_hash(hash: &str) -> Option<String> {
488    if let Some(value) = hash.strip_prefix("sha256=") {
489        return Some(value.to_string());
490    }
491    if let Some(value) = hash.strip_prefix("sha256:") {
492        return Some(value.to_string());
493    }
494    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
495        return Some(hash.to_string());
496    }
497    None
498}
499
500fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
501    let content = match read_file_to_string(path, None) {
502        Ok(content) => content,
503        Err(e) => {
504            warn!("Failed to read metadata at {:?}: {}", path, e);
505            return default_package_data(path);
506        }
507    };
508
509    let metadata = super::rfc822::parse_rfc822_content(&content);
510    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
511    merge_sibling_metadata_dependencies(path, &mut package_data);
512    merge_sibling_metadata_file_references(path, &mut package_data);
513    if datasource_id == DatasourceId::PypiWheelMetadata {
514        merge_sibling_wheel_metadata(path, &mut package_data);
515    }
516    package_data
517}
518
519fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
520    let mut extra_dependencies = Vec::new();
521
522    if let Some(parent) = path.parent() {
523        let direct_requires = parent.join("requires.txt");
524        if direct_requires.exists()
525            && let Ok(content) = read_file_to_string(&direct_requires, None)
526        {
527            extra_dependencies.extend(parse_requires_txt(&content));
528        }
529
530        let sibling_egg_info_requires = parent
531            .read_dir()
532            .ok()
533            .into_iter()
534            .flatten()
535            .flatten()
536            .find_map(|entry| {
537                let child_path = entry.path();
538                if child_path.is_dir()
539                    && child_path
540                        .file_name()
541                        .and_then(|name| name.to_str())
542                        .is_some_and(|name| name.ends_with(".egg-info"))
543                {
544                    let requires = child_path.join("requires.txt");
545                    requires.exists().then_some(requires)
546                } else {
547                    None
548                }
549            });
550
551        if let Some(requires_path) = sibling_egg_info_requires
552            && let Ok(content) = read_file_to_string(&requires_path, None)
553        {
554            extra_dependencies.extend(parse_requires_txt(&content));
555        }
556    }
557
558    for dependency in extra_dependencies {
559        if !package_data.dependencies.iter().any(|existing| {
560            existing.purl == dependency.purl
561                && existing.scope == dependency.scope
562                && existing.extracted_requirement == dependency.extracted_requirement
563                && existing.extra_data == dependency.extra_data
564        }) {
565            package_data.dependencies.push(dependency);
566        }
567    }
568}
569
570fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
571    let mut extra_refs = Vec::new();
572
573    if let Some(parent) = path.parent() {
574        let record_path = parent.join("RECORD");
575        if record_path.exists()
576            && let Ok(content) = read_file_to_string(&record_path, None)
577        {
578            extra_refs.extend(parse_record_csv(&content));
579        }
580
581        let installed_files_path = parent.join("installed-files.txt");
582        if installed_files_path.exists()
583            && let Ok(content) = read_file_to_string(&installed_files_path, None)
584        {
585            extra_refs.extend(parse_installed_files_txt(&content));
586        }
587
588        let sources_path = parent.join("SOURCES.txt");
589        if sources_path.exists()
590            && let Ok(content) = read_file_to_string(&sources_path, None)
591        {
592            extra_refs.extend(parse_sources_txt(&content));
593        }
594    }
595
596    for file_ref in extra_refs {
597        if !package_data
598            .file_references
599            .iter()
600            .any(|existing| existing.path == file_ref.path)
601        {
602            package_data.file_references.push(file_ref);
603        }
604    }
605}
606
607fn collect_validated_zip_entries<R: Read + std::io::Seek>(
608    archive: &mut ZipArchive<R>,
609    path: &Path,
610    archive_type: &str,
611) -> Result<Vec<ValidatedZipEntry>, String> {
612    let mut total_extracted = 0u64;
613    let mut entries = Vec::new();
614    let mut entry_count = 0usize;
615
616    for i in 0..archive.len() {
617        entry_count += 1;
618        if entry_count > MAX_ITERATION_COUNT {
619            warn!(
620                "Exceeded max entry count in {} {:?}; stopping at {} entries",
621                archive_type, path, MAX_ITERATION_COUNT
622            );
623            break;
624        }
625        if let Ok(file) = archive.by_index_raw(i) {
626            let compressed_size = file.compressed_size();
627            let uncompressed_size = file.size();
628            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
629                warn!(
630                    "Skipping unsafe path in {} {:?}: {}",
631                    archive_type,
632                    path,
633                    file.name()
634                );
635                continue;
636            };
637
638            if compressed_size > 0 {
639                let ratio = uncompressed_size as f64 / compressed_size as f64;
640                if ratio > MAX_COMPRESSION_RATIO {
641                    warn!(
642                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
643                        archive_type, path, ratio
644                    );
645                    continue;
646                }
647            }
648
649            if uncompressed_size > MAX_FILE_SIZE {
650                warn!(
651                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
652                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
653                );
654                continue;
655            }
656
657            total_extracted += uncompressed_size;
658            if total_extracted > MAX_ARCHIVE_SIZE {
659                let msg = format!(
660                    "Total extracted size exceeds limit for {} {:?}",
661                    archive_type, path
662                );
663                warn!("{}", msg);
664                return Err(msg);
665            }
666
667            entries.push(ValidatedZipEntry {
668                index: i,
669                name: entry_name,
670            });
671        }
672    }
673
674    Ok(entries)
675}
676
677fn is_python_sdist_archive_path(path: &Path) -> bool {
678    detect_python_sdist_archive_format(path).is_some()
679}
680
681fn is_valid_wheel_archive_path(path: &Path) -> bool {
682    if !path.is_file() {
683        return true;
684    }
685
686    let file = match File::open(path) {
687        Ok(file) => file,
688        Err(_) => return false,
689    };
690    let mut archive = match ZipArchive::new(file) {
691        Ok(archive) => archive,
692        Err(_) => return false,
693    };
694
695    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
696        Ok(entries) => entries,
697        Err(_) => return false,
698    };
699
700    find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA").is_some()
701}
702
703fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
704    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
705
706    if !is_likely_python_sdist_filename(&file_name) {
707        return None;
708    }
709
710    if file_name.ends_with(".tar.gz") {
711        tar_gz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarGz)
712    } else if file_name.ends_with(".tgz") {
713        tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
714    } else if file_name.ends_with(".tar.bz2") {
715        tar_bz2_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarBz2)
716    } else if file_name.ends_with(".tar.xz") {
717        tar_xz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarXz)
718    } else if file_name.ends_with(".zip") {
719        zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
720    } else {
721        None
722    }
723}
724
725fn tar_gz_sdist_contains_pkg_info(path: &Path) -> bool {
726    let Some(compressed_size) = compressed_archive_size(path) else {
727        return false;
728    };
729    let file = match File::open(path) {
730        Ok(file) => file,
731        Err(_) => return false,
732    };
733    let decoder = GzDecoder::new(file);
734    tar_sdist_contains_pkg_info(path, decoder, "tar.gz", compressed_size)
735}
736
737fn tar_bz2_sdist_contains_pkg_info(path: &Path) -> bool {
738    let Some(compressed_size) = compressed_archive_size(path) else {
739        return false;
740    };
741    let file = match File::open(path) {
742        Ok(file) => file,
743        Err(_) => return false,
744    };
745    let decoder = BzDecoder::new(file);
746    tar_sdist_contains_pkg_info(path, decoder, "tar.bz2", compressed_size)
747}
748
749fn tar_xz_sdist_contains_pkg_info(path: &Path) -> bool {
750    let Some(compressed_size) = compressed_archive_size(path) else {
751        return false;
752    };
753    let file = match File::open(path) {
754        Ok(file) => file,
755        Err(_) => return false,
756    };
757    let decoder = XzDecoder::new(file);
758    tar_sdist_contains_pkg_info(path, decoder, "tar.xz", compressed_size)
759}
760
761fn compressed_archive_size(path: &Path) -> Option<u64> {
762    std::fs::metadata(path).ok().map(|metadata| metadata.len())
763}
764
765fn tar_sdist_contains_pkg_info<R: Read>(
766    path: &Path,
767    reader: R,
768    archive_type: &str,
769    compressed_size: u64,
770) -> bool {
771    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
772    else {
773        return false;
774    };
775
776    select_sdist_pkginfo_entry(path, &entries).is_some()
777}
778
779fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
780    if !path.is_file() {
781        return true;
782    }
783
784    let Some(compressed_size) = compressed_archive_size(path) else {
785        return false;
786    };
787    let file = match File::open(path) {
788        Ok(file) => file,
789        Err(_) => return false,
790    };
791    let decoder = GzDecoder::new(file);
792    tar_sdist_contains_pkg_info(path, decoder, "tgz", compressed_size)
793}
794
795fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
796    if !path.is_file() {
797        return true;
798    }
799
800    let file = match File::open(path) {
801        Ok(file) => file,
802        Err(_) => return false,
803    };
804    let mut archive = match ZipArchive::new(file) {
805        Ok(archive) => archive,
806        Err(_) => return false,
807    };
808
809    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
810        Ok(entries) => entries,
811        Err(_) => return false,
812    };
813    let metadata_entries: Vec<_> = validated_entries
814        .iter()
815        .filter(|entry| entry.name.ends_with("/PKG-INFO"))
816        .filter_map(|entry| {
817            read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
818                .ok()
819                .map(|content| (entry.name.clone(), content))
820        })
821        .collect();
822
823    has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
824}
825
826fn is_likely_python_sdist_filename(file_name: &str) -> bool {
827    let Some(stem) = strip_python_archive_extension(file_name) else {
828        return false;
829    };
830
831    let Some((name, version)) = stem.rsplit_once('-') else {
832        return false;
833    };
834
835    !name.is_empty()
836        && !version.is_empty()
837        && version.chars().any(|ch| ch.is_ascii_digit())
838        && name
839            .chars()
840            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
841}
842
843fn extract_from_sdist_archive(path: &Path) -> PackageData {
844    let metadata = match std::fs::metadata(path) {
845        Ok(m) => m,
846        Err(e) => {
847            warn!(
848                "Failed to read metadata for sdist archive {:?}: {}",
849                path, e
850            );
851            return default_package_data(path);
852        }
853    };
854
855    if metadata.len() > MAX_ARCHIVE_SIZE {
856        warn!(
857            "sdist archive too large: {} bytes (limit: {} bytes)",
858            metadata.len(),
859            MAX_ARCHIVE_SIZE
860        );
861        return default_package_data(path);
862    }
863
864    let Some(format) = detect_python_sdist_archive_format(path) else {
865        return default_package_data(path);
866    };
867
868    let mut package_data = match format {
869        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
870            let file = match File::open(path) {
871                Ok(file) => file,
872                Err(e) => {
873                    warn!("Failed to open sdist archive {:?}: {}", path, e);
874                    return default_package_data(path);
875                }
876            };
877            let decoder = GzDecoder::new(file);
878            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
879        }
880        PythonSdistArchiveFormat::TarBz2 => {
881            let file = match File::open(path) {
882                Ok(file) => file,
883                Err(e) => {
884                    warn!("Failed to open sdist archive {:?}: {}", path, e);
885                    return default_package_data(path);
886                }
887            };
888            let decoder = BzDecoder::new(file);
889            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
890        }
891        PythonSdistArchiveFormat::TarXz => {
892            let file = match File::open(path) {
893                Ok(file) => file,
894                Err(e) => {
895                    warn!("Failed to open sdist archive {:?}: {}", path, e);
896                    return default_package_data(path);
897                }
898            };
899            let decoder = XzDecoder::new(file);
900            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
901        }
902        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
903    };
904
905    if package_data.package_type.is_some() {
906        let (size, sha256) = calculate_file_checksums(path);
907        package_data.size = size;
908        package_data.sha256 = sha256;
909    }
910
911    package_data
912}
913
914fn extract_from_tar_sdist_archive<R: Read>(
915    path: &Path,
916    reader: R,
917    archive_type: &str,
918    compressed_size: u64,
919) -> PackageData {
920    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
921    else {
922        return default_package_data(path);
923    };
924
925    build_sdist_package_data(path, entries)
926}
927
928fn collect_tar_sdist_entries<R: Read>(
929    path: &Path,
930    reader: R,
931    archive_type: &str,
932    compressed_size: u64,
933) -> Option<Vec<(String, String)>> {
934    let mut archive = Archive::new(reader);
935    let archive_entries = match archive.entries() {
936        Ok(entries) => entries,
937        Err(e) => {
938            warn!(
939                "Failed to read {} sdist archive {:?}: {}",
940                archive_type, path, e
941            );
942            return None;
943        }
944    };
945
946    let mut total_extracted = 0u64;
947    let mut entries = Vec::new();
948    let mut entry_count = 0usize;
949
950    for entry_result in archive_entries {
951        entry_count += 1;
952        if entry_count > MAX_ITERATION_COUNT {
953            warn!(
954                "Exceeded max entry count in {} sdist {:?}; stopping at {} entries",
955                archive_type, path, MAX_ITERATION_COUNT
956            );
957            break;
958        }
959
960        let mut entry = match entry_result {
961            Ok(entry) => entry,
962            Err(e) => {
963                warn!(
964                    "Failed to read {} sdist entry from {:?}: {}",
965                    archive_type, path, e
966                );
967                continue;
968            }
969        };
970
971        let entry_size = entry.size();
972        if entry_size > MAX_FILE_SIZE {
973            warn!(
974                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
975                archive_type, path, entry_size, MAX_FILE_SIZE
976            );
977            continue;
978        }
979
980        total_extracted += entry_size;
981        if total_extracted > MAX_ARCHIVE_SIZE {
982            warn!(
983                "Total extracted size exceeds limit for {} sdist {:?}",
984                archive_type, path
985            );
986            return None;
987        }
988
989        if compressed_size > 0 {
990            let ratio = total_extracted as f64 / compressed_size as f64;
991            if ratio > MAX_COMPRESSION_RATIO {
992                warn!(
993                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
994                    archive_type, path, ratio
995                );
996                return None;
997            }
998        }
999
1000        let entry_path = match entry.path() {
1001            Ok(path) => path.to_string_lossy().replace('\\', "/"),
1002            Err(e) => {
1003                warn!(
1004                    "Failed to get {} sdist entry path from {:?}: {}",
1005                    archive_type, path, e
1006                );
1007                continue;
1008            }
1009        };
1010
1011        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
1012            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
1013            continue;
1014        };
1015
1016        if !is_relevant_sdist_text_entry(&entry_path) {
1017            continue;
1018        }
1019
1020        if let Ok(content) = read_limited_utf8(
1021            &mut entry,
1022            MAX_FILE_SIZE,
1023            &format!("{} entry {}", archive_type, entry_path),
1024        ) {
1025            entries.push((entry_path, content));
1026        }
1027    }
1028
1029    Some(entries)
1030}
1031
1032fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
1033    let file = match File::open(path) {
1034        Ok(file) => file,
1035        Err(e) => {
1036            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
1037            return default_package_data(path);
1038        }
1039    };
1040
1041    let mut archive = match ZipArchive::new(file) {
1042        Ok(archive) => archive,
1043        Err(e) => {
1044            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
1045            return default_package_data(path);
1046        }
1047    };
1048
1049    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
1050        Ok(entries) => entries,
1051        Err(_) => return default_package_data(path),
1052    };
1053
1054    let mut entries = Vec::new();
1055    for entry in validated_entries.iter() {
1056        if !is_relevant_sdist_text_entry(&entry.name) {
1057            continue;
1058        }
1059
1060        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
1061            entries.push((entry.name.clone(), content));
1062        }
1063    }
1064
1065    build_sdist_package_data(path, entries)
1066}
1067
1068fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
1069    entry_path.ends_with("/PKG-INFO")
1070        || entry_path.ends_with("/requires.txt")
1071        || entry_path.ends_with("/SOURCES.txt")
1072}
1073
1074fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
1075    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
1076        warn!("No PKG-INFO file found in sdist archive {:?}", path);
1077        return default_package_data(path);
1078    };
1079
1080    let mut package_data =
1081        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
1082    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
1083    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
1084    apply_sdist_name_version_fallback(path, &mut package_data);
1085    package_data.datasource_id = Some(DatasourceId::PypiSdist);
1086    package_data
1087}
1088
1089fn select_sdist_pkginfo_entry(
1090    archive_path: &Path,
1091    entries: &[(String, String)],
1092) -> Option<(String, String)> {
1093    let expected_name = sdist_archive_expected_name(archive_path);
1094
1095    entries
1096        .iter()
1097        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
1098        .min_by_key(|(entry_path, content)| {
1099            let components: Vec<_> = entry_path
1100                .split('/')
1101                .filter(|part| !part.is_empty())
1102                .collect();
1103            let candidate_name = sdist_pkginfo_candidate_name(content);
1104            let name_rank = if candidate_name == expected_name {
1105                0
1106            } else {
1107                1
1108            };
1109            let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1110
1111            (name_rank, kind_rank, components.len(), entry_path.clone())
1112        })
1113        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1114}
1115
1116fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1117    let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1118        return false;
1119    };
1120
1121    entries.iter().any(|(entry_path, content)| {
1122        sdist_pkginfo_kind_rank(entry_path) < 3
1123            && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1124    })
1125}
1126
1127fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1128    archive_path
1129        .file_name()
1130        .and_then(|name| name.to_str())
1131        .and_then(strip_python_archive_extension)
1132        .and_then(|stem| {
1133            stem.rsplit_once('-')
1134                .map(|(name, _)| normalize_python_package_name(name))
1135        })
1136}
1137
1138fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1139    let metadata = super::rfc822::parse_rfc822_content(content);
1140    super::rfc822::get_header_first(&metadata.headers, "name")
1141        .map(|name| normalize_python_package_name(&name))
1142}
1143
1144fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1145    let components: Vec<_> = entry_path
1146        .split('/')
1147        .filter(|part| !part.is_empty())
1148        .collect();
1149
1150    if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1151    {
1152        0
1153    } else if components.len() == 2 && components[1] == "PKG-INFO" {
1154        1
1155    } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1156        2
1157    } else {
1158        3
1159    }
1160}
1161
1162fn merge_sdist_archive_dependencies(
1163    entries: &[(String, String)],
1164    metadata_path: &str,
1165    package_data: &mut PackageData,
1166) {
1167    let metadata_dir = metadata_path
1168        .rsplit_once('/')
1169        .map(|(dir, _)| dir)
1170        .unwrap_or("");
1171    let archive_root = metadata_path.split('/').next().unwrap_or("");
1172    let matched_egg_info_dir =
1173        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1174    let mut extra_dependencies = Vec::new();
1175
1176    for (entry_path, content) in entries {
1177        let is_direct_requires =
1178            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1179        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1180            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1181        });
1182
1183        if is_direct_requires || is_egg_info_requires {
1184            extra_dependencies.extend(parse_requires_txt(content));
1185        }
1186    }
1187
1188    for dependency in extra_dependencies {
1189        if !package_data.dependencies.iter().any(|existing| {
1190            existing.purl == dependency.purl
1191                && existing.scope == dependency.scope
1192                && existing.extracted_requirement == dependency.extracted_requirement
1193                && existing.extra_data == dependency.extra_data
1194        }) {
1195            package_data.dependencies.push(dependency);
1196        }
1197    }
1198}
1199
1200fn merge_sdist_archive_file_references(
1201    entries: &[(String, String)],
1202    metadata_path: &str,
1203    package_data: &mut PackageData,
1204) {
1205    let metadata_dir = metadata_path
1206        .rsplit_once('/')
1207        .map(|(dir, _)| dir)
1208        .unwrap_or("");
1209    let archive_root = metadata_path.split('/').next().unwrap_or("");
1210    let matched_egg_info_dir =
1211        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1212    let mut extra_refs = Vec::new();
1213
1214    for (entry_path, content) in entries {
1215        let is_direct_sources =
1216            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1217        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1218            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1219        });
1220
1221        if is_direct_sources || is_egg_info_sources {
1222            extra_refs.extend(parse_sources_txt(content));
1223        }
1224    }
1225
1226    for file_ref in extra_refs {
1227        if !package_data
1228            .file_references
1229            .iter()
1230            .any(|existing| existing.path == file_ref.path)
1231        {
1232            package_data.file_references.push(file_ref);
1233        }
1234    }
1235}
1236
1237fn select_matching_sdist_egg_info_dir(
1238    entries: &[(String, String)],
1239    archive_root: &str,
1240    package_name: Option<&str>,
1241) -> Option<String> {
1242    let normalized_package_name = package_name.map(normalize_python_package_name);
1243
1244    entries
1245        .iter()
1246        .filter_map(|(entry_path, _)| {
1247            let components: Vec<_> = entry_path
1248                .split('/')
1249                .filter(|part| !part.is_empty())
1250                .collect();
1251            if components.len() == 3
1252                && components[0] == archive_root
1253                && components[1].ends_with(".egg-info")
1254            {
1255                Some(components[1].to_string())
1256            } else {
1257                None
1258            }
1259        })
1260        .min_by_key(|egg_info_dir| {
1261            let normalized_dir_name =
1262                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1263            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1264                0
1265            } else {
1266                1
1267            };
1268
1269            (name_rank, egg_info_dir.clone())
1270        })
1271}
1272
1273fn normalize_python_package_name(name: &str) -> String {
1274    name.to_ascii_lowercase().replace('_', "-")
1275}
1276
1277fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1278    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1279        return;
1280    };
1281
1282    let Some(stem) = strip_python_archive_extension(file_name) else {
1283        return;
1284    };
1285
1286    let Some((name, version)) = stem.rsplit_once('-') else {
1287        return;
1288    };
1289
1290    if package_data.name.is_none() {
1291        package_data.name = Some(name.replace('_', "-"));
1292    }
1293    if package_data.version.is_none() {
1294        package_data.version = Some(version.to_string());
1295    }
1296
1297    if package_data.purl.is_none()
1298        || package_data.repository_homepage_url.is_none()
1299        || package_data.repository_download_url.is_none()
1300        || package_data.api_data_url.is_none()
1301    {
1302        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1303            build_pypi_urls(
1304                package_data.name.as_deref(),
1305                package_data.version.as_deref(),
1306            );
1307
1308        if package_data.repository_homepage_url.is_none() {
1309            package_data.repository_homepage_url = repository_homepage_url;
1310        }
1311        if package_data.repository_download_url.is_none() {
1312            package_data.repository_download_url = repository_download_url;
1313        }
1314        if package_data.api_data_url.is_none() {
1315            package_data.api_data_url = api_data_url;
1316        }
1317        if package_data.purl.is_none() {
1318            package_data.purl = purl;
1319        }
1320    }
1321}
1322
1323fn extract_from_wheel_archive(path: &Path) -> PackageData {
1324    let metadata = match std::fs::metadata(path) {
1325        Ok(m) => m,
1326        Err(e) => {
1327            warn!(
1328                "Failed to read metadata for wheel archive {:?}: {}",
1329                path, e
1330            );
1331            return default_package_data(path);
1332        }
1333    };
1334
1335    if metadata.len() > MAX_ARCHIVE_SIZE {
1336        warn!(
1337            "Wheel archive too large: {} bytes (limit: {} bytes)",
1338            metadata.len(),
1339            MAX_ARCHIVE_SIZE
1340        );
1341        return default_package_data(path);
1342    }
1343
1344    let file = match File::open(path) {
1345        Ok(f) => f,
1346        Err(e) => {
1347            warn!("Failed to open wheel archive {:?}: {}", path, e);
1348            return default_package_data(path);
1349        }
1350    };
1351
1352    let mut archive = match ZipArchive::new(file) {
1353        Ok(a) => a,
1354        Err(e) => {
1355            warn!("Failed to read wheel archive {:?}: {}", path, e);
1356            return default_package_data(path);
1357        }
1358    };
1359
1360    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1361        Ok(entries) => entries,
1362        Err(_) => return default_package_data(path),
1363    };
1364
1365    let metadata_entry =
1366        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1367            Some(entry) => entry,
1368            None => {
1369                warn!("No METADATA file found in wheel archive {:?}", path);
1370                return default_package_data(path);
1371            }
1372        };
1373
1374    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1375        Ok(c) => c,
1376        Err(e) => {
1377            warn!("Failed to read METADATA from {:?}: {}", path, e);
1378            return default_package_data(path);
1379        }
1380    };
1381
1382    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1383
1384    let (size, sha256) = calculate_file_checksums(path);
1385    package_data.size = size;
1386    package_data.sha256 = sha256;
1387
1388    if let Some(record_entry) =
1389        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1390        && let Ok(record_content) =
1391            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1392    {
1393        package_data.file_references = parse_record_csv(&record_content);
1394    }
1395
1396    if let Some(wheel_info) = parse_wheel_filename(path) {
1397        if package_data.name.is_none() {
1398            package_data.name = Some(wheel_info.name.clone());
1399        }
1400        if package_data.version.is_none() {
1401            package_data.version = Some(wheel_info.version.clone());
1402        }
1403
1404        package_data.qualifiers = Some(std::collections::HashMap::from([(
1405            "extension".to_string(),
1406            format!(
1407                "{}-{}-{}",
1408                wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1409            ),
1410        )]));
1411
1412        package_data.purl = build_wheel_purl(
1413            package_data.name.as_deref(),
1414            package_data.version.as_deref(),
1415            &wheel_info,
1416        );
1417
1418        let mut extra_data = package_data.extra_data.unwrap_or_default();
1419        extra_data.insert(
1420            "python_requires".to_string(),
1421            serde_json::Value::String(wheel_info.python_tag.clone()),
1422        );
1423        extra_data.insert(
1424            "abi_tag".to_string(),
1425            serde_json::Value::String(wheel_info.abi_tag.clone()),
1426        );
1427        extra_data.insert(
1428            "platform_tag".to_string(),
1429            serde_json::Value::String(wheel_info.platform_tag.clone()),
1430        );
1431        package_data.extra_data = Some(extra_data);
1432    }
1433
1434    package_data
1435}
1436
1437fn extract_from_egg_archive(path: &Path) -> PackageData {
1438    let metadata = match std::fs::metadata(path) {
1439        Ok(m) => m,
1440        Err(e) => {
1441            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1442            return default_package_data(path);
1443        }
1444    };
1445
1446    if metadata.len() > MAX_ARCHIVE_SIZE {
1447        warn!(
1448            "Egg archive too large: {} bytes (limit: {} bytes)",
1449            metadata.len(),
1450            MAX_ARCHIVE_SIZE
1451        );
1452        return default_package_data(path);
1453    }
1454
1455    let file = match File::open(path) {
1456        Ok(f) => f,
1457        Err(e) => {
1458            warn!("Failed to open egg archive {:?}: {}", path, e);
1459            return default_package_data(path);
1460        }
1461    };
1462
1463    let mut archive = match ZipArchive::new(file) {
1464        Ok(a) => a,
1465        Err(e) => {
1466            warn!("Failed to read egg archive {:?}: {}", path, e);
1467            return default_package_data(path);
1468        }
1469    };
1470
1471    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1472        Ok(entries) => entries,
1473        Err(_) => return default_package_data(path),
1474    };
1475
1476    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1477        &validated_entries,
1478        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1479    ) {
1480        Some(entry) => entry,
1481        None => {
1482            warn!("No PKG-INFO file found in egg archive {:?}", path);
1483            return default_package_data(path);
1484        }
1485    };
1486
1487    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1488        Ok(c) => c,
1489        Err(e) => {
1490            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1491            return default_package_data(path);
1492        }
1493    };
1494
1495    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1496
1497    let (size, sha256) = calculate_file_checksums(path);
1498    package_data.size = size;
1499    package_data.sha256 = sha256;
1500
1501    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1502        &validated_entries,
1503        &[
1504            "EGG-INFO/installed-files.txt",
1505            ".egg-info/installed-files.txt",
1506        ],
1507    ) && let Ok(installed_files_content) =
1508        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1509    {
1510        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1511    }
1512
1513    if let Some(egg_info) = parse_egg_filename(path) {
1514        if package_data.name.is_none() {
1515            package_data.name = Some(egg_info.name.clone());
1516        }
1517        if package_data.version.is_none() {
1518            package_data.version = Some(egg_info.version.clone());
1519        }
1520
1521        if let Some(python_version) = &egg_info.python_version {
1522            let mut extra_data = package_data.extra_data.unwrap_or_default();
1523            extra_data.insert(
1524                "python_version".to_string(),
1525                serde_json::Value::String(python_version.clone()),
1526            );
1527            package_data.extra_data = Some(extra_data);
1528        }
1529    }
1530
1531    package_data.purl = build_egg_purl(
1532        package_data.name.as_deref(),
1533        package_data.version.as_deref(),
1534    );
1535
1536    package_data
1537}
1538
1539fn find_validated_zip_entry_by_suffix<'a>(
1540    entries: &'a [ValidatedZipEntry],
1541    suffix: &str,
1542) -> Option<&'a ValidatedZipEntry> {
1543    entries.iter().find(|entry| entry.name.ends_with(suffix))
1544}
1545
1546fn find_validated_zip_entry_by_any_suffix<'a>(
1547    entries: &'a [ValidatedZipEntry],
1548    suffixes: &[&str],
1549) -> Option<&'a ValidatedZipEntry> {
1550    entries
1551        .iter()
1552        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1553}
1554
1555fn read_validated_zip_entry<R: Read + std::io::Seek>(
1556    archive: &mut ZipArchive<R>,
1557    entry: &ValidatedZipEntry,
1558    path: &Path,
1559    archive_type: &str,
1560) -> Result<String, String> {
1561    let mut file = archive
1562        .by_index(entry.index)
1563        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1564
1565    let compressed_size = file.compressed_size();
1566    let uncompressed_size = file.size();
1567
1568    if compressed_size > 0 {
1569        let ratio = uncompressed_size as f64 / compressed_size as f64;
1570        if ratio > MAX_COMPRESSION_RATIO {
1571            return Err(format!(
1572                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1573                archive_type, path, ratio
1574            ));
1575        }
1576    }
1577
1578    if uncompressed_size > MAX_FILE_SIZE {
1579        return Err(format!(
1580            "Rejected oversized entry in {} {:?}: {} bytes",
1581            archive_type, path, uncompressed_size
1582        ));
1583    }
1584
1585    read_limited_utf8(
1586        &mut file,
1587        MAX_FILE_SIZE,
1588        &format!("{} entry {}", archive_type, entry.name),
1589    )
1590}
1591
1592fn read_limited_utf8<R: Read>(
1593    reader: &mut R,
1594    max_bytes: u64,
1595    context: &str,
1596) -> Result<String, String> {
1597    let mut limited = reader.take(max_bytes + 1);
1598    let mut bytes = Vec::new();
1599    limited
1600        .read_to_end(&mut bytes)
1601        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1602
1603    if bytes.len() as u64 > max_bytes {
1604        return Err(format!(
1605            "{} exceeded {} byte limit while reading",
1606            context, max_bytes
1607        ));
1608    }
1609
1610    match String::from_utf8(bytes) {
1611        Ok(s) => Ok(s),
1612        Err(err) => {
1613            let bytes = err.into_bytes();
1614            warn!("Invalid UTF-8 in archive entry; using lossy conversion");
1615            Ok(String::from_utf8_lossy(&bytes).into_owned())
1616        }
1617    }
1618}
1619
1620fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1621    let normalized = entry_path.replace('\\', "/");
1622    if normalized.len() >= 3 {
1623        let bytes = normalized.as_bytes();
1624        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1625            return None;
1626        }
1627    }
1628    let path = Path::new(&normalized);
1629    let mut components = Vec::new();
1630
1631    for component in path.components() {
1632        match component {
1633            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1634            Component::CurDir => {}
1635            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1636        }
1637    }
1638
1639    (!components.is_empty()).then_some(components.join("/"))
1640}
1641
1642/// Parses RECORD CSV format from wheel archives (PEP 427).
1643/// Format: path,hash,size (3 columns, no header)
1644/// Hash format: sha256=urlsafe_base64_hash or empty
1645/// Size: bytes as u64 or empty
1646pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1647    let mut reader = ReaderBuilder::new()
1648        .has_headers(false)
1649        .from_reader(content.as_bytes());
1650
1651    let mut file_references = Vec::new();
1652    let mut record_count = 0usize;
1653
1654    for result in reader.records() {
1655        record_count += 1;
1656        if record_count > MAX_ITERATION_COUNT {
1657            warn!(
1658                "Exceeded max record count in RECORD CSV; stopping at {} records",
1659                MAX_ITERATION_COUNT
1660            );
1661            break;
1662        }
1663        match result {
1664            Ok(record) => {
1665                if record.len() < 3 {
1666                    continue;
1667                }
1668
1669                let path = record.get(0).unwrap_or("").trim().to_string();
1670                if path.is_empty() {
1671                    continue;
1672                }
1673
1674                let hash_field = record.get(1).unwrap_or("").trim();
1675                let size_field = record.get(2).unwrap_or("").trim();
1676
1677                // Parse hash: format is "algorithm=value"
1678                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1679                    let parts: Vec<&str> = hash_field.split('=').collect();
1680                    if parts.len() == 2 && parts[0] == "sha256" {
1681                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1682                            Ok(decoded) => {
1683                                let hex = decoded
1684                                    .iter()
1685                                    .map(|b| format!("{:02x}", b))
1686                                    .collect::<String>();
1687                                Sha256Digest::from_hex(&hex).ok()
1688                            }
1689                            Err(_) => None,
1690                        }
1691                    } else {
1692                        None
1693                    }
1694                } else {
1695                    None
1696                };
1697
1698                // Parse size
1699                let size = if !size_field.is_empty() && size_field != "-" {
1700                    size_field.parse::<u64>().ok()
1701                } else {
1702                    None
1703                };
1704
1705                file_references.push(FileReference {
1706                    path,
1707                    size,
1708                    sha1: None,
1709                    md5: None,
1710                    sha256,
1711                    sha512: None,
1712                    extra_data: None,
1713                });
1714            }
1715            Err(e) => {
1716                warn!("Failed to parse RECORD CSV row: {}", e);
1717                continue;
1718            }
1719        }
1720    }
1721
1722    file_references
1723}
1724
1725/// Parses installed-files.txt format from egg archives (PEP 376).
1726/// Format: one file path per line, no headers, no hash, no size
1727pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1728    content
1729        .lines()
1730        .take(MAX_ITERATION_COUNT)
1731        .map(|line| line.trim())
1732        .filter(|line| !line.is_empty())
1733        .map(|path| FileReference {
1734            path: path.to_string(),
1735            size: None,
1736            sha1: None,
1737            md5: None,
1738            sha256: None,
1739            sha512: None,
1740            extra_data: None,
1741        })
1742        .collect()
1743}
1744
1745pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1746    content
1747        .lines()
1748        .take(MAX_ITERATION_COUNT)
1749        .map(str::trim)
1750        .filter(|line| !line.is_empty())
1751        .map(|path| FileReference {
1752            path: path.to_string(),
1753            size: None,
1754            sha1: None,
1755            md5: None,
1756            sha256: None,
1757            sha512: None,
1758            extra_data: None,
1759        })
1760        .collect()
1761}
1762
1763struct WheelInfo {
1764    name: String,
1765    version: String,
1766    python_tag: String,
1767    abi_tag: String,
1768    platform_tag: String,
1769}
1770
1771fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1772    let stem = path.file_stem()?.to_string_lossy();
1773    let parts: Vec<&str> = stem.split('-').collect();
1774
1775    if parts.len() >= 5 {
1776        Some(WheelInfo {
1777            name: parts[0].replace('_', "-"),
1778            version: parts[1].to_string(),
1779            python_tag: parts[2].to_string(),
1780            abi_tag: parts[3].to_string(),
1781            platform_tag: parts[4..].join("-"),
1782        })
1783    } else {
1784        None
1785    }
1786}
1787
1788struct EggInfo {
1789    name: String,
1790    version: String,
1791    python_version: Option<String>,
1792}
1793
1794fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1795    let stem = path.file_stem()?.to_string_lossy();
1796    let parts: Vec<&str> = stem.split('-').collect();
1797
1798    if parts.len() >= 2 {
1799        Some(EggInfo {
1800            name: parts[0].replace('_', "-"),
1801            version: parts[1].to_string(),
1802            python_version: parts.get(2).map(|s| s.to_string()),
1803        })
1804    } else {
1805        None
1806    }
1807}
1808
1809fn build_wheel_purl(
1810    name: Option<&str>,
1811    version: Option<&str>,
1812    wheel_info: &WheelInfo,
1813) -> Option<String> {
1814    let name = name?;
1815    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1816
1817    if let Some(ver) = version {
1818        package_url.with_version(ver).ok()?;
1819    }
1820
1821    let extension = format!(
1822        "{}-{}-{}",
1823        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1824    );
1825    package_url.add_qualifier("extension", extension).ok()?;
1826
1827    Some(package_url.to_string())
1828}
1829
1830fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1831    let name = name?;
1832    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1833
1834    if let Some(ver) = version {
1835        package_url.with_version(ver).ok()?;
1836    }
1837
1838    package_url.add_qualifier("type", "egg").ok()?;
1839
1840    Some(package_url.to_string())
1841}
1842
1843fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1844    let metadata = super::rfc822::parse_rfc822_content(content);
1845    build_package_data_from_rfc822(&metadata, datasource_id)
1846}
1847
1848/// Builds PackageData from parsed RFC822 metadata.
1849///
1850/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1851/// and `python_parse_rfc822_content` (content-based) functions.
1852fn build_package_data_from_rfc822(
1853    metadata: &super::rfc822::Rfc822Metadata,
1854    datasource_id: DatasourceId,
1855) -> PackageData {
1856    use super::rfc822::{get_header_all, get_header_first};
1857
1858    let name = get_header_first(&metadata.headers, "name").map(truncate_field);
1859    let version = get_header_first(&metadata.headers, "version").map(truncate_field);
1860    let summary = get_header_first(&metadata.headers, "summary").map(truncate_field);
1861    let mut homepage_url = get_header_first(&metadata.headers, "home-page").map(truncate_field);
1862    let author = get_header_first(&metadata.headers, "author").map(truncate_field);
1863    let author_email = get_header_first(&metadata.headers, "author-email").map(truncate_field);
1864    let license = get_header_first(&metadata.headers, "license").map(truncate_field);
1865    let license_expression = get_header_first(&metadata.headers, "license-expression");
1866    let download_url = get_header_first(&metadata.headers, "download-url");
1867    let platform = get_header_first(&metadata.headers, "platform");
1868    let requires_python = get_header_first(&metadata.headers, "requires-python");
1869    let classifiers = get_header_all(&metadata.headers, "classifier");
1870    let license_files = get_header_all(&metadata.headers, "license-file");
1871
1872    let description_body = if metadata.body.is_empty() {
1873        get_header_first(&metadata.headers, "description").unwrap_or_default()
1874    } else {
1875        metadata.body.clone()
1876    };
1877
1878    let description = build_description(summary.as_deref(), &description_body).map(truncate_field);
1879
1880    let mut parties = Vec::new();
1881    if author.is_some() || author_email.is_some() {
1882        parties.push(Party {
1883            r#type: Some("person".to_string()),
1884            role: Some("author".to_string()),
1885            name: author,
1886            email: author_email,
1887            url: None,
1888            organization: None,
1889            organization_url: None,
1890            timezone: None,
1891        });
1892    }
1893
1894    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1895    let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1896    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1897        license_expression
1898            .as_deref()
1899            .and_then(normalize_spdx_expression)
1900            .map(|normalized| {
1901                build_declared_license_data(
1902                    normalized,
1903                    DeclaredLicenseMatchMetadata::single_line(
1904                        license_expression.as_deref().unwrap_or_default(),
1905                    )
1906                    .with_referenced_filenames(&referenced_license_files),
1907                )
1908            })
1909            .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1910
1911    let extracted_license_statement = license_expression
1912        .clone()
1913        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1914
1915    let mut extra_data = HashMap::new();
1916    if let Some(platform_value) = platform
1917        && !platform_value.eq_ignore_ascii_case("unknown")
1918        && !platform_value.is_empty()
1919    {
1920        extra_data.insert(
1921            "platform".to_string(),
1922            serde_json::Value::String(platform_value),
1923        );
1924    }
1925
1926    if let Some(requires_python_value) = requires_python
1927        && !requires_python_value.is_empty()
1928    {
1929        extra_data.insert(
1930            "requires_python".to_string(),
1931            serde_json::Value::String(requires_python_value),
1932        );
1933    }
1934
1935    if !license_files.is_empty() {
1936        extra_data.insert(
1937            "license_files".to_string(),
1938            serde_json::Value::Array(
1939                license_files
1940                    .iter()
1941                    .cloned()
1942                    .map(serde_json::Value::String)
1943                    .collect(),
1944            ),
1945        );
1946    }
1947
1948    let file_references = license_files
1949        .iter()
1950        .map(|path| FileReference {
1951            path: path.clone(),
1952            size: None,
1953            sha1: None,
1954            md5: None,
1955            sha256: None,
1956            sha512: None,
1957            extra_data: None,
1958        })
1959        .collect();
1960
1961    let project_urls = get_header_all(&metadata.headers, "project-url");
1962    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1963    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1964
1965    if !project_urls.is_empty() {
1966        let parsed_urls = parse_project_urls(&project_urls);
1967
1968        for (label, url) in &parsed_urls {
1969            let label_lower = label.to_lowercase();
1970
1971            if bug_tracking_url.is_none()
1972                && matches!(
1973                    label_lower.as_str(),
1974                    "tracker"
1975                        | "bug reports"
1976                        | "bug tracker"
1977                        | "issues"
1978                        | "issue tracker"
1979                        | "github: issues"
1980                )
1981            {
1982                bug_tracking_url = Some(url.clone());
1983            } else if code_view_url.is_none()
1984                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1985            {
1986                code_view_url = Some(url.clone());
1987            } else if vcs_url.is_none()
1988                && matches!(
1989                    label_lower.as_str(),
1990                    "github" | "gitlab" | "github: repo" | "repository"
1991                )
1992            {
1993                vcs_url = Some(url.clone());
1994            } else if homepage_url.is_none()
1995                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1996            {
1997                homepage_url = Some(url.clone());
1998            } else if label_lower == "changelog" {
1999                extra_data.insert(
2000                    "changelog_url".to_string(),
2001                    serde_json::Value::String(url.clone()),
2002                );
2003            }
2004        }
2005
2006        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
2007            .iter()
2008            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
2009            .collect();
2010
2011        if !project_urls_json.is_empty() {
2012            extra_data.insert(
2013                "project_urls".to_string(),
2014                serde_json::Value::Object(project_urls_json),
2015            );
2016        }
2017    }
2018
2019    let extra_data = if extra_data.is_empty() {
2020        None
2021    } else {
2022        Some(extra_data)
2023    };
2024
2025    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
2026        build_pypi_urls(name.as_deref(), version.as_deref());
2027
2028    PackageData {
2029        package_type: Some(PythonParser::PACKAGE_TYPE),
2030        namespace: None,
2031        name,
2032        version,
2033        qualifiers: None,
2034        subpath: None,
2035        primary_language: Some("Python".to_string()),
2036        description,
2037        release_date: None,
2038        parties,
2039        keywords,
2040        homepage_url,
2041        download_url,
2042        size: None,
2043        sha1: None,
2044        md5: None,
2045        sha256: None,
2046        sha512: None,
2047        bug_tracking_url,
2048        code_view_url,
2049        vcs_url,
2050        copyright: None,
2051        holder: None,
2052        declared_license_expression,
2053        declared_license_expression_spdx,
2054        license_detections,
2055        other_license_expression: None,
2056        other_license_expression_spdx: None,
2057        other_license_detections: Vec::new(),
2058        extracted_license_statement,
2059        notice_text: None,
2060        source_packages: Vec::new(),
2061        file_references,
2062        is_private: false,
2063        is_virtual: false,
2064        extra_data,
2065        dependencies,
2066        repository_homepage_url,
2067        repository_download_url,
2068        api_data_url,
2069        datasource_id: Some(datasource_id),
2070        purl,
2071    }
2072}
2073
2074fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
2075    project_urls
2076        .iter()
2077        .filter_map(|url_entry| {
2078            if let Some((label, url)) = url_entry.split_once(", ") {
2079                let label_trimmed = label.trim();
2080                let url_trimmed = url.trim();
2081                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
2082                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
2083                }
2084            }
2085            None
2086        })
2087        .collect()
2088}
2089
2090fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
2091    let mut parts = Vec::new();
2092    if let Some(summary_value) = summary
2093        && !summary_value.trim().is_empty()
2094    {
2095        parts.push(summary_value.trim().to_string());
2096    }
2097
2098    if !body.trim().is_empty() {
2099        parts.push(body.trim().to_string());
2100    }
2101
2102    if parts.is_empty() {
2103        None
2104    } else {
2105        Some(parts.join("\n"))
2106    }
2107}
2108
2109fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
2110    let mut keywords = Vec::new();
2111    let mut license_classifiers = Vec::new();
2112
2113    for classifier in classifiers {
2114        if classifier.starts_with("License ::") {
2115            license_classifiers.push(classifier.to_string());
2116        } else {
2117            keywords.push(classifier.to_string());
2118        }
2119    }
2120
2121    (keywords, license_classifiers)
2122}
2123
2124fn build_extracted_license_statement(
2125    license: Option<&str>,
2126    license_classifiers: &[String],
2127) -> Option<String> {
2128    let mut lines = Vec::new();
2129
2130    if let Some(value) = license
2131        && !value.trim().is_empty()
2132    {
2133        lines.push(format!("license: {}", value.trim()));
2134    }
2135
2136    if !license_classifiers.is_empty() {
2137        lines.push("classifiers:".to_string());
2138        for classifier in license_classifiers {
2139            lines.push(format!("  - '{}'", classifier));
2140        }
2141    }
2142
2143    if lines.is_empty() {
2144        None
2145    } else {
2146        Some(format!("{}\n", lines.join("\n")))
2147    }
2148}
2149
2150pub(crate) fn build_pypi_urls(
2151    name: Option<&str>,
2152    version: Option<&str>,
2153) -> (
2154    Option<String>,
2155    Option<String>,
2156    Option<String>,
2157    Option<String>,
2158) {
2159    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2160
2161    let repository_download_url = name.and_then(|value| {
2162        version.map(|ver| {
2163            format!(
2164                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2165                &value[..1.min(value.len())],
2166                value,
2167                value,
2168                ver
2169            )
2170        })
2171    });
2172
2173    let api_data_url = name.map(|value| {
2174        if let Some(ver) = version {
2175            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2176        } else {
2177            format!("https://pypi.org/pypi/{}/json", value)
2178        }
2179    });
2180
2181    let purl = name.and_then(|value| {
2182        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2183        if let Some(ver) = version {
2184            package_url.with_version(ver).ok()?;
2185        }
2186        Some(package_url.to_string())
2187    });
2188
2189    (
2190        repository_homepage_url,
2191        repository_download_url,
2192        api_data_url,
2193        purl,
2194    )
2195}
2196
2197fn build_pypi_purl_with_extension(
2198    name: &str,
2199    version: Option<&str>,
2200    extension: &str,
2201) -> Option<String> {
2202    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2203    if let Some(ver) = version {
2204        package_url.with_version(ver).ok()?;
2205    }
2206    package_url.add_qualifier("extension", extension).ok()?;
2207    Some(package_url.to_string())
2208}
2209
2210fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2211    let toml_content = match read_toml_file(path) {
2212        Ok(content) => content,
2213        Err(e) => {
2214            warn!(
2215                "Failed to read or parse pyproject.toml at {:?}: {}",
2216                path, e
2217            );
2218            return default_package_data(path);
2219        }
2220    };
2221
2222    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2223    let is_poetry_pyproject = tool_table
2224        .and_then(|tool| tool.get("poetry"))
2225        .and_then(|value| value.as_table())
2226        .is_some();
2227
2228    // Handle both PEP 621 (project table) and poetry formats
2229    let project_table =
2230        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2231            // Standard PEP 621 format with [project] table
2232            project.clone()
2233        } else if let Some(tool) = tool_table {
2234            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2235                // Poetry format with [tool.poetry] table
2236                poetry.clone()
2237            } else {
2238                return default_package_data(path);
2239            }
2240        } else if toml_content.get(FIELD_NAME).is_some() {
2241            // Other format with top-level fields
2242            match toml_content.as_table() {
2243                Some(table) => table.clone(),
2244                None => {
2245                    warn!("Failed to convert TOML content to table in {:?}", path);
2246                    return default_package_data(path);
2247                }
2248            }
2249        } else {
2250            return default_package_data(path);
2251        };
2252
2253    let name = project_table
2254        .get(FIELD_NAME)
2255        .and_then(|v| v.as_str())
2256        .map(|v| truncate_field(v.to_string()));
2257
2258    let version = project_table
2259        .get(FIELD_VERSION)
2260        .and_then(|v| v.as_str())
2261        .map(String::from);
2262    let classifiers = project_table
2263        .get("classifiers")
2264        .and_then(|value| value.as_array())
2265        .map(|values| {
2266            values
2267                .iter()
2268                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2269                .collect::<Vec<_>>()
2270        })
2271        .unwrap_or_default();
2272    let (classifier_keywords, license_classifiers) = split_classifiers(&classifiers);
2273
2274    let extracted_license_statement = extract_raw_license_string(&project_table);
2275    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2276        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2277
2278    let description = project_table
2279        .get(FIELD_DESCRIPTION)
2280        .and_then(|value| value.as_str())
2281        .map(|value| truncate_field(value.to_string()));
2282    let mut keywords = project_table
2283        .get(FIELD_KEYWORDS)
2284        .and_then(|value| value.as_array())
2285        .map(|values| {
2286            values
2287                .iter()
2288                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2289                .collect::<Vec<_>>()
2290        })
2291        .unwrap_or_default();
2292    for classifier in classifier_keywords {
2293        if !keywords.contains(&classifier) {
2294            keywords.push(classifier);
2295        }
2296    }
2297
2298    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2299    let mut extra_data = extract_pyproject_extra_data(&toml_content).unwrap_or_default();
2300    let (homepage_url, download_url, bug_tracking_url, code_view_url, repository_url) =
2301        extract_urls(&project_table, &mut extra_data);
2302
2303    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2304
2305    // Create package URL
2306    let purl = name.as_ref().and_then(|n| {
2307        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2308            Ok(p) => p,
2309            Err(e) => {
2310                warn!(
2311                    "Failed to create PackageUrl for Python package '{}': {}",
2312                    n, e
2313                );
2314                return None;
2315            }
2316        };
2317
2318        if let Some(v) = &version
2319            && let Err(e) = package_url.with_version(v)
2320        {
2321            warn!(
2322                "Failed to set version '{}' for Python package '{}': {}",
2323                v, n, e
2324            );
2325            return None;
2326        }
2327
2328        Some(package_url.to_string())
2329    });
2330
2331    let api_data_url = name.as_ref().map(|n| {
2332        if let Some(v) = &version {
2333            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2334        } else {
2335            format!("https://pypi.org/pypi/{}/json", n)
2336        }
2337    });
2338
2339    let pypi_homepage_url = name
2340        .as_ref()
2341        .map(|n| format!("https://pypi.org/project/{}", n));
2342
2343    let pypi_download_url = name.as_ref().and_then(|n| {
2344        version.as_ref().map(|v| {
2345            format!(
2346                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2347                &n[..1.min(n.len())],
2348                n,
2349                n,
2350                v
2351            )
2352        })
2353    });
2354
2355    PackageData {
2356        package_type: Some(PythonParser::PACKAGE_TYPE),
2357        namespace: None,
2358        name,
2359        version,
2360        qualifiers: None,
2361        subpath: None,
2362        primary_language: None,
2363        description,
2364        release_date: None,
2365        parties: extract_parties(&project_table),
2366        keywords,
2367        homepage_url: homepage_url.or(pypi_homepage_url),
2368        download_url: download_url
2369            .or_else(|| repository_url.clone())
2370            .or(pypi_download_url),
2371        size: None,
2372        sha1: None,
2373        md5: None,
2374        sha256: None,
2375        sha512: None,
2376        bug_tracking_url,
2377        code_view_url,
2378        vcs_url: repository_url,
2379        copyright: None,
2380        holder: None,
2381        declared_license_expression,
2382        declared_license_expression_spdx,
2383        license_detections,
2384        other_license_expression: None,
2385        other_license_expression_spdx: None,
2386        other_license_detections: Vec::new(),
2387        extracted_license_statement: extracted_license_statement
2388            .or_else(|| build_extracted_license_statement(None, &license_classifiers)),
2389        notice_text: None,
2390        source_packages: Vec::new(),
2391        file_references: Vec::new(),
2392        is_private: has_private_classifier(&classifiers),
2393        is_virtual: false,
2394        extra_data: if extra_data.is_empty() {
2395            None
2396        } else {
2397            Some(extra_data)
2398        },
2399        dependencies: [dependencies, optional_dependencies].concat(),
2400        repository_homepage_url: None,
2401        repository_download_url: None,
2402        api_data_url,
2403        datasource_id: Some(if is_poetry_pyproject {
2404            DatasourceId::PypiPoetryPyprojectToml
2405        } else {
2406            DatasourceId::PypiPyprojectToml
2407        }),
2408        purl,
2409    }
2410}
2411
2412fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2413    let path_str = path.to_string_lossy().replace('\\', "/");
2414    if path_str.contains("/EGG-INFO/PKG-INFO") {
2415        DatasourceId::PypiEggPkginfo
2416    } else if path_str.ends_with(".egg-info/PKG-INFO") {
2417        DatasourceId::PypiEditableEggPkginfo
2418    } else {
2419        DatasourceId::PypiSdistPkginfo
2420    }
2421}
2422
2423fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2424    project
2425        .get(FIELD_LICENSE)
2426        .and_then(|license_value| match license_value {
2427            TomlValue::String(license_str) => Some(license_str.clone()),
2428            TomlValue::Table(license_table) => license_table
2429                .get("text")
2430                .and_then(|v| v.as_str())
2431                .map(|s| s.to_string())
2432                .or_else(|| {
2433                    license_table
2434                        .get("expression")
2435                        .and_then(|v| v.as_str())
2436                        .map(|expr| expr.to_string())
2437                }),
2438            _ => None,
2439        })
2440}
2441
2442fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2443    match project.get(FIELD_LICENSE) {
2444        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2445        Some(TomlValue::Table(license_table)) => license_table
2446            .get("expression")
2447            .and_then(|value| value.as_str()),
2448        _ => None,
2449    }
2450}
2451
2452fn extract_urls(
2453    project: &TomlMap<String, TomlValue>,
2454    extra_data: &mut HashMap<String, serde_json::Value>,
2455) -> ProjectUrls {
2456    let mut homepage_url = None;
2457    let mut download_url = None;
2458    let mut bug_tracking_url = None;
2459    let mut code_view_url = None;
2460    let mut repository_url = None;
2461
2462    // Check for URLs table
2463    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2464        let parsed_urls: Vec<(String, String)> = urls
2465            .iter()
2466            .filter_map(|(label, value)| {
2467                value
2468                    .as_str()
2469                    .map(|url| (label.to_string(), url.to_string()))
2470            })
2471            .collect();
2472        apply_project_url_mappings(
2473            &parsed_urls,
2474            &mut homepage_url,
2475            &mut bug_tracking_url,
2476            &mut code_view_url,
2477            &mut repository_url,
2478            extra_data,
2479        );
2480
2481        download_url = urls
2482            .get("Downloads")
2483            .or_else(|| urls.get("downloads"))
2484            .and_then(|v| v.as_str())
2485            .map(String::from);
2486
2487        if homepage_url.is_none() {
2488            homepage_url = urls
2489                .get(FIELD_HOMEPAGE)
2490                .and_then(|v| v.as_str())
2491                .map(String::from);
2492        }
2493        if repository_url.is_none() {
2494            repository_url = urls
2495                .get(FIELD_REPOSITORY)
2496                .and_then(|v| v.as_str())
2497                .map(String::from);
2498        }
2499    }
2500
2501    // If not found in URLs table, check for top-level keys
2502    if homepage_url.is_none() {
2503        homepage_url = project
2504            .get(FIELD_HOMEPAGE)
2505            .and_then(|v| v.as_str())
2506            .map(String::from);
2507    }
2508
2509    if repository_url.is_none() {
2510        repository_url = project
2511            .get(FIELD_REPOSITORY)
2512            .and_then(|v| v.as_str())
2513            .map(String::from);
2514    }
2515
2516    (
2517        homepage_url,
2518        download_url,
2519        bug_tracking_url,
2520        code_view_url,
2521        repository_url,
2522    )
2523}
2524
2525fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2526    let mut parties = Vec::new();
2527
2528    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2529        for author in authors {
2530            if let Some(author_str) = author.as_str() {
2531                let (name, email) = split_name_email(author_str);
2532                parties.push(Party {
2533                    r#type: None,
2534                    role: Some("author".to_string()),
2535                    name,
2536                    email,
2537                    url: None,
2538                    organization: None,
2539                    organization_url: None,
2540                    timezone: None,
2541                });
2542            } else if let Some(author_table) = author.as_table() {
2543                let name = author_table
2544                    .get("name")
2545                    .and_then(|value| value.as_str())
2546                    .map(|value| value.to_string());
2547                let email = author_table
2548                    .get("email")
2549                    .and_then(|value| value.as_str())
2550                    .map(|value| value.to_string());
2551                if name.is_some() || email.is_some() {
2552                    parties.push(Party {
2553                        r#type: None,
2554                        role: Some("author".to_string()),
2555                        name,
2556                        email,
2557                        url: None,
2558                        organization: None,
2559                        organization_url: None,
2560                        timezone: None,
2561                    });
2562                }
2563            }
2564        }
2565    }
2566
2567    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2568        for maintainer in maintainers {
2569            if let Some(maintainer_str) = maintainer.as_str() {
2570                let (name, email) = split_name_email(maintainer_str);
2571                parties.push(Party {
2572                    r#type: None,
2573                    role: Some("maintainer".to_string()),
2574                    name,
2575                    email,
2576                    url: None,
2577                    organization: None,
2578                    organization_url: None,
2579                    timezone: None,
2580                });
2581            } else if let Some(maintainer_table) = maintainer.as_table() {
2582                let name = maintainer_table
2583                    .get("name")
2584                    .and_then(|value| value.as_str())
2585                    .map(|value| value.to_string());
2586                let email = maintainer_table
2587                    .get("email")
2588                    .and_then(|value| value.as_str())
2589                    .map(|value| value.to_string());
2590                if name.is_some() || email.is_some() {
2591                    parties.push(Party {
2592                        r#type: None,
2593                        role: Some("maintainer".to_string()),
2594                        name,
2595                        email,
2596                        url: None,
2597                        organization: None,
2598                        organization_url: None,
2599                        timezone: None,
2600                    });
2601                }
2602            }
2603        }
2604    }
2605
2606    parties
2607}
2608
2609fn extract_dependencies(
2610    project: &TomlMap<String, TomlValue>,
2611    toml_content: &TomlValue,
2612) -> (Vec<Dependency>, Vec<Dependency>) {
2613    let mut dependencies = Vec::new();
2614    let mut optional_dependencies = Vec::new();
2615
2616    // Handle dependencies - can be array or table format
2617    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2618        match deps_value {
2619            TomlValue::Array(arr) => {
2620                dependencies = parse_dependency_array(arr, false, None);
2621            }
2622            TomlValue::Table(table) => {
2623                dependencies = parse_dependency_table(table, false, None);
2624            }
2625            _ => {}
2626        }
2627    }
2628
2629    // Handle PEP 621 optional-dependencies with scope
2630    if let Some(opt_deps_table) = project
2631        .get(FIELD_OPTIONAL_DEPENDENCIES)
2632        .and_then(|v| v.as_table())
2633    {
2634        for (extra_name, deps) in opt_deps_table {
2635            match deps {
2636                TomlValue::Array(arr) => {
2637                    optional_dependencies.extend(parse_dependency_array(
2638                        arr,
2639                        true,
2640                        Some(extra_name),
2641                    ));
2642                }
2643                TomlValue::Table(table) => {
2644                    optional_dependencies.extend(parse_dependency_table(
2645                        table,
2646                        true,
2647                        Some(extra_name),
2648                    ));
2649                }
2650                _ => {}
2651            }
2652        }
2653    }
2654
2655    // Handle Poetry dev-dependencies
2656    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2657        match dev_deps_value {
2658            TomlValue::Array(arr) => {
2659                optional_dependencies.extend(parse_dependency_array(
2660                    arr,
2661                    true,
2662                    Some(FIELD_DEV_DEPENDENCIES),
2663                ));
2664            }
2665            TomlValue::Table(table) => {
2666                optional_dependencies.extend(parse_dependency_table(
2667                    table,
2668                    true,
2669                    Some(FIELD_DEV_DEPENDENCIES),
2670                ));
2671            }
2672            _ => {}
2673        }
2674    }
2675
2676    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2677    if let Some(groups_table) = toml_content
2678        .get("tool")
2679        .and_then(|value| value.as_table())
2680        .and_then(|tool| tool.get("poetry"))
2681        .and_then(|value| value.as_table())
2682        .and_then(|poetry| poetry.get("group"))
2683        .and_then(|value| value.as_table())
2684    {
2685        for (group_name, group_data) in groups_table {
2686            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2687                match group_deps {
2688                    TomlValue::Array(arr) => {
2689                        optional_dependencies.extend(parse_dependency_array(
2690                            arr,
2691                            true,
2692                            Some(group_name),
2693                        ));
2694                    }
2695                    TomlValue::Table(table) => {
2696                        optional_dependencies.extend(parse_poetry_group_dependency_table(
2697                            table,
2698                            true,
2699                            Some(group_name),
2700                        ));
2701                    }
2702                    _ => {}
2703                }
2704            }
2705        }
2706    }
2707
2708    if let Some(groups_table) = toml_content
2709        .get(FIELD_DEPENDENCY_GROUPS)
2710        .and_then(|value| value.as_table())
2711    {
2712        for (group_name, deps) in groups_table {
2713            match deps {
2714                TomlValue::Array(arr) => {
2715                    optional_dependencies.extend(parse_dependency_array(
2716                        arr,
2717                        true,
2718                        Some(group_name),
2719                    ));
2720                }
2721                TomlValue::Table(table) => {
2722                    optional_dependencies.extend(parse_dependency_table(
2723                        table,
2724                        true,
2725                        Some(group_name),
2726                    ));
2727                }
2728                _ => {}
2729            }
2730        }
2731    }
2732
2733    if let Some(dev_deps_value) = toml_content
2734        .get("tool")
2735        .and_then(|value| value.as_table())
2736        .and_then(|tool| tool.get("uv"))
2737        .and_then(|value| value.as_table())
2738        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2739    {
2740        match dev_deps_value {
2741            TomlValue::Array(arr) => {
2742                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2743            }
2744            TomlValue::Table(table) => {
2745                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2746            }
2747            _ => {}
2748        }
2749    }
2750
2751    (dependencies, optional_dependencies)
2752}
2753
2754fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2755    let mut extra_data = HashMap::new();
2756
2757    if let Some(tool_uv) = toml_content
2758        .get("tool")
2759        .and_then(|value| value.as_table())
2760        .and_then(|tool| tool.get("uv"))
2761    {
2762        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2763    }
2764
2765    if extra_data.is_empty() {
2766        None
2767    } else {
2768        Some(extra_data)
2769    }
2770}
2771
2772fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2773    match value {
2774        TomlValue::String(value) => JsonValue::String(value.clone()),
2775        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2776        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2777        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2778        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2779        TomlValue::Array(values) => {
2780            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2781        }
2782        TomlValue::Table(values) => JsonValue::Object(
2783            values
2784                .iter()
2785                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2786                .collect::<JsonMap<String, JsonValue>>(),
2787        ),
2788    }
2789}
2790
2791fn parse_dependency_table(
2792    table: &TomlMap<String, TomlValue>,
2793    is_optional: bool,
2794    scope: Option<&str>,
2795) -> Vec<Dependency> {
2796    table
2797        .iter()
2798        .filter_map(|(name, version)| {
2799            let version_str = version.as_str().map(|s| s.to_string());
2800            let mut package_url =
2801                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2802
2803            if let Some(v) = &version_str {
2804                package_url.with_version(v).ok()?;
2805            }
2806
2807            Some(Dependency {
2808                purl: Some(package_url.to_string()),
2809                extracted_requirement: None,
2810                scope: scope.map(|s| s.to_string()),
2811                is_runtime: Some(!is_optional),
2812                is_optional: Some(is_optional),
2813                is_pinned: None,
2814                is_direct: Some(true),
2815                resolved_package: None,
2816                extra_data: None,
2817            })
2818        })
2819        .collect()
2820}
2821
2822fn parse_poetry_group_dependency_table(
2823    table: &TomlMap<String, TomlValue>,
2824    is_optional: bool,
2825    scope: Option<&str>,
2826) -> Vec<Dependency> {
2827    table
2828        .iter()
2829        .filter_map(|(name, value)| build_poetry_group_dependency(name, value, is_optional, scope))
2830        .collect()
2831}
2832
2833fn build_poetry_group_dependency(
2834    name: &str,
2835    value: &TomlValue,
2836    is_optional: bool,
2837    scope: Option<&str>,
2838) -> Option<Dependency> {
2839    let normalized_name = normalize_python_dependency_name(name);
2840    let (version_spec, extras, marker) = match value {
2841        TomlValue::String(spec) => (Some(spec.trim().to_string()), Vec::new(), None),
2842        TomlValue::Table(table) => {
2843            let version_spec = table
2844                .get(FIELD_VERSION)
2845                .and_then(|value| value.as_str())
2846                .map(str::trim)
2847                .filter(|value| !value.is_empty())
2848                .map(ToOwned::to_owned);
2849            let extras = table
2850                .get(FIELD_EXTRAS)
2851                .and_then(|value| value.as_array())
2852                .map(|values| {
2853                    values
2854                        .iter()
2855                        .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2856                        .collect::<Vec<_>>()
2857                })
2858                .unwrap_or_default();
2859            let marker = table
2860                .get("markers")
2861                .and_then(|value| value.as_str())
2862                .map(str::trim)
2863                .filter(|value| !value.is_empty())
2864                .map(ToOwned::to_owned);
2865
2866            (version_spec, extras, marker)
2867        }
2868        _ => return None,
2869    };
2870
2871    let pinned_version = version_spec
2872        .as_deref()
2873        .and_then(extract_exact_pinned_version);
2874    let purl = build_python_dependency_purl(&normalized_name, pinned_version.as_deref())?;
2875
2876    let mut extra_data = HashMap::new();
2877    if let Some(marker) = marker {
2878        extra_data.insert("marker".to_string(), JsonValue::String(marker));
2879    }
2880    if !extras.is_empty() {
2881        extra_data.insert(
2882            "extras".to_string(),
2883            JsonValue::Array(extras.into_iter().map(JsonValue::String).collect()),
2884        );
2885    }
2886
2887    Some(Dependency {
2888        purl: Some(purl),
2889        extracted_requirement: version_spec,
2890        scope: scope.map(|value| value.to_string()),
2891        is_runtime: Some(!is_optional),
2892        is_optional: Some(is_optional),
2893        is_pinned: Some(pinned_version.is_some()),
2894        is_direct: Some(true),
2895        resolved_package: None,
2896        extra_data: if extra_data.is_empty() {
2897            None
2898        } else {
2899            Some(extra_data)
2900        },
2901    })
2902}
2903
2904fn parse_dependency_array(
2905    array: &[TomlValue],
2906    is_optional: bool,
2907    scope: Option<&str>,
2908) -> Vec<Dependency> {
2909    array
2910        .iter()
2911        .filter_map(|dep| {
2912            let dep_str = dep.as_str()?;
2913            build_pyproject_array_dependency(dep_str, is_optional, scope)
2914        })
2915        .collect()
2916}
2917
2918fn build_pyproject_array_dependency(
2919    dep_str: &str,
2920    is_optional: bool,
2921    scope: Option<&str>,
2922) -> Option<Dependency> {
2923    let parsed = parse_pep508_requirement(dep_str)?;
2924    let name = normalize_python_package_name(&parsed.name);
2925    let pinned_version = parsed
2926        .specifiers
2927        .as_deref()
2928        .and_then(extract_exact_pinned_version);
2929
2930    let purl = build_python_dependency_purl(&name, pinned_version.as_deref())?;
2931
2932    let mut extra_data = HashMap::new();
2933    if let Some(marker) = parsed.marker {
2934        extra_data.insert("marker".to_string(), JsonValue::String(marker));
2935    }
2936    if !parsed.extras.is_empty() {
2937        extra_data.insert(
2938            "extras".to_string(),
2939            JsonValue::Array(parsed.extras.into_iter().map(JsonValue::String).collect()),
2940        );
2941    }
2942
2943    let extracted_requirement = parsed.specifiers.or(parsed.url);
2944
2945    Some(Dependency {
2946        purl: Some(purl),
2947        extracted_requirement: extracted_requirement.clone(),
2948        scope: scope.map(|s| s.to_string()),
2949        is_runtime: Some(!is_optional),
2950        is_optional: Some(is_optional),
2951        is_pinned: Some(pinned_version.is_some()),
2952        is_direct: Some(true),
2953        resolved_package: None,
2954        extra_data: if extra_data.is_empty() {
2955            None
2956        } else {
2957            Some(extra_data)
2958        },
2959    })
2960}
2961
2962fn extract_exact_pinned_version(specifiers: &str) -> Option<String> {
2963    let trimmed = specifiers.trim();
2964    if trimmed.contains(',') {
2965        return None;
2966    }
2967
2968    let stripped = if let Some(version) = trimmed.strip_prefix("===") {
2969        version
2970    } else if let Some(version) = trimmed.strip_prefix("==") {
2971        version
2972    } else {
2973        return None;
2974    };
2975
2976    let version = stripped.trim();
2977    if version.is_empty() {
2978        None
2979    } else {
2980        Some(version.to_string())
2981    }
2982}
2983
2984#[derive(Debug, Clone)]
2985enum Value {
2986    String(String),
2987    Number(f64),
2988    Bool(bool),
2989    None,
2990    List(Vec<Value>),
2991    Tuple(Vec<Value>),
2992    Dict(HashMap<String, Value>),
2993}
2994
2995struct LiteralEvaluator {
2996    constants: HashMap<String, Value>,
2997    max_depth: usize,
2998    max_nodes: usize,
2999    nodes_visited: usize,
3000}
3001
3002impl LiteralEvaluator {
3003    fn new(constants: HashMap<String, Value>) -> Self {
3004        Self {
3005            constants,
3006            max_depth: MAX_SETUP_PY_AST_DEPTH,
3007            max_nodes: MAX_SETUP_PY_AST_NODES,
3008            nodes_visited: 0,
3009        }
3010    }
3011
3012    fn insert_constant(&mut self, name: String, value: Value) {
3013        self.constants.insert(name, value);
3014    }
3015
3016    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
3017        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
3018            return None;
3019        }
3020        self.nodes_visited += 1;
3021
3022        match expr {
3023            ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
3024                Some(Value::String(value.to_str().to_string()))
3025            }
3026            ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
3027                Some(Value::Bool(*value))
3028            }
3029            ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
3030                self.evaluate_number(value)
3031            }
3032            ast::Expr::NoneLiteral(_) => Some(Value::None),
3033            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
3034            ast::Expr::List(ast::ExprList { elts, .. }) => {
3035                let mut values = Vec::new();
3036                for elt in elts {
3037                    values.push(self.evaluate_expr(elt, depth + 1)?);
3038                }
3039                Some(Value::List(values))
3040            }
3041            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
3042                let mut values = Vec::new();
3043                for elt in elts {
3044                    values.push(self.evaluate_expr(elt, depth + 1)?);
3045                }
3046                Some(Value::Tuple(values))
3047            }
3048            ast::Expr::Dict(ast::ExprDict { items, .. }) => {
3049                let mut dict = HashMap::new();
3050                for item in items {
3051                    let key_expr = item.key.as_ref()?;
3052                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
3053                    let key = value_to_string(&key_value)?;
3054                    let value = self.evaluate_expr(&item.value, depth + 1)?;
3055                    dict.insert(key, value);
3056                }
3057                Some(Value::Dict(dict))
3058            }
3059            ast::Expr::Call(ast::ExprCall {
3060                func, arguments, ..
3061            }) => {
3062                let args = arguments.args.as_ref();
3063                let keywords = arguments.keywords.as_ref();
3064                if keywords.is_empty()
3065                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
3066                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
3067                {
3068                    return self.evaluate_ordered_dict(args, depth + 1);
3069                }
3070
3071                if !args.is_empty() {
3072                    return None;
3073                }
3074
3075                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
3076                    && id == "dict"
3077                {
3078                    let mut dict = HashMap::new();
3079                    for keyword in keywords {
3080                        let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
3081                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
3082                        dict.insert(key.to_string(), value);
3083                    }
3084                    return Some(Value::Dict(dict));
3085                }
3086
3087                None
3088            }
3089            _ => None,
3090        }
3091    }
3092
3093    fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
3094        match number {
3095            ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
3096            ast::Number::Float(value) => Some(Value::Number(*value)),
3097            ast::Number::Complex { .. } => None,
3098        }
3099    }
3100
3101    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
3102        if args.len() != 1 {
3103            return None;
3104        }
3105
3106        let items = match self.evaluate_expr(&args[0], depth)? {
3107            Value::List(items) | Value::Tuple(items) => items,
3108            _ => return None,
3109        };
3110
3111        let mut dict = HashMap::new();
3112        for item in items {
3113            let Value::Tuple(values) = item else {
3114                return None;
3115            };
3116            if values.len() != 2 {
3117                return None;
3118            }
3119            let key = value_to_string(&values[0])?;
3120            dict.insert(key, values[1].clone());
3121        }
3122
3123        Some(Value::Dict(dict))
3124    }
3125}
3126
3127#[derive(Default)]
3128struct SetupAliases {
3129    setup_names: HashSet<String>,
3130    module_aliases: HashMap<String, String>,
3131}
3132
3133fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
3134    extract_from_setup_py(path).into_iter().collect()
3135}
3136
3137fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
3138    let content = match read_file_to_string(path, None) {
3139        Ok(content) => content,
3140        Err(e) => {
3141            warn!("Failed to read setup.py at {:?}: {}", path, e);
3142            return Some(default_package_data(path));
3143        }
3144    };
3145
3146    if content.len() > MAX_SETUP_PY_BYTES {
3147        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
3148        let package_data = extract_from_setup_py_regex(&content);
3149        return should_emit_setup_py_package(&package_data).then_some(package_data);
3150    }
3151
3152    let mut package_data = match extract_from_setup_py_ast(&content) {
3153        Ok(Some(data)) => data,
3154        Ok(None) => return Some(default_package_data(path)),
3155        Err(e) => {
3156            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
3157            extract_from_setup_py_regex(&content)
3158        }
3159    };
3160
3161    if package_data.name.is_none() {
3162        package_data.name = extract_setup_value(&content, "name");
3163    }
3164
3165    if package_data.version.is_none() {
3166        package_data.version = extract_setup_value(&content, "version");
3167    }
3168
3169    if package_data
3170        .version
3171        .as_deref()
3172        .is_some_and(|version| version.trim().is_empty())
3173    {
3174        package_data.version = None;
3175    }
3176
3177    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
3178    package_data.purl = build_setup_py_purl(
3179        package_data.name.as_deref(),
3180        package_data.version.as_deref(),
3181    );
3182
3183    if should_emit_setup_py_package(&package_data) {
3184        Some(package_data)
3185    } else {
3186        Some(default_package_data(path))
3187    }
3188}
3189
3190fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
3191    package_data.name.is_some()
3192        || package_data.version.is_some()
3193        || package_data.purl.is_some()
3194        || !package_data.dependencies.is_empty()
3195        || package_data.extracted_license_statement.is_some()
3196        || !package_data.license_detections.is_empty()
3197        || !package_data.parties.is_empty()
3198        || package_data.description.is_some()
3199        || package_data.homepage_url.is_some()
3200        || package_data.bug_tracking_url.is_some()
3201        || package_data.code_view_url.is_some()
3202        || package_data.vcs_url.is_some()
3203}
3204
3205fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
3206    if package_data.version.is_some()
3207        && package_data.extracted_license_statement.is_some()
3208        && package_data
3209            .parties
3210            .iter()
3211            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
3212    {
3213        return;
3214    }
3215
3216    let Some(root) = path.parent() else {
3217        return;
3218    };
3219
3220    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
3221
3222    if package_data.version.is_none() {
3223        package_data.version = dunder_metadata.version;
3224    }
3225
3226    if package_data.extracted_license_statement.is_none() {
3227        package_data.extracted_license_statement = dunder_metadata.license;
3228    }
3229
3230    let has_author = package_data
3231        .parties
3232        .iter()
3233        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
3234
3235    if !has_author && let Some(author) = dunder_metadata.author {
3236        package_data.parties.push(Party {
3237            r#type: Some("person".to_string()),
3238            role: Some("author".to_string()),
3239            name: Some(author),
3240            email: None,
3241            url: None,
3242            organization: None,
3243            organization_url: None,
3244            timezone: None,
3245        });
3246    }
3247}
3248
3249#[derive(Default)]
3250struct DunderMetadata {
3251    version: Option<String>,
3252    author: Option<String>,
3253    license: Option<String>,
3254}
3255
3256fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
3257    let statements = match parse_module(content) {
3258        Ok(parsed) => parsed.into_suite(),
3259        Err(_) => return DunderMetadata::default(),
3260    };
3261
3262    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3263    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3264    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3265    let mut metadata = DunderMetadata::default();
3266    let mut candidate_paths = Vec::new();
3267
3268    for module in imported_dunder_modules(&statements) {
3269        let Some(path) = resolve_imported_module_path(root, &module) else {
3270            continue;
3271        };
3272
3273        candidate_paths.push(path);
3274    }
3275
3276    candidate_paths.extend(referenced_dunder_init_paths(root, content));
3277
3278    let mut seen_paths = HashSet::new();
3279    for path in candidate_paths {
3280        if !seen_paths.insert(path.clone()) {
3281            continue;
3282        }
3283
3284        let Ok(module_content) = read_file_to_string(&path, None) else {
3285            continue;
3286        };
3287
3288        if metadata.version.is_none() {
3289            metadata.version = version_re
3290                .as_ref()
3291                .and_then(|regex| regex.captures(&module_content))
3292                .and_then(|captures| captures.get(1))
3293                .map(|match_| match_.as_str().to_string());
3294        }
3295
3296        if metadata.author.is_none() {
3297            metadata.author = author_re
3298                .as_ref()
3299                .and_then(|regex| regex.captures(&module_content))
3300                .and_then(|captures| captures.get(1))
3301                .map(|match_| match_.as_str().to_string());
3302        }
3303
3304        if metadata.license.is_none() {
3305            metadata.license = license_re
3306                .as_ref()
3307                .and_then(|regex| regex.captures(&module_content))
3308                .and_then(|captures| captures.get(1))
3309                .map(|match_| match_.as_str().to_string());
3310        }
3311
3312        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
3313            return metadata;
3314        }
3315    }
3316
3317    metadata
3318}
3319
3320fn referenced_dunder_init_paths(root: &Path, content: &str) -> Vec<PathBuf> {
3321    let open_re = match Regex::new(r#"open\(\s*['\"]([^'\"]+__init__\.py)['\"]"#) {
3322        Ok(regex) => regex,
3323        Err(_) => return Vec::new(),
3324    };
3325
3326    open_re
3327        .captures_iter(content)
3328        .filter_map(|captures| captures.get(1).map(|m| m.as_str()))
3329        .filter_map(|relative| {
3330            let relative_path = PathBuf::from(relative);
3331            if relative_path.is_absolute()
3332                || relative_path.components().any(|component| {
3333                    matches!(
3334                        component,
3335                        Component::ParentDir | Component::RootDir | Component::Prefix(_)
3336                    )
3337                })
3338            {
3339                return None;
3340            }
3341
3342            let candidate = root.join(relative_path);
3343            candidate.exists().then_some(candidate)
3344        })
3345        .collect()
3346}
3347
3348fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
3349    let mut modules = Vec::new();
3350
3351    for statement in statements {
3352        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
3353            continue;
3354        };
3355        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
3356            continue;
3357        };
3358        let imports_dunder = names.iter().any(|alias| {
3359            matches!(
3360                alias.name.as_str(),
3361                "__version__" | "__author__" | "__license__"
3362            )
3363        });
3364        if imports_dunder {
3365            modules.push(module.to_string());
3366        }
3367    }
3368
3369    modules
3370}
3371
3372fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
3373    let relative = PathBuf::from_iter(module.split('.'));
3374    let candidates = [
3375        root.join(relative.with_extension("py")),
3376        root.join(&relative).join("__init__.py"),
3377        root.join("src").join(relative.with_extension("py")),
3378        root.join("src").join(relative).join("__init__.py"),
3379    ];
3380
3381    candidates.into_iter().find(|candidate| candidate.exists())
3382}
3383
3384/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
3385///
3386/// # Security Model
3387///
3388/// This function parses setup.py as a Python AST and evaluates only literal values
3389/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
3390/// arbitrary code execution during scanning.
3391///
3392/// # DoS Prevention
3393///
3394/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
3395/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
3396/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
3397///
3398/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
3399fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
3400    let statements = parse_module(content)
3401        .map(|parsed| parsed.into_suite())
3402        .map_err(|e| e.to_string())?;
3403    let aliases = collect_setup_aliases(&statements);
3404    let mut evaluator = LiteralEvaluator::new(HashMap::new());
3405    build_setup_py_constants(&statements, &mut evaluator);
3406
3407    let setup_call = find_setup_call(&statements, &aliases);
3408    let Some(call_expr) = setup_call else {
3409        return Ok(None);
3410    };
3411
3412    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3413    Ok(Some(build_setup_py_package_data(&setup_values)))
3414}
3415
3416fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3417    for stmt in statements {
3418        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3419            if targets.len() != 1 {
3420                continue;
3421            }
3422
3423            let Some(name) = extract_assign_name(&targets[0]) else {
3424                continue;
3425            };
3426
3427            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3428                evaluator.insert_constant(name, value);
3429            }
3430        }
3431    }
3432}
3433
3434fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3435    match target {
3436        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3437        _ => None,
3438    }
3439}
3440
3441fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3442    let mut aliases = SetupAliases::default();
3443    aliases.setup_names.insert("setup".to_string());
3444
3445    for stmt in statements {
3446        match stmt {
3447            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3448                for alias in names {
3449                    let module_name = alias.name.as_str();
3450                    if !is_setup_module(module_name) {
3451                        continue;
3452                    }
3453                    let alias_name = alias
3454                        .asname
3455                        .as_ref()
3456                        .map(|name| name.as_str())
3457                        .unwrap_or(module_name);
3458                    aliases
3459                        .module_aliases
3460                        .insert(alias_name.to_string(), module_name.to_string());
3461                }
3462            }
3463            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3464                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3465                    continue;
3466                };
3467                if !is_setup_module(module_name) {
3468                    continue;
3469                }
3470                for alias in names {
3471                    if alias.name.as_str() != "setup" {
3472                        continue;
3473                    }
3474                    let alias_name = alias
3475                        .asname
3476                        .as_ref()
3477                        .map(|name| name.as_str())
3478                        .unwrap_or("setup");
3479                    aliases.setup_names.insert(alias_name.to_string());
3480                }
3481            }
3482            _ => {}
3483        }
3484    }
3485
3486    aliases
3487}
3488
3489fn is_setup_module(module_name: &str) -> bool {
3490    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3491}
3492
3493fn find_setup_call<'a>(
3494    statements: &'a [ast::Stmt],
3495    aliases: &'a SetupAliases,
3496) -> Option<&'a ast::Expr> {
3497    let mut finder = SetupCallFinder {
3498        aliases,
3499        called_function_names: collect_top_level_called_function_names(statements),
3500        nodes_visited: 0,
3501    };
3502    finder.find_in_statements(statements)
3503}
3504
3505fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3506    let mut called = HashSet::new();
3507    collect_called_function_names_in_statements(statements, &mut called);
3508    called
3509}
3510
3511fn collect_called_function_names_in_statements(
3512    statements: &[ast::Stmt],
3513    called: &mut HashSet<String>,
3514) {
3515    for stmt in statements {
3516        match stmt {
3517            ast::Stmt::Expr(ast::StmtExpr { value, .. })
3518            | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3519                collect_called_function_names_in_expr(value.as_ref(), called);
3520            }
3521            ast::Stmt::If(ast::StmtIf {
3522                body,
3523                elif_else_clauses,
3524                ..
3525            }) => {
3526                collect_called_function_names_in_statements(body, called);
3527                for clause in elif_else_clauses {
3528                    collect_called_function_names_in_statements(&clause.body, called);
3529                }
3530            }
3531            ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3532            | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3533                collect_called_function_names_in_statements(body, called);
3534                collect_called_function_names_in_statements(orelse, called);
3535            }
3536            ast::Stmt::With(ast::StmtWith { body, .. }) => {
3537                collect_called_function_names_in_statements(body, called);
3538            }
3539            ast::Stmt::Try(ast::StmtTry {
3540                body,
3541                orelse,
3542                finalbody,
3543                handlers,
3544                ..
3545            }) => {
3546                collect_called_function_names_in_statements(body, called);
3547                collect_called_function_names_in_statements(orelse, called);
3548                collect_called_function_names_in_statements(finalbody, called);
3549                for handler in handlers {
3550                    let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3551                        body,
3552                        ..
3553                    }) = handler;
3554                    collect_called_function_names_in_statements(body, called);
3555                }
3556            }
3557            _ => {}
3558        }
3559    }
3560}
3561
3562fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3563    if let ast::Expr::Call(ast::ExprCall {
3564        func, arguments, ..
3565    }) = expr
3566    {
3567        if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3568            called.insert(id.as_str().to_string());
3569        }
3570
3571        for arg in arguments.args.iter() {
3572            collect_called_function_names_in_expr(arg, called);
3573        }
3574        for keyword in arguments.keywords.iter() {
3575            collect_called_function_names_in_expr(&keyword.value, called);
3576        }
3577    }
3578}
3579
3580struct SetupCallFinder<'a> {
3581    aliases: &'a SetupAliases,
3582    called_function_names: HashSet<String>,
3583    nodes_visited: usize,
3584}
3585
3586impl<'a> SetupCallFinder<'a> {
3587    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3588        for stmt in statements {
3589            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3590                return None;
3591            }
3592            self.nodes_visited += 1;
3593
3594            let found = match stmt {
3595                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3596                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3597                ast::Stmt::If(ast::StmtIf {
3598                    body,
3599                    elif_else_clauses,
3600                    ..
3601                }) => self.find_in_statements(body).or_else(|| {
3602                    for clause in elif_else_clauses {
3603                        if let Some(found) = self.find_in_statements(&clause.body) {
3604                            return Some(found);
3605                        }
3606                    }
3607                    None
3608                }),
3609                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3610                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3611                    .find_in_statements(body)
3612                    .or_else(|| self.find_in_statements(orelse)),
3613                ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3614                    .called_function_names
3615                    .contains(name.as_str())
3616                    .then(|| self.find_in_statements(body))
3617                    .flatten(),
3618                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3619                ast::Stmt::Try(ast::StmtTry {
3620                    body,
3621                    orelse,
3622                    finalbody,
3623                    handlers,
3624                    ..
3625                }) => self
3626                    .find_in_statements(body)
3627                    .or_else(|| self.find_in_statements(orelse))
3628                    .or_else(|| self.find_in_statements(finalbody))
3629                    .or_else(|| {
3630                        for handler in handlers {
3631                            let ast::ExceptHandler::ExceptHandler(
3632                                ast::ExceptHandlerExceptHandler { body, .. },
3633                            ) = handler;
3634                            if let Some(found) = self.find_in_statements(body) {
3635                                return Some(found);
3636                            }
3637                        }
3638                        None
3639                    }),
3640                _ => None,
3641            };
3642
3643            if found.is_some() {
3644                return found;
3645            }
3646        }
3647
3648        None
3649    }
3650
3651    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3652        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3653            return None;
3654        }
3655        self.nodes_visited += 1;
3656
3657        match expr {
3658            ast::Expr::Call(ast::ExprCall { func, .. })
3659                if is_setup_call(func.as_ref(), self.aliases) =>
3660            {
3661                Some(expr)
3662            }
3663            _ => None,
3664        }
3665    }
3666}
3667
3668fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3669    let Some(dotted) = dotted_name(func, 0) else {
3670        return false;
3671    };
3672
3673    if aliases.setup_names.contains(&dotted) {
3674        return true;
3675    }
3676
3677    let Some(module) = dotted.strip_suffix(".setup") else {
3678        return false;
3679    };
3680
3681    let resolved = resolve_module_alias(module, aliases);
3682    is_setup_module(&resolved)
3683}
3684
3685fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3686    if depth >= MAX_SETUP_PY_AST_DEPTH {
3687        return None;
3688    }
3689
3690    match expr {
3691        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3692        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3693            let base = dotted_name(value.as_ref(), depth + 1)?;
3694            Some(format!("{}.{}", base, attr.as_str()))
3695        }
3696        _ => None,
3697    }
3698}
3699
3700fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3701    if let Some(mapped) = aliases.module_aliases.get(module) {
3702        return mapped.clone();
3703    }
3704
3705    let Some((base, rest)) = module.split_once('.') else {
3706        return module.to_string();
3707    };
3708
3709    if let Some(mapped) = aliases.module_aliases.get(base) {
3710        return format!("{}.{}", mapped, rest);
3711    }
3712
3713    module.to_string()
3714}
3715
3716fn extract_setup_keywords(
3717    call_expr: &ast::Expr,
3718    evaluator: &mut LiteralEvaluator,
3719) -> HashMap<String, Value> {
3720    let mut values = HashMap::new();
3721    let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3722        return values;
3723    };
3724
3725    for keyword in arguments.keywords.iter() {
3726        if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3727            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3728                values.insert(arg.to_string(), value);
3729            }
3730        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3731            for (key, value) in dict {
3732                values.insert(key, value);
3733            }
3734        }
3735    }
3736
3737    values
3738}
3739
3740fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3741    let name = get_value_string(values, "name").map(truncate_field);
3742    let version = get_value_string(values, "version").map(truncate_field);
3743    let description = get_value_string(values, "description")
3744        .or_else(|| get_value_string(values, "summary"))
3745        .map(truncate_field);
3746    let homepage_url = get_value_string(values, "url")
3747        .or_else(|| get_value_string(values, "home_page"))
3748        .map(truncate_field);
3749    let author = get_value_string(values, "author").map(truncate_field);
3750    let author_email = get_value_string(values, "author_email");
3751    let maintainer = get_value_string(values, "maintainer").map(truncate_field);
3752    let maintainer_email = get_value_string(values, "maintainer_email");
3753    let license = get_value_string(values, "license").map(truncate_field);
3754    let classifiers = values
3755        .get("classifiers")
3756        .and_then(value_to_string_list)
3757        .unwrap_or_default();
3758
3759    let mut parties = Vec::new();
3760    if author.is_some() || author_email.is_some() {
3761        parties.push(Party {
3762            r#type: Some("person".to_string()),
3763            role: Some("author".to_string()),
3764            name: author,
3765            email: author_email,
3766            url: None,
3767            organization: None,
3768            organization_url: None,
3769            timezone: None,
3770        });
3771    }
3772
3773    if maintainer.is_some() || maintainer_email.is_some() {
3774        parties.push(Party {
3775            r#type: Some("person".to_string()),
3776            role: Some("maintainer".to_string()),
3777            name: maintainer,
3778            email: maintainer_email,
3779            url: None,
3780            organization: None,
3781            organization_url: None,
3782            timezone: None,
3783        });
3784    }
3785
3786    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3787        normalize_spdx_declared_license(license.as_deref());
3788    let extracted_license_statement = license.clone();
3789
3790    let dependencies = build_setup_py_dependencies(values);
3791    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3792    let mut homepage_from_project_urls = None;
3793    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3794    let mut extra_data = HashMap::new();
3795
3796    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3797        apply_project_url_mappings(
3798            &parsed_project_urls,
3799            &mut homepage_from_project_urls,
3800            &mut bug_tracking_url,
3801            &mut code_view_url,
3802            &mut vcs_url,
3803            &mut extra_data,
3804        );
3805    }
3806
3807    let extra_data = if extra_data.is_empty() {
3808        None
3809    } else {
3810        Some(extra_data)
3811    };
3812
3813    PackageData {
3814        package_type: Some(PythonParser::PACKAGE_TYPE),
3815        namespace: None,
3816        name,
3817        version,
3818        qualifiers: None,
3819        subpath: None,
3820        primary_language: Some("Python".to_string()),
3821        description,
3822        release_date: None,
3823        parties,
3824        keywords: Vec::new(),
3825        homepage_url: homepage_url.or(homepage_from_project_urls),
3826        download_url: None,
3827        size: None,
3828        sha1: None,
3829        md5: None,
3830        sha256: None,
3831        sha512: None,
3832        bug_tracking_url,
3833        code_view_url,
3834        vcs_url,
3835        copyright: None,
3836        holder: None,
3837        declared_license_expression,
3838        declared_license_expression_spdx,
3839        license_detections,
3840        other_license_expression: None,
3841        other_license_expression_spdx: None,
3842        other_license_detections: Vec::new(),
3843        extracted_license_statement,
3844        notice_text: None,
3845        source_packages: Vec::new(),
3846        file_references: Vec::new(),
3847        is_private: has_private_classifier(&classifiers),
3848        is_virtual: false,
3849        extra_data,
3850        dependencies,
3851        repository_homepage_url: None,
3852        repository_download_url: None,
3853        api_data_url: None,
3854        datasource_id: Some(DatasourceId::PypiSetupPy),
3855        purl,
3856    }
3857}
3858
3859fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3860    let mut dependencies = Vec::new();
3861
3862    if let Some(reqs) = values
3863        .get("install_requires")
3864        .and_then(value_to_string_list)
3865    {
3866        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3867    }
3868
3869    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3870        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3871    }
3872
3873    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3874        let mut extra_items: Vec<_> = extras.iter().collect();
3875        extra_items.sort_by_key(|(name, _)| *name);
3876        for (extra_name, extra_value) in extra_items {
3877            if let Some(reqs) = value_to_string_list(extra_value) {
3878                dependencies.extend(build_setup_py_dependency_list(
3879                    reqs.as_slice(),
3880                    extra_name,
3881                    true,
3882                ));
3883            }
3884        }
3885    }
3886
3887    dependencies
3888}
3889
3890fn build_setup_py_dependency_list(
3891    reqs: &[String],
3892    scope: &str,
3893    is_optional: bool,
3894) -> Vec<Dependency> {
3895    reqs.iter()
3896        .filter_map(|req| build_python_dependency(req, scope, is_optional, None))
3897        .collect()
3898}
3899
3900fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3901    values.get(key).and_then(value_to_string)
3902}
3903
3904fn value_to_string(value: &Value) -> Option<String> {
3905    match value {
3906        Value::String(value) => Some(value.clone()),
3907        Value::Number(value) => Some(value.to_string()),
3908        Value::Bool(value) => Some(value.to_string()),
3909        _ => None,
3910    }
3911}
3912
3913fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3914    match value {
3915        Value::String(value) => Some(vec![value.clone()]),
3916        Value::List(values) | Value::Tuple(values) => {
3917            let mut items = Vec::new();
3918            for item in values {
3919                items.push(value_to_string(item)?);
3920            }
3921            Some(items)
3922        }
3923        _ => None,
3924    }
3925}
3926
3927fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3928    let Value::Dict(dict) = value else {
3929        return None;
3930    };
3931
3932    let mut pairs: Vec<(String, String)> = dict
3933        .iter()
3934        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3935        .collect::<Option<Vec<_>>>()?;
3936    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3937    Some(pairs)
3938}
3939
3940fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3941    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3942    extract_requires_dist_dependencies(&requires_dist)
3943}
3944
3945pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3946    requires_dist
3947        .iter()
3948        .filter_map(|entry| build_rfc822_dependency(entry))
3949        .collect()
3950}
3951
3952fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3953    build_python_dependency(entry, "install", false, None)
3954}
3955
3956fn build_python_dependency(
3957    entry: &str,
3958    default_scope: &str,
3959    default_optional: bool,
3960    marker_override: Option<&str>,
3961) -> Option<Dependency> {
3962    let (requirement_part, marker_part) = entry
3963        .split_once(';')
3964        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3965        .unwrap_or((entry.trim(), None));
3966
3967    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3968    let requirement = normalize_rfc822_requirement(requirement_part);
3969    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3970        marker_part.or(marker_override),
3971        default_scope,
3972        default_optional,
3973    );
3974    let purl = build_python_dependency_purl(&name, None)?;
3975
3976    let is_pinned = requirement
3977        .as_deref()
3978        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3979    let purl = if is_pinned {
3980        requirement
3981            .as_deref()
3982            .map(|req| req.trim_start_matches('='))
3983            .and_then(|version| build_python_dependency_purl(&name, Some(version)))
3984            .unwrap_or(purl)
3985    } else {
3986        purl
3987    };
3988
3989    let mut extra_data = HashMap::new();
3990    extra_data.extend(marker_data);
3991    if let Some(marker) = marker {
3992        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3993    }
3994
3995    Some(Dependency {
3996        purl: Some(purl),
3997        extracted_requirement: requirement,
3998        scope: Some(scope),
3999        is_runtime: Some(true),
4000        is_optional: Some(is_optional),
4001        is_pinned: Some(is_pinned),
4002        is_direct: Some(true),
4003        resolved_package: None,
4004        extra_data: if extra_data.is_empty() {
4005            None
4006        } else {
4007            Some(extra_data)
4008        },
4009    })
4010}
4011
4012fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
4013    let name = extract_setup_cfg_dependency_name(requirement_part)?;
4014    let trimmed = requirement_part.trim();
4015    let mut remainder = trimmed[name.len()..].trim();
4016
4017    if let Some(stripped) = remainder.strip_prefix('[')
4018        && let Some(end_idx) = stripped.find(']')
4019    {
4020        remainder = stripped[end_idx + 1..].trim();
4021    }
4022
4023    let remainder = remainder
4024        .strip_prefix('(')
4025        .and_then(|value| value.strip_suffix(')'))
4026        .unwrap_or(remainder)
4027        .trim();
4028
4029    if remainder.is_empty() {
4030        return None;
4031    }
4032
4033    let mut specifiers: Vec<String> = remainder
4034        .split(',')
4035        .map(|specifier| specifier.trim().replace(' ', ""))
4036        .filter(|specifier| !specifier.is_empty())
4037        .collect();
4038    specifiers.sort();
4039    Some(specifiers.join(","))
4040}
4041
4042fn encode_python_dependency_purl_version(version: &str) -> String {
4043    version.replace('*', "%2A")
4044}
4045
4046fn build_python_dependency_purl(name: &str, version: Option<&str>) -> Option<String> {
4047    let normalized_name = normalize_python_dependency_name(name);
4048
4049    PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &normalized_name)
4050        .ok()
4051        .map(|_| match version {
4052            Some(version) => {
4053                format!(
4054                    "pkg:pypi/{normalized_name}@{}",
4055                    encode_python_dependency_purl_version(version)
4056                )
4057            }
4058            None => format!("pkg:pypi/{normalized_name}"),
4059        })
4060}
4061
4062fn normalize_python_dependency_name(name: &str) -> String {
4063    name.trim().to_ascii_lowercase().replace('_', "-")
4064}
4065
4066fn parse_rfc822_marker(
4067    marker_part: Option<&str>,
4068    default_scope: &str,
4069    default_optional: bool,
4070) -> (
4071    String,
4072    bool,
4073    Option<String>,
4074    HashMap<String, serde_json::Value>,
4075) {
4076    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
4077        return (
4078            default_scope.to_string(),
4079            default_optional,
4080            None,
4081            HashMap::new(),
4082        );
4083    };
4084
4085    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
4086        .expect("extra marker regex should compile");
4087    let mut extra_data = HashMap::new();
4088
4089    if let Some(python_version) = extract_marker_field(marker, "python_version") {
4090        extra_data.insert(
4091            "python_version".to_string(),
4092            serde_json::Value::String(python_version),
4093        );
4094    }
4095    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
4096        extra_data.insert(
4097            "sys_platform".to_string(),
4098            serde_json::Value::String(sys_platform),
4099        );
4100    }
4101
4102    if let Some(captures) = extra_re.captures(marker)
4103        && let Some(scope) = captures.get(1)
4104    {
4105        return (
4106            scope.as_str().to_string(),
4107            true,
4108            Some(marker.trim().to_string()),
4109            extra_data,
4110        );
4111    }
4112
4113    (
4114        default_scope.to_string(),
4115        default_optional,
4116        Some(marker.trim().to_string()),
4117        extra_data,
4118    )
4119}
4120
4121fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
4122    let re = Regex::new(&format!(
4123        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
4124        field
4125    ))
4126    .ok()?;
4127    let captures = re.captures(marker)?;
4128    let operator = captures.get(1)?.as_str();
4129    let value = captures.get(2)?.as_str();
4130    Some(format!("{} {}", operator, value))
4131}
4132
4133fn parse_requires_txt(content: &str) -> Vec<Dependency> {
4134    let mut dependencies = Vec::new();
4135    let mut current_scope = "install".to_string();
4136    let mut current_optional = false;
4137    let mut current_marker: Option<String> = None;
4138    let mut line_count = 0usize;
4139
4140    for line in content.lines() {
4141        line_count += 1;
4142        if line_count > MAX_ITERATION_COUNT {
4143            warn!(
4144                "Exceeded max line count in requires.txt; stopping at {} lines",
4145                MAX_ITERATION_COUNT
4146            );
4147            break;
4148        }
4149        let trimmed = line.trim();
4150        if trimmed.is_empty() || trimmed.starts_with('#') {
4151            continue;
4152        }
4153
4154        if trimmed.starts_with('[') && trimmed.ends_with(']') {
4155            let inner = &trimmed[1..trimmed.len() - 1];
4156            if let Some(rest) = inner.strip_prefix(':') {
4157                current_scope = "install".to_string();
4158                current_optional = false;
4159                current_marker = Some(rest.trim().to_string());
4160            } else if let Some((scope, marker)) = inner.split_once(':') {
4161                current_scope = scope.trim().to_string();
4162                current_optional = true;
4163                current_marker = Some(marker.trim().to_string());
4164            } else {
4165                current_scope = inner.trim().to_string();
4166                current_optional = true;
4167                current_marker = None;
4168            }
4169            continue;
4170        }
4171
4172        if let Some(dependency) = build_python_dependency(
4173            trimmed,
4174            &current_scope,
4175            current_optional,
4176            current_marker.as_deref(),
4177        ) {
4178            dependencies.push(dependency);
4179        }
4180    }
4181
4182    dependencies
4183}
4184
4185fn has_private_classifier(classifiers: &[String]) -> bool {
4186    classifiers
4187        .iter()
4188        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
4189}
4190
4191fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
4192    let name = name?;
4193    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
4194    if let Some(version) = version {
4195        package_url.with_version(version).ok()?;
4196    }
4197    Some(package_url.to_string())
4198}
4199
4200fn extract_from_setup_py_regex(content: &str) -> PackageData {
4201    let name = extract_setup_value(content, "name").map(truncate_field);
4202    let version = extract_setup_value(content, "version").map(truncate_field);
4203    let license_expression = extract_setup_value(content, "license").map(truncate_field);
4204
4205    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4206        normalize_spdx_declared_license(license_expression.as_deref());
4207    let extracted_license_statement = license_expression.clone();
4208
4209    let dependencies = extract_setup_py_dependencies(content);
4210    let homepage_url = extract_setup_value(content, "url").map(truncate_field);
4211    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
4212
4213    PackageData {
4214        package_type: Some(PythonParser::PACKAGE_TYPE),
4215        namespace: None,
4216        name,
4217        version,
4218        qualifiers: None,
4219        subpath: None,
4220        primary_language: Some("Python".to_string()),
4221        description: None,
4222        release_date: None,
4223        parties: Vec::new(),
4224        keywords: Vec::new(),
4225        homepage_url,
4226        download_url: None,
4227        size: None,
4228        sha1: None,
4229        md5: None,
4230        sha256: None,
4231        sha512: None,
4232        bug_tracking_url: None,
4233        code_view_url: None,
4234        vcs_url: None,
4235        copyright: None,
4236        holder: None,
4237        declared_license_expression,
4238        declared_license_expression_spdx,
4239        license_detections,
4240        other_license_expression: None,
4241        other_license_expression_spdx: None,
4242        other_license_detections: Vec::new(),
4243        extracted_license_statement,
4244        notice_text: None,
4245        source_packages: Vec::new(),
4246        file_references: Vec::new(),
4247        is_private: false,
4248        is_virtual: false,
4249        extra_data: None,
4250        dependencies,
4251        repository_homepage_url: None,
4252        repository_download_url: None,
4253        api_data_url: None,
4254        datasource_id: Some(DatasourceId::PypiSetupPy),
4255        purl,
4256    }
4257}
4258
4259fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
4260    crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
4261}
4262
4263fn extract_from_pypi_json(path: &Path) -> PackageData {
4264    let default = PackageData {
4265        package_type: Some(PythonParser::PACKAGE_TYPE),
4266        datasource_id: Some(DatasourceId::PypiJson),
4267        ..Default::default()
4268    };
4269
4270    let content = match read_file_to_string(path, None) {
4271        Ok(content) => content,
4272        Err(error) => {
4273            warn!("Failed to read pypi.json at {:?}: {}", path, error);
4274            return default;
4275        }
4276    };
4277
4278    let root: serde_json::Value = match serde_json::from_str(&content) {
4279        Ok(value) => value,
4280        Err(error) => {
4281            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
4282            return default;
4283        }
4284    };
4285
4286    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
4287        warn!("No info object found in pypi.json at {:?}", path);
4288        return default;
4289    };
4290
4291    let name = info
4292        .get("name")
4293        .and_then(|value| value.as_str())
4294        .map(|v| truncate_field(v.to_owned()));
4295    let version = info
4296        .get("version")
4297        .and_then(|value| value.as_str())
4298        .map(ToOwned::to_owned);
4299    let summary = info
4300        .get("summary")
4301        .and_then(|value| value.as_str())
4302        .map(|v| truncate_field(v.to_owned()));
4303    let description = info
4304        .get("description")
4305        .and_then(|value| value.as_str())
4306        .filter(|value| !value.trim().is_empty())
4307        .map(|v| truncate_field(v.to_owned()))
4308        .or(summary);
4309    let mut homepage_url = info
4310        .get("home_page")
4311        .and_then(|value| value.as_str())
4312        .map(|v| truncate_field(v.to_owned()));
4313    let author = info
4314        .get("author")
4315        .and_then(|value| value.as_str())
4316        .filter(|value| !value.trim().is_empty())
4317        .map(|v| truncate_field(v.to_owned()));
4318    let author_email = info
4319        .get("author_email")
4320        .and_then(|value| value.as_str())
4321        .filter(|value| !value.trim().is_empty())
4322        .map(ToOwned::to_owned);
4323    let license = info
4324        .get("license")
4325        .and_then(|value| value.as_str())
4326        .filter(|value| !value.trim().is_empty())
4327        .map(ToOwned::to_owned);
4328    let keywords = parse_setup_cfg_keywords(
4329        info.get("keywords")
4330            .and_then(|value| value.as_str())
4331            .map(ToOwned::to_owned),
4332    );
4333    let classifiers = info
4334        .get("classifiers")
4335        .and_then(|value| value.as_array())
4336        .map(|values| {
4337            values
4338                .iter()
4339                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
4340                .collect::<Vec<_>>()
4341        })
4342        .unwrap_or_default();
4343
4344    let mut parties = Vec::new();
4345    if author.is_some() || author_email.is_some() {
4346        parties.push(Party {
4347            r#type: Some("person".to_string()),
4348            role: Some("author".to_string()),
4349            name: author,
4350            email: author_email,
4351            url: None,
4352            organization: None,
4353            organization_url: None,
4354            timezone: None,
4355        });
4356    }
4357
4358    let mut bug_tracking_url = None;
4359    let mut code_view_url = None;
4360    let mut vcs_url = None;
4361    let mut extra_data = HashMap::new();
4362
4363    let parsed_project_urls = info
4364        .get("project_urls")
4365        .and_then(|value| value.as_object())
4366        .map(|map| {
4367            let mut pairs: Vec<(String, String)> = map
4368                .iter()
4369                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
4370                .collect();
4371            pairs.sort_by(|left, right| left.0.cmp(&right.0));
4372            pairs
4373        })
4374        .unwrap_or_default();
4375
4376    apply_project_url_mappings(
4377        &parsed_project_urls,
4378        &mut homepage_url,
4379        &mut bug_tracking_url,
4380        &mut code_view_url,
4381        &mut vcs_url,
4382        &mut extra_data,
4383    );
4384
4385    let (download_url, size, sha256) = root
4386        .get("urls")
4387        .and_then(|value| value.as_array())
4388        .map(|urls| select_pypi_json_artifact(urls))
4389        .unwrap_or((None, None, None));
4390
4391    let sha256 = sha256.and_then(|h| Sha256Digest::from_hex(&h).ok());
4392
4393    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4394        normalize_spdx_declared_license(license.as_deref());
4395    let dependencies = info
4396        .get("requires_dist")
4397        .and_then(|value| value.as_array())
4398        .map(|entries| {
4399            entries
4400                .iter()
4401                .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4402                .collect::<Vec<_>>()
4403        })
4404        .map(|entries| extract_requires_dist_dependencies(&entries))
4405        .unwrap_or_default();
4406
4407    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
4408        build_pypi_urls(name.as_deref(), version.as_deref());
4409
4410    PackageData {
4411        package_type: Some(PythonParser::PACKAGE_TYPE),
4412        namespace: None,
4413        name,
4414        version,
4415        qualifiers: None,
4416        subpath: None,
4417        primary_language: None,
4418        description,
4419        release_date: None,
4420        parties,
4421        keywords,
4422        homepage_url: homepage_url.or(repository_homepage_url.clone()),
4423        download_url,
4424        size,
4425        sha1: None,
4426        md5: None,
4427        sha256,
4428        sha512: None,
4429        bug_tracking_url,
4430        code_view_url,
4431        vcs_url,
4432        copyright: None,
4433        holder: None,
4434        declared_license_expression,
4435        declared_license_expression_spdx,
4436        license_detections,
4437        other_license_expression: None,
4438        other_license_expression_spdx: None,
4439        other_license_detections: Vec::new(),
4440        extracted_license_statement: license,
4441        notice_text: None,
4442        source_packages: Vec::new(),
4443        file_references: Vec::new(),
4444        is_private: has_private_classifier(&classifiers),
4445        is_virtual: false,
4446        extra_data: if extra_data.is_empty() {
4447            None
4448        } else {
4449            Some(extra_data)
4450        },
4451        dependencies,
4452        repository_homepage_url,
4453        repository_download_url,
4454        api_data_url,
4455        datasource_id: Some(DatasourceId::PypiJson),
4456        purl,
4457    }
4458}
4459
4460fn select_pypi_json_artifact(
4461    urls: &[serde_json::Value],
4462) -> (Option<String>, Option<u64>, Option<String>) {
4463    let selected = urls
4464        .iter()
4465        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4466        .or_else(|| urls.first());
4467
4468    let Some(entry) = selected else {
4469        return (None, None, None);
4470    };
4471
4472    let download_url = entry
4473        .get("url")
4474        .and_then(|value| value.as_str())
4475        .map(ToOwned::to_owned);
4476    let size = entry.get("size").and_then(|value| value.as_u64());
4477    let sha256 = entry
4478        .get("digests")
4479        .and_then(|value| value.as_object())
4480        .and_then(|digests| digests.get("sha256"))
4481        .and_then(|value| value.as_str())
4482        .map(ToOwned::to_owned);
4483
4484    (download_url, size, sha256)
4485}
4486
4487fn extract_from_pip_inspect(path: &Path) -> PackageData {
4488    let content = match read_file_to_string(path, None) {
4489        Ok(content) => content,
4490        Err(e) => {
4491            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4492            return default_package_data(path);
4493        }
4494    };
4495
4496    let root: serde_json::Value = match serde_json::from_str(&content) {
4497        Ok(value) => value,
4498        Err(e) => {
4499            warn!(
4500                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4501                path, e
4502            );
4503            return default_package_data(path);
4504        }
4505    };
4506
4507    let installed = match root.get("installed").and_then(|v| v.as_array()) {
4508        Some(arr) => arr,
4509        None => {
4510            warn!(
4511                "No 'installed' array found in pip-inspect.deplock at {:?}",
4512                path
4513            );
4514            return default_package_data(path);
4515        }
4516    };
4517
4518    let pip_version = root
4519        .get("pip_version")
4520        .and_then(|v| v.as_str())
4521        .map(String::from);
4522    let inspect_version = root
4523        .get("version")
4524        .and_then(|v| v.as_str())
4525        .map(String::from);
4526
4527    let mut main_package: Option<PackageData> = None;
4528    let mut dependencies: Vec<Dependency> = Vec::new();
4529
4530    for package_entry in installed {
4531        let metadata = match package_entry.get("metadata") {
4532            Some(m) => m,
4533            None => continue,
4534        };
4535
4536        let is_requested = package_entry
4537            .get("requested")
4538            .and_then(|v| v.as_bool())
4539            .unwrap_or(false);
4540        let has_direct_url = package_entry.get("direct_url").is_some();
4541
4542        let name = metadata
4543            .get("name")
4544            .and_then(|v| v.as_str())
4545            .map(|v| truncate_field(v.to_string()));
4546        let version = metadata
4547            .get("version")
4548            .and_then(|v| v.as_str())
4549            .map(String::from);
4550        let summary = metadata
4551            .get("summary")
4552            .and_then(|v| v.as_str())
4553            .map(|v| truncate_field(v.to_string()));
4554        let home_page = metadata
4555            .get("home_page")
4556            .and_then(|v| v.as_str())
4557            .map(|v| truncate_field(v.to_string()));
4558        let author = metadata
4559            .get("author")
4560            .and_then(|v| v.as_str())
4561            .map(|v| truncate_field(v.to_string()));
4562        let author_email = metadata
4563            .get("author_email")
4564            .and_then(|v| v.as_str())
4565            .map(String::from);
4566        let license = metadata
4567            .get("license")
4568            .and_then(|v| v.as_str())
4569            .map(|v| truncate_field(v.to_string()));
4570        let description = metadata
4571            .get("description")
4572            .and_then(|v| v.as_str())
4573            .map(|v| truncate_field(v.to_string()));
4574        let keywords = metadata
4575            .get("keywords")
4576            .and_then(|v| v.as_array())
4577            .map(|arr| {
4578                arr.iter()
4579                    .filter_map(|k| k.as_str().map(String::from))
4580                    .collect::<Vec<_>>()
4581            })
4582            .unwrap_or_default();
4583
4584        let mut parties = Vec::new();
4585        if author.is_some() || author_email.is_some() {
4586            parties.push(Party {
4587                r#type: Some("person".to_string()),
4588                role: Some("author".to_string()),
4589                name: author,
4590                email: author_email,
4591                url: None,
4592                organization: None,
4593                organization_url: None,
4594                timezone: None,
4595            });
4596        }
4597
4598        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4599            normalize_spdx_declared_license(license.as_deref());
4600        let extracted_license_statement = license.clone();
4601        let requires_dist = metadata
4602            .get("requires_dist")
4603            .and_then(|v| v.as_array())
4604            .map(|entries| {
4605                entries
4606                    .iter()
4607                    .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4608                    .collect::<Vec<_>>()
4609            })
4610            .unwrap_or_default();
4611        let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4612
4613        let purl = name.as_ref().and_then(|n| {
4614            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4615            if let Some(v) = &version {
4616                package_url.with_version(v).ok()?;
4617            }
4618            Some(package_url.to_string())
4619        });
4620
4621        if is_requested && has_direct_url {
4622            let mut extra_data = HashMap::new();
4623            if let Some(pv) = &pip_version {
4624                extra_data.insert(
4625                    "pip_version".to_string(),
4626                    serde_json::Value::String(pv.clone()),
4627                );
4628            }
4629            if let Some(iv) = &inspect_version {
4630                extra_data.insert(
4631                    "inspect_version".to_string(),
4632                    serde_json::Value::String(iv.clone()),
4633                );
4634            }
4635
4636            main_package = Some(PackageData {
4637                package_type: Some(PythonParser::PACKAGE_TYPE),
4638                namespace: None,
4639                name,
4640                version,
4641                qualifiers: None,
4642                subpath: None,
4643                primary_language: Some("Python".to_string()),
4644                description: description.or(summary),
4645                release_date: None,
4646                parties,
4647                keywords,
4648                homepage_url: home_page,
4649                download_url: None,
4650                size: None,
4651                sha1: None,
4652                md5: None,
4653                sha256: None,
4654                sha512: None,
4655                bug_tracking_url: None,
4656                code_view_url: None,
4657                vcs_url: None,
4658                copyright: None,
4659                holder: None,
4660                declared_license_expression,
4661                declared_license_expression_spdx,
4662                license_detections,
4663                other_license_expression: None,
4664                other_license_expression_spdx: None,
4665                other_license_detections: Vec::new(),
4666                extracted_license_statement,
4667                notice_text: None,
4668                source_packages: Vec::new(),
4669                file_references: Vec::new(),
4670                is_private: false,
4671                is_virtual: true,
4672                extra_data: if extra_data.is_empty() {
4673                    None
4674                } else {
4675                    Some(extra_data)
4676                },
4677                dependencies: parsed_dependencies,
4678                repository_homepage_url: None,
4679                repository_download_url: None,
4680                api_data_url: None,
4681                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4682                purl,
4683            });
4684        } else {
4685            let resolved_package = PackageData {
4686                package_type: Some(PythonParser::PACKAGE_TYPE),
4687                namespace: None,
4688                name: name.clone(),
4689                version: version.clone(),
4690                qualifiers: None,
4691                subpath: None,
4692                primary_language: Some("Python".to_string()),
4693                description: description.or(summary),
4694                release_date: None,
4695                parties,
4696                keywords,
4697                homepage_url: home_page,
4698                download_url: None,
4699                size: None,
4700                sha1: None,
4701                md5: None,
4702                sha256: None,
4703                sha512: None,
4704                bug_tracking_url: None,
4705                code_view_url: None,
4706                vcs_url: None,
4707                copyright: None,
4708                holder: None,
4709                declared_license_expression,
4710                declared_license_expression_spdx,
4711                license_detections,
4712                other_license_expression: None,
4713                other_license_expression_spdx: None,
4714                other_license_detections: Vec::new(),
4715                extracted_license_statement,
4716                notice_text: None,
4717                source_packages: Vec::new(),
4718                file_references: Vec::new(),
4719                is_private: false,
4720                is_virtual: true,
4721                extra_data: None,
4722                dependencies: parsed_dependencies,
4723                repository_homepage_url: None,
4724                repository_download_url: None,
4725                api_data_url: None,
4726                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4727                purl: purl.clone(),
4728            };
4729
4730            let resolved = package_data_to_resolved(&resolved_package);
4731            dependencies.push(Dependency {
4732                purl,
4733                extracted_requirement: None,
4734                scope: None,
4735                is_runtime: Some(true),
4736                is_optional: Some(false),
4737                is_pinned: Some(true),
4738                is_direct: Some(is_requested),
4739                resolved_package: Some(Box::new(resolved)),
4740                extra_data: None,
4741            });
4742        }
4743    }
4744
4745    if let Some(mut main_pkg) = main_package {
4746        let direct_requirement_purls: HashSet<String> = main_pkg
4747            .dependencies
4748            .iter()
4749            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4750            .collect();
4751
4752        let resolved_requirement_purls: HashSet<String> = dependencies
4753            .iter()
4754            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4755            .collect();
4756
4757        let unresolved_dependencies = main_pkg
4758            .dependencies
4759            .iter()
4760            .filter(|dep| {
4761                dep.purl.as_ref().is_some_and(|purl| {
4762                    !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4763                })
4764            })
4765            .cloned()
4766            .collect::<Vec<_>>();
4767
4768        for dependency in &mut dependencies {
4769            if dependency
4770                .purl
4771                .as_ref()
4772                .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4773            {
4774                dependency.is_direct = Some(true);
4775            }
4776        }
4777
4778        main_pkg.dependencies = dependencies;
4779        main_pkg.dependencies.extend(unresolved_dependencies);
4780        main_pkg
4781    } else {
4782        default_package_data(path)
4783    }
4784}
4785
4786fn base_dependency_purl(purl: &str) -> String {
4787    purl.split_once('@')
4788        .map(|(base, _)| base.to_string())
4789        .unwrap_or_else(|| purl.to_string())
4790}
4791
4792type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4793
4794fn extract_from_setup_cfg(path: &Path) -> PackageData {
4795    let content = match read_file_to_string(path, None) {
4796        Ok(content) => content,
4797        Err(e) => {
4798            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4799            return default_package_data(path);
4800        }
4801    };
4802
4803    let sections = parse_setup_cfg(&content);
4804    let name = get_ini_value(&sections, "metadata", "name").map(truncate_field);
4805    let version = get_ini_value(&sections, "metadata", "version").map(truncate_field);
4806    let description = get_ini_value(&sections, "metadata", "description").map(truncate_field);
4807    let author = get_ini_value(&sections, "metadata", "author").map(truncate_field);
4808    let author_email = get_ini_value(&sections, "metadata", "author_email");
4809    let maintainer = get_ini_value(&sections, "metadata", "maintainer").map(truncate_field);
4810    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4811    let license = get_ini_value(&sections, "metadata", "license").map(truncate_field);
4812    let mut homepage_url = get_ini_value(&sections, "metadata", "url").map(truncate_field);
4813    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4814    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4815    let python_requires = get_ini_value(&sections, "options", "python_requires");
4816    let parsed_project_urls =
4817        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4818    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4819    let mut extra_data = HashMap::new();
4820
4821    let mut parties = Vec::new();
4822    if author.is_some() || author_email.is_some() {
4823        parties.push(Party {
4824            r#type: Some("person".to_string()),
4825            role: Some("author".to_string()),
4826            name: author,
4827            email: author_email,
4828            url: None,
4829            organization: None,
4830            organization_url: None,
4831            timezone: None,
4832        });
4833    }
4834
4835    if maintainer.is_some() || maintainer_email.is_some() {
4836        parties.push(Party {
4837            r#type: Some("person".to_string()),
4838            role: Some("maintainer".to_string()),
4839            name: maintainer,
4840            email: maintainer_email,
4841            url: None,
4842            organization: None,
4843            organization_url: None,
4844            timezone: None,
4845        });
4846    }
4847
4848    let declared_license_expression = None;
4849    let declared_license_expression_spdx = None;
4850    let license_detections = Vec::new();
4851    let extracted_license_statement = license.clone();
4852
4853    let dependencies = extract_setup_cfg_dependencies(&sections);
4854
4855    if let Some(value) = python_requires {
4856        extra_data.insert(
4857            "python_requires".to_string(),
4858            serde_json::Value::String(value),
4859        );
4860    }
4861
4862    apply_project_url_mappings(
4863        &parsed_project_urls,
4864        &mut homepage_url,
4865        &mut bug_tracking_url,
4866        &mut code_view_url,
4867        &mut vcs_url,
4868        &mut extra_data,
4869    );
4870
4871    let extra_data = if extra_data.is_empty() {
4872        None
4873    } else {
4874        Some(extra_data)
4875    };
4876
4877    let purl = name.as_ref().and_then(|n| {
4878        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4879        if let Some(v) = &version {
4880            package_url.with_version(v).ok()?;
4881        }
4882        Some(package_url.to_string())
4883    });
4884
4885    PackageData {
4886        package_type: Some(PythonParser::PACKAGE_TYPE),
4887        namespace: None,
4888        name,
4889        version,
4890        qualifiers: None,
4891        subpath: None,
4892        primary_language: Some("Python".to_string()),
4893        description,
4894        release_date: None,
4895        parties,
4896        keywords,
4897        homepage_url,
4898        download_url: None,
4899        size: None,
4900        sha1: None,
4901        md5: None,
4902        sha256: None,
4903        sha512: None,
4904        bug_tracking_url,
4905        code_view_url,
4906        vcs_url,
4907        copyright: None,
4908        holder: None,
4909        declared_license_expression,
4910        declared_license_expression_spdx,
4911        license_detections,
4912        other_license_expression: None,
4913        other_license_expression_spdx: None,
4914        other_license_detections: Vec::new(),
4915        extracted_license_statement,
4916        notice_text: None,
4917        source_packages: Vec::new(),
4918        file_references: Vec::new(),
4919        is_private: has_private_classifier(&classifiers),
4920        is_virtual: false,
4921        extra_data,
4922        dependencies,
4923        repository_homepage_url: None,
4924        repository_download_url: None,
4925        api_data_url: None,
4926        datasource_id: Some(DatasourceId::PypiSetupCfg),
4927        purl,
4928    }
4929}
4930
4931fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4932    let Some(keywords) = value else {
4933        return Vec::new();
4934    };
4935
4936    keywords
4937        .split(',')
4938        .map(str::trim)
4939        .filter(|keyword| !keyword.is_empty())
4940        .map(ToOwned::to_owned)
4941        .collect()
4942}
4943
4944fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4945    entries
4946        .iter()
4947        .filter_map(|entry| {
4948            let (label, url) = entry.split_once('=')?;
4949            let label = label.trim();
4950            let url = url.trim();
4951            if label.is_empty() || url.is_empty() {
4952                None
4953            } else {
4954                Some((label.to_string(), url.to_string()))
4955            }
4956        })
4957        .collect()
4958}
4959
4960fn apply_project_url_mappings(
4961    parsed_urls: &[(String, String)],
4962    homepage_url: &mut Option<String>,
4963    bug_tracking_url: &mut Option<String>,
4964    code_view_url: &mut Option<String>,
4965    vcs_url: &mut Option<String>,
4966    extra_data: &mut HashMap<String, serde_json::Value>,
4967) {
4968    for (label, url) in parsed_urls {
4969        let label_lower = label.to_lowercase();
4970
4971        if bug_tracking_url.is_none()
4972            && matches!(
4973                label_lower.as_str(),
4974                "tracker"
4975                    | "bug reports"
4976                    | "bug tracker"
4977                    | "issues"
4978                    | "issue tracker"
4979                    | "github: issues"
4980            )
4981        {
4982            *bug_tracking_url = Some(url.clone());
4983        } else if code_view_url.is_none()
4984            && matches!(label_lower.as_str(), "source" | "source code" | "code")
4985        {
4986            *code_view_url = Some(url.clone());
4987        } else if vcs_url.is_none()
4988            && matches!(
4989                label_lower.as_str(),
4990                "github" | "gitlab" | "github: repo" | "repository"
4991            )
4992        {
4993            *vcs_url = Some(url.clone());
4994        } else if homepage_url.is_none()
4995            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4996        {
4997            *homepage_url = Some(url.clone());
4998        } else if label_lower == "changelog" {
4999            extra_data.insert(
5000                "changelog_url".to_string(),
5001                serde_json::Value::String(url.clone()),
5002            );
5003        }
5004    }
5005
5006    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
5007        .iter()
5008        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
5009        .collect();
5010
5011    if !project_urls_json.is_empty() {
5012        extra_data.insert(
5013            "project_urls".to_string(),
5014            serde_json::Value::Object(project_urls_json),
5015        );
5016    }
5017}
5018
5019fn parse_setup_cfg(content: &str) -> IniSections {
5020    let mut sections: IniSections = HashMap::new();
5021    let mut current_section: Option<String> = None;
5022    let mut current_key: Option<String> = None;
5023
5024    for raw_line in content.lines() {
5025        let line = raw_line.trim_end_matches('\r');
5026        let trimmed = line.trim();
5027        if trimmed.is_empty() {
5028            continue;
5029        }
5030
5031        let stripped = line.trim_start();
5032        if stripped.starts_with('#') || stripped.starts_with(';') {
5033            continue;
5034        }
5035
5036        if stripped.starts_with('[') && stripped.ends_with(']') {
5037            let section_name = stripped
5038                .trim_start_matches('[')
5039                .trim_end_matches(']')
5040                .trim()
5041                .to_ascii_lowercase();
5042            current_section = if section_name.is_empty() {
5043                None
5044            } else {
5045                Some(section_name)
5046            };
5047            current_key = None;
5048            continue;
5049        }
5050
5051        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
5052            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
5053                let value = stripped.trim();
5054                if !value.is_empty() {
5055                    sections
5056                        .entry(section.clone())
5057                        .or_default()
5058                        .entry(key.clone())
5059                        .or_default()
5060                        .push(value.to_string());
5061                }
5062            }
5063            continue;
5064        }
5065
5066        if let Some((key, value)) = stripped.split_once('=')
5067            && let Some(section) = current_section.as_ref()
5068        {
5069            let key_name = key.trim().to_ascii_lowercase();
5070            let value_trimmed = value.trim();
5071            let entry = sections
5072                .entry(section.clone())
5073                .or_default()
5074                .entry(key_name.clone())
5075                .or_default();
5076            if !value_trimmed.is_empty() {
5077                entry.push(value_trimmed.to_string());
5078            }
5079            current_key = Some(key_name);
5080        }
5081    }
5082
5083    sections
5084}
5085
5086fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
5087    sections
5088        .get(&section.to_ascii_lowercase())
5089        .and_then(|values| values.get(&key.to_ascii_lowercase()))
5090        .and_then(|entries| entries.first())
5091        .map(|value| value.trim().to_string())
5092}
5093
5094fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
5095    sections
5096        .get(&section.to_ascii_lowercase())
5097        .and_then(|values| values.get(&key.to_ascii_lowercase()))
5098        .cloned()
5099        .unwrap_or_default()
5100}
5101
5102fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
5103    let mut dependencies = Vec::new();
5104
5105    for (sub_section, scope) in [
5106        ("install_requires", "install"),
5107        ("tests_require", "test"),
5108        ("setup_requires", "setup"),
5109    ] {
5110        let reqs = get_ini_values(sections, "options", sub_section);
5111        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
5112    }
5113
5114    if let Some(extras) = sections.get("options.extras_require") {
5115        let mut extra_items: Vec<_> = extras.iter().collect();
5116        extra_items.sort_by_key(|(name, _)| *name);
5117        for (extra_name, reqs) in extra_items {
5118            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
5119        }
5120    }
5121
5122    dependencies
5123}
5124
5125fn parse_setup_cfg_requirements(
5126    reqs: &[String],
5127    scope: &str,
5128    is_optional: bool,
5129) -> Vec<Dependency> {
5130    reqs.iter()
5131        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
5132        .collect()
5133}
5134
5135fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
5136    let trimmed = req.trim();
5137    if trimmed.is_empty() || trimmed.starts_with('#') {
5138        return None;
5139    }
5140
5141    let name = extract_setup_cfg_dependency_name(trimmed)?;
5142    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5143
5144    Some(Dependency {
5145        purl: Some(purl.to_string()),
5146        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
5147        scope: Some(scope.to_string()),
5148        is_runtime: Some(true),
5149        is_optional: Some(is_optional),
5150        is_pinned: Some(false),
5151        is_direct: Some(true),
5152        resolved_package: None,
5153        extra_data: None,
5154    })
5155}
5156
5157fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
5158    let trimmed = req.trim();
5159    if trimmed.is_empty() {
5160        return None;
5161    }
5162
5163    let end = trimmed
5164        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
5165        .unwrap_or(trimmed.len());
5166    let name = trimmed[..end].trim();
5167    if name.is_empty() {
5168        None
5169    } else {
5170        Some(name.to_string())
5171    }
5172}
5173
5174fn normalize_setup_cfg_requirement(req: &str) -> String {
5175    req.chars().filter(|c| !c.is_whitespace()).collect()
5176}
5177
5178fn extract_setup_value(content: &str, key: &str) -> Option<String> {
5179    let patterns = vec![
5180        format!("{}=\"", key),   // name="value"
5181        format!("{} =\"", key),  // name ="value"
5182        format!("{}= \"", key),  // name= "value"
5183        format!("{} = \"", key), // name = "value"
5184        format!("{}='", key),    // name='value'
5185        format!("{} ='", key),   // name ='value'
5186        format!("{}= '", key),   // name= 'value'
5187        format!("{} = '", key),  // name = 'value'
5188    ];
5189
5190    for pattern in patterns {
5191        if let Some(start_idx) = content.find(&pattern) {
5192            let value_start = start_idx + pattern.len();
5193            let remaining = &content[value_start..];
5194
5195            if let Some(end_idx) = remaining.find(['"', '\'']) {
5196                return Some(remaining[..end_idx].to_string());
5197            }
5198        }
5199    }
5200
5201    None
5202}
5203
5204fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
5205    let mut dependencies = Vec::new();
5206
5207    if let Some(tests_deps) = extract_tests_require(content) {
5208        dependencies.extend(tests_deps);
5209    }
5210
5211    if let Some(extras_deps) = extract_extras_require(content) {
5212        dependencies.extend(extras_deps);
5213    }
5214
5215    dependencies
5216}
5217
5218fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
5219    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
5220    let re = Regex::new(pattern).ok()?;
5221    let captures = re.captures(content)?;
5222    let deps_str = captures.get(1)?.as_str();
5223
5224    let deps = parse_setup_py_dep_list(deps_str, "test", true);
5225    if deps.is_empty() { None } else { Some(deps) }
5226}
5227
5228fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
5229    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
5230    let re = Regex::new(pattern).ok()?;
5231    let captures = re.captures(content)?;
5232    let dict_content = captures.get(1)?.as_str();
5233
5234    let mut all_deps = Vec::new();
5235
5236    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
5237    let entry_re = Regex::new(entry_pattern).ok()?;
5238
5239    for entry_cap in entry_re.captures_iter(dict_content) {
5240        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
5241            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
5242            all_deps.extend(deps);
5243        }
5244    }
5245
5246    if all_deps.is_empty() {
5247        None
5248    } else {
5249        Some(all_deps)
5250    }
5251}
5252
5253fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
5254    let dep_pattern = r#"['"]([^'"]+)['"]"#;
5255    let re = match Regex::new(dep_pattern) {
5256        Ok(r) => r,
5257        Err(_) => return Vec::new(),
5258    };
5259
5260    re.captures_iter(deps_str)
5261        .filter_map(|cap| {
5262            let dep_str = cap.get(1)?.as_str().trim();
5263            if dep_str.is_empty() {
5264                return None;
5265            }
5266
5267            let name = extract_setup_cfg_dependency_name(dep_str)?;
5268            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5269
5270            Some(Dependency {
5271                purl: Some(purl.to_string()),
5272                extracted_requirement: Some(dep_str.to_string()),
5273                scope: Some(scope.to_string()),
5274                is_runtime: Some(true),
5275                is_optional: Some(is_optional),
5276                is_pinned: Some(false),
5277                is_direct: Some(true),
5278                resolved_package: None,
5279                extra_data: None,
5280            })
5281        })
5282        .collect()
5283}
5284
5285/// Reads and parses a TOML file
5286pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
5287    let content = read_file_to_string(path, None).map_err(|e| e.to_string())?;
5288    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
5289}
5290
5291/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
5292///
5293/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
5294/// Essential for SBOM compliance and package integrity verification.
5295///
5296/// # Returns
5297///
5298/// - `(Some(size), Some(hash))` on success
5299/// - `(None, None)` if file cannot be opened
5300/// - `(Some(size), None)` if hash calculation fails during read
5301fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<Sha256Digest>) {
5302    let mut file = match File::open(path) {
5303        Ok(f) => f,
5304        Err(_) => return (None, None),
5305    };
5306
5307    let metadata = match file.metadata() {
5308        Ok(m) => m,
5309        Err(_) => return (None, None),
5310    };
5311    let size = metadata.len();
5312
5313    let mut hasher = Sha256::new();
5314    let mut buffer = vec![0; 8192];
5315
5316    loop {
5317        match file.read(&mut buffer) {
5318            Ok(0) => break,
5319            Ok(n) => hasher.update(&buffer[..n]),
5320            Err(_) => return (Some(size), None),
5321        }
5322    }
5323
5324    let hash = Sha256Digest::from_bytes(hasher.finalize().into());
5325    (Some(size), Some(hash))
5326}
5327
5328fn default_package_data(path: &Path) -> PackageData {
5329    PackageData {
5330        package_type: Some(PythonParser::PACKAGE_TYPE),
5331        primary_language: Some("Python".to_string()),
5332        datasource_id: infer_python_datasource_id(path),
5333        ..Default::default()
5334    }
5335}
5336
5337fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
5338    let file_name = path.file_name().and_then(|name| name.to_str());
5339
5340    match file_name {
5341        Some("pyproject.toml") => {
5342            if read_toml_file(path)
5343                .ok()
5344                .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
5345                .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
5346                .is_some()
5347            {
5348                Some(DatasourceId::PypiPoetryPyprojectToml)
5349            } else {
5350                Some(DatasourceId::PypiPyprojectToml)
5351            }
5352        }
5353        Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
5354            Some(DatasourceId::PypiSetupPy)
5355        }
5356        Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
5357        Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
5358        Some("METADATA") if is_installed_wheel_metadata_path(path) => {
5359            Some(DatasourceId::PypiWheelMetadata)
5360        }
5361        Some("pypi.json") => Some(DatasourceId::PypiJson),
5362        Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
5363        Some("origin.json") if is_pip_cache_origin_json(path) => {
5364            Some(DatasourceId::PypiPipOriginJson)
5365        }
5366        _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
5367            Some(DatasourceId::PypiSdist)
5368        }
5369        _ if path
5370            .extension()
5371            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
5372        {
5373            Some(DatasourceId::PypiWheel)
5374        }
5375        _ if path
5376            .extension()
5377            .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
5378        {
5379            Some(DatasourceId::PypiEgg)
5380        }
5381        _ => None,
5382    }
5383}
5384
5385crate::register_parser!(
5386    "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
5387    &[
5388        "**/pyproject.toml",
5389        "**/setup.py",
5390        "**/*_setup.py",
5391        "**/setup.cfg",
5392        "**/pypi.json",
5393        "**/PKG-INFO",
5394        "**/*.dist-info/METADATA",
5395        "**/origin.json",
5396        "**/*.tar.gz",
5397        "**/*.tgz",
5398        "**/*.tar.bz2",
5399        "**/*.tar.xz",
5400        "**/*.zip",
5401        "**/*.whl",
5402        "**/*.egg"
5403    ],
5404    "pypi",
5405    "Python",
5406    Some("https://packaging.python.org/"),
5407);