Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{
35    DatasourceId, Dependency, FileReference, PackageData, PackageType, Party, Sha256Digest,
36};
37use crate::parser_warn as warn;
38use crate::parsers::utils::{
39    MAX_ITERATION_COUNT, read_file_to_string, split_name_email, truncate_field,
40};
41use base64::Engine;
42use base64::engine::general_purpose::URL_SAFE_NO_PAD;
43use bzip2::read::BzDecoder;
44use csv::ReaderBuilder;
45use flate2::read::GzDecoder;
46use liblzma::read::XzDecoder;
47use packageurl::PackageUrl;
48use regex::Regex;
49use ruff_python_ast as ast;
50use ruff_python_parser::parse_module;
51use serde_json::{Map as JsonMap, Value as JsonValue};
52use sha2::{Digest, Sha256};
53use std::collections::{HashMap, HashSet};
54use std::fs::File;
55use std::io::Read;
56use std::path::{Component, Path, PathBuf};
57use tar::Archive;
58use toml::Value as TomlValue;
59use toml::map::Map as TomlMap;
60use zip::ZipArchive;
61
62use super::PackageParser;
63use super::license_normalization::{
64    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
65    normalize_spdx_expression,
66};
67use super::pep508::parse_pep508_requirement;
68
69// Field constants for pyproject.toml
70const FIELD_PROJECT: &str = "project";
71const FIELD_NAME: &str = "name";
72const FIELD_VERSION: &str = "version";
73const FIELD_DESCRIPTION: &str = "description";
74const FIELD_KEYWORDS: &str = "keywords";
75const FIELD_LICENSE: &str = "license";
76const FIELD_AUTHORS: &str = "authors";
77const FIELD_MAINTAINERS: &str = "maintainers";
78const FIELD_URLS: &str = "urls";
79const FIELD_HOMEPAGE: &str = "homepage";
80const FIELD_REPOSITORY: &str = "repository";
81const FIELD_DEPENDENCIES: &str = "dependencies";
82const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
83const FIELD_EXTRAS: &str = "extras";
84
85type ProjectUrls = (
86    Option<String>,
87    Option<String>,
88    Option<String>,
89    Option<String>,
90    Option<String>,
91);
92const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
93const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
94const MAX_SETUP_PY_BYTES: usize = 1_048_576;
95const MAX_SETUP_PY_AST_NODES: usize = 10_000;
96const MAX_SETUP_PY_AST_DEPTH: usize = 50;
97const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
98const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
99const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
100
101/// Python package parser supporting 11 manifest formats.
102///
103/// Extracts metadata from Python package files including pyproject.toml, setup.py,
104/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
105///
106/// # Security
107///
108/// setup.py files are parsed using AST analysis rather than code execution to prevent
109/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
110pub struct PythonParser;
111
112#[derive(Clone, Copy, Debug)]
113enum PythonSdistArchiveFormat {
114    TarGz,
115    Tgz,
116    TarBz2,
117    TarXz,
118    Zip,
119}
120
121#[derive(Clone, Debug)]
122struct ValidatedZipEntry {
123    index: usize,
124    name: String,
125}
126
127impl PackageParser for PythonParser {
128    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
129
130    fn extract_packages(path: &Path) -> Vec<PackageData> {
131        vec![
132            if path.file_name().unwrap_or_default() == "pyproject.toml" {
133                extract_from_pyproject_toml(path)
134            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
135                extract_from_setup_cfg(path)
136            } else if is_setup_py_like_path(path) {
137                return extract_setup_py_packages(path);
138            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
139                extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
140            } else if is_installed_wheel_metadata_path(path) {
141                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
142            } else if is_pip_cache_origin_json(path) {
143                extract_from_pip_origin_json(path)
144            } else if path.file_name().unwrap_or_default() == "pypi.json" {
145                extract_from_pypi_json(path)
146            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
147                extract_from_pip_inspect(path)
148            } else if is_python_sdist_archive_path(path) {
149                extract_from_sdist_archive(path)
150            } else if path
151                .extension()
152                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
153            {
154                extract_from_wheel_archive(path)
155            } else if path
156                .extension()
157                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
158            {
159                extract_from_egg_archive(path)
160            } else {
161                default_package_data(path)
162            },
163        ]
164    }
165
166    fn is_match(path: &Path) -> bool {
167        if let Some(filename) = path.file_name()
168            && (filename == "pyproject.toml"
169                || filename == "setup.cfg"
170                || is_setup_py_like_path(path)
171                || filename == "PKG-INFO"
172                || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
173                || filename == "pypi.json"
174                || filename == "pip-inspect.deplock"
175                || is_pip_cache_origin_json(path))
176        {
177            return true;
178        }
179
180        if let Some(extension) = path.extension() {
181            let ext = extension.to_string_lossy().to_lowercase();
182            if (ext == "whl" && is_valid_wheel_archive_path(path))
183                || ext == "egg"
184                || is_python_sdist_archive_path(path)
185            {
186                return true;
187            }
188        }
189
190        false
191    }
192}
193
194fn is_setup_py_like_path(path: &Path) -> bool {
195    path.file_name()
196        .and_then(|name| name.to_str())
197        .is_some_and(|name| {
198            name == "setup.py" || name.ends_with("_setup.py") || name.ends_with("-setup.py")
199        })
200}
201
202fn is_installed_wheel_metadata_path(path: &Path) -> bool {
203    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
204        && path
205            .parent()
206            .and_then(|parent| parent.file_name())
207            .and_then(|name| name.to_str())
208            .is_some_and(|name| name.ends_with(".dist-info"))
209}
210
211#[derive(Debug, Clone)]
212struct InstalledWheelMetadata {
213    wheel_tags: Vec<String>,
214    wheel_version: Option<String>,
215    wheel_generator: Option<String>,
216    root_is_purelib: Option<bool>,
217    compressed_tag: Option<String>,
218}
219
220fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
221    let Some(parent) = path.parent() else {
222        return;
223    };
224
225    if !parent
226        .file_name()
227        .and_then(|name| name.to_str())
228        .is_some_and(|name| name.ends_with(".dist-info"))
229    {
230        return;
231    }
232
233    let wheel_path = parent.join("WHEEL");
234    if !wheel_path.exists() {
235        return;
236    }
237
238    let Ok(content) = read_file_to_string(&wheel_path, None) else {
239        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
240        return;
241    };
242
243    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
244        return;
245    };
246
247    apply_installed_wheel_metadata(package_data, &wheel_metadata);
248}
249
250fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
251    use super::rfc822::{get_header_all, get_header_first};
252
253    let metadata = super::rfc822::parse_rfc822_content(content);
254    let wheel_tags = get_header_all(&metadata.headers, "tag");
255    if wheel_tags.is_empty() {
256        return None;
257    }
258
259    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
260    let wheel_generator = get_header_first(&metadata.headers, "generator");
261    let root_is_purelib =
262        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
263            match value.to_ascii_lowercase().as_str() {
264                "true" => Some(true),
265                "false" => Some(false),
266                _ => None,
267            }
268        });
269
270    let compressed_tag = compress_wheel_tags(&wheel_tags);
271
272    Some(InstalledWheelMetadata {
273        wheel_tags,
274        wheel_version,
275        wheel_generator,
276        root_is_purelib,
277        compressed_tag,
278    })
279}
280
281fn compress_wheel_tags(tags: &[String]) -> Option<String> {
282    if tags.is_empty() {
283        return None;
284    }
285
286    if tags.len() == 1 {
287        return Some(tags[0].clone());
288    }
289
290    let mut python_tags = Vec::new();
291    let mut abi_tag: Option<&str> = None;
292    let mut platform_tag: Option<&str> = None;
293
294    for tag in tags {
295        let mut parts = tag.splitn(3, '-');
296        let python = parts.next()?;
297        let abi = parts.next()?;
298        let platform = parts.next()?;
299
300        if abi_tag.is_some_and(|existing| existing != abi)
301            || platform_tag.is_some_and(|existing| existing != platform)
302        {
303            return None;
304        }
305
306        abi_tag = Some(abi);
307        platform_tag = Some(platform);
308        python_tags.push(python.to_string());
309    }
310
311    Some(format!(
312        "{}-{}-{}",
313        python_tags.join("."),
314        abi_tag?,
315        platform_tag?
316    ))
317}
318
319fn apply_installed_wheel_metadata(
320    package_data: &mut PackageData,
321    wheel_metadata: &InstalledWheelMetadata,
322) {
323    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
324    extra_data.insert(
325        "wheel_tags".to_string(),
326        JsonValue::Array(
327            wheel_metadata
328                .wheel_tags
329                .iter()
330                .cloned()
331                .map(JsonValue::String)
332                .collect(),
333        ),
334    );
335
336    if let Some(wheel_version) = &wheel_metadata.wheel_version {
337        extra_data.insert(
338            "wheel_version".to_string(),
339            JsonValue::String(wheel_version.clone()),
340        );
341    }
342
343    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
344        extra_data.insert(
345            "wheel_generator".to_string(),
346            JsonValue::String(wheel_generator.clone()),
347        );
348    }
349
350    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
351        extra_data.insert(
352            "root_is_purelib".to_string(),
353            JsonValue::Bool(root_is_purelib),
354        );
355    }
356
357    if let (Some(name), Some(version), Some(extension)) = (
358        package_data.name.as_deref(),
359        package_data.version.as_deref(),
360        wheel_metadata.compressed_tag.as_deref(),
361    ) {
362        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
363    }
364}
365
366fn is_pip_cache_origin_json(path: &Path) -> bool {
367    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
368        && path.ancestors().skip(1).any(|ancestor| {
369            ancestor
370                .file_name()
371                .and_then(|name| name.to_str())
372                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
373        })
374}
375
376fn extract_from_pip_origin_json(path: &Path) -> PackageData {
377    let content = match read_file_to_string(path, None) {
378        Ok(content) => content,
379        Err(e) => {
380            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
381            return default_package_data(path);
382        }
383    };
384
385    let root: JsonValue = match serde_json::from_str(&content) {
386        Ok(root) => root,
387        Err(e) => {
388            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
389            return default_package_data(path);
390        }
391    };
392
393    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
394        warn!("No url found in pip cache origin.json at {:?}", path);
395        return default_package_data(path);
396    };
397
398    let sibling_wheel = find_sibling_cached_wheel(path);
399    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
400        sibling_wheel
401            .as_ref()
402            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
403    });
404
405    let Some((name, version)) = name_version else {
406        warn!(
407            "Failed to infer package name/version from pip cache origin.json at {:?}",
408            path
409        );
410        return default_package_data(path);
411    };
412
413    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
414        build_pypi_urls(Some(&name), Some(&version));
415    let purl = sibling_wheel
416        .as_ref()
417        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
418        .or(plain_purl);
419
420    PackageData {
421        package_type: Some(PythonParser::PACKAGE_TYPE),
422        primary_language: Some("Python".to_string()),
423        name: Some(truncate_field(name)),
424        version: Some(version),
425        datasource_id: Some(DatasourceId::PypiPipOriginJson),
426        download_url: Some(truncate_field(download_url.to_string())),
427        sha256: extract_sha256_from_origin_json(&root)
428            .and_then(|h| Sha256Digest::from_hex(&h).ok()),
429        repository_homepage_url,
430        repository_download_url,
431        api_data_url,
432        purl,
433        ..Default::default()
434    }
435}
436
437fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
438    let parent = path.parent()?;
439    let entries = parent.read_dir().ok()?;
440
441    for entry in entries.flatten() {
442        let sibling_path = entry.path();
443        if sibling_path
444            .extension()
445            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
446            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
447        {
448            return Some(wheel_info);
449        }
450    }
451
452    None
453}
454
455fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
456    let file_name = url.rsplit('/').next()?;
457
458    if file_name.ends_with(".whl") {
459        return parse_wheel_filename(Path::new(file_name))
460            .map(|wheel_info| (wheel_info.name, wheel_info.version));
461    }
462
463    let stem = strip_python_archive_extension(file_name)?;
464    let (name, version) = stem.rsplit_once('-')?;
465    if name.is_empty() || version.is_empty() {
466        return None;
467    }
468
469    Some((name.replace('_', "-"), version.to_string()))
470}
471
472fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
473    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
474        .iter()
475        .find_map(|suffix| file_name.strip_suffix(suffix))
476}
477
478fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
479    root.pointer("/archive_info/hashes/sha256")
480        .and_then(|value| value.as_str())
481        .map(ToOwned::to_owned)
482        .or_else(|| {
483            root.pointer("/archive_info/hash")
484                .and_then(|value| value.as_str())
485                .and_then(normalize_origin_hash)
486        })
487}
488
489fn normalize_origin_hash(hash: &str) -> Option<String> {
490    if let Some(value) = hash.strip_prefix("sha256=") {
491        return Some(value.to_string());
492    }
493    if let Some(value) = hash.strip_prefix("sha256:") {
494        return Some(value.to_string());
495    }
496    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
497        return Some(hash.to_string());
498    }
499    None
500}
501
502fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
503    let content = match read_file_to_string(path, None) {
504        Ok(content) => content,
505        Err(e) => {
506            warn!("Failed to read metadata at {:?}: {}", path, e);
507            return default_package_data(path);
508        }
509    };
510
511    let metadata = super::rfc822::parse_rfc822_content(&content);
512    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
513    merge_sibling_metadata_dependencies(path, &mut package_data);
514    merge_sibling_metadata_file_references(path, &mut package_data);
515    if datasource_id == DatasourceId::PypiWheelMetadata {
516        merge_sibling_wheel_metadata(path, &mut package_data);
517    }
518    package_data
519}
520
521fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
522    let mut extra_dependencies = Vec::new();
523
524    if let Some(parent) = path.parent() {
525        let direct_requires = parent.join("requires.txt");
526        if direct_requires.exists()
527            && let Ok(content) = read_file_to_string(&direct_requires, None)
528        {
529            extra_dependencies.extend(parse_requires_txt(&content));
530        }
531
532        let sibling_egg_info_requires = parent
533            .read_dir()
534            .ok()
535            .into_iter()
536            .flatten()
537            .flatten()
538            .find_map(|entry| {
539                let child_path = entry.path();
540                if child_path.is_dir()
541                    && child_path
542                        .file_name()
543                        .and_then(|name| name.to_str())
544                        .is_some_and(|name| name.ends_with(".egg-info"))
545                {
546                    let requires = child_path.join("requires.txt");
547                    requires.exists().then_some(requires)
548                } else {
549                    None
550                }
551            });
552
553        if let Some(requires_path) = sibling_egg_info_requires
554            && let Ok(content) = read_file_to_string(&requires_path, None)
555        {
556            extra_dependencies.extend(parse_requires_txt(&content));
557        }
558    }
559
560    for dependency in extra_dependencies {
561        if !package_data.dependencies.iter().any(|existing| {
562            existing.purl == dependency.purl
563                && existing.scope == dependency.scope
564                && existing.extracted_requirement == dependency.extracted_requirement
565                && existing.extra_data == dependency.extra_data
566        }) {
567            package_data.dependencies.push(dependency);
568        }
569    }
570}
571
572fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
573    let mut extra_refs = Vec::new();
574
575    if let Some(parent) = path.parent() {
576        let record_path = parent.join("RECORD");
577        if record_path.exists()
578            && let Ok(content) = read_file_to_string(&record_path, None)
579        {
580            extra_refs.extend(parse_record_csv(&content));
581        }
582
583        let installed_files_path = parent.join("installed-files.txt");
584        if installed_files_path.exists()
585            && let Ok(content) = read_file_to_string(&installed_files_path, None)
586        {
587            extra_refs.extend(parse_installed_files_txt(&content));
588        }
589
590        let sources_path = parent.join("SOURCES.txt");
591        if sources_path.exists()
592            && let Ok(content) = read_file_to_string(&sources_path, None)
593        {
594            extra_refs.extend(parse_sources_txt(&content));
595        }
596    }
597
598    for file_ref in extra_refs {
599        if !package_data
600            .file_references
601            .iter()
602            .any(|existing| existing.path == file_ref.path)
603        {
604            package_data.file_references.push(file_ref);
605        }
606    }
607}
608
609fn collect_validated_zip_entries<R: Read + std::io::Seek>(
610    archive: &mut ZipArchive<R>,
611    path: &Path,
612    archive_type: &str,
613) -> Result<Vec<ValidatedZipEntry>, String> {
614    let mut total_extracted = 0u64;
615    let mut entries = Vec::new();
616    let mut entry_count = 0usize;
617
618    for i in 0..archive.len() {
619        entry_count += 1;
620        if entry_count > MAX_ITERATION_COUNT {
621            warn!(
622                "Exceeded max entry count in {} {:?}; stopping at {} entries",
623                archive_type, path, MAX_ITERATION_COUNT
624            );
625            break;
626        }
627        if let Ok(file) = archive.by_index_raw(i) {
628            let compressed_size = file.compressed_size();
629            let uncompressed_size = file.size();
630            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
631                warn!(
632                    "Skipping unsafe path in {} {:?}: {}",
633                    archive_type,
634                    path,
635                    file.name()
636                );
637                continue;
638            };
639
640            if compressed_size > 0 {
641                let ratio = uncompressed_size as f64 / compressed_size as f64;
642                if ratio > MAX_COMPRESSION_RATIO {
643                    warn!(
644                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
645                        archive_type, path, ratio
646                    );
647                    continue;
648                }
649            }
650
651            if uncompressed_size > MAX_FILE_SIZE {
652                warn!(
653                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
654                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
655                );
656                continue;
657            }
658
659            total_extracted += uncompressed_size;
660            if total_extracted > MAX_ARCHIVE_SIZE {
661                let msg = format!(
662                    "Total extracted size exceeds limit for {} {:?}",
663                    archive_type, path
664                );
665                warn!("{}", msg);
666                return Err(msg);
667            }
668
669            entries.push(ValidatedZipEntry {
670                index: i,
671                name: entry_name,
672            });
673        }
674    }
675
676    Ok(entries)
677}
678
679fn is_python_sdist_archive_path(path: &Path) -> bool {
680    detect_python_sdist_archive_format(path).is_some()
681}
682
683fn is_valid_wheel_archive_path(path: &Path) -> bool {
684    if !path.is_file() {
685        return true;
686    }
687
688    let file = match File::open(path) {
689        Ok(file) => file,
690        Err(_) => return false,
691    };
692    let mut archive = match ZipArchive::new(file) {
693        Ok(archive) => archive,
694        Err(_) => return false,
695    };
696
697    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
698        Ok(entries) => entries,
699        Err(_) => return false,
700    };
701
702    find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA").is_some()
703}
704
705fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
706    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
707
708    if !is_likely_python_sdist_filename(&file_name) {
709        return None;
710    }
711
712    if file_name.ends_with(".tar.gz") {
713        tar_gz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarGz)
714    } else if file_name.ends_with(".tgz") {
715        tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
716    } else if file_name.ends_with(".tar.bz2") {
717        tar_bz2_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarBz2)
718    } else if file_name.ends_with(".tar.xz") {
719        tar_xz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::TarXz)
720    } else if file_name.ends_with(".zip") {
721        zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
722    } else {
723        None
724    }
725}
726
727fn tar_gz_sdist_contains_pkg_info(path: &Path) -> bool {
728    let Some(compressed_size) = compressed_archive_size(path) else {
729        return false;
730    };
731    let file = match File::open(path) {
732        Ok(file) => file,
733        Err(_) => return false,
734    };
735    let decoder = GzDecoder::new(file);
736    tar_sdist_contains_pkg_info(path, decoder, "tar.gz", compressed_size)
737}
738
739fn tar_bz2_sdist_contains_pkg_info(path: &Path) -> bool {
740    let Some(compressed_size) = compressed_archive_size(path) else {
741        return false;
742    };
743    let file = match File::open(path) {
744        Ok(file) => file,
745        Err(_) => return false,
746    };
747    let decoder = BzDecoder::new(file);
748    tar_sdist_contains_pkg_info(path, decoder, "tar.bz2", compressed_size)
749}
750
751fn tar_xz_sdist_contains_pkg_info(path: &Path) -> bool {
752    let Some(compressed_size) = compressed_archive_size(path) else {
753        return false;
754    };
755    let file = match File::open(path) {
756        Ok(file) => file,
757        Err(_) => return false,
758    };
759    let decoder = XzDecoder::new(file);
760    tar_sdist_contains_pkg_info(path, decoder, "tar.xz", compressed_size)
761}
762
763fn compressed_archive_size(path: &Path) -> Option<u64> {
764    std::fs::metadata(path).ok().map(|metadata| metadata.len())
765}
766
767fn tar_sdist_contains_pkg_info<R: Read>(
768    path: &Path,
769    reader: R,
770    archive_type: &str,
771    compressed_size: u64,
772) -> bool {
773    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
774    else {
775        return false;
776    };
777
778    select_sdist_pkginfo_entry(path, &entries).is_some()
779}
780
781fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
782    if !path.is_file() {
783        return true;
784    }
785
786    let Some(compressed_size) = compressed_archive_size(path) else {
787        return false;
788    };
789    let file = match File::open(path) {
790        Ok(file) => file,
791        Err(_) => return false,
792    };
793    let decoder = GzDecoder::new(file);
794    tar_sdist_contains_pkg_info(path, decoder, "tgz", compressed_size)
795}
796
797fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
798    if !path.is_file() {
799        return true;
800    }
801
802    let file = match File::open(path) {
803        Ok(file) => file,
804        Err(_) => return false,
805    };
806    let mut archive = match ZipArchive::new(file) {
807        Ok(archive) => archive,
808        Err(_) => return false,
809    };
810
811    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
812        Ok(entries) => entries,
813        Err(_) => return false,
814    };
815    let metadata_entries: Vec<_> = validated_entries
816        .iter()
817        .filter(|entry| entry.name.ends_with("/PKG-INFO"))
818        .filter_map(|entry| {
819            read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
820                .ok()
821                .map(|content| (entry.name.clone(), content))
822        })
823        .collect();
824
825    has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
826}
827
828fn is_likely_python_sdist_filename(file_name: &str) -> bool {
829    let Some(stem) = strip_python_archive_extension(file_name) else {
830        return false;
831    };
832
833    let Some((name, version)) = stem.rsplit_once('-') else {
834        return false;
835    };
836
837    !name.is_empty()
838        && !version.is_empty()
839        && version.chars().any(|ch| ch.is_ascii_digit())
840        && name
841            .chars()
842            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
843}
844
845fn extract_from_sdist_archive(path: &Path) -> PackageData {
846    let metadata = match std::fs::metadata(path) {
847        Ok(m) => m,
848        Err(e) => {
849            warn!(
850                "Failed to read metadata for sdist archive {:?}: {}",
851                path, e
852            );
853            return default_package_data(path);
854        }
855    };
856
857    if metadata.len() > MAX_ARCHIVE_SIZE {
858        warn!(
859            "sdist archive too large: {} bytes (limit: {} bytes)",
860            metadata.len(),
861            MAX_ARCHIVE_SIZE
862        );
863        return default_package_data(path);
864    }
865
866    let Some(format) = detect_python_sdist_archive_format(path) else {
867        return default_package_data(path);
868    };
869
870    let mut package_data = match format {
871        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
872            let file = match File::open(path) {
873                Ok(file) => file,
874                Err(e) => {
875                    warn!("Failed to open sdist archive {:?}: {}", path, e);
876                    return default_package_data(path);
877                }
878            };
879            let decoder = GzDecoder::new(file);
880            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
881        }
882        PythonSdistArchiveFormat::TarBz2 => {
883            let file = match File::open(path) {
884                Ok(file) => file,
885                Err(e) => {
886                    warn!("Failed to open sdist archive {:?}: {}", path, e);
887                    return default_package_data(path);
888                }
889            };
890            let decoder = BzDecoder::new(file);
891            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
892        }
893        PythonSdistArchiveFormat::TarXz => {
894            let file = match File::open(path) {
895                Ok(file) => file,
896                Err(e) => {
897                    warn!("Failed to open sdist archive {:?}: {}", path, e);
898                    return default_package_data(path);
899                }
900            };
901            let decoder = XzDecoder::new(file);
902            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
903        }
904        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
905    };
906
907    if package_data.package_type.is_some() {
908        let (size, sha256) = calculate_file_checksums(path);
909        package_data.size = size;
910        package_data.sha256 = sha256;
911    }
912
913    package_data
914}
915
916fn extract_from_tar_sdist_archive<R: Read>(
917    path: &Path,
918    reader: R,
919    archive_type: &str,
920    compressed_size: u64,
921) -> PackageData {
922    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
923    else {
924        return default_package_data(path);
925    };
926
927    build_sdist_package_data(path, entries)
928}
929
930fn collect_tar_sdist_entries<R: Read>(
931    path: &Path,
932    reader: R,
933    archive_type: &str,
934    compressed_size: u64,
935) -> Option<Vec<(String, String)>> {
936    let mut archive = Archive::new(reader);
937    let archive_entries = match archive.entries() {
938        Ok(entries) => entries,
939        Err(e) => {
940            warn!(
941                "Failed to read {} sdist archive {:?}: {}",
942                archive_type, path, e
943            );
944            return None;
945        }
946    };
947
948    let mut total_extracted = 0u64;
949    let mut entries = Vec::new();
950    let mut entry_count = 0usize;
951
952    for entry_result in archive_entries {
953        entry_count += 1;
954        if entry_count > MAX_ITERATION_COUNT {
955            warn!(
956                "Exceeded max entry count in {} sdist {:?}; stopping at {} entries",
957                archive_type, path, MAX_ITERATION_COUNT
958            );
959            break;
960        }
961
962        let mut entry = match entry_result {
963            Ok(entry) => entry,
964            Err(e) => {
965                warn!(
966                    "Failed to read {} sdist entry from {:?}: {}",
967                    archive_type, path, e
968                );
969                continue;
970            }
971        };
972
973        let entry_size = entry.size();
974        if entry_size > MAX_FILE_SIZE {
975            warn!(
976                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
977                archive_type, path, entry_size, MAX_FILE_SIZE
978            );
979            continue;
980        }
981
982        total_extracted += entry_size;
983        if total_extracted > MAX_ARCHIVE_SIZE {
984            warn!(
985                "Total extracted size exceeds limit for {} sdist {:?}",
986                archive_type, path
987            );
988            return None;
989        }
990
991        if compressed_size > 0 {
992            let ratio = total_extracted as f64 / compressed_size as f64;
993            if ratio > MAX_COMPRESSION_RATIO {
994                warn!(
995                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
996                    archive_type, path, ratio
997                );
998                return None;
999            }
1000        }
1001
1002        let entry_path = match entry.path() {
1003            Ok(path) => path.to_string_lossy().replace('\\', "/"),
1004            Err(e) => {
1005                warn!(
1006                    "Failed to get {} sdist entry path from {:?}: {}",
1007                    archive_type, path, e
1008                );
1009                continue;
1010            }
1011        };
1012
1013        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
1014            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
1015            continue;
1016        };
1017
1018        if !is_relevant_sdist_text_entry(&entry_path) {
1019            continue;
1020        }
1021
1022        if let Ok(content) = read_limited_utf8(
1023            &mut entry,
1024            MAX_FILE_SIZE,
1025            &format!("{} entry {}", archive_type, entry_path),
1026        ) {
1027            entries.push((entry_path, content));
1028        }
1029    }
1030
1031    Some(entries)
1032}
1033
1034fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
1035    let file = match File::open(path) {
1036        Ok(file) => file,
1037        Err(e) => {
1038            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
1039            return default_package_data(path);
1040        }
1041    };
1042
1043    let mut archive = match ZipArchive::new(file) {
1044        Ok(archive) => archive,
1045        Err(e) => {
1046            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
1047            return default_package_data(path);
1048        }
1049    };
1050
1051    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
1052        Ok(entries) => entries,
1053        Err(_) => return default_package_data(path),
1054    };
1055
1056    let mut entries = Vec::new();
1057    for entry in validated_entries.iter() {
1058        if !is_relevant_sdist_text_entry(&entry.name) {
1059            continue;
1060        }
1061
1062        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
1063            entries.push((entry.name.clone(), content));
1064        }
1065    }
1066
1067    build_sdist_package_data(path, entries)
1068}
1069
1070fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
1071    entry_path.ends_with("/PKG-INFO")
1072        || entry_path.ends_with("/requires.txt")
1073        || entry_path.ends_with("/SOURCES.txt")
1074}
1075
1076fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
1077    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
1078        warn!("No PKG-INFO file found in sdist archive {:?}", path);
1079        return default_package_data(path);
1080    };
1081
1082    let mut package_data =
1083        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
1084    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
1085    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
1086    apply_sdist_name_version_fallback(path, &mut package_data);
1087    package_data.datasource_id = Some(DatasourceId::PypiSdist);
1088    package_data
1089}
1090
1091fn select_sdist_pkginfo_entry(
1092    archive_path: &Path,
1093    entries: &[(String, String)],
1094) -> Option<(String, String)> {
1095    let expected_name = sdist_archive_expected_name(archive_path);
1096
1097    entries
1098        .iter()
1099        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
1100        .min_by_key(|(entry_path, content)| {
1101            let components: Vec<_> = entry_path
1102                .split('/')
1103                .filter(|part| !part.is_empty())
1104                .collect();
1105            let candidate_name = sdist_pkginfo_candidate_name(content);
1106            let name_rank = if candidate_name == expected_name {
1107                0
1108            } else {
1109                1
1110            };
1111            let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1112
1113            (name_rank, kind_rank, components.len(), entry_path.clone())
1114        })
1115        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1116}
1117
1118fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1119    let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1120        return false;
1121    };
1122
1123    entries.iter().any(|(entry_path, content)| {
1124        sdist_pkginfo_kind_rank(entry_path) < 3
1125            && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1126    })
1127}
1128
1129fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1130    archive_path
1131        .file_name()
1132        .and_then(|name| name.to_str())
1133        .and_then(strip_python_archive_extension)
1134        .and_then(|stem| {
1135            stem.rsplit_once('-')
1136                .map(|(name, _)| normalize_python_package_name(name))
1137        })
1138}
1139
1140fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1141    let metadata = super::rfc822::parse_rfc822_content(content);
1142    super::rfc822::get_header_first(&metadata.headers, "name")
1143        .map(|name| normalize_python_package_name(&name))
1144}
1145
1146fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1147    let components: Vec<_> = entry_path
1148        .split('/')
1149        .filter(|part| !part.is_empty())
1150        .collect();
1151
1152    if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1153    {
1154        0
1155    } else if components.len() == 2 && components[1] == "PKG-INFO" {
1156        1
1157    } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1158        2
1159    } else {
1160        3
1161    }
1162}
1163
1164fn merge_sdist_archive_dependencies(
1165    entries: &[(String, String)],
1166    metadata_path: &str,
1167    package_data: &mut PackageData,
1168) {
1169    let metadata_dir = metadata_path
1170        .rsplit_once('/')
1171        .map(|(dir, _)| dir)
1172        .unwrap_or("");
1173    let archive_root = metadata_path.split('/').next().unwrap_or("");
1174    let matched_egg_info_dir =
1175        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1176    let mut extra_dependencies = Vec::new();
1177
1178    for (entry_path, content) in entries {
1179        let is_direct_requires =
1180            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1181        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1182            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1183        });
1184
1185        if is_direct_requires || is_egg_info_requires {
1186            extra_dependencies.extend(parse_requires_txt(content));
1187        }
1188    }
1189
1190    for dependency in extra_dependencies {
1191        if !package_data.dependencies.iter().any(|existing| {
1192            existing.purl == dependency.purl
1193                && existing.scope == dependency.scope
1194                && existing.extracted_requirement == dependency.extracted_requirement
1195                && existing.extra_data == dependency.extra_data
1196        }) {
1197            package_data.dependencies.push(dependency);
1198        }
1199    }
1200}
1201
1202fn merge_sdist_archive_file_references(
1203    entries: &[(String, String)],
1204    metadata_path: &str,
1205    package_data: &mut PackageData,
1206) {
1207    let metadata_dir = metadata_path
1208        .rsplit_once('/')
1209        .map(|(dir, _)| dir)
1210        .unwrap_or("");
1211    let archive_root = metadata_path.split('/').next().unwrap_or("");
1212    let matched_egg_info_dir =
1213        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1214    let mut extra_refs = Vec::new();
1215
1216    for (entry_path, content) in entries {
1217        let is_direct_sources =
1218            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1219        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1220            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1221        });
1222
1223        if is_direct_sources || is_egg_info_sources {
1224            extra_refs.extend(parse_sources_txt(content));
1225        }
1226    }
1227
1228    for file_ref in extra_refs {
1229        if !package_data
1230            .file_references
1231            .iter()
1232            .any(|existing| existing.path == file_ref.path)
1233        {
1234            package_data.file_references.push(file_ref);
1235        }
1236    }
1237}
1238
1239fn select_matching_sdist_egg_info_dir(
1240    entries: &[(String, String)],
1241    archive_root: &str,
1242    package_name: Option<&str>,
1243) -> Option<String> {
1244    let normalized_package_name = package_name.map(normalize_python_package_name);
1245
1246    entries
1247        .iter()
1248        .filter_map(|(entry_path, _)| {
1249            let components: Vec<_> = entry_path
1250                .split('/')
1251                .filter(|part| !part.is_empty())
1252                .collect();
1253            if components.len() == 3
1254                && components[0] == archive_root
1255                && components[1].ends_with(".egg-info")
1256            {
1257                Some(components[1].to_string())
1258            } else {
1259                None
1260            }
1261        })
1262        .min_by_key(|egg_info_dir| {
1263            let normalized_dir_name =
1264                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1265            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1266                0
1267            } else {
1268                1
1269            };
1270
1271            (name_rank, egg_info_dir.clone())
1272        })
1273}
1274
1275fn normalize_python_package_name(name: &str) -> String {
1276    name.to_ascii_lowercase().replace('_', "-")
1277}
1278
1279fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1280    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1281        return;
1282    };
1283
1284    let Some(stem) = strip_python_archive_extension(file_name) else {
1285        return;
1286    };
1287
1288    let Some((name, version)) = stem.rsplit_once('-') else {
1289        return;
1290    };
1291
1292    if package_data.name.is_none() {
1293        package_data.name = Some(name.replace('_', "-"));
1294    }
1295    if package_data.version.is_none() {
1296        package_data.version = Some(version.to_string());
1297    }
1298
1299    if package_data.purl.is_none()
1300        || package_data.repository_homepage_url.is_none()
1301        || package_data.repository_download_url.is_none()
1302        || package_data.api_data_url.is_none()
1303    {
1304        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1305            build_pypi_urls(
1306                package_data.name.as_deref(),
1307                package_data.version.as_deref(),
1308            );
1309
1310        if package_data.repository_homepage_url.is_none() {
1311            package_data.repository_homepage_url = repository_homepage_url;
1312        }
1313        if package_data.repository_download_url.is_none() {
1314            package_data.repository_download_url = repository_download_url;
1315        }
1316        if package_data.api_data_url.is_none() {
1317            package_data.api_data_url = api_data_url;
1318        }
1319        if package_data.purl.is_none() {
1320            package_data.purl = purl;
1321        }
1322    }
1323}
1324
1325fn extract_from_wheel_archive(path: &Path) -> PackageData {
1326    let metadata = match std::fs::metadata(path) {
1327        Ok(m) => m,
1328        Err(e) => {
1329            warn!(
1330                "Failed to read metadata for wheel archive {:?}: {}",
1331                path, e
1332            );
1333            return default_package_data(path);
1334        }
1335    };
1336
1337    if metadata.len() > MAX_ARCHIVE_SIZE {
1338        warn!(
1339            "Wheel archive too large: {} bytes (limit: {} bytes)",
1340            metadata.len(),
1341            MAX_ARCHIVE_SIZE
1342        );
1343        return default_package_data(path);
1344    }
1345
1346    let file = match File::open(path) {
1347        Ok(f) => f,
1348        Err(e) => {
1349            warn!("Failed to open wheel archive {:?}: {}", path, e);
1350            return default_package_data(path);
1351        }
1352    };
1353
1354    let mut archive = match ZipArchive::new(file) {
1355        Ok(a) => a,
1356        Err(e) => {
1357            warn!("Failed to read wheel archive {:?}: {}", path, e);
1358            return default_package_data(path);
1359        }
1360    };
1361
1362    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1363        Ok(entries) => entries,
1364        Err(_) => return default_package_data(path),
1365    };
1366
1367    let metadata_entry =
1368        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1369            Some(entry) => entry,
1370            None => {
1371                warn!("No METADATA file found in wheel archive {:?}", path);
1372                return default_package_data(path);
1373            }
1374        };
1375
1376    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1377        Ok(c) => c,
1378        Err(e) => {
1379            warn!("Failed to read METADATA from {:?}: {}", path, e);
1380            return default_package_data(path);
1381        }
1382    };
1383
1384    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1385
1386    let (size, sha256) = calculate_file_checksums(path);
1387    package_data.size = size;
1388    package_data.sha256 = sha256;
1389
1390    if let Some(record_entry) =
1391        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1392        && let Ok(record_content) =
1393            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1394    {
1395        package_data.file_references = parse_record_csv(&record_content);
1396    }
1397
1398    if let Some(wheel_info) = parse_wheel_filename(path) {
1399        if package_data.name.is_none() {
1400            package_data.name = Some(wheel_info.name.clone());
1401        }
1402        if package_data.version.is_none() {
1403            package_data.version = Some(wheel_info.version.clone());
1404        }
1405
1406        package_data.qualifiers = Some(std::collections::HashMap::from([(
1407            "extension".to_string(),
1408            format!(
1409                "{}-{}-{}",
1410                wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1411            ),
1412        )]));
1413
1414        package_data.purl = build_wheel_purl(
1415            package_data.name.as_deref(),
1416            package_data.version.as_deref(),
1417            &wheel_info,
1418        );
1419
1420        let mut extra_data = package_data.extra_data.unwrap_or_default();
1421        extra_data.insert(
1422            "python_requires".to_string(),
1423            serde_json::Value::String(wheel_info.python_tag.clone()),
1424        );
1425        extra_data.insert(
1426            "abi_tag".to_string(),
1427            serde_json::Value::String(wheel_info.abi_tag.clone()),
1428        );
1429        extra_data.insert(
1430            "platform_tag".to_string(),
1431            serde_json::Value::String(wheel_info.platform_tag.clone()),
1432        );
1433        package_data.extra_data = Some(extra_data);
1434    }
1435
1436    package_data
1437}
1438
1439fn extract_from_egg_archive(path: &Path) -> PackageData {
1440    let metadata = match std::fs::metadata(path) {
1441        Ok(m) => m,
1442        Err(e) => {
1443            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1444            return default_package_data(path);
1445        }
1446    };
1447
1448    if metadata.len() > MAX_ARCHIVE_SIZE {
1449        warn!(
1450            "Egg archive too large: {} bytes (limit: {} bytes)",
1451            metadata.len(),
1452            MAX_ARCHIVE_SIZE
1453        );
1454        return default_package_data(path);
1455    }
1456
1457    let file = match File::open(path) {
1458        Ok(f) => f,
1459        Err(e) => {
1460            warn!("Failed to open egg archive {:?}: {}", path, e);
1461            return default_package_data(path);
1462        }
1463    };
1464
1465    let mut archive = match ZipArchive::new(file) {
1466        Ok(a) => a,
1467        Err(e) => {
1468            warn!("Failed to read egg archive {:?}: {}", path, e);
1469            return default_package_data(path);
1470        }
1471    };
1472
1473    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1474        Ok(entries) => entries,
1475        Err(_) => return default_package_data(path),
1476    };
1477
1478    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1479        &validated_entries,
1480        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1481    ) {
1482        Some(entry) => entry,
1483        None => {
1484            warn!("No PKG-INFO file found in egg archive {:?}", path);
1485            return default_package_data(path);
1486        }
1487    };
1488
1489    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1490        Ok(c) => c,
1491        Err(e) => {
1492            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1493            return default_package_data(path);
1494        }
1495    };
1496
1497    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1498
1499    let (size, sha256) = calculate_file_checksums(path);
1500    package_data.size = size;
1501    package_data.sha256 = sha256;
1502
1503    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1504        &validated_entries,
1505        &[
1506            "EGG-INFO/installed-files.txt",
1507            ".egg-info/installed-files.txt",
1508        ],
1509    ) && let Ok(installed_files_content) =
1510        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1511    {
1512        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1513    }
1514
1515    if let Some(egg_info) = parse_egg_filename(path) {
1516        if package_data.name.is_none() {
1517            package_data.name = Some(egg_info.name.clone());
1518        }
1519        if package_data.version.is_none() {
1520            package_data.version = Some(egg_info.version.clone());
1521        }
1522
1523        if let Some(python_version) = &egg_info.python_version {
1524            let mut extra_data = package_data.extra_data.unwrap_or_default();
1525            extra_data.insert(
1526                "python_version".to_string(),
1527                serde_json::Value::String(python_version.clone()),
1528            );
1529            package_data.extra_data = Some(extra_data);
1530        }
1531    }
1532
1533    package_data.purl = build_egg_purl(
1534        package_data.name.as_deref(),
1535        package_data.version.as_deref(),
1536    );
1537
1538    package_data
1539}
1540
1541fn find_validated_zip_entry_by_suffix<'a>(
1542    entries: &'a [ValidatedZipEntry],
1543    suffix: &str,
1544) -> Option<&'a ValidatedZipEntry> {
1545    entries.iter().find(|entry| entry.name.ends_with(suffix))
1546}
1547
1548fn find_validated_zip_entry_by_any_suffix<'a>(
1549    entries: &'a [ValidatedZipEntry],
1550    suffixes: &[&str],
1551) -> Option<&'a ValidatedZipEntry> {
1552    entries
1553        .iter()
1554        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1555}
1556
1557fn read_validated_zip_entry<R: Read + std::io::Seek>(
1558    archive: &mut ZipArchive<R>,
1559    entry: &ValidatedZipEntry,
1560    path: &Path,
1561    archive_type: &str,
1562) -> Result<String, String> {
1563    let mut file = archive
1564        .by_index(entry.index)
1565        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1566
1567    let compressed_size = file.compressed_size();
1568    let uncompressed_size = file.size();
1569
1570    if compressed_size > 0 {
1571        let ratio = uncompressed_size as f64 / compressed_size as f64;
1572        if ratio > MAX_COMPRESSION_RATIO {
1573            return Err(format!(
1574                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1575                archive_type, path, ratio
1576            ));
1577        }
1578    }
1579
1580    if uncompressed_size > MAX_FILE_SIZE {
1581        return Err(format!(
1582            "Rejected oversized entry in {} {:?}: {} bytes",
1583            archive_type, path, uncompressed_size
1584        ));
1585    }
1586
1587    read_limited_utf8(
1588        &mut file,
1589        MAX_FILE_SIZE,
1590        &format!("{} entry {}", archive_type, entry.name),
1591    )
1592}
1593
1594fn read_limited_utf8<R: Read>(
1595    reader: &mut R,
1596    max_bytes: u64,
1597    context: &str,
1598) -> Result<String, String> {
1599    let mut limited = reader.take(max_bytes + 1);
1600    let mut bytes = Vec::new();
1601    limited
1602        .read_to_end(&mut bytes)
1603        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1604
1605    if bytes.len() as u64 > max_bytes {
1606        return Err(format!(
1607            "{} exceeded {} byte limit while reading",
1608            context, max_bytes
1609        ));
1610    }
1611
1612    match String::from_utf8(bytes) {
1613        Ok(s) => Ok(s),
1614        Err(err) => {
1615            let bytes = err.into_bytes();
1616            warn!("Invalid UTF-8 in archive entry; using lossy conversion");
1617            Ok(String::from_utf8_lossy(&bytes).into_owned())
1618        }
1619    }
1620}
1621
1622fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1623    let normalized = entry_path.replace('\\', "/");
1624    if normalized.len() >= 3 {
1625        let bytes = normalized.as_bytes();
1626        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1627            return None;
1628        }
1629    }
1630    let path = Path::new(&normalized);
1631    let mut components = Vec::new();
1632
1633    for component in path.components() {
1634        match component {
1635            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1636            Component::CurDir => {}
1637            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1638        }
1639    }
1640
1641    (!components.is_empty()).then_some(components.join("/"))
1642}
1643
1644/// Parses RECORD CSV format from wheel archives (PEP 427).
1645/// Format: path,hash,size (3 columns, no header)
1646/// Hash format: sha256=urlsafe_base64_hash or empty
1647/// Size: bytes as u64 or empty
1648pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1649    let mut reader = ReaderBuilder::new()
1650        .has_headers(false)
1651        .from_reader(content.as_bytes());
1652
1653    let mut file_references = Vec::new();
1654    let mut record_count = 0usize;
1655
1656    for result in reader.records() {
1657        record_count += 1;
1658        if record_count > MAX_ITERATION_COUNT {
1659            warn!(
1660                "Exceeded max record count in RECORD CSV; stopping at {} records",
1661                MAX_ITERATION_COUNT
1662            );
1663            break;
1664        }
1665        match result {
1666            Ok(record) => {
1667                if record.len() < 3 {
1668                    continue;
1669                }
1670
1671                let path = record.get(0).unwrap_or("").trim().to_string();
1672                if path.is_empty() {
1673                    continue;
1674                }
1675
1676                let hash_field = record.get(1).unwrap_or("").trim();
1677                let size_field = record.get(2).unwrap_or("").trim();
1678
1679                // Parse hash: format is "algorithm=value"
1680                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1681                    let parts: Vec<&str> = hash_field.split('=').collect();
1682                    if parts.len() == 2 && parts[0] == "sha256" {
1683                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1684                            Ok(decoded) => {
1685                                let hex = decoded
1686                                    .iter()
1687                                    .map(|b| format!("{:02x}", b))
1688                                    .collect::<String>();
1689                                Sha256Digest::from_hex(&hex).ok()
1690                            }
1691                            Err(_) => None,
1692                        }
1693                    } else {
1694                        None
1695                    }
1696                } else {
1697                    None
1698                };
1699
1700                // Parse size
1701                let size = if !size_field.is_empty() && size_field != "-" {
1702                    size_field.parse::<u64>().ok()
1703                } else {
1704                    None
1705                };
1706
1707                file_references.push(FileReference {
1708                    path,
1709                    size,
1710                    sha1: None,
1711                    md5: None,
1712                    sha256,
1713                    sha512: None,
1714                    extra_data: None,
1715                });
1716            }
1717            Err(e) => {
1718                warn!("Failed to parse RECORD CSV row: {}", e);
1719                continue;
1720            }
1721        }
1722    }
1723
1724    file_references
1725}
1726
1727/// Parses installed-files.txt format from egg archives (PEP 376).
1728/// Format: one file path per line, no headers, no hash, no size
1729pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1730    content
1731        .lines()
1732        .take(MAX_ITERATION_COUNT)
1733        .map(|line| line.trim())
1734        .filter(|line| !line.is_empty())
1735        .map(|path| FileReference {
1736            path: path.to_string(),
1737            size: None,
1738            sha1: None,
1739            md5: None,
1740            sha256: None,
1741            sha512: None,
1742            extra_data: None,
1743        })
1744        .collect()
1745}
1746
1747pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1748    content
1749        .lines()
1750        .take(MAX_ITERATION_COUNT)
1751        .map(str::trim)
1752        .filter(|line| !line.is_empty())
1753        .map(|path| FileReference {
1754            path: path.to_string(),
1755            size: None,
1756            sha1: None,
1757            md5: None,
1758            sha256: None,
1759            sha512: None,
1760            extra_data: None,
1761        })
1762        .collect()
1763}
1764
1765struct WheelInfo {
1766    name: String,
1767    version: String,
1768    python_tag: String,
1769    abi_tag: String,
1770    platform_tag: String,
1771}
1772
1773fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1774    let stem = path.file_stem()?.to_string_lossy();
1775    let parts: Vec<&str> = stem.split('-').collect();
1776
1777    if parts.len() >= 5 {
1778        Some(WheelInfo {
1779            name: parts[0].replace('_', "-"),
1780            version: parts[1].to_string(),
1781            python_tag: parts[2].to_string(),
1782            abi_tag: parts[3].to_string(),
1783            platform_tag: parts[4..].join("-"),
1784        })
1785    } else {
1786        None
1787    }
1788}
1789
1790struct EggInfo {
1791    name: String,
1792    version: String,
1793    python_version: Option<String>,
1794}
1795
1796fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1797    let stem = path.file_stem()?.to_string_lossy();
1798    let parts: Vec<&str> = stem.split('-').collect();
1799
1800    if parts.len() >= 2 {
1801        Some(EggInfo {
1802            name: parts[0].replace('_', "-"),
1803            version: parts[1].to_string(),
1804            python_version: parts.get(2).map(|s| s.to_string()),
1805        })
1806    } else {
1807        None
1808    }
1809}
1810
1811fn build_wheel_purl(
1812    name: Option<&str>,
1813    version: Option<&str>,
1814    wheel_info: &WheelInfo,
1815) -> Option<String> {
1816    let name = name?;
1817    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1818
1819    if let Some(ver) = version {
1820        package_url.with_version(ver).ok()?;
1821    }
1822
1823    let extension = format!(
1824        "{}-{}-{}",
1825        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1826    );
1827    package_url.add_qualifier("extension", extension).ok()?;
1828
1829    Some(package_url.to_string())
1830}
1831
1832fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1833    let name = name?;
1834    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1835
1836    if let Some(ver) = version {
1837        package_url.with_version(ver).ok()?;
1838    }
1839
1840    package_url.add_qualifier("type", "egg").ok()?;
1841
1842    Some(package_url.to_string())
1843}
1844
1845fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1846    let metadata = super::rfc822::parse_rfc822_content(content);
1847    build_package_data_from_rfc822(&metadata, datasource_id)
1848}
1849
1850/// Builds PackageData from parsed RFC822 metadata.
1851///
1852/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1853/// and `python_parse_rfc822_content` (content-based) functions.
1854fn build_package_data_from_rfc822(
1855    metadata: &super::rfc822::Rfc822Metadata,
1856    datasource_id: DatasourceId,
1857) -> PackageData {
1858    use super::rfc822::{get_header_all, get_header_first};
1859
1860    let name = get_header_first(&metadata.headers, "name").map(truncate_field);
1861    let version = get_header_first(&metadata.headers, "version").map(truncate_field);
1862    let summary = get_header_first(&metadata.headers, "summary").map(truncate_field);
1863    let mut homepage_url = get_header_first(&metadata.headers, "home-page").map(truncate_field);
1864    let author = get_header_first(&metadata.headers, "author").map(truncate_field);
1865    let author_email = get_header_first(&metadata.headers, "author-email").map(truncate_field);
1866    let license = get_header_first(&metadata.headers, "license").map(truncate_field);
1867    let license_expression = get_header_first(&metadata.headers, "license-expression");
1868    let download_url = get_header_first(&metadata.headers, "download-url");
1869    let platform = get_header_first(&metadata.headers, "platform");
1870    let requires_python = get_header_first(&metadata.headers, "requires-python");
1871    let classifiers = get_header_all(&metadata.headers, "classifier");
1872    let license_files = get_header_all(&metadata.headers, "license-file");
1873
1874    let description_body = if metadata.body.is_empty() {
1875        get_header_first(&metadata.headers, "description").unwrap_or_default()
1876    } else {
1877        metadata.body.clone()
1878    };
1879
1880    let description = build_description(summary.as_deref(), &description_body).map(truncate_field);
1881
1882    let mut parties = Vec::new();
1883    if author.is_some() || author_email.is_some() {
1884        parties.push(Party {
1885            r#type: Some("person".to_string()),
1886            role: Some("author".to_string()),
1887            name: author,
1888            email: author_email,
1889            url: None,
1890            organization: None,
1891            organization_url: None,
1892            timezone: None,
1893        });
1894    }
1895
1896    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1897    let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1898    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1899        license_expression
1900            .as_deref()
1901            .and_then(normalize_spdx_expression)
1902            .map(|normalized| {
1903                build_declared_license_data(
1904                    normalized,
1905                    DeclaredLicenseMatchMetadata::single_line(
1906                        license_expression.as_deref().unwrap_or_default(),
1907                    )
1908                    .with_referenced_filenames(&referenced_license_files),
1909                )
1910            })
1911            .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1912
1913    let extracted_license_statement = license_expression
1914        .clone()
1915        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1916
1917    let mut extra_data = HashMap::new();
1918    if let Some(platform_value) = platform
1919        && !platform_value.eq_ignore_ascii_case("unknown")
1920        && !platform_value.is_empty()
1921    {
1922        extra_data.insert(
1923            "platform".to_string(),
1924            serde_json::Value::String(platform_value),
1925        );
1926    }
1927
1928    if let Some(requires_python_value) = requires_python
1929        && !requires_python_value.is_empty()
1930    {
1931        extra_data.insert(
1932            "requires_python".to_string(),
1933            serde_json::Value::String(requires_python_value),
1934        );
1935    }
1936
1937    if !license_files.is_empty() {
1938        extra_data.insert(
1939            "license_files".to_string(),
1940            serde_json::Value::Array(
1941                license_files
1942                    .iter()
1943                    .cloned()
1944                    .map(serde_json::Value::String)
1945                    .collect(),
1946            ),
1947        );
1948    }
1949
1950    let file_references = license_files
1951        .iter()
1952        .map(|path| FileReference {
1953            path: path.clone(),
1954            size: None,
1955            sha1: None,
1956            md5: None,
1957            sha256: None,
1958            sha512: None,
1959            extra_data: None,
1960        })
1961        .collect();
1962
1963    let project_urls = get_header_all(&metadata.headers, "project-url");
1964    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1965    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1966
1967    if !project_urls.is_empty() {
1968        let parsed_urls = parse_project_urls(&project_urls);
1969
1970        for (label, url) in &parsed_urls {
1971            let label_lower = label.to_lowercase();
1972
1973            if bug_tracking_url.is_none()
1974                && matches!(
1975                    label_lower.as_str(),
1976                    "tracker"
1977                        | "bug reports"
1978                        | "bug tracker"
1979                        | "issues"
1980                        | "issue tracker"
1981                        | "github: issues"
1982                )
1983            {
1984                bug_tracking_url = Some(url.clone());
1985            } else if code_view_url.is_none()
1986                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1987            {
1988                code_view_url = Some(url.clone());
1989            } else if vcs_url.is_none()
1990                && matches!(
1991                    label_lower.as_str(),
1992                    "github" | "gitlab" | "github: repo" | "repository"
1993                )
1994            {
1995                vcs_url = Some(url.clone());
1996            } else if homepage_url.is_none()
1997                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1998            {
1999                homepage_url = Some(url.clone());
2000            } else if label_lower == "changelog" {
2001                extra_data.insert(
2002                    "changelog_url".to_string(),
2003                    serde_json::Value::String(url.clone()),
2004                );
2005            }
2006        }
2007
2008        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
2009            .iter()
2010            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
2011            .collect();
2012
2013        if !project_urls_json.is_empty() {
2014            extra_data.insert(
2015                "project_urls".to_string(),
2016                serde_json::Value::Object(project_urls_json),
2017            );
2018        }
2019    }
2020
2021    let extra_data = if extra_data.is_empty() {
2022        None
2023    } else {
2024        Some(extra_data)
2025    };
2026
2027    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
2028        build_pypi_urls(name.as_deref(), version.as_deref());
2029
2030    PackageData {
2031        package_type: Some(PythonParser::PACKAGE_TYPE),
2032        namespace: None,
2033        name,
2034        version,
2035        qualifiers: None,
2036        subpath: None,
2037        primary_language: Some("Python".to_string()),
2038        description,
2039        release_date: None,
2040        parties,
2041        keywords,
2042        homepage_url,
2043        download_url,
2044        size: None,
2045        sha1: None,
2046        md5: None,
2047        sha256: None,
2048        sha512: None,
2049        bug_tracking_url,
2050        code_view_url,
2051        vcs_url,
2052        copyright: None,
2053        holder: None,
2054        declared_license_expression,
2055        declared_license_expression_spdx,
2056        license_detections,
2057        other_license_expression: None,
2058        other_license_expression_spdx: None,
2059        other_license_detections: Vec::new(),
2060        extracted_license_statement,
2061        notice_text: None,
2062        source_packages: Vec::new(),
2063        file_references,
2064        is_private: false,
2065        is_virtual: false,
2066        extra_data,
2067        dependencies,
2068        repository_homepage_url,
2069        repository_download_url,
2070        api_data_url,
2071        datasource_id: Some(datasource_id),
2072        purl,
2073    }
2074}
2075
2076fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
2077    project_urls
2078        .iter()
2079        .filter_map(|url_entry| {
2080            if let Some((label, url)) = url_entry.split_once(", ") {
2081                let label_trimmed = label.trim();
2082                let url_trimmed = url.trim();
2083                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
2084                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
2085                }
2086            }
2087            None
2088        })
2089        .collect()
2090}
2091
2092fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
2093    let mut parts = Vec::new();
2094    if let Some(summary_value) = summary
2095        && !summary_value.trim().is_empty()
2096    {
2097        parts.push(summary_value.trim().to_string());
2098    }
2099
2100    if !body.trim().is_empty() {
2101        parts.push(body.trim().to_string());
2102    }
2103
2104    if parts.is_empty() {
2105        None
2106    } else {
2107        Some(parts.join("\n"))
2108    }
2109}
2110
2111fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
2112    let mut keywords = Vec::new();
2113    let mut license_classifiers = Vec::new();
2114
2115    for classifier in classifiers {
2116        if classifier.starts_with("License ::") {
2117            license_classifiers.push(classifier.to_string());
2118        } else {
2119            keywords.push(classifier.to_string());
2120        }
2121    }
2122
2123    (keywords, license_classifiers)
2124}
2125
2126fn build_extracted_license_statement(
2127    license: Option<&str>,
2128    license_classifiers: &[String],
2129) -> Option<String> {
2130    let mut lines = Vec::new();
2131
2132    if let Some(value) = license
2133        && !value.trim().is_empty()
2134    {
2135        lines.push(format!("license: {}", value.trim()));
2136    }
2137
2138    if !license_classifiers.is_empty() {
2139        lines.push("classifiers:".to_string());
2140        for classifier in license_classifiers {
2141            lines.push(format!("  - '{}'", classifier));
2142        }
2143    }
2144
2145    if lines.is_empty() {
2146        None
2147    } else {
2148        Some(format!("{}\n", lines.join("\n")))
2149    }
2150}
2151
2152pub(crate) fn build_pypi_urls(
2153    name: Option<&str>,
2154    version: Option<&str>,
2155) -> (
2156    Option<String>,
2157    Option<String>,
2158    Option<String>,
2159    Option<String>,
2160) {
2161    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2162
2163    let repository_download_url = name.and_then(|value| {
2164        version.map(|ver| {
2165            format!(
2166                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2167                &value[..1.min(value.len())],
2168                value,
2169                value,
2170                ver
2171            )
2172        })
2173    });
2174
2175    let api_data_url = name.map(|value| {
2176        if let Some(ver) = version {
2177            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2178        } else {
2179            format!("https://pypi.org/pypi/{}/json", value)
2180        }
2181    });
2182
2183    let purl = name.and_then(|value| {
2184        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2185        if let Some(ver) = version {
2186            package_url.with_version(ver).ok()?;
2187        }
2188        Some(package_url.to_string())
2189    });
2190
2191    (
2192        repository_homepage_url,
2193        repository_download_url,
2194        api_data_url,
2195        purl,
2196    )
2197}
2198
2199fn build_pypi_purl_with_extension(
2200    name: &str,
2201    version: Option<&str>,
2202    extension: &str,
2203) -> Option<String> {
2204    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2205    if let Some(ver) = version {
2206        package_url.with_version(ver).ok()?;
2207    }
2208    package_url.add_qualifier("extension", extension).ok()?;
2209    Some(package_url.to_string())
2210}
2211
2212fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2213    let toml_content = match read_toml_file(path) {
2214        Ok(content) => content,
2215        Err(e) => {
2216            warn!(
2217                "Failed to read or parse pyproject.toml at {:?}: {}",
2218                path, e
2219            );
2220            return default_package_data(path);
2221        }
2222    };
2223
2224    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2225    let is_poetry_pyproject = tool_table
2226        .and_then(|tool| tool.get("poetry"))
2227        .and_then(|value| value.as_table())
2228        .is_some();
2229
2230    // Handle both PEP 621 (project table) and poetry formats
2231    let project_table =
2232        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2233            // Standard PEP 621 format with [project] table
2234            project.clone()
2235        } else if let Some(tool) = tool_table {
2236            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2237                // Poetry format with [tool.poetry] table
2238                poetry.clone()
2239            } else {
2240                return default_package_data(path);
2241            }
2242        } else if toml_content.get(FIELD_NAME).is_some() {
2243            // Other format with top-level fields
2244            match toml_content.as_table() {
2245                Some(table) => table.clone(),
2246                None => {
2247                    warn!("Failed to convert TOML content to table in {:?}", path);
2248                    return default_package_data(path);
2249                }
2250            }
2251        } else {
2252            return default_package_data(path);
2253        };
2254
2255    let name = project_table
2256        .get(FIELD_NAME)
2257        .and_then(|v| v.as_str())
2258        .map(|v| truncate_field(v.to_string()));
2259
2260    let version = project_table
2261        .get(FIELD_VERSION)
2262        .and_then(|v| v.as_str())
2263        .map(String::from);
2264    let classifiers = project_table
2265        .get("classifiers")
2266        .and_then(|value| value.as_array())
2267        .map(|values| {
2268            values
2269                .iter()
2270                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2271                .collect::<Vec<_>>()
2272        })
2273        .unwrap_or_default();
2274    let (classifier_keywords, license_classifiers) = split_classifiers(&classifiers);
2275
2276    let extracted_license_statement = extract_raw_license_string(&project_table);
2277    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2278        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2279
2280    let description = project_table
2281        .get(FIELD_DESCRIPTION)
2282        .and_then(|value| value.as_str())
2283        .map(|value| truncate_field(value.to_string()));
2284    let mut keywords = project_table
2285        .get(FIELD_KEYWORDS)
2286        .and_then(|value| value.as_array())
2287        .map(|values| {
2288            values
2289                .iter()
2290                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2291                .collect::<Vec<_>>()
2292        })
2293        .unwrap_or_default();
2294    for classifier in classifier_keywords {
2295        if !keywords.contains(&classifier) {
2296            keywords.push(classifier);
2297        }
2298    }
2299
2300    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2301    let mut extra_data = extract_pyproject_extra_data(&toml_content).unwrap_or_default();
2302    let (homepage_url, download_url, bug_tracking_url, code_view_url, repository_url) =
2303        extract_urls(&project_table, &mut extra_data);
2304
2305    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2306
2307    // Create package URL
2308    let purl = name.as_ref().and_then(|n| {
2309        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2310            Ok(p) => p,
2311            Err(e) => {
2312                warn!(
2313                    "Failed to create PackageUrl for Python package '{}': {}",
2314                    n, e
2315                );
2316                return None;
2317            }
2318        };
2319
2320        if let Some(v) = &version
2321            && let Err(e) = package_url.with_version(v)
2322        {
2323            warn!(
2324                "Failed to set version '{}' for Python package '{}': {}",
2325                v, n, e
2326            );
2327            return None;
2328        }
2329
2330        Some(package_url.to_string())
2331    });
2332
2333    let api_data_url = name.as_ref().map(|n| {
2334        if let Some(v) = &version {
2335            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2336        } else {
2337            format!("https://pypi.org/pypi/{}/json", n)
2338        }
2339    });
2340
2341    let pypi_homepage_url = name
2342        .as_ref()
2343        .map(|n| format!("https://pypi.org/project/{}", n));
2344
2345    let pypi_download_url = name.as_ref().and_then(|n| {
2346        version.as_ref().map(|v| {
2347            format!(
2348                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2349                &n[..1.min(n.len())],
2350                n,
2351                n,
2352                v
2353            )
2354        })
2355    });
2356
2357    PackageData {
2358        package_type: Some(PythonParser::PACKAGE_TYPE),
2359        namespace: None,
2360        name,
2361        version,
2362        qualifiers: None,
2363        subpath: None,
2364        primary_language: None,
2365        description,
2366        release_date: None,
2367        parties: extract_parties(&project_table),
2368        keywords,
2369        homepage_url: homepage_url.or(pypi_homepage_url),
2370        download_url: download_url
2371            .or_else(|| repository_url.clone())
2372            .or(pypi_download_url),
2373        size: None,
2374        sha1: None,
2375        md5: None,
2376        sha256: None,
2377        sha512: None,
2378        bug_tracking_url,
2379        code_view_url,
2380        vcs_url: repository_url,
2381        copyright: None,
2382        holder: None,
2383        declared_license_expression,
2384        declared_license_expression_spdx,
2385        license_detections,
2386        other_license_expression: None,
2387        other_license_expression_spdx: None,
2388        other_license_detections: Vec::new(),
2389        extracted_license_statement: extracted_license_statement
2390            .or_else(|| build_extracted_license_statement(None, &license_classifiers)),
2391        notice_text: None,
2392        source_packages: Vec::new(),
2393        file_references: Vec::new(),
2394        is_private: has_private_classifier(&classifiers),
2395        is_virtual: false,
2396        extra_data: if extra_data.is_empty() {
2397            None
2398        } else {
2399            Some(extra_data)
2400        },
2401        dependencies: [dependencies, optional_dependencies].concat(),
2402        repository_homepage_url: None,
2403        repository_download_url: None,
2404        api_data_url,
2405        datasource_id: Some(if is_poetry_pyproject {
2406            DatasourceId::PypiPoetryPyprojectToml
2407        } else {
2408            DatasourceId::PypiPyprojectToml
2409        }),
2410        purl,
2411    }
2412}
2413
2414fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2415    let path_str = path.to_string_lossy().replace('\\', "/");
2416    if path_str.contains("/EGG-INFO/PKG-INFO") {
2417        DatasourceId::PypiEggPkginfo
2418    } else if path_str.ends_with(".egg-info/PKG-INFO") {
2419        DatasourceId::PypiEditableEggPkginfo
2420    } else {
2421        DatasourceId::PypiSdistPkginfo
2422    }
2423}
2424
2425fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2426    project
2427        .get(FIELD_LICENSE)
2428        .and_then(|license_value| match license_value {
2429            TomlValue::String(license_str) => Some(license_str.clone()),
2430            TomlValue::Table(license_table) => license_table
2431                .get("text")
2432                .and_then(|v| v.as_str())
2433                .map(|s| s.to_string())
2434                .or_else(|| {
2435                    license_table
2436                        .get("expression")
2437                        .and_then(|v| v.as_str())
2438                        .map(|expr| expr.to_string())
2439                }),
2440            _ => None,
2441        })
2442}
2443
2444fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2445    match project.get(FIELD_LICENSE) {
2446        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2447        Some(TomlValue::Table(license_table)) => license_table
2448            .get("expression")
2449            .and_then(|value| value.as_str()),
2450        _ => None,
2451    }
2452}
2453
2454fn extract_urls(
2455    project: &TomlMap<String, TomlValue>,
2456    extra_data: &mut HashMap<String, serde_json::Value>,
2457) -> ProjectUrls {
2458    let mut homepage_url = None;
2459    let mut download_url = None;
2460    let mut bug_tracking_url = None;
2461    let mut code_view_url = None;
2462    let mut repository_url = None;
2463
2464    // Check for URLs table
2465    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2466        let parsed_urls: Vec<(String, String)> = urls
2467            .iter()
2468            .filter_map(|(label, value)| {
2469                value
2470                    .as_str()
2471                    .map(|url| (label.to_string(), url.to_string()))
2472            })
2473            .collect();
2474        apply_project_url_mappings(
2475            &parsed_urls,
2476            &mut homepage_url,
2477            &mut bug_tracking_url,
2478            &mut code_view_url,
2479            &mut repository_url,
2480            extra_data,
2481        );
2482
2483        download_url = urls
2484            .get("Downloads")
2485            .or_else(|| urls.get("downloads"))
2486            .and_then(|v| v.as_str())
2487            .map(String::from);
2488
2489        if homepage_url.is_none() {
2490            homepage_url = urls
2491                .get(FIELD_HOMEPAGE)
2492                .and_then(|v| v.as_str())
2493                .map(String::from);
2494        }
2495        if repository_url.is_none() {
2496            repository_url = urls
2497                .get(FIELD_REPOSITORY)
2498                .and_then(|v| v.as_str())
2499                .map(String::from);
2500        }
2501    }
2502
2503    // If not found in URLs table, check for top-level keys
2504    if homepage_url.is_none() {
2505        homepage_url = project
2506            .get(FIELD_HOMEPAGE)
2507            .and_then(|v| v.as_str())
2508            .map(String::from);
2509    }
2510
2511    if repository_url.is_none() {
2512        repository_url = project
2513            .get(FIELD_REPOSITORY)
2514            .and_then(|v| v.as_str())
2515            .map(String::from);
2516    }
2517
2518    (
2519        homepage_url,
2520        download_url,
2521        bug_tracking_url,
2522        code_view_url,
2523        repository_url,
2524    )
2525}
2526
2527fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2528    let mut parties = Vec::new();
2529
2530    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2531        for author in authors {
2532            if let Some(author_str) = author.as_str() {
2533                let (name, email) = split_name_email(author_str);
2534                parties.push(Party {
2535                    r#type: None,
2536                    role: Some("author".to_string()),
2537                    name,
2538                    email,
2539                    url: None,
2540                    organization: None,
2541                    organization_url: None,
2542                    timezone: None,
2543                });
2544            } else if let Some(author_table) = author.as_table() {
2545                let name = author_table
2546                    .get("name")
2547                    .and_then(|value| value.as_str())
2548                    .map(|value| value.to_string());
2549                let email = author_table
2550                    .get("email")
2551                    .and_then(|value| value.as_str())
2552                    .map(|value| value.to_string());
2553                if name.is_some() || email.is_some() {
2554                    parties.push(Party {
2555                        r#type: None,
2556                        role: Some("author".to_string()),
2557                        name,
2558                        email,
2559                        url: None,
2560                        organization: None,
2561                        organization_url: None,
2562                        timezone: None,
2563                    });
2564                }
2565            }
2566        }
2567    }
2568
2569    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2570        for maintainer in maintainers {
2571            if let Some(maintainer_str) = maintainer.as_str() {
2572                let (name, email) = split_name_email(maintainer_str);
2573                parties.push(Party {
2574                    r#type: None,
2575                    role: Some("maintainer".to_string()),
2576                    name,
2577                    email,
2578                    url: None,
2579                    organization: None,
2580                    organization_url: None,
2581                    timezone: None,
2582                });
2583            } else if let Some(maintainer_table) = maintainer.as_table() {
2584                let name = maintainer_table
2585                    .get("name")
2586                    .and_then(|value| value.as_str())
2587                    .map(|value| value.to_string());
2588                let email = maintainer_table
2589                    .get("email")
2590                    .and_then(|value| value.as_str())
2591                    .map(|value| value.to_string());
2592                if name.is_some() || email.is_some() {
2593                    parties.push(Party {
2594                        r#type: None,
2595                        role: Some("maintainer".to_string()),
2596                        name,
2597                        email,
2598                        url: None,
2599                        organization: None,
2600                        organization_url: None,
2601                        timezone: None,
2602                    });
2603                }
2604            }
2605        }
2606    }
2607
2608    parties
2609}
2610
2611fn extract_dependencies(
2612    project: &TomlMap<String, TomlValue>,
2613    toml_content: &TomlValue,
2614) -> (Vec<Dependency>, Vec<Dependency>) {
2615    let mut dependencies = Vec::new();
2616    let mut optional_dependencies = Vec::new();
2617
2618    // Handle dependencies - can be array or table format
2619    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2620        match deps_value {
2621            TomlValue::Array(arr) => {
2622                dependencies = parse_dependency_array(arr, false, None);
2623            }
2624            TomlValue::Table(table) => {
2625                dependencies = parse_dependency_table(table, false, None);
2626            }
2627            _ => {}
2628        }
2629    }
2630
2631    // Handle PEP 621 optional-dependencies with scope
2632    if let Some(opt_deps_table) = project
2633        .get(FIELD_OPTIONAL_DEPENDENCIES)
2634        .and_then(|v| v.as_table())
2635    {
2636        for (extra_name, deps) in opt_deps_table {
2637            match deps {
2638                TomlValue::Array(arr) => {
2639                    optional_dependencies.extend(parse_dependency_array(
2640                        arr,
2641                        true,
2642                        Some(extra_name),
2643                    ));
2644                }
2645                TomlValue::Table(table) => {
2646                    optional_dependencies.extend(parse_dependency_table(
2647                        table,
2648                        true,
2649                        Some(extra_name),
2650                    ));
2651                }
2652                _ => {}
2653            }
2654        }
2655    }
2656
2657    // Handle Poetry dev-dependencies
2658    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2659        match dev_deps_value {
2660            TomlValue::Array(arr) => {
2661                optional_dependencies.extend(parse_dependency_array(
2662                    arr,
2663                    true,
2664                    Some(FIELD_DEV_DEPENDENCIES),
2665                ));
2666            }
2667            TomlValue::Table(table) => {
2668                optional_dependencies.extend(parse_dependency_table(
2669                    table,
2670                    true,
2671                    Some(FIELD_DEV_DEPENDENCIES),
2672                ));
2673            }
2674            _ => {}
2675        }
2676    }
2677
2678    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2679    if let Some(groups_table) = toml_content
2680        .get("tool")
2681        .and_then(|value| value.as_table())
2682        .and_then(|tool| tool.get("poetry"))
2683        .and_then(|value| value.as_table())
2684        .and_then(|poetry| poetry.get("group"))
2685        .and_then(|value| value.as_table())
2686    {
2687        for (group_name, group_data) in groups_table {
2688            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2689                match group_deps {
2690                    TomlValue::Array(arr) => {
2691                        optional_dependencies.extend(parse_dependency_array(
2692                            arr,
2693                            true,
2694                            Some(group_name),
2695                        ));
2696                    }
2697                    TomlValue::Table(table) => {
2698                        optional_dependencies.extend(parse_poetry_group_dependency_table(
2699                            table,
2700                            true,
2701                            Some(group_name),
2702                        ));
2703                    }
2704                    _ => {}
2705                }
2706            }
2707        }
2708    }
2709
2710    if let Some(groups_table) = toml_content
2711        .get(FIELD_DEPENDENCY_GROUPS)
2712        .and_then(|value| value.as_table())
2713    {
2714        for (group_name, deps) in groups_table {
2715            match deps {
2716                TomlValue::Array(arr) => {
2717                    optional_dependencies.extend(parse_dependency_array(
2718                        arr,
2719                        true,
2720                        Some(group_name),
2721                    ));
2722                }
2723                TomlValue::Table(table) => {
2724                    optional_dependencies.extend(parse_dependency_table(
2725                        table,
2726                        true,
2727                        Some(group_name),
2728                    ));
2729                }
2730                _ => {}
2731            }
2732        }
2733    }
2734
2735    if let Some(dev_deps_value) = toml_content
2736        .get("tool")
2737        .and_then(|value| value.as_table())
2738        .and_then(|tool| tool.get("uv"))
2739        .and_then(|value| value.as_table())
2740        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2741    {
2742        match dev_deps_value {
2743            TomlValue::Array(arr) => {
2744                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2745            }
2746            TomlValue::Table(table) => {
2747                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2748            }
2749            _ => {}
2750        }
2751    }
2752
2753    (dependencies, optional_dependencies)
2754}
2755
2756fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2757    let mut extra_data = HashMap::new();
2758
2759    if let Some(tool_uv) = toml_content
2760        .get("tool")
2761        .and_then(|value| value.as_table())
2762        .and_then(|tool| tool.get("uv"))
2763    {
2764        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2765    }
2766
2767    if extra_data.is_empty() {
2768        None
2769    } else {
2770        Some(extra_data)
2771    }
2772}
2773
2774fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2775    match value {
2776        TomlValue::String(value) => JsonValue::String(value.clone()),
2777        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2778        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2779        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2780        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2781        TomlValue::Array(values) => {
2782            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2783        }
2784        TomlValue::Table(values) => JsonValue::Object(
2785            values
2786                .iter()
2787                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2788                .collect::<JsonMap<String, JsonValue>>(),
2789        ),
2790    }
2791}
2792
2793fn parse_dependency_table(
2794    table: &TomlMap<String, TomlValue>,
2795    is_optional: bool,
2796    scope: Option<&str>,
2797) -> Vec<Dependency> {
2798    table
2799        .iter()
2800        .filter_map(|(name, version)| {
2801            let version_str = version.as_str().map(|s| s.to_string());
2802            let mut package_url =
2803                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2804
2805            if let Some(v) = &version_str {
2806                package_url.with_version(v).ok()?;
2807            }
2808
2809            Some(Dependency {
2810                purl: Some(package_url.to_string()),
2811                extracted_requirement: None,
2812                scope: scope.map(|s| s.to_string()),
2813                is_runtime: Some(!is_optional),
2814                is_optional: Some(is_optional),
2815                is_pinned: None,
2816                is_direct: Some(true),
2817                resolved_package: None,
2818                extra_data: None,
2819            })
2820        })
2821        .collect()
2822}
2823
2824fn parse_poetry_group_dependency_table(
2825    table: &TomlMap<String, TomlValue>,
2826    is_optional: bool,
2827    scope: Option<&str>,
2828) -> Vec<Dependency> {
2829    table
2830        .iter()
2831        .filter_map(|(name, value)| build_poetry_group_dependency(name, value, is_optional, scope))
2832        .collect()
2833}
2834
2835fn build_poetry_group_dependency(
2836    name: &str,
2837    value: &TomlValue,
2838    is_optional: bool,
2839    scope: Option<&str>,
2840) -> Option<Dependency> {
2841    let normalized_name = normalize_python_dependency_name(name);
2842    let (version_spec, extras, marker) = match value {
2843        TomlValue::String(spec) => (Some(spec.trim().to_string()), Vec::new(), None),
2844        TomlValue::Table(table) => {
2845            let version_spec = table
2846                .get(FIELD_VERSION)
2847                .and_then(|value| value.as_str())
2848                .map(str::trim)
2849                .filter(|value| !value.is_empty())
2850                .map(ToOwned::to_owned);
2851            let extras = table
2852                .get(FIELD_EXTRAS)
2853                .and_then(|value| value.as_array())
2854                .map(|values| {
2855                    values
2856                        .iter()
2857                        .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2858                        .collect::<Vec<_>>()
2859                })
2860                .unwrap_or_default();
2861            let marker = table
2862                .get("markers")
2863                .and_then(|value| value.as_str())
2864                .map(str::trim)
2865                .filter(|value| !value.is_empty())
2866                .map(ToOwned::to_owned);
2867
2868            (version_spec, extras, marker)
2869        }
2870        _ => return None,
2871    };
2872
2873    let pinned_version = version_spec
2874        .as_deref()
2875        .and_then(extract_exact_pinned_version);
2876    let purl = build_python_dependency_purl(&normalized_name, pinned_version.as_deref())?;
2877
2878    let mut extra_data = HashMap::new();
2879    if let Some(marker) = marker {
2880        extra_data.insert("marker".to_string(), JsonValue::String(marker));
2881    }
2882    if !extras.is_empty() {
2883        extra_data.insert(
2884            "extras".to_string(),
2885            JsonValue::Array(extras.into_iter().map(JsonValue::String).collect()),
2886        );
2887    }
2888
2889    Some(Dependency {
2890        purl: Some(purl),
2891        extracted_requirement: version_spec,
2892        scope: scope.map(|value| value.to_string()),
2893        is_runtime: Some(!is_optional),
2894        is_optional: Some(is_optional),
2895        is_pinned: Some(pinned_version.is_some()),
2896        is_direct: Some(true),
2897        resolved_package: None,
2898        extra_data: if extra_data.is_empty() {
2899            None
2900        } else {
2901            Some(extra_data)
2902        },
2903    })
2904}
2905
2906fn parse_dependency_array(
2907    array: &[TomlValue],
2908    is_optional: bool,
2909    scope: Option<&str>,
2910) -> Vec<Dependency> {
2911    array
2912        .iter()
2913        .filter_map(|dep| {
2914            let dep_str = dep.as_str()?;
2915            build_pyproject_array_dependency(dep_str, is_optional, scope)
2916        })
2917        .collect()
2918}
2919
2920fn build_pyproject_array_dependency(
2921    dep_str: &str,
2922    is_optional: bool,
2923    scope: Option<&str>,
2924) -> Option<Dependency> {
2925    let parsed = parse_pep508_requirement(dep_str)?;
2926    let name = normalize_python_package_name(&parsed.name);
2927    let pinned_version = parsed
2928        .specifiers
2929        .as_deref()
2930        .and_then(extract_exact_pinned_version);
2931
2932    let purl = build_python_dependency_purl(&name, pinned_version.as_deref())?;
2933
2934    let mut extra_data = HashMap::new();
2935    if let Some(marker) = parsed.marker {
2936        extra_data.insert("marker".to_string(), JsonValue::String(marker));
2937    }
2938    if !parsed.extras.is_empty() {
2939        extra_data.insert(
2940            "extras".to_string(),
2941            JsonValue::Array(parsed.extras.into_iter().map(JsonValue::String).collect()),
2942        );
2943    }
2944
2945    let extracted_requirement = parsed.specifiers.or(parsed.url);
2946
2947    Some(Dependency {
2948        purl: Some(purl),
2949        extracted_requirement: extracted_requirement.clone(),
2950        scope: scope.map(|s| s.to_string()),
2951        is_runtime: Some(!is_optional),
2952        is_optional: Some(is_optional),
2953        is_pinned: Some(pinned_version.is_some()),
2954        is_direct: Some(true),
2955        resolved_package: None,
2956        extra_data: if extra_data.is_empty() {
2957            None
2958        } else {
2959            Some(extra_data)
2960        },
2961    })
2962}
2963
2964fn extract_exact_pinned_version(specifiers: &str) -> Option<String> {
2965    let trimmed = specifiers.trim();
2966    if trimmed.contains(',') {
2967        return None;
2968    }
2969
2970    let stripped = if let Some(version) = trimmed.strip_prefix("===") {
2971        version
2972    } else if let Some(version) = trimmed.strip_prefix("==") {
2973        version
2974    } else {
2975        return None;
2976    };
2977
2978    let version = stripped.trim();
2979    if version.is_empty() {
2980        None
2981    } else {
2982        Some(version.to_string())
2983    }
2984}
2985
2986#[derive(Debug, Clone)]
2987enum Value {
2988    String(String),
2989    Number(f64),
2990    Bool(bool),
2991    None,
2992    List(Vec<Value>),
2993    Tuple(Vec<Value>),
2994    Dict(HashMap<String, Value>),
2995}
2996
2997struct LiteralEvaluator {
2998    constants: HashMap<String, Value>,
2999    max_depth: usize,
3000    max_nodes: usize,
3001    nodes_visited: usize,
3002}
3003
3004impl LiteralEvaluator {
3005    fn new(constants: HashMap<String, Value>) -> Self {
3006        Self {
3007            constants,
3008            max_depth: MAX_SETUP_PY_AST_DEPTH,
3009            max_nodes: MAX_SETUP_PY_AST_NODES,
3010            nodes_visited: 0,
3011        }
3012    }
3013
3014    fn insert_constant(&mut self, name: String, value: Value) {
3015        self.constants.insert(name, value);
3016    }
3017
3018    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
3019        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
3020            return None;
3021        }
3022        self.nodes_visited += 1;
3023
3024        match expr {
3025            ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
3026                Some(Value::String(value.to_str().to_string()))
3027            }
3028            ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
3029                Some(Value::Bool(*value))
3030            }
3031            ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
3032                self.evaluate_number(value)
3033            }
3034            ast::Expr::NoneLiteral(_) => Some(Value::None),
3035            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
3036            ast::Expr::List(ast::ExprList { elts, .. }) => {
3037                let mut values = Vec::new();
3038                for elt in elts {
3039                    values.push(self.evaluate_expr(elt, depth + 1)?);
3040                }
3041                Some(Value::List(values))
3042            }
3043            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
3044                let mut values = Vec::new();
3045                for elt in elts {
3046                    values.push(self.evaluate_expr(elt, depth + 1)?);
3047                }
3048                Some(Value::Tuple(values))
3049            }
3050            ast::Expr::Dict(ast::ExprDict { items, .. }) => {
3051                let mut dict = HashMap::new();
3052                for item in items {
3053                    let key_expr = item.key.as_ref()?;
3054                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
3055                    let key = value_to_string(&key_value)?;
3056                    let value = self.evaluate_expr(&item.value, depth + 1)?;
3057                    dict.insert(key, value);
3058                }
3059                Some(Value::Dict(dict))
3060            }
3061            ast::Expr::Call(ast::ExprCall {
3062                func, arguments, ..
3063            }) => {
3064                let args = arguments.args.as_ref();
3065                let keywords = arguments.keywords.as_ref();
3066                if keywords.is_empty()
3067                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
3068                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
3069                {
3070                    return self.evaluate_ordered_dict(args, depth + 1);
3071                }
3072
3073                if !args.is_empty() {
3074                    return None;
3075                }
3076
3077                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
3078                    && id == "dict"
3079                {
3080                    let mut dict = HashMap::new();
3081                    for keyword in keywords {
3082                        let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
3083                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
3084                        dict.insert(key.to_string(), value);
3085                    }
3086                    return Some(Value::Dict(dict));
3087                }
3088
3089                None
3090            }
3091            _ => None,
3092        }
3093    }
3094
3095    fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
3096        match number {
3097            ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
3098            ast::Number::Float(value) => Some(Value::Number(*value)),
3099            ast::Number::Complex { .. } => None,
3100        }
3101    }
3102
3103    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
3104        if args.len() != 1 {
3105            return None;
3106        }
3107
3108        let items = match self.evaluate_expr(&args[0], depth)? {
3109            Value::List(items) | Value::Tuple(items) => items,
3110            _ => return None,
3111        };
3112
3113        let mut dict = HashMap::new();
3114        for item in items {
3115            let Value::Tuple(values) = item else {
3116                return None;
3117            };
3118            if values.len() != 2 {
3119                return None;
3120            }
3121            let key = value_to_string(&values[0])?;
3122            dict.insert(key, values[1].clone());
3123        }
3124
3125        Some(Value::Dict(dict))
3126    }
3127}
3128
3129#[derive(Default)]
3130struct SetupAliases {
3131    setup_names: HashSet<String>,
3132    module_aliases: HashMap<String, String>,
3133}
3134
3135fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
3136    extract_from_setup_py(path).into_iter().collect()
3137}
3138
3139fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
3140    let content = match read_file_to_string(path, None) {
3141        Ok(content) => content,
3142        Err(e) => {
3143            warn!("Failed to read setup.py at {:?}: {}", path, e);
3144            return Some(default_package_data(path));
3145        }
3146    };
3147
3148    if content.len() > MAX_SETUP_PY_BYTES {
3149        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
3150        let package_data = extract_from_setup_py_regex(&content);
3151        return should_emit_setup_py_package(&package_data).then_some(package_data);
3152    }
3153
3154    let mut package_data = match extract_from_setup_py_ast(&content) {
3155        Ok(Some(data)) => data,
3156        Ok(None) => return Some(default_package_data(path)),
3157        Err(e) => {
3158            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
3159            extract_from_setup_py_regex(&content)
3160        }
3161    };
3162
3163    if package_data.name.is_none() {
3164        package_data.name = extract_setup_value(&content, "name");
3165    }
3166
3167    if package_data.version.is_none() {
3168        package_data.version = extract_setup_value(&content, "version");
3169    }
3170
3171    if package_data
3172        .version
3173        .as_deref()
3174        .is_some_and(|version| version.trim().is_empty())
3175    {
3176        package_data.version = None;
3177    }
3178
3179    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
3180    package_data.purl = build_setup_py_purl(
3181        package_data.name.as_deref(),
3182        package_data.version.as_deref(),
3183    );
3184
3185    if should_emit_setup_py_package(&package_data) {
3186        Some(package_data)
3187    } else {
3188        Some(default_package_data(path))
3189    }
3190}
3191
3192fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
3193    package_data.name.is_some()
3194        || package_data.version.is_some()
3195        || package_data.purl.is_some()
3196        || !package_data.dependencies.is_empty()
3197        || package_data.extracted_license_statement.is_some()
3198        || !package_data.license_detections.is_empty()
3199        || !package_data.parties.is_empty()
3200        || package_data.description.is_some()
3201        || package_data.homepage_url.is_some()
3202        || package_data.bug_tracking_url.is_some()
3203        || package_data.code_view_url.is_some()
3204        || package_data.vcs_url.is_some()
3205}
3206
3207fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
3208    if package_data.version.is_some()
3209        && package_data.extracted_license_statement.is_some()
3210        && package_data
3211            .parties
3212            .iter()
3213            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
3214    {
3215        return;
3216    }
3217
3218    let Some(root) = path.parent() else {
3219        return;
3220    };
3221
3222    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
3223
3224    if package_data.version.is_none() {
3225        package_data.version = dunder_metadata.version;
3226    }
3227
3228    if package_data.extracted_license_statement.is_none() {
3229        package_data.extracted_license_statement = dunder_metadata.license;
3230    }
3231
3232    let has_author = package_data
3233        .parties
3234        .iter()
3235        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
3236
3237    if !has_author && let Some(author) = dunder_metadata.author {
3238        package_data.parties.push(Party {
3239            r#type: Some("person".to_string()),
3240            role: Some("author".to_string()),
3241            name: Some(author),
3242            email: None,
3243            url: None,
3244            organization: None,
3245            organization_url: None,
3246            timezone: None,
3247        });
3248    }
3249}
3250
3251#[derive(Default)]
3252struct DunderMetadata {
3253    version: Option<String>,
3254    author: Option<String>,
3255    license: Option<String>,
3256}
3257
3258fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
3259    let statements = match parse_module(content) {
3260        Ok(parsed) => parsed.into_suite(),
3261        Err(_) => return DunderMetadata::default(),
3262    };
3263
3264    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3265    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3266    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
3267    let mut metadata = DunderMetadata::default();
3268    let mut candidate_paths = Vec::new();
3269
3270    for module in imported_dunder_modules(&statements) {
3271        let Some(path) = resolve_imported_module_path(root, &module) else {
3272            continue;
3273        };
3274
3275        candidate_paths.push(path);
3276    }
3277
3278    candidate_paths.extend(referenced_dunder_attribute_paths(root, content));
3279    candidate_paths.extend(referenced_dunder_init_paths(root, content));
3280
3281    let mut seen_paths = HashSet::new();
3282    for path in candidate_paths {
3283        if !seen_paths.insert(path.clone()) {
3284            continue;
3285        }
3286
3287        let Ok(module_content) = read_file_to_string(&path, None) else {
3288            continue;
3289        };
3290
3291        if metadata.version.is_none() {
3292            metadata.version = version_re
3293                .as_ref()
3294                .and_then(|regex| regex.captures(&module_content))
3295                .and_then(|captures| captures.get(1))
3296                .map(|match_| match_.as_str().to_string());
3297        }
3298
3299        if metadata.author.is_none() {
3300            metadata.author = author_re
3301                .as_ref()
3302                .and_then(|regex| regex.captures(&module_content))
3303                .and_then(|captures| captures.get(1))
3304                .map(|match_| match_.as_str().to_string());
3305        }
3306
3307        if metadata.license.is_none() {
3308            metadata.license = license_re
3309                .as_ref()
3310                .and_then(|regex| regex.captures(&module_content))
3311                .and_then(|captures| captures.get(1))
3312                .map(|match_| match_.as_str().to_string());
3313        }
3314
3315        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
3316            return metadata;
3317        }
3318    }
3319
3320    metadata
3321}
3322
3323fn referenced_dunder_init_paths(root: &Path, content: &str) -> Vec<PathBuf> {
3324    let open_re = match Regex::new(r#"open\(\s*['\"]([^'\"]+__init__\.py)['\"]"#) {
3325        Ok(regex) => regex,
3326        Err(_) => return Vec::new(),
3327    };
3328
3329    open_re
3330        .captures_iter(content)
3331        .filter_map(|captures| captures.get(1).map(|m| m.as_str()))
3332        .filter_map(|relative| {
3333            let relative_path = PathBuf::from(relative);
3334            if relative_path.is_absolute()
3335                || relative_path.components().any(|component| {
3336                    matches!(
3337                        component,
3338                        Component::ParentDir | Component::RootDir | Component::Prefix(_)
3339                    )
3340                })
3341            {
3342                return None;
3343            }
3344
3345            let candidate = root.join(relative_path);
3346            candidate.exists().then_some(candidate)
3347        })
3348        .collect()
3349}
3350
3351fn referenced_dunder_attribute_paths(root: &Path, content: &str) -> Vec<PathBuf> {
3352    let attr_re =
3353        match Regex::new(r#"\b([A-Za-z_][A-Za-z0-9_]*)\s*\.\s*__(?:version|author|license)__\b"#) {
3354            Ok(regex) => regex,
3355            Err(_) => return Vec::new(),
3356        };
3357
3358    let mut seen_modules = HashSet::new();
3359    attr_re
3360        .captures_iter(content)
3361        .filter_map(|captures| captures.get(1).map(|m| m.as_str().to_string()))
3362        .filter(|module| seen_modules.insert(module.clone()))
3363        .filter_map(|module| resolve_imported_module_path(root, &module))
3364        .collect()
3365}
3366
3367fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
3368    let mut modules = Vec::new();
3369
3370    for statement in statements {
3371        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
3372            continue;
3373        };
3374        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
3375            continue;
3376        };
3377        let imports_dunder = names.iter().any(|alias| {
3378            matches!(
3379                alias.name.as_str(),
3380                "__version__" | "__author__" | "__license__"
3381            )
3382        });
3383        if imports_dunder {
3384            modules.push(module.to_string());
3385        }
3386    }
3387
3388    modules
3389}
3390
3391fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
3392    let relative = PathBuf::from_iter(module.split('.'));
3393    let candidates = [
3394        root.join(relative.with_extension("py")),
3395        root.join(&relative).join("__init__.py"),
3396        root.join("src").join(relative.with_extension("py")),
3397        root.join("src").join(relative).join("__init__.py"),
3398    ];
3399
3400    candidates.into_iter().find(|candidate| candidate.exists())
3401}
3402
3403/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
3404///
3405/// # Security Model
3406///
3407/// This function parses setup.py as a Python AST and evaluates only literal values
3408/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
3409/// arbitrary code execution during scanning.
3410///
3411/// # DoS Prevention
3412///
3413/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
3414/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
3415/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
3416///
3417/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
3418fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
3419    let statements = parse_module(content)
3420        .map(|parsed| parsed.into_suite())
3421        .map_err(|e| e.to_string())?;
3422    let aliases = collect_setup_aliases(&statements);
3423    let mut evaluator = LiteralEvaluator::new(HashMap::new());
3424    build_setup_py_constants(&statements, &mut evaluator);
3425
3426    let setup_call = find_setup_call(&statements, &aliases);
3427    let Some(call_expr) = setup_call else {
3428        return Ok(None);
3429    };
3430
3431    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3432    Ok(Some(build_setup_py_package_data(&setup_values)))
3433}
3434
3435fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3436    for stmt in statements {
3437        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3438            if targets.len() != 1 {
3439                continue;
3440            }
3441
3442            let Some(name) = extract_assign_name(&targets[0]) else {
3443                continue;
3444            };
3445
3446            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3447                evaluator.insert_constant(name, value);
3448            }
3449        }
3450    }
3451}
3452
3453fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3454    match target {
3455        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3456        _ => None,
3457    }
3458}
3459
3460fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3461    let mut aliases = SetupAliases::default();
3462    aliases.setup_names.insert("setup".to_string());
3463
3464    for stmt in statements {
3465        match stmt {
3466            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3467                for alias in names {
3468                    let module_name = alias.name.as_str();
3469                    if !is_setup_module(module_name) {
3470                        continue;
3471                    }
3472                    let alias_name = alias
3473                        .asname
3474                        .as_ref()
3475                        .map(|name| name.as_str())
3476                        .unwrap_or(module_name);
3477                    aliases
3478                        .module_aliases
3479                        .insert(alias_name.to_string(), module_name.to_string());
3480                }
3481            }
3482            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3483                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3484                    continue;
3485                };
3486                if !is_setup_module(module_name) {
3487                    continue;
3488                }
3489                for alias in names {
3490                    if alias.name.as_str() != "setup" {
3491                        continue;
3492                    }
3493                    let alias_name = alias
3494                        .asname
3495                        .as_ref()
3496                        .map(|name| name.as_str())
3497                        .unwrap_or("setup");
3498                    aliases.setup_names.insert(alias_name.to_string());
3499                }
3500            }
3501            _ => {}
3502        }
3503    }
3504
3505    aliases
3506}
3507
3508fn is_setup_module(module_name: &str) -> bool {
3509    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3510}
3511
3512fn find_setup_call<'a>(
3513    statements: &'a [ast::Stmt],
3514    aliases: &'a SetupAliases,
3515) -> Option<&'a ast::Expr> {
3516    let mut finder = SetupCallFinder {
3517        aliases,
3518        called_function_names: collect_top_level_called_function_names(statements),
3519        nodes_visited: 0,
3520    };
3521    finder.find_in_statements(statements)
3522}
3523
3524fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3525    let mut called = HashSet::new();
3526    collect_called_function_names_in_statements(statements, &mut called);
3527    called
3528}
3529
3530fn collect_called_function_names_in_statements(
3531    statements: &[ast::Stmt],
3532    called: &mut HashSet<String>,
3533) {
3534    for stmt in statements {
3535        match stmt {
3536            ast::Stmt::Expr(ast::StmtExpr { value, .. })
3537            | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3538                collect_called_function_names_in_expr(value.as_ref(), called);
3539            }
3540            ast::Stmt::If(ast::StmtIf {
3541                body,
3542                elif_else_clauses,
3543                ..
3544            }) => {
3545                collect_called_function_names_in_statements(body, called);
3546                for clause in elif_else_clauses {
3547                    collect_called_function_names_in_statements(&clause.body, called);
3548                }
3549            }
3550            ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3551            | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3552                collect_called_function_names_in_statements(body, called);
3553                collect_called_function_names_in_statements(orelse, called);
3554            }
3555            ast::Stmt::With(ast::StmtWith { body, .. }) => {
3556                collect_called_function_names_in_statements(body, called);
3557            }
3558            ast::Stmt::Try(ast::StmtTry {
3559                body,
3560                orelse,
3561                finalbody,
3562                handlers,
3563                ..
3564            }) => {
3565                collect_called_function_names_in_statements(body, called);
3566                collect_called_function_names_in_statements(orelse, called);
3567                collect_called_function_names_in_statements(finalbody, called);
3568                for handler in handlers {
3569                    let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3570                        body,
3571                        ..
3572                    }) = handler;
3573                    collect_called_function_names_in_statements(body, called);
3574                }
3575            }
3576            _ => {}
3577        }
3578    }
3579}
3580
3581fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3582    if let ast::Expr::Call(ast::ExprCall {
3583        func, arguments, ..
3584    }) = expr
3585    {
3586        if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3587            called.insert(id.as_str().to_string());
3588        }
3589
3590        for arg in arguments.args.iter() {
3591            collect_called_function_names_in_expr(arg, called);
3592        }
3593        for keyword in arguments.keywords.iter() {
3594            collect_called_function_names_in_expr(&keyword.value, called);
3595        }
3596    }
3597}
3598
3599struct SetupCallFinder<'a> {
3600    aliases: &'a SetupAliases,
3601    called_function_names: HashSet<String>,
3602    nodes_visited: usize,
3603}
3604
3605impl<'a> SetupCallFinder<'a> {
3606    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3607        for stmt in statements {
3608            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3609                return None;
3610            }
3611            self.nodes_visited += 1;
3612
3613            let found = match stmt {
3614                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3615                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3616                ast::Stmt::If(ast::StmtIf {
3617                    body,
3618                    elif_else_clauses,
3619                    ..
3620                }) => self.find_in_statements(body).or_else(|| {
3621                    for clause in elif_else_clauses {
3622                        if let Some(found) = self.find_in_statements(&clause.body) {
3623                            return Some(found);
3624                        }
3625                    }
3626                    None
3627                }),
3628                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3629                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3630                    .find_in_statements(body)
3631                    .or_else(|| self.find_in_statements(orelse)),
3632                ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3633                    .called_function_names
3634                    .contains(name.as_str())
3635                    .then(|| self.find_in_statements(body))
3636                    .flatten(),
3637                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3638                ast::Stmt::Try(ast::StmtTry {
3639                    body,
3640                    orelse,
3641                    finalbody,
3642                    handlers,
3643                    ..
3644                }) => self
3645                    .find_in_statements(body)
3646                    .or_else(|| self.find_in_statements(orelse))
3647                    .or_else(|| self.find_in_statements(finalbody))
3648                    .or_else(|| {
3649                        for handler in handlers {
3650                            let ast::ExceptHandler::ExceptHandler(
3651                                ast::ExceptHandlerExceptHandler { body, .. },
3652                            ) = handler;
3653                            if let Some(found) = self.find_in_statements(body) {
3654                                return Some(found);
3655                            }
3656                        }
3657                        None
3658                    }),
3659                _ => None,
3660            };
3661
3662            if found.is_some() {
3663                return found;
3664            }
3665        }
3666
3667        None
3668    }
3669
3670    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3671        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3672            return None;
3673        }
3674        self.nodes_visited += 1;
3675
3676        match expr {
3677            ast::Expr::Call(ast::ExprCall { func, .. })
3678                if is_setup_call(func.as_ref(), self.aliases) =>
3679            {
3680                Some(expr)
3681            }
3682            _ => None,
3683        }
3684    }
3685}
3686
3687fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3688    let Some(dotted) = dotted_name(func, 0) else {
3689        return false;
3690    };
3691
3692    if aliases.setup_names.contains(&dotted) {
3693        return true;
3694    }
3695
3696    let Some(module) = dotted.strip_suffix(".setup") else {
3697        return false;
3698    };
3699
3700    let resolved = resolve_module_alias(module, aliases);
3701    is_setup_module(&resolved)
3702}
3703
3704fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3705    if depth >= MAX_SETUP_PY_AST_DEPTH {
3706        return None;
3707    }
3708
3709    match expr {
3710        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3711        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3712            let base = dotted_name(value.as_ref(), depth + 1)?;
3713            Some(format!("{}.{}", base, attr.as_str()))
3714        }
3715        _ => None,
3716    }
3717}
3718
3719fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3720    if let Some(mapped) = aliases.module_aliases.get(module) {
3721        return mapped.clone();
3722    }
3723
3724    let Some((base, rest)) = module.split_once('.') else {
3725        return module.to_string();
3726    };
3727
3728    if let Some(mapped) = aliases.module_aliases.get(base) {
3729        return format!("{}.{}", mapped, rest);
3730    }
3731
3732    module.to_string()
3733}
3734
3735fn extract_setup_keywords(
3736    call_expr: &ast::Expr,
3737    evaluator: &mut LiteralEvaluator,
3738) -> HashMap<String, Value> {
3739    let mut values = HashMap::new();
3740    let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3741        return values;
3742    };
3743
3744    for keyword in arguments.keywords.iter() {
3745        if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3746            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3747                values.insert(arg.to_string(), value);
3748            }
3749        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3750            for (key, value) in dict {
3751                values.insert(key, value);
3752            }
3753        }
3754    }
3755
3756    values
3757}
3758
3759fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3760    let name = get_value_string(values, "name").map(truncate_field);
3761    let version = get_value_string(values, "version").map(truncate_field);
3762    let description = get_value_string(values, "description")
3763        .or_else(|| get_value_string(values, "summary"))
3764        .map(truncate_field);
3765    let homepage_url = get_value_string(values, "url")
3766        .or_else(|| get_value_string(values, "home_page"))
3767        .map(truncate_field);
3768    let author = get_value_string(values, "author").map(truncate_field);
3769    let author_email = get_value_string(values, "author_email");
3770    let maintainer = get_value_string(values, "maintainer").map(truncate_field);
3771    let maintainer_email = get_value_string(values, "maintainer_email");
3772    let license = get_value_string(values, "license").map(truncate_field);
3773    let classifiers = values
3774        .get("classifiers")
3775        .and_then(value_to_string_list)
3776        .unwrap_or_default();
3777
3778    let mut parties = Vec::new();
3779    if author.is_some() || author_email.is_some() {
3780        parties.push(Party {
3781            r#type: Some("person".to_string()),
3782            role: Some("author".to_string()),
3783            name: author,
3784            email: author_email,
3785            url: None,
3786            organization: None,
3787            organization_url: None,
3788            timezone: None,
3789        });
3790    }
3791
3792    if maintainer.is_some() || maintainer_email.is_some() {
3793        parties.push(Party {
3794            r#type: Some("person".to_string()),
3795            role: Some("maintainer".to_string()),
3796            name: maintainer,
3797            email: maintainer_email,
3798            url: None,
3799            organization: None,
3800            organization_url: None,
3801            timezone: None,
3802        });
3803    }
3804
3805    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3806        normalize_spdx_declared_license(license.as_deref());
3807    let extracted_license_statement = license.clone();
3808
3809    let dependencies = build_setup_py_dependencies(values);
3810    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3811    let mut homepage_from_project_urls = None;
3812    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3813    let mut extra_data = HashMap::new();
3814
3815    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3816        apply_project_url_mappings(
3817            &parsed_project_urls,
3818            &mut homepage_from_project_urls,
3819            &mut bug_tracking_url,
3820            &mut code_view_url,
3821            &mut vcs_url,
3822            &mut extra_data,
3823        );
3824    }
3825
3826    let extra_data = if extra_data.is_empty() {
3827        None
3828    } else {
3829        Some(extra_data)
3830    };
3831
3832    PackageData {
3833        package_type: Some(PythonParser::PACKAGE_TYPE),
3834        namespace: None,
3835        name,
3836        version,
3837        qualifiers: None,
3838        subpath: None,
3839        primary_language: Some("Python".to_string()),
3840        description,
3841        release_date: None,
3842        parties,
3843        keywords: Vec::new(),
3844        homepage_url: homepage_url.or(homepage_from_project_urls),
3845        download_url: None,
3846        size: None,
3847        sha1: None,
3848        md5: None,
3849        sha256: None,
3850        sha512: None,
3851        bug_tracking_url,
3852        code_view_url,
3853        vcs_url,
3854        copyright: None,
3855        holder: None,
3856        declared_license_expression,
3857        declared_license_expression_spdx,
3858        license_detections,
3859        other_license_expression: None,
3860        other_license_expression_spdx: None,
3861        other_license_detections: Vec::new(),
3862        extracted_license_statement,
3863        notice_text: None,
3864        source_packages: Vec::new(),
3865        file_references: Vec::new(),
3866        is_private: has_private_classifier(&classifiers),
3867        is_virtual: false,
3868        extra_data,
3869        dependencies,
3870        repository_homepage_url: None,
3871        repository_download_url: None,
3872        api_data_url: None,
3873        datasource_id: Some(DatasourceId::PypiSetupPy),
3874        purl,
3875    }
3876}
3877
3878fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3879    let mut dependencies = Vec::new();
3880
3881    if let Some(reqs) = values
3882        .get("install_requires")
3883        .and_then(value_to_string_list)
3884    {
3885        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3886    }
3887
3888    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3889        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3890    }
3891
3892    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3893        let mut extra_items: Vec<_> = extras.iter().collect();
3894        extra_items.sort_by_key(|(name, _)| *name);
3895        for (extra_name, extra_value) in extra_items {
3896            if let Some(reqs) = value_to_string_list(extra_value) {
3897                dependencies.extend(build_setup_py_dependency_list(
3898                    reqs.as_slice(),
3899                    extra_name,
3900                    true,
3901                ));
3902            }
3903        }
3904    }
3905
3906    dependencies
3907}
3908
3909fn build_setup_py_dependency_list(
3910    reqs: &[String],
3911    scope: &str,
3912    is_optional: bool,
3913) -> Vec<Dependency> {
3914    reqs.iter()
3915        .filter_map(|req| build_python_dependency(req, scope, is_optional, None))
3916        .collect()
3917}
3918
3919fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3920    values.get(key).and_then(value_to_string)
3921}
3922
3923fn value_to_string(value: &Value) -> Option<String> {
3924    match value {
3925        Value::String(value) => Some(value.clone()),
3926        Value::Number(value) => Some(value.to_string()),
3927        Value::Bool(value) => Some(value.to_string()),
3928        _ => None,
3929    }
3930}
3931
3932fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3933    match value {
3934        Value::String(value) => Some(vec![value.clone()]),
3935        Value::List(values) | Value::Tuple(values) => {
3936            let mut items = Vec::new();
3937            for item in values {
3938                items.push(value_to_string(item)?);
3939            }
3940            Some(items)
3941        }
3942        _ => None,
3943    }
3944}
3945
3946fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3947    let Value::Dict(dict) = value else {
3948        return None;
3949    };
3950
3951    let mut pairs: Vec<(String, String)> = dict
3952        .iter()
3953        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3954        .collect::<Option<Vec<_>>>()?;
3955    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3956    Some(pairs)
3957}
3958
3959fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3960    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3961    extract_requires_dist_dependencies(&requires_dist)
3962}
3963
3964pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3965    requires_dist
3966        .iter()
3967        .filter_map(|entry| build_rfc822_dependency(entry))
3968        .collect()
3969}
3970
3971fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3972    build_python_dependency(entry, "install", false, None)
3973}
3974
3975fn build_python_dependency(
3976    entry: &str,
3977    default_scope: &str,
3978    default_optional: bool,
3979    marker_override: Option<&str>,
3980) -> Option<Dependency> {
3981    let (requirement_part, marker_part) = entry
3982        .split_once(';')
3983        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3984        .unwrap_or((entry.trim(), None));
3985
3986    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3987    let requirement = normalize_rfc822_requirement(requirement_part);
3988    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3989        marker_part.or(marker_override),
3990        default_scope,
3991        default_optional,
3992    );
3993    let purl = build_python_dependency_purl(&name, None)?;
3994
3995    let is_pinned = requirement
3996        .as_deref()
3997        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3998    let purl = if is_pinned {
3999        requirement
4000            .as_deref()
4001            .map(|req| req.trim_start_matches('='))
4002            .and_then(|version| build_python_dependency_purl(&name, Some(version)))
4003            .unwrap_or(purl)
4004    } else {
4005        purl
4006    };
4007
4008    let mut extra_data = HashMap::new();
4009    extra_data.extend(marker_data);
4010    if let Some(marker) = marker {
4011        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
4012    }
4013
4014    Some(Dependency {
4015        purl: Some(purl),
4016        extracted_requirement: requirement,
4017        scope: Some(scope),
4018        is_runtime: Some(true),
4019        is_optional: Some(is_optional),
4020        is_pinned: Some(is_pinned),
4021        is_direct: Some(true),
4022        resolved_package: None,
4023        extra_data: if extra_data.is_empty() {
4024            None
4025        } else {
4026            Some(extra_data)
4027        },
4028    })
4029}
4030
4031fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
4032    let name = extract_setup_cfg_dependency_name(requirement_part)?;
4033    let trimmed = requirement_part.trim();
4034    let mut remainder = trimmed[name.len()..].trim();
4035
4036    if let Some(stripped) = remainder.strip_prefix('[')
4037        && let Some(end_idx) = stripped.find(']')
4038    {
4039        remainder = stripped[end_idx + 1..].trim();
4040    }
4041
4042    let remainder = remainder
4043        .strip_prefix('(')
4044        .and_then(|value| value.strip_suffix(')'))
4045        .unwrap_or(remainder)
4046        .trim();
4047
4048    if remainder.is_empty() {
4049        return None;
4050    }
4051
4052    let mut specifiers: Vec<String> = remainder
4053        .split(',')
4054        .map(|specifier| specifier.trim().replace(' ', ""))
4055        .filter(|specifier| !specifier.is_empty())
4056        .collect();
4057    specifiers.sort();
4058    Some(specifiers.join(","))
4059}
4060
4061fn encode_python_dependency_purl_version(version: &str) -> String {
4062    version.replace('*', "%2A")
4063}
4064
4065fn build_python_dependency_purl(name: &str, version: Option<&str>) -> Option<String> {
4066    let normalized_name = normalize_python_dependency_name(name);
4067
4068    PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &normalized_name)
4069        .ok()
4070        .map(|_| match version {
4071            Some(version) => {
4072                format!(
4073                    "pkg:pypi/{normalized_name}@{}",
4074                    encode_python_dependency_purl_version(version)
4075                )
4076            }
4077            None => format!("pkg:pypi/{normalized_name}"),
4078        })
4079}
4080
4081fn normalize_python_dependency_name(name: &str) -> String {
4082    name.trim().to_ascii_lowercase().replace('_', "-")
4083}
4084
4085fn parse_rfc822_marker(
4086    marker_part: Option<&str>,
4087    default_scope: &str,
4088    default_optional: bool,
4089) -> (
4090    String,
4091    bool,
4092    Option<String>,
4093    HashMap<String, serde_json::Value>,
4094) {
4095    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
4096        return (
4097            default_scope.to_string(),
4098            default_optional,
4099            None,
4100            HashMap::new(),
4101        );
4102    };
4103
4104    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
4105        .expect("extra marker regex should compile");
4106    let mut extra_data = HashMap::new();
4107
4108    if let Some(python_version) = extract_marker_field(marker, "python_version") {
4109        extra_data.insert(
4110            "python_version".to_string(),
4111            serde_json::Value::String(python_version),
4112        );
4113    }
4114    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
4115        extra_data.insert(
4116            "sys_platform".to_string(),
4117            serde_json::Value::String(sys_platform),
4118        );
4119    }
4120
4121    if let Some(captures) = extra_re.captures(marker)
4122        && let Some(scope) = captures.get(1)
4123    {
4124        return (
4125            scope.as_str().to_string(),
4126            true,
4127            Some(marker.trim().to_string()),
4128            extra_data,
4129        );
4130    }
4131
4132    (
4133        default_scope.to_string(),
4134        default_optional,
4135        Some(marker.trim().to_string()),
4136        extra_data,
4137    )
4138}
4139
4140fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
4141    let re = Regex::new(&format!(
4142        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
4143        field
4144    ))
4145    .ok()?;
4146    let captures = re.captures(marker)?;
4147    let operator = captures.get(1)?.as_str();
4148    let value = captures.get(2)?.as_str();
4149    Some(format!("{} {}", operator, value))
4150}
4151
4152fn parse_requires_txt(content: &str) -> Vec<Dependency> {
4153    let mut dependencies = Vec::new();
4154    let mut current_scope = "install".to_string();
4155    let mut current_optional = false;
4156    let mut current_marker: Option<String> = None;
4157    let mut line_count = 0usize;
4158
4159    for line in content.lines() {
4160        line_count += 1;
4161        if line_count > MAX_ITERATION_COUNT {
4162            warn!(
4163                "Exceeded max line count in requires.txt; stopping at {} lines",
4164                MAX_ITERATION_COUNT
4165            );
4166            break;
4167        }
4168        let trimmed = line.trim();
4169        if trimmed.is_empty() || trimmed.starts_with('#') {
4170            continue;
4171        }
4172
4173        if trimmed.starts_with('[') && trimmed.ends_with(']') {
4174            let inner = &trimmed[1..trimmed.len() - 1];
4175            if let Some(rest) = inner.strip_prefix(':') {
4176                current_scope = "install".to_string();
4177                current_optional = false;
4178                current_marker = Some(rest.trim().to_string());
4179            } else if let Some((scope, marker)) = inner.split_once(':') {
4180                current_scope = scope.trim().to_string();
4181                current_optional = true;
4182                current_marker = Some(marker.trim().to_string());
4183            } else {
4184                current_scope = inner.trim().to_string();
4185                current_optional = true;
4186                current_marker = None;
4187            }
4188            continue;
4189        }
4190
4191        if let Some(dependency) = build_python_dependency(
4192            trimmed,
4193            &current_scope,
4194            current_optional,
4195            current_marker.as_deref(),
4196        ) {
4197            dependencies.push(dependency);
4198        }
4199    }
4200
4201    dependencies
4202}
4203
4204fn has_private_classifier(classifiers: &[String]) -> bool {
4205    classifiers
4206        .iter()
4207        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
4208}
4209
4210fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
4211    let name = name?;
4212    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
4213    if let Some(version) = version {
4214        package_url.with_version(version).ok()?;
4215    }
4216    Some(package_url.to_string())
4217}
4218
4219fn extract_from_setup_py_regex(content: &str) -> PackageData {
4220    let name = extract_setup_value(content, "name").map(truncate_field);
4221    let version = extract_setup_value(content, "version").map(truncate_field);
4222    let license_expression = extract_setup_value(content, "license").map(truncate_field);
4223
4224    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4225        normalize_spdx_declared_license(license_expression.as_deref());
4226    let extracted_license_statement = license_expression.clone();
4227
4228    let dependencies = extract_setup_py_dependencies(content);
4229    let homepage_url = extract_setup_value(content, "url").map(truncate_field);
4230    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
4231
4232    PackageData {
4233        package_type: Some(PythonParser::PACKAGE_TYPE),
4234        namespace: None,
4235        name,
4236        version,
4237        qualifiers: None,
4238        subpath: None,
4239        primary_language: Some("Python".to_string()),
4240        description: None,
4241        release_date: None,
4242        parties: Vec::new(),
4243        keywords: Vec::new(),
4244        homepage_url,
4245        download_url: None,
4246        size: None,
4247        sha1: None,
4248        md5: None,
4249        sha256: None,
4250        sha512: None,
4251        bug_tracking_url: None,
4252        code_view_url: None,
4253        vcs_url: None,
4254        copyright: None,
4255        holder: None,
4256        declared_license_expression,
4257        declared_license_expression_spdx,
4258        license_detections,
4259        other_license_expression: None,
4260        other_license_expression_spdx: None,
4261        other_license_detections: Vec::new(),
4262        extracted_license_statement,
4263        notice_text: None,
4264        source_packages: Vec::new(),
4265        file_references: Vec::new(),
4266        is_private: false,
4267        is_virtual: false,
4268        extra_data: None,
4269        dependencies,
4270        repository_homepage_url: None,
4271        repository_download_url: None,
4272        api_data_url: None,
4273        datasource_id: Some(DatasourceId::PypiSetupPy),
4274        purl,
4275    }
4276}
4277
4278fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
4279    crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
4280}
4281
4282fn extract_from_pypi_json(path: &Path) -> PackageData {
4283    let default = PackageData {
4284        package_type: Some(PythonParser::PACKAGE_TYPE),
4285        datasource_id: Some(DatasourceId::PypiJson),
4286        ..Default::default()
4287    };
4288
4289    let content = match read_file_to_string(path, None) {
4290        Ok(content) => content,
4291        Err(error) => {
4292            warn!("Failed to read pypi.json at {:?}: {}", path, error);
4293            return default;
4294        }
4295    };
4296
4297    let root: serde_json::Value = match serde_json::from_str(&content) {
4298        Ok(value) => value,
4299        Err(error) => {
4300            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
4301            return default;
4302        }
4303    };
4304
4305    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
4306        warn!("No info object found in pypi.json at {:?}", path);
4307        return default;
4308    };
4309
4310    let name = info
4311        .get("name")
4312        .and_then(|value| value.as_str())
4313        .map(|v| truncate_field(v.to_owned()));
4314    let version = info
4315        .get("version")
4316        .and_then(|value| value.as_str())
4317        .map(ToOwned::to_owned);
4318    let summary = info
4319        .get("summary")
4320        .and_then(|value| value.as_str())
4321        .map(|v| truncate_field(v.to_owned()));
4322    let description = info
4323        .get("description")
4324        .and_then(|value| value.as_str())
4325        .filter(|value| !value.trim().is_empty())
4326        .map(|v| truncate_field(v.to_owned()))
4327        .or(summary);
4328    let mut homepage_url = info
4329        .get("home_page")
4330        .and_then(|value| value.as_str())
4331        .map(|v| truncate_field(v.to_owned()));
4332    let author = info
4333        .get("author")
4334        .and_then(|value| value.as_str())
4335        .filter(|value| !value.trim().is_empty())
4336        .map(|v| truncate_field(v.to_owned()));
4337    let author_email = info
4338        .get("author_email")
4339        .and_then(|value| value.as_str())
4340        .filter(|value| !value.trim().is_empty())
4341        .map(ToOwned::to_owned);
4342    let license = info
4343        .get("license")
4344        .and_then(|value| value.as_str())
4345        .filter(|value| !value.trim().is_empty())
4346        .map(ToOwned::to_owned);
4347    let keywords = parse_setup_cfg_keywords(
4348        info.get("keywords")
4349            .and_then(|value| value.as_str())
4350            .map(ToOwned::to_owned),
4351    );
4352    let classifiers = info
4353        .get("classifiers")
4354        .and_then(|value| value.as_array())
4355        .map(|values| {
4356            values
4357                .iter()
4358                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
4359                .collect::<Vec<_>>()
4360        })
4361        .unwrap_or_default();
4362
4363    let mut parties = Vec::new();
4364    if author.is_some() || author_email.is_some() {
4365        parties.push(Party {
4366            r#type: Some("person".to_string()),
4367            role: Some("author".to_string()),
4368            name: author,
4369            email: author_email,
4370            url: None,
4371            organization: None,
4372            organization_url: None,
4373            timezone: None,
4374        });
4375    }
4376
4377    let mut bug_tracking_url = None;
4378    let mut code_view_url = None;
4379    let mut vcs_url = None;
4380    let mut extra_data = HashMap::new();
4381
4382    let parsed_project_urls = info
4383        .get("project_urls")
4384        .and_then(|value| value.as_object())
4385        .map(|map| {
4386            let mut pairs: Vec<(String, String)> = map
4387                .iter()
4388                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
4389                .collect();
4390            pairs.sort_by(|left, right| left.0.cmp(&right.0));
4391            pairs
4392        })
4393        .unwrap_or_default();
4394
4395    apply_project_url_mappings(
4396        &parsed_project_urls,
4397        &mut homepage_url,
4398        &mut bug_tracking_url,
4399        &mut code_view_url,
4400        &mut vcs_url,
4401        &mut extra_data,
4402    );
4403
4404    let (download_url, size, sha256) = root
4405        .get("urls")
4406        .and_then(|value| value.as_array())
4407        .map(|urls| select_pypi_json_artifact(urls))
4408        .unwrap_or((None, None, None));
4409
4410    let sha256 = sha256.and_then(|h| Sha256Digest::from_hex(&h).ok());
4411
4412    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4413        normalize_spdx_declared_license(license.as_deref());
4414    let dependencies = info
4415        .get("requires_dist")
4416        .and_then(|value| value.as_array())
4417        .map(|entries| {
4418            entries
4419                .iter()
4420                .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4421                .collect::<Vec<_>>()
4422        })
4423        .map(|entries| extract_requires_dist_dependencies(&entries))
4424        .unwrap_or_default();
4425
4426    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
4427        build_pypi_urls(name.as_deref(), version.as_deref());
4428
4429    PackageData {
4430        package_type: Some(PythonParser::PACKAGE_TYPE),
4431        namespace: None,
4432        name,
4433        version,
4434        qualifiers: None,
4435        subpath: None,
4436        primary_language: None,
4437        description,
4438        release_date: None,
4439        parties,
4440        keywords,
4441        homepage_url: homepage_url.or(repository_homepage_url.clone()),
4442        download_url,
4443        size,
4444        sha1: None,
4445        md5: None,
4446        sha256,
4447        sha512: None,
4448        bug_tracking_url,
4449        code_view_url,
4450        vcs_url,
4451        copyright: None,
4452        holder: None,
4453        declared_license_expression,
4454        declared_license_expression_spdx,
4455        license_detections,
4456        other_license_expression: None,
4457        other_license_expression_spdx: None,
4458        other_license_detections: Vec::new(),
4459        extracted_license_statement: license,
4460        notice_text: None,
4461        source_packages: Vec::new(),
4462        file_references: Vec::new(),
4463        is_private: has_private_classifier(&classifiers),
4464        is_virtual: false,
4465        extra_data: if extra_data.is_empty() {
4466            None
4467        } else {
4468            Some(extra_data)
4469        },
4470        dependencies,
4471        repository_homepage_url,
4472        repository_download_url,
4473        api_data_url,
4474        datasource_id: Some(DatasourceId::PypiJson),
4475        purl,
4476    }
4477}
4478
4479fn select_pypi_json_artifact(
4480    urls: &[serde_json::Value],
4481) -> (Option<String>, Option<u64>, Option<String>) {
4482    let selected = urls
4483        .iter()
4484        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4485        .or_else(|| urls.first());
4486
4487    let Some(entry) = selected else {
4488        return (None, None, None);
4489    };
4490
4491    let download_url = entry
4492        .get("url")
4493        .and_then(|value| value.as_str())
4494        .map(ToOwned::to_owned);
4495    let size = entry.get("size").and_then(|value| value.as_u64());
4496    let sha256 = entry
4497        .get("digests")
4498        .and_then(|value| value.as_object())
4499        .and_then(|digests| digests.get("sha256"))
4500        .and_then(|value| value.as_str())
4501        .map(ToOwned::to_owned);
4502
4503    (download_url, size, sha256)
4504}
4505
4506fn extract_from_pip_inspect(path: &Path) -> PackageData {
4507    let content = match read_file_to_string(path, None) {
4508        Ok(content) => content,
4509        Err(e) => {
4510            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4511            return default_package_data(path);
4512        }
4513    };
4514
4515    let root: serde_json::Value = match serde_json::from_str(&content) {
4516        Ok(value) => value,
4517        Err(e) => {
4518            warn!(
4519                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4520                path, e
4521            );
4522            return default_package_data(path);
4523        }
4524    };
4525
4526    let installed = match root.get("installed").and_then(|v| v.as_array()) {
4527        Some(arr) => arr,
4528        None => {
4529            warn!(
4530                "No 'installed' array found in pip-inspect.deplock at {:?}",
4531                path
4532            );
4533            return default_package_data(path);
4534        }
4535    };
4536
4537    let pip_version = root
4538        .get("pip_version")
4539        .and_then(|v| v.as_str())
4540        .map(String::from);
4541    let inspect_version = root
4542        .get("version")
4543        .and_then(|v| v.as_str())
4544        .map(String::from);
4545
4546    let mut main_package: Option<PackageData> = None;
4547    let mut dependencies: Vec<Dependency> = Vec::new();
4548
4549    for package_entry in installed {
4550        let metadata = match package_entry.get("metadata") {
4551            Some(m) => m,
4552            None => continue,
4553        };
4554
4555        let is_requested = package_entry
4556            .get("requested")
4557            .and_then(|v| v.as_bool())
4558            .unwrap_or(false);
4559        let has_direct_url = package_entry.get("direct_url").is_some();
4560
4561        let name = metadata
4562            .get("name")
4563            .and_then(|v| v.as_str())
4564            .map(|v| truncate_field(v.to_string()));
4565        let version = metadata
4566            .get("version")
4567            .and_then(|v| v.as_str())
4568            .map(String::from);
4569        let summary = metadata
4570            .get("summary")
4571            .and_then(|v| v.as_str())
4572            .map(|v| truncate_field(v.to_string()));
4573        let home_page = metadata
4574            .get("home_page")
4575            .and_then(|v| v.as_str())
4576            .map(|v| truncate_field(v.to_string()));
4577        let author = metadata
4578            .get("author")
4579            .and_then(|v| v.as_str())
4580            .map(|v| truncate_field(v.to_string()));
4581        let author_email = metadata
4582            .get("author_email")
4583            .and_then(|v| v.as_str())
4584            .map(String::from);
4585        let license = metadata
4586            .get("license")
4587            .and_then(|v| v.as_str())
4588            .map(|v| truncate_field(v.to_string()));
4589        let description = metadata
4590            .get("description")
4591            .and_then(|v| v.as_str())
4592            .map(|v| truncate_field(v.to_string()));
4593        let keywords = metadata
4594            .get("keywords")
4595            .and_then(|v| v.as_array())
4596            .map(|arr| {
4597                arr.iter()
4598                    .filter_map(|k| k.as_str().map(String::from))
4599                    .collect::<Vec<_>>()
4600            })
4601            .unwrap_or_default();
4602
4603        let mut parties = Vec::new();
4604        if author.is_some() || author_email.is_some() {
4605            parties.push(Party {
4606                r#type: Some("person".to_string()),
4607                role: Some("author".to_string()),
4608                name: author,
4609                email: author_email,
4610                url: None,
4611                organization: None,
4612                organization_url: None,
4613                timezone: None,
4614            });
4615        }
4616
4617        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4618            normalize_spdx_declared_license(license.as_deref());
4619        let extracted_license_statement = license.clone();
4620        let requires_dist = metadata
4621            .get("requires_dist")
4622            .and_then(|v| v.as_array())
4623            .map(|entries| {
4624                entries
4625                    .iter()
4626                    .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4627                    .collect::<Vec<_>>()
4628            })
4629            .unwrap_or_default();
4630        let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4631
4632        let purl = name.as_ref().and_then(|n| {
4633            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4634            if let Some(v) = &version {
4635                package_url.with_version(v).ok()?;
4636            }
4637            Some(package_url.to_string())
4638        });
4639
4640        if is_requested && has_direct_url {
4641            let mut extra_data = HashMap::new();
4642            if let Some(pv) = &pip_version {
4643                extra_data.insert(
4644                    "pip_version".to_string(),
4645                    serde_json::Value::String(pv.clone()),
4646                );
4647            }
4648            if let Some(iv) = &inspect_version {
4649                extra_data.insert(
4650                    "inspect_version".to_string(),
4651                    serde_json::Value::String(iv.clone()),
4652                );
4653            }
4654
4655            main_package = Some(PackageData {
4656                package_type: Some(PythonParser::PACKAGE_TYPE),
4657                namespace: None,
4658                name,
4659                version,
4660                qualifiers: None,
4661                subpath: None,
4662                primary_language: Some("Python".to_string()),
4663                description: description.or(summary),
4664                release_date: None,
4665                parties,
4666                keywords,
4667                homepage_url: home_page,
4668                download_url: None,
4669                size: None,
4670                sha1: None,
4671                md5: None,
4672                sha256: None,
4673                sha512: None,
4674                bug_tracking_url: None,
4675                code_view_url: None,
4676                vcs_url: None,
4677                copyright: None,
4678                holder: None,
4679                declared_license_expression,
4680                declared_license_expression_spdx,
4681                license_detections,
4682                other_license_expression: None,
4683                other_license_expression_spdx: None,
4684                other_license_detections: Vec::new(),
4685                extracted_license_statement,
4686                notice_text: None,
4687                source_packages: Vec::new(),
4688                file_references: Vec::new(),
4689                is_private: false,
4690                is_virtual: true,
4691                extra_data: if extra_data.is_empty() {
4692                    None
4693                } else {
4694                    Some(extra_data)
4695                },
4696                dependencies: parsed_dependencies,
4697                repository_homepage_url: None,
4698                repository_download_url: None,
4699                api_data_url: None,
4700                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4701                purl,
4702            });
4703        } else {
4704            let resolved_package = PackageData {
4705                package_type: Some(PythonParser::PACKAGE_TYPE),
4706                namespace: None,
4707                name: name.clone(),
4708                version: version.clone(),
4709                qualifiers: None,
4710                subpath: None,
4711                primary_language: Some("Python".to_string()),
4712                description: description.or(summary),
4713                release_date: None,
4714                parties,
4715                keywords,
4716                homepage_url: home_page,
4717                download_url: None,
4718                size: None,
4719                sha1: None,
4720                md5: None,
4721                sha256: None,
4722                sha512: None,
4723                bug_tracking_url: None,
4724                code_view_url: None,
4725                vcs_url: None,
4726                copyright: None,
4727                holder: None,
4728                declared_license_expression,
4729                declared_license_expression_spdx,
4730                license_detections,
4731                other_license_expression: None,
4732                other_license_expression_spdx: None,
4733                other_license_detections: Vec::new(),
4734                extracted_license_statement,
4735                notice_text: None,
4736                source_packages: Vec::new(),
4737                file_references: Vec::new(),
4738                is_private: false,
4739                is_virtual: true,
4740                extra_data: None,
4741                dependencies: parsed_dependencies,
4742                repository_homepage_url: None,
4743                repository_download_url: None,
4744                api_data_url: None,
4745                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4746                purl: purl.clone(),
4747            };
4748
4749            let resolved = package_data_to_resolved(&resolved_package);
4750            dependencies.push(Dependency {
4751                purl,
4752                extracted_requirement: None,
4753                scope: None,
4754                is_runtime: Some(true),
4755                is_optional: Some(false),
4756                is_pinned: Some(true),
4757                is_direct: Some(is_requested),
4758                resolved_package: Some(Box::new(resolved)),
4759                extra_data: None,
4760            });
4761        }
4762    }
4763
4764    if let Some(mut main_pkg) = main_package {
4765        let direct_requirement_purls: HashSet<String> = main_pkg
4766            .dependencies
4767            .iter()
4768            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4769            .collect();
4770
4771        let resolved_requirement_purls: HashSet<String> = dependencies
4772            .iter()
4773            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4774            .collect();
4775
4776        let unresolved_dependencies = main_pkg
4777            .dependencies
4778            .iter()
4779            .filter(|dep| {
4780                dep.purl.as_ref().is_some_and(|purl| {
4781                    !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4782                })
4783            })
4784            .cloned()
4785            .collect::<Vec<_>>();
4786
4787        for dependency in &mut dependencies {
4788            if dependency
4789                .purl
4790                .as_ref()
4791                .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4792            {
4793                dependency.is_direct = Some(true);
4794            }
4795        }
4796
4797        main_pkg.dependencies = dependencies;
4798        main_pkg.dependencies.extend(unresolved_dependencies);
4799        main_pkg
4800    } else {
4801        default_package_data(path)
4802    }
4803}
4804
4805fn base_dependency_purl(purl: &str) -> String {
4806    purl.split_once('@')
4807        .map(|(base, _)| base.to_string())
4808        .unwrap_or_else(|| purl.to_string())
4809}
4810
4811type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4812
4813fn extract_from_setup_cfg(path: &Path) -> PackageData {
4814    let content = match read_file_to_string(path, None) {
4815        Ok(content) => content,
4816        Err(e) => {
4817            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4818            return default_package_data(path);
4819        }
4820    };
4821
4822    let sections = parse_setup_cfg(&content);
4823    let name = get_ini_value(&sections, "metadata", "name").map(truncate_field);
4824    let version = get_ini_value(&sections, "metadata", "version").map(truncate_field);
4825    let description = get_ini_value(&sections, "metadata", "description").map(truncate_field);
4826    let author = get_ini_value(&sections, "metadata", "author").map(truncate_field);
4827    let author_email = get_ini_value(&sections, "metadata", "author_email");
4828    let maintainer = get_ini_value(&sections, "metadata", "maintainer").map(truncate_field);
4829    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4830    let license = get_ini_value(&sections, "metadata", "license").map(truncate_field);
4831    let mut homepage_url = get_ini_value(&sections, "metadata", "url").map(truncate_field);
4832    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4833    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4834    let python_requires = get_ini_value(&sections, "options", "python_requires");
4835    let parsed_project_urls =
4836        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4837    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4838    let mut extra_data = HashMap::new();
4839
4840    let mut parties = Vec::new();
4841    if author.is_some() || author_email.is_some() {
4842        parties.push(Party {
4843            r#type: Some("person".to_string()),
4844            role: Some("author".to_string()),
4845            name: author,
4846            email: author_email,
4847            url: None,
4848            organization: None,
4849            organization_url: None,
4850            timezone: None,
4851        });
4852    }
4853
4854    if maintainer.is_some() || maintainer_email.is_some() {
4855        parties.push(Party {
4856            r#type: Some("person".to_string()),
4857            role: Some("maintainer".to_string()),
4858            name: maintainer,
4859            email: maintainer_email,
4860            url: None,
4861            organization: None,
4862            organization_url: None,
4863            timezone: None,
4864        });
4865    }
4866
4867    let declared_license_expression = None;
4868    let declared_license_expression_spdx = None;
4869    let license_detections = Vec::new();
4870    let extracted_license_statement = license.clone();
4871
4872    let dependencies = extract_setup_cfg_dependencies(&sections);
4873
4874    if let Some(value) = python_requires {
4875        extra_data.insert(
4876            "python_requires".to_string(),
4877            serde_json::Value::String(value),
4878        );
4879    }
4880
4881    apply_project_url_mappings(
4882        &parsed_project_urls,
4883        &mut homepage_url,
4884        &mut bug_tracking_url,
4885        &mut code_view_url,
4886        &mut vcs_url,
4887        &mut extra_data,
4888    );
4889
4890    let extra_data = if extra_data.is_empty() {
4891        None
4892    } else {
4893        Some(extra_data)
4894    };
4895
4896    let purl = name.as_ref().and_then(|n| {
4897        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4898        if let Some(v) = &version {
4899            package_url.with_version(v).ok()?;
4900        }
4901        Some(package_url.to_string())
4902    });
4903
4904    PackageData {
4905        package_type: Some(PythonParser::PACKAGE_TYPE),
4906        namespace: None,
4907        name,
4908        version,
4909        qualifiers: None,
4910        subpath: None,
4911        primary_language: Some("Python".to_string()),
4912        description,
4913        release_date: None,
4914        parties,
4915        keywords,
4916        homepage_url,
4917        download_url: None,
4918        size: None,
4919        sha1: None,
4920        md5: None,
4921        sha256: None,
4922        sha512: None,
4923        bug_tracking_url,
4924        code_view_url,
4925        vcs_url,
4926        copyright: None,
4927        holder: None,
4928        declared_license_expression,
4929        declared_license_expression_spdx,
4930        license_detections,
4931        other_license_expression: None,
4932        other_license_expression_spdx: None,
4933        other_license_detections: Vec::new(),
4934        extracted_license_statement,
4935        notice_text: None,
4936        source_packages: Vec::new(),
4937        file_references: Vec::new(),
4938        is_private: has_private_classifier(&classifiers),
4939        is_virtual: false,
4940        extra_data,
4941        dependencies,
4942        repository_homepage_url: None,
4943        repository_download_url: None,
4944        api_data_url: None,
4945        datasource_id: Some(DatasourceId::PypiSetupCfg),
4946        purl,
4947    }
4948}
4949
4950fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4951    let Some(keywords) = value else {
4952        return Vec::new();
4953    };
4954
4955    keywords
4956        .split(',')
4957        .map(str::trim)
4958        .filter(|keyword| !keyword.is_empty())
4959        .map(ToOwned::to_owned)
4960        .collect()
4961}
4962
4963fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4964    entries
4965        .iter()
4966        .filter_map(|entry| {
4967            let (label, url) = entry.split_once('=')?;
4968            let label = label.trim();
4969            let url = url.trim();
4970            if label.is_empty() || url.is_empty() {
4971                None
4972            } else {
4973                Some((label.to_string(), url.to_string()))
4974            }
4975        })
4976        .collect()
4977}
4978
4979fn apply_project_url_mappings(
4980    parsed_urls: &[(String, String)],
4981    homepage_url: &mut Option<String>,
4982    bug_tracking_url: &mut Option<String>,
4983    code_view_url: &mut Option<String>,
4984    vcs_url: &mut Option<String>,
4985    extra_data: &mut HashMap<String, serde_json::Value>,
4986) {
4987    for (label, url) in parsed_urls {
4988        let label_lower = label.to_lowercase();
4989
4990        if bug_tracking_url.is_none()
4991            && matches!(
4992                label_lower.as_str(),
4993                "tracker"
4994                    | "bug reports"
4995                    | "bug tracker"
4996                    | "issues"
4997                    | "issue tracker"
4998                    | "github: issues"
4999            )
5000        {
5001            *bug_tracking_url = Some(url.clone());
5002        } else if code_view_url.is_none()
5003            && matches!(label_lower.as_str(), "source" | "source code" | "code")
5004        {
5005            *code_view_url = Some(url.clone());
5006        } else if vcs_url.is_none()
5007            && matches!(
5008                label_lower.as_str(),
5009                "github" | "gitlab" | "github: repo" | "repository"
5010            )
5011        {
5012            *vcs_url = Some(url.clone());
5013        } else if homepage_url.is_none()
5014            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
5015        {
5016            *homepage_url = Some(url.clone());
5017        } else if label_lower == "changelog" {
5018            extra_data.insert(
5019                "changelog_url".to_string(),
5020                serde_json::Value::String(url.clone()),
5021            );
5022        }
5023    }
5024
5025    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
5026        .iter()
5027        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
5028        .collect();
5029
5030    if !project_urls_json.is_empty() {
5031        extra_data.insert(
5032            "project_urls".to_string(),
5033            serde_json::Value::Object(project_urls_json),
5034        );
5035    }
5036}
5037
5038fn parse_setup_cfg(content: &str) -> IniSections {
5039    let mut sections: IniSections = HashMap::new();
5040    let mut current_section: Option<String> = None;
5041    let mut current_key: Option<String> = None;
5042
5043    for raw_line in content.lines() {
5044        let line = raw_line.trim_end_matches('\r');
5045        let trimmed = line.trim();
5046        if trimmed.is_empty() {
5047            continue;
5048        }
5049
5050        let stripped = line.trim_start();
5051        if stripped.starts_with('#') || stripped.starts_with(';') {
5052            continue;
5053        }
5054
5055        if stripped.starts_with('[') && stripped.ends_with(']') {
5056            let section_name = stripped
5057                .trim_start_matches('[')
5058                .trim_end_matches(']')
5059                .trim()
5060                .to_ascii_lowercase();
5061            current_section = if section_name.is_empty() {
5062                None
5063            } else {
5064                Some(section_name)
5065            };
5066            current_key = None;
5067            continue;
5068        }
5069
5070        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
5071            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
5072                let value = stripped.trim();
5073                if !value.is_empty() {
5074                    sections
5075                        .entry(section.clone())
5076                        .or_default()
5077                        .entry(key.clone())
5078                        .or_default()
5079                        .push(value.to_string());
5080                }
5081            }
5082            continue;
5083        }
5084
5085        if let Some((key, value)) = stripped.split_once('=')
5086            && let Some(section) = current_section.as_ref()
5087        {
5088            let key_name = key.trim().to_ascii_lowercase();
5089            let value_trimmed = value.trim();
5090            let entry = sections
5091                .entry(section.clone())
5092                .or_default()
5093                .entry(key_name.clone())
5094                .or_default();
5095            if !value_trimmed.is_empty() {
5096                entry.push(value_trimmed.to_string());
5097            }
5098            current_key = Some(key_name);
5099        }
5100    }
5101
5102    sections
5103}
5104
5105fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
5106    sections
5107        .get(&section.to_ascii_lowercase())
5108        .and_then(|values| values.get(&key.to_ascii_lowercase()))
5109        .and_then(|entries| entries.first())
5110        .map(|value| value.trim().to_string())
5111}
5112
5113fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
5114    sections
5115        .get(&section.to_ascii_lowercase())
5116        .and_then(|values| values.get(&key.to_ascii_lowercase()))
5117        .cloned()
5118        .unwrap_or_default()
5119}
5120
5121fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
5122    let mut dependencies = Vec::new();
5123
5124    for (sub_section, scope) in [
5125        ("install_requires", "install"),
5126        ("tests_require", "test"),
5127        ("setup_requires", "setup"),
5128    ] {
5129        let reqs = get_ini_values(sections, "options", sub_section);
5130        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
5131    }
5132
5133    if let Some(extras) = sections.get("options.extras_require") {
5134        let mut extra_items: Vec<_> = extras.iter().collect();
5135        extra_items.sort_by_key(|(name, _)| *name);
5136        for (extra_name, reqs) in extra_items {
5137            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
5138        }
5139    }
5140
5141    dependencies
5142}
5143
5144fn parse_setup_cfg_requirements(
5145    reqs: &[String],
5146    scope: &str,
5147    is_optional: bool,
5148) -> Vec<Dependency> {
5149    reqs.iter()
5150        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
5151        .collect()
5152}
5153
5154fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
5155    let trimmed = req.trim();
5156    if trimmed.is_empty() || trimmed.starts_with('#') {
5157        return None;
5158    }
5159
5160    let name = extract_setup_cfg_dependency_name(trimmed)?;
5161    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5162
5163    Some(Dependency {
5164        purl: Some(purl.to_string()),
5165        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
5166        scope: Some(scope.to_string()),
5167        is_runtime: Some(true),
5168        is_optional: Some(is_optional),
5169        is_pinned: Some(false),
5170        is_direct: Some(true),
5171        resolved_package: None,
5172        extra_data: None,
5173    })
5174}
5175
5176fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
5177    let trimmed = req.trim();
5178    if trimmed.is_empty() {
5179        return None;
5180    }
5181
5182    let end = trimmed
5183        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
5184        .unwrap_or(trimmed.len());
5185    let name = trimmed[..end].trim();
5186    if name.is_empty() {
5187        None
5188    } else {
5189        Some(name.to_string())
5190    }
5191}
5192
5193fn normalize_setup_cfg_requirement(req: &str) -> String {
5194    req.chars().filter(|c| !c.is_whitespace()).collect()
5195}
5196
5197fn extract_setup_value(content: &str, key: &str) -> Option<String> {
5198    let patterns = vec![
5199        format!("{}=\"", key),   // name="value"
5200        format!("{} =\"", key),  // name ="value"
5201        format!("{}= \"", key),  // name= "value"
5202        format!("{} = \"", key), // name = "value"
5203        format!("{}='", key),    // name='value'
5204        format!("{} ='", key),   // name ='value'
5205        format!("{}= '", key),   // name= 'value'
5206        format!("{} = '", key),  // name = 'value'
5207    ];
5208
5209    for pattern in patterns {
5210        if let Some(start_idx) = content.find(&pattern) {
5211            let value_start = start_idx + pattern.len();
5212            let remaining = &content[value_start..];
5213
5214            if let Some(end_idx) = remaining.find(['"', '\'']) {
5215                return Some(remaining[..end_idx].to_string());
5216            }
5217        }
5218    }
5219
5220    None
5221}
5222
5223fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
5224    let mut dependencies = Vec::new();
5225
5226    if let Some(tests_deps) = extract_tests_require(content) {
5227        dependencies.extend(tests_deps);
5228    }
5229
5230    if let Some(extras_deps) = extract_extras_require(content) {
5231        dependencies.extend(extras_deps);
5232    }
5233
5234    dependencies
5235}
5236
5237fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
5238    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
5239    let re = Regex::new(pattern).ok()?;
5240    let captures = re.captures(content)?;
5241    let deps_str = captures.get(1)?.as_str();
5242
5243    let deps = parse_setup_py_dep_list(deps_str, "test", true);
5244    if deps.is_empty() { None } else { Some(deps) }
5245}
5246
5247fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
5248    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
5249    let re = Regex::new(pattern).ok()?;
5250    let captures = re.captures(content)?;
5251    let dict_content = captures.get(1)?.as_str();
5252
5253    let mut all_deps = Vec::new();
5254
5255    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
5256    let entry_re = Regex::new(entry_pattern).ok()?;
5257
5258    for entry_cap in entry_re.captures_iter(dict_content) {
5259        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
5260            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
5261            all_deps.extend(deps);
5262        }
5263    }
5264
5265    if all_deps.is_empty() {
5266        None
5267    } else {
5268        Some(all_deps)
5269    }
5270}
5271
5272fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
5273    let dep_pattern = r#"['"]([^'"]+)['"]"#;
5274    let re = match Regex::new(dep_pattern) {
5275        Ok(r) => r,
5276        Err(_) => return Vec::new(),
5277    };
5278
5279    re.captures_iter(deps_str)
5280        .filter_map(|cap| {
5281            let dep_str = cap.get(1)?.as_str().trim();
5282            if dep_str.is_empty() {
5283                return None;
5284            }
5285
5286            let name = extract_setup_cfg_dependency_name(dep_str)?;
5287            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
5288
5289            Some(Dependency {
5290                purl: Some(purl.to_string()),
5291                extracted_requirement: Some(dep_str.to_string()),
5292                scope: Some(scope.to_string()),
5293                is_runtime: Some(true),
5294                is_optional: Some(is_optional),
5295                is_pinned: Some(false),
5296                is_direct: Some(true),
5297                resolved_package: None,
5298                extra_data: None,
5299            })
5300        })
5301        .collect()
5302}
5303
5304/// Reads and parses a TOML file
5305pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
5306    let content = read_file_to_string(path, None).map_err(|e| e.to_string())?;
5307    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
5308}
5309
5310/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
5311///
5312/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
5313/// Essential for SBOM compliance and package integrity verification.
5314///
5315/// # Returns
5316///
5317/// - `(Some(size), Some(hash))` on success
5318/// - `(None, None)` if file cannot be opened
5319/// - `(Some(size), None)` if hash calculation fails during read
5320fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<Sha256Digest>) {
5321    let mut file = match File::open(path) {
5322        Ok(f) => f,
5323        Err(_) => return (None, None),
5324    };
5325
5326    let metadata = match file.metadata() {
5327        Ok(m) => m,
5328        Err(_) => return (None, None),
5329    };
5330    let size = metadata.len();
5331
5332    let mut hasher = Sha256::new();
5333    let mut buffer = vec![0; 8192];
5334
5335    loop {
5336        match file.read(&mut buffer) {
5337            Ok(0) => break,
5338            Ok(n) => hasher.update(&buffer[..n]),
5339            Err(_) => return (Some(size), None),
5340        }
5341    }
5342
5343    let hash = Sha256Digest::from_bytes(hasher.finalize().into());
5344    (Some(size), Some(hash))
5345}
5346
5347fn default_package_data(path: &Path) -> PackageData {
5348    PackageData {
5349        package_type: Some(PythonParser::PACKAGE_TYPE),
5350        primary_language: Some("Python".to_string()),
5351        datasource_id: infer_python_datasource_id(path),
5352        ..Default::default()
5353    }
5354}
5355
5356fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
5357    let file_name = path.file_name().and_then(|name| name.to_str());
5358
5359    match file_name {
5360        Some("pyproject.toml") => {
5361            if read_toml_file(path)
5362                .ok()
5363                .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
5364                .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
5365                .is_some()
5366            {
5367                Some(DatasourceId::PypiPoetryPyprojectToml)
5368            } else {
5369                Some(DatasourceId::PypiPyprojectToml)
5370            }
5371        }
5372        Some(name)
5373            if name == "setup.py" || name.ends_with("_setup.py") || name.ends_with("-setup.py") =>
5374        {
5375            Some(DatasourceId::PypiSetupPy)
5376        }
5377        Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
5378        Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
5379        Some("METADATA") if is_installed_wheel_metadata_path(path) => {
5380            Some(DatasourceId::PypiWheelMetadata)
5381        }
5382        Some("pypi.json") => Some(DatasourceId::PypiJson),
5383        Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
5384        Some("origin.json") if is_pip_cache_origin_json(path) => {
5385            Some(DatasourceId::PypiPipOriginJson)
5386        }
5387        _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
5388            Some(DatasourceId::PypiSdist)
5389        }
5390        _ if path
5391            .extension()
5392            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
5393        {
5394            Some(DatasourceId::PypiWheel)
5395        }
5396        _ if path
5397            .extension()
5398            .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
5399        {
5400            Some(DatasourceId::PypiEgg)
5401        }
5402        _ => None,
5403    }
5404}
5405
5406crate::register_parser!(
5407    "Python package manifests (pyproject.toml, setup.py, suffixed setup.py variants, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
5408    &[
5409        "**/pyproject.toml",
5410        "**/setup.py",
5411        "**/*_setup.py",
5412        "**/*-setup.py",
5413        "**/setup.cfg",
5414        "**/pypi.json",
5415        "**/PKG-INFO",
5416        "**/*.dist-info/METADATA",
5417        "**/origin.json",
5418        "**/*.tar.gz",
5419        "**/*.tgz",
5420        "**/*.tar.bz2",
5421        "**/*.tar.xz",
5422        "**/*.zip",
5423        "**/*.whl",
5424        "**/*.egg"
5425    ],
5426    "pypi",
5427    "Python",
5428    Some("https://packaging.python.org/"),
5429);