Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
35use crate::parser_warn as warn;
36use crate::parsers::utils::{read_file_to_string, split_name_email};
37use base64::Engine;
38use base64::engine::general_purpose::URL_SAFE_NO_PAD;
39use bzip2::read::BzDecoder;
40use csv::ReaderBuilder;
41use flate2::read::GzDecoder;
42use liblzma::read::XzDecoder;
43use packageurl::PackageUrl;
44use regex::Regex;
45use rustpython_parser::{Parse, ast};
46use serde_json::{Map as JsonMap, Value as JsonValue};
47use sha2::{Digest, Sha256};
48use std::collections::{HashMap, HashSet};
49use std::fs::File;
50use std::io::Read;
51use std::path::{Component, Path, PathBuf};
52use tar::Archive;
53use toml::Value as TomlValue;
54use toml::map::Map as TomlMap;
55use zip::ZipArchive;
56
57use super::PackageParser;
58use super::license_normalization::{
59    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
60    normalize_spdx_expression,
61};
62
63// Field constants for pyproject.toml
64const FIELD_PROJECT: &str = "project";
65const FIELD_NAME: &str = "name";
66const FIELD_VERSION: &str = "version";
67const FIELD_LICENSE: &str = "license";
68const FIELD_AUTHORS: &str = "authors";
69const FIELD_MAINTAINERS: &str = "maintainers";
70const FIELD_URLS: &str = "urls";
71const FIELD_HOMEPAGE: &str = "homepage";
72const FIELD_REPOSITORY: &str = "repository";
73const FIELD_DEPENDENCIES: &str = "dependencies";
74const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
75const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
76const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
77const MAX_SETUP_PY_BYTES: usize = 1_048_576;
78const MAX_SETUP_PY_AST_NODES: usize = 10_000;
79const MAX_SETUP_PY_AST_DEPTH: usize = 50;
80const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
81const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
82const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
83
84/// Python package parser supporting 11 manifest formats.
85///
86/// Extracts metadata from Python package files including pyproject.toml, setup.py,
87/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
88///
89/// # Security
90///
91/// setup.py files are parsed using AST analysis rather than code execution to prevent
92/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
93pub struct PythonParser;
94
95#[derive(Clone, Copy, Debug)]
96enum PythonSdistArchiveFormat {
97    TarGz,
98    Tgz,
99    TarBz2,
100    TarXz,
101    Zip,
102}
103
104#[derive(Clone, Debug)]
105struct ValidatedZipEntry {
106    index: usize,
107    name: String,
108}
109
110impl PackageParser for PythonParser {
111    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
112
113    fn extract_packages(path: &Path) -> Vec<PackageData> {
114        vec![
115            if path.file_name().unwrap_or_default() == "pyproject.toml" {
116                extract_from_pyproject_toml(path)
117            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
118                extract_from_setup_cfg(path)
119            } else if path.file_name().unwrap_or_default() == "setup.py" {
120                extract_from_setup_py(path)
121            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
122                extract_from_rfc822_metadata(path, DatasourceId::PypiSdistPkginfo)
123            } else if path.file_name().unwrap_or_default() == "METADATA" {
124                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
125            } else if is_pip_cache_origin_json(path) {
126                extract_from_pip_origin_json(path)
127            } else if path.file_name().unwrap_or_default() == "pypi.json" {
128                extract_from_pypi_json(path)
129            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
130                extract_from_pip_inspect(path)
131            } else if is_python_sdist_archive_path(path) {
132                extract_from_sdist_archive(path)
133            } else if path
134                .extension()
135                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
136            {
137                extract_from_wheel_archive(path)
138            } else if path
139                .extension()
140                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
141            {
142                extract_from_egg_archive(path)
143            } else {
144                default_package_data(path)
145            },
146        ]
147    }
148
149    fn is_match(path: &Path) -> bool {
150        if let Some(filename) = path.file_name()
151            && (filename == "pyproject.toml"
152                || filename == "setup.cfg"
153                || filename == "setup.py"
154                || filename == "PKG-INFO"
155                || filename == "METADATA"
156                || filename == "pypi.json"
157                || filename == "pip-inspect.deplock"
158                || is_pip_cache_origin_json(path))
159        {
160            return true;
161        }
162
163        if let Some(extension) = path.extension() {
164            let ext = extension.to_string_lossy().to_lowercase();
165            if ext == "whl" || ext == "egg" || is_python_sdist_archive_path(path) {
166                return true;
167            }
168        }
169
170        false
171    }
172}
173
174#[derive(Debug, Clone)]
175struct InstalledWheelMetadata {
176    wheel_tags: Vec<String>,
177    wheel_version: Option<String>,
178    wheel_generator: Option<String>,
179    root_is_purelib: Option<bool>,
180    compressed_tag: Option<String>,
181}
182
183fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
184    let Some(parent) = path.parent() else {
185        return;
186    };
187
188    if !parent
189        .file_name()
190        .and_then(|name| name.to_str())
191        .is_some_and(|name| name.ends_with(".dist-info"))
192    {
193        return;
194    }
195
196    let wheel_path = parent.join("WHEEL");
197    if !wheel_path.exists() {
198        return;
199    }
200
201    let Ok(content) = read_file_to_string(&wheel_path) else {
202        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
203        return;
204    };
205
206    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
207        return;
208    };
209
210    apply_installed_wheel_metadata(package_data, &wheel_metadata);
211}
212
213fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
214    use super::rfc822::{get_header_all, get_header_first};
215
216    let metadata = super::rfc822::parse_rfc822_content(content);
217    let wheel_tags = get_header_all(&metadata.headers, "tag");
218    if wheel_tags.is_empty() {
219        return None;
220    }
221
222    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
223    let wheel_generator = get_header_first(&metadata.headers, "generator");
224    let root_is_purelib =
225        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
226            match value.to_ascii_lowercase().as_str() {
227                "true" => Some(true),
228                "false" => Some(false),
229                _ => None,
230            }
231        });
232
233    let compressed_tag = compress_wheel_tags(&wheel_tags);
234
235    Some(InstalledWheelMetadata {
236        wheel_tags,
237        wheel_version,
238        wheel_generator,
239        root_is_purelib,
240        compressed_tag,
241    })
242}
243
244fn compress_wheel_tags(tags: &[String]) -> Option<String> {
245    if tags.is_empty() {
246        return None;
247    }
248
249    if tags.len() == 1 {
250        return Some(tags[0].clone());
251    }
252
253    let mut python_tags = Vec::new();
254    let mut abi_tag: Option<&str> = None;
255    let mut platform_tag: Option<&str> = None;
256
257    for tag in tags {
258        let mut parts = tag.splitn(3, '-');
259        let python = parts.next()?;
260        let abi = parts.next()?;
261        let platform = parts.next()?;
262
263        if abi_tag.is_some_and(|existing| existing != abi)
264            || platform_tag.is_some_and(|existing| existing != platform)
265        {
266            return None;
267        }
268
269        abi_tag = Some(abi);
270        platform_tag = Some(platform);
271        python_tags.push(python.to_string());
272    }
273
274    Some(format!(
275        "{}-{}-{}",
276        python_tags.join("."),
277        abi_tag?,
278        platform_tag?
279    ))
280}
281
282fn apply_installed_wheel_metadata(
283    package_data: &mut PackageData,
284    wheel_metadata: &InstalledWheelMetadata,
285) {
286    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
287    extra_data.insert(
288        "wheel_tags".to_string(),
289        JsonValue::Array(
290            wheel_metadata
291                .wheel_tags
292                .iter()
293                .cloned()
294                .map(JsonValue::String)
295                .collect(),
296        ),
297    );
298
299    if let Some(wheel_version) = &wheel_metadata.wheel_version {
300        extra_data.insert(
301            "wheel_version".to_string(),
302            JsonValue::String(wheel_version.clone()),
303        );
304    }
305
306    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
307        extra_data.insert(
308            "wheel_generator".to_string(),
309            JsonValue::String(wheel_generator.clone()),
310        );
311    }
312
313    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
314        extra_data.insert(
315            "root_is_purelib".to_string(),
316            JsonValue::Bool(root_is_purelib),
317        );
318    }
319
320    if let (Some(name), Some(version), Some(extension)) = (
321        package_data.name.as_deref(),
322        package_data.version.as_deref(),
323        wheel_metadata.compressed_tag.as_deref(),
324    ) {
325        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
326    }
327}
328
329fn is_pip_cache_origin_json(path: &Path) -> bool {
330    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
331        && path.ancestors().skip(1).any(|ancestor| {
332            ancestor
333                .file_name()
334                .and_then(|name| name.to_str())
335                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
336        })
337}
338
339fn extract_from_pip_origin_json(path: &Path) -> PackageData {
340    let content = match read_file_to_string(path) {
341        Ok(content) => content,
342        Err(e) => {
343            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
344            return default_package_data(path);
345        }
346    };
347
348    let root: JsonValue = match serde_json::from_str(&content) {
349        Ok(root) => root,
350        Err(e) => {
351            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
352            return default_package_data(path);
353        }
354    };
355
356    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
357        warn!("No url found in pip cache origin.json at {:?}", path);
358        return default_package_data(path);
359    };
360
361    let sibling_wheel = find_sibling_cached_wheel(path);
362    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
363        sibling_wheel
364            .as_ref()
365            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
366    });
367
368    let Some((name, version)) = name_version else {
369        warn!(
370            "Failed to infer package name/version from pip cache origin.json at {:?}",
371            path
372        );
373        return default_package_data(path);
374    };
375
376    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
377        build_pypi_urls(Some(&name), Some(&version));
378    let purl = sibling_wheel
379        .as_ref()
380        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
381        .or(plain_purl);
382
383    PackageData {
384        package_type: Some(PythonParser::PACKAGE_TYPE),
385        primary_language: Some("Python".to_string()),
386        name: Some(name),
387        version: Some(version),
388        datasource_id: Some(DatasourceId::PypiPipOriginJson),
389        download_url: Some(download_url.to_string()),
390        sha256: extract_sha256_from_origin_json(&root),
391        repository_homepage_url,
392        repository_download_url,
393        api_data_url,
394        purl,
395        ..Default::default()
396    }
397}
398
399fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
400    let parent = path.parent()?;
401    let entries = parent.read_dir().ok()?;
402
403    for entry in entries.flatten() {
404        let sibling_path = entry.path();
405        if sibling_path
406            .extension()
407            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
408            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
409        {
410            return Some(wheel_info);
411        }
412    }
413
414    None
415}
416
417fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
418    let file_name = url.rsplit('/').next()?;
419
420    if file_name.ends_with(".whl") {
421        return parse_wheel_filename(Path::new(file_name))
422            .map(|wheel_info| (wheel_info.name, wheel_info.version));
423    }
424
425    let stem = strip_python_archive_extension(file_name)?;
426    let (name, version) = stem.rsplit_once('-')?;
427    if name.is_empty() || version.is_empty() {
428        return None;
429    }
430
431    Some((name.replace('_', "-"), version.to_string()))
432}
433
434fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
435    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
436        .iter()
437        .find_map(|suffix| file_name.strip_suffix(suffix))
438}
439
440fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
441    root.pointer("/archive_info/hashes/sha256")
442        .and_then(|value| value.as_str())
443        .map(ToOwned::to_owned)
444        .or_else(|| {
445            root.pointer("/archive_info/hash")
446                .and_then(|value| value.as_str())
447                .and_then(normalize_origin_hash)
448        })
449}
450
451fn normalize_origin_hash(hash: &str) -> Option<String> {
452    if let Some(value) = hash.strip_prefix("sha256=") {
453        return Some(value.to_string());
454    }
455    if let Some(value) = hash.strip_prefix("sha256:") {
456        return Some(value.to_string());
457    }
458    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
459        return Some(hash.to_string());
460    }
461    None
462}
463
464fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
465    let content = match read_file_to_string(path) {
466        Ok(content) => content,
467        Err(e) => {
468            warn!("Failed to read metadata at {:?}: {}", path, e);
469            return default_package_data(path);
470        }
471    };
472
473    let metadata = super::rfc822::parse_rfc822_content(&content);
474    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
475    merge_sibling_metadata_dependencies(path, &mut package_data);
476    merge_sibling_metadata_file_references(path, &mut package_data);
477    if datasource_id == DatasourceId::PypiWheelMetadata {
478        merge_sibling_wheel_metadata(path, &mut package_data);
479    }
480    package_data
481}
482
483fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
484    let mut extra_dependencies = Vec::new();
485
486    if let Some(parent) = path.parent() {
487        let direct_requires = parent.join("requires.txt");
488        if direct_requires.exists()
489            && let Ok(content) = read_file_to_string(&direct_requires)
490        {
491            extra_dependencies.extend(parse_requires_txt(&content));
492        }
493
494        let sibling_egg_info_requires = parent
495            .read_dir()
496            .ok()
497            .into_iter()
498            .flatten()
499            .flatten()
500            .find_map(|entry| {
501                let child_path = entry.path();
502                if child_path.is_dir()
503                    && child_path
504                        .file_name()
505                        .and_then(|name| name.to_str())
506                        .is_some_and(|name| name.ends_with(".egg-info"))
507                {
508                    let requires = child_path.join("requires.txt");
509                    requires.exists().then_some(requires)
510                } else {
511                    None
512                }
513            });
514
515        if let Some(requires_path) = sibling_egg_info_requires
516            && let Ok(content) = read_file_to_string(&requires_path)
517        {
518            extra_dependencies.extend(parse_requires_txt(&content));
519        }
520    }
521
522    for dependency in extra_dependencies {
523        if !package_data.dependencies.iter().any(|existing| {
524            existing.purl == dependency.purl
525                && existing.scope == dependency.scope
526                && existing.extracted_requirement == dependency.extracted_requirement
527                && existing.extra_data == dependency.extra_data
528        }) {
529            package_data.dependencies.push(dependency);
530        }
531    }
532}
533
534fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
535    let mut extra_refs = Vec::new();
536
537    if let Some(parent) = path.parent() {
538        let record_path = parent.join("RECORD");
539        if record_path.exists()
540            && let Ok(content) = read_file_to_string(&record_path)
541        {
542            extra_refs.extend(parse_record_csv(&content));
543        }
544
545        let installed_files_path = parent.join("installed-files.txt");
546        if installed_files_path.exists()
547            && let Ok(content) = read_file_to_string(&installed_files_path)
548        {
549            extra_refs.extend(parse_installed_files_txt(&content));
550        }
551
552        let sources_path = parent.join("SOURCES.txt");
553        if sources_path.exists()
554            && let Ok(content) = read_file_to_string(&sources_path)
555        {
556            extra_refs.extend(parse_sources_txt(&content));
557        }
558    }
559
560    for file_ref in extra_refs {
561        if !package_data
562            .file_references
563            .iter()
564            .any(|existing| existing.path == file_ref.path)
565        {
566            package_data.file_references.push(file_ref);
567        }
568    }
569}
570
571fn collect_validated_zip_entries<R: Read + std::io::Seek>(
572    archive: &mut ZipArchive<R>,
573    path: &Path,
574    archive_type: &str,
575) -> Result<Vec<ValidatedZipEntry>, String> {
576    let mut total_extracted = 0u64;
577    let mut entries = Vec::new();
578
579    for i in 0..archive.len() {
580        if let Ok(file) = archive.by_index_raw(i) {
581            let compressed_size = file.compressed_size();
582            let uncompressed_size = file.size();
583            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
584                warn!(
585                    "Skipping unsafe path in {} {:?}: {}",
586                    archive_type,
587                    path,
588                    file.name()
589                );
590                continue;
591            };
592
593            if compressed_size > 0 {
594                let ratio = uncompressed_size as f64 / compressed_size as f64;
595                if ratio > MAX_COMPRESSION_RATIO {
596                    warn!(
597                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
598                        archive_type, path, ratio
599                    );
600                    continue;
601                }
602            }
603
604            if uncompressed_size > MAX_FILE_SIZE {
605                warn!(
606                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
607                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
608                );
609                continue;
610            }
611
612            total_extracted += uncompressed_size;
613            if total_extracted > MAX_ARCHIVE_SIZE {
614                let msg = format!(
615                    "Total extracted size exceeds limit for {} {:?}",
616                    archive_type, path
617                );
618                warn!("{}", msg);
619                return Err(msg);
620            }
621
622            entries.push(ValidatedZipEntry {
623                index: i,
624                name: entry_name,
625            });
626        }
627    }
628
629    Ok(entries)
630}
631
632fn is_python_sdist_archive_path(path: &Path) -> bool {
633    detect_python_sdist_archive_format(path).is_some()
634}
635
636fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
637    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
638
639    if !is_likely_python_sdist_filename(&file_name) {
640        return None;
641    }
642
643    if file_name.ends_with(".tar.gz") {
644        Some(PythonSdistArchiveFormat::TarGz)
645    } else if file_name.ends_with(".tgz") {
646        Some(PythonSdistArchiveFormat::Tgz)
647    } else if file_name.ends_with(".tar.bz2") {
648        Some(PythonSdistArchiveFormat::TarBz2)
649    } else if file_name.ends_with(".tar.xz") {
650        Some(PythonSdistArchiveFormat::TarXz)
651    } else if file_name.ends_with(".zip") {
652        Some(PythonSdistArchiveFormat::Zip)
653    } else {
654        None
655    }
656}
657
658fn is_likely_python_sdist_filename(file_name: &str) -> bool {
659    let Some(stem) = strip_python_archive_extension(file_name) else {
660        return false;
661    };
662
663    let Some((name, version)) = stem.rsplit_once('-') else {
664        return false;
665    };
666
667    !name.is_empty()
668        && !version.is_empty()
669        && version.chars().any(|ch| ch.is_ascii_digit())
670        && name
671            .chars()
672            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
673}
674
675fn extract_from_sdist_archive(path: &Path) -> PackageData {
676    let metadata = match std::fs::metadata(path) {
677        Ok(m) => m,
678        Err(e) => {
679            warn!(
680                "Failed to read metadata for sdist archive {:?}: {}",
681                path, e
682            );
683            return default_package_data(path);
684        }
685    };
686
687    if metadata.len() > MAX_ARCHIVE_SIZE {
688        warn!(
689            "sdist archive too large: {} bytes (limit: {} bytes)",
690            metadata.len(),
691            MAX_ARCHIVE_SIZE
692        );
693        return default_package_data(path);
694    }
695
696    let Some(format) = detect_python_sdist_archive_format(path) else {
697        return default_package_data(path);
698    };
699
700    let mut package_data = match format {
701        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
702            let file = match File::open(path) {
703                Ok(file) => file,
704                Err(e) => {
705                    warn!("Failed to open sdist archive {:?}: {}", path, e);
706                    return default_package_data(path);
707                }
708            };
709            let decoder = GzDecoder::new(file);
710            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
711        }
712        PythonSdistArchiveFormat::TarBz2 => {
713            let file = match File::open(path) {
714                Ok(file) => file,
715                Err(e) => {
716                    warn!("Failed to open sdist archive {:?}: {}", path, e);
717                    return default_package_data(path);
718                }
719            };
720            let decoder = BzDecoder::new(file);
721            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
722        }
723        PythonSdistArchiveFormat::TarXz => {
724            let file = match File::open(path) {
725                Ok(file) => file,
726                Err(e) => {
727                    warn!("Failed to open sdist archive {:?}: {}", path, e);
728                    return default_package_data(path);
729                }
730            };
731            let decoder = XzDecoder::new(file);
732            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
733        }
734        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
735    };
736
737    if package_data.package_type.is_some() {
738        let (size, sha256) = calculate_file_checksums(path);
739        package_data.size = size;
740        package_data.sha256 = sha256;
741    }
742
743    package_data
744}
745
746fn extract_from_tar_sdist_archive<R: Read>(
747    path: &Path,
748    reader: R,
749    archive_type: &str,
750    compressed_size: u64,
751) -> PackageData {
752    let mut archive = Archive::new(reader);
753    let archive_entries = match archive.entries() {
754        Ok(entries) => entries,
755        Err(e) => {
756            warn!(
757                "Failed to read {} sdist archive {:?}: {}",
758                archive_type, path, e
759            );
760            return default_package_data(path);
761        }
762    };
763
764    let mut total_extracted = 0u64;
765    let mut entries = Vec::new();
766
767    for entry_result in archive_entries {
768        let mut entry = match entry_result {
769            Ok(entry) => entry,
770            Err(e) => {
771                warn!(
772                    "Failed to read {} sdist entry from {:?}: {}",
773                    archive_type, path, e
774                );
775                continue;
776            }
777        };
778
779        let entry_size = entry.size();
780        if entry_size > MAX_FILE_SIZE {
781            warn!(
782                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
783                archive_type, path, entry_size, MAX_FILE_SIZE
784            );
785            continue;
786        }
787
788        total_extracted += entry_size;
789        if total_extracted > MAX_ARCHIVE_SIZE {
790            warn!(
791                "Total extracted size exceeds limit for {} sdist {:?}",
792                archive_type, path
793            );
794            return default_package_data(path);
795        }
796
797        if compressed_size > 0 {
798            let ratio = total_extracted as f64 / compressed_size as f64;
799            if ratio > MAX_COMPRESSION_RATIO {
800                warn!(
801                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
802                    archive_type, path, ratio
803                );
804                return default_package_data(path);
805            }
806        }
807
808        let entry_path = match entry.path() {
809            Ok(path) => path.to_string_lossy().replace('\\', "/"),
810            Err(e) => {
811                warn!(
812                    "Failed to get {} sdist entry path from {:?}: {}",
813                    archive_type, path, e
814                );
815                continue;
816            }
817        };
818
819        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
820            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
821            continue;
822        };
823
824        if !is_relevant_sdist_text_entry(&entry_path) {
825            continue;
826        }
827
828        if let Ok(content) = read_limited_utf8(
829            &mut entry,
830            MAX_FILE_SIZE,
831            &format!("{} entry {}", archive_type, entry_path),
832        ) {
833            entries.push((entry_path, content));
834        }
835    }
836
837    build_sdist_package_data(path, entries)
838}
839
840fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
841    let file = match File::open(path) {
842        Ok(file) => file,
843        Err(e) => {
844            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
845            return default_package_data(path);
846        }
847    };
848
849    let mut archive = match ZipArchive::new(file) {
850        Ok(archive) => archive,
851        Err(e) => {
852            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
853            return default_package_data(path);
854        }
855    };
856
857    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
858        Ok(entries) => entries,
859        Err(_) => return default_package_data(path),
860    };
861
862    let mut entries = Vec::new();
863    for entry in validated_entries.iter() {
864        if !is_relevant_sdist_text_entry(&entry.name) {
865            continue;
866        }
867
868        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
869            entries.push((entry.name.clone(), content));
870        }
871    }
872
873    build_sdist_package_data(path, entries)
874}
875
876fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
877    entry_path.ends_with("/PKG-INFO")
878        || entry_path.ends_with("/requires.txt")
879        || entry_path.ends_with("/SOURCES.txt")
880}
881
882fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
883    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
884        warn!("No PKG-INFO file found in sdist archive {:?}", path);
885        return default_package_data(path);
886    };
887
888    let mut package_data =
889        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
890    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
891    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
892    apply_sdist_name_version_fallback(path, &mut package_data);
893    package_data
894}
895
896fn select_sdist_pkginfo_entry(
897    archive_path: &Path,
898    entries: &[(String, String)],
899) -> Option<(String, String)> {
900    let expected_name = archive_path
901        .file_name()
902        .and_then(|name| name.to_str())
903        .and_then(strip_python_archive_extension)
904        .and_then(|stem| {
905            stem.rsplit_once('-')
906                .map(|(name, _)| normalize_python_package_name(name))
907        });
908
909    entries
910        .iter()
911        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
912        .min_by_key(|(entry_path, content)| {
913            let components: Vec<_> = entry_path
914                .split('/')
915                .filter(|part| !part.is_empty())
916                .collect();
917            let metadata = super::rfc822::parse_rfc822_content(content);
918            let candidate_name = super::rfc822::get_header_first(&metadata.headers, "name")
919                .map(|name| normalize_python_package_name(&name));
920            let name_rank = if candidate_name == expected_name {
921                0
922            } else {
923                1
924            };
925            let kind_rank = if components.len() == 3
926                && components[1].ends_with(".egg-info")
927                && components[2] == "PKG-INFO"
928            {
929                0
930            } else if components.len() == 2 && components[1] == "PKG-INFO" {
931                1
932            } else if entry_path.ends_with(".egg-info/PKG-INFO") {
933                2
934            } else {
935                3
936            };
937
938            (name_rank, kind_rank, components.len(), entry_path.clone())
939        })
940        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
941}
942
943fn merge_sdist_archive_dependencies(
944    entries: &[(String, String)],
945    metadata_path: &str,
946    package_data: &mut PackageData,
947) {
948    let metadata_dir = metadata_path
949        .rsplit_once('/')
950        .map(|(dir, _)| dir)
951        .unwrap_or("");
952    let archive_root = metadata_path.split('/').next().unwrap_or("");
953    let matched_egg_info_dir =
954        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
955    let mut extra_dependencies = Vec::new();
956
957    for (entry_path, content) in entries {
958        let is_direct_requires =
959            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
960        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
961            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
962        });
963
964        if is_direct_requires || is_egg_info_requires {
965            extra_dependencies.extend(parse_requires_txt(content));
966        }
967    }
968
969    for dependency in extra_dependencies {
970        if !package_data.dependencies.iter().any(|existing| {
971            existing.purl == dependency.purl
972                && existing.scope == dependency.scope
973                && existing.extracted_requirement == dependency.extracted_requirement
974                && existing.extra_data == dependency.extra_data
975        }) {
976            package_data.dependencies.push(dependency);
977        }
978    }
979}
980
981fn merge_sdist_archive_file_references(
982    entries: &[(String, String)],
983    metadata_path: &str,
984    package_data: &mut PackageData,
985) {
986    let metadata_dir = metadata_path
987        .rsplit_once('/')
988        .map(|(dir, _)| dir)
989        .unwrap_or("");
990    let archive_root = metadata_path.split('/').next().unwrap_or("");
991    let matched_egg_info_dir =
992        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
993    let mut extra_refs = Vec::new();
994
995    for (entry_path, content) in entries {
996        let is_direct_sources =
997            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
998        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
999            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1000        });
1001
1002        if is_direct_sources || is_egg_info_sources {
1003            extra_refs.extend(parse_sources_txt(content));
1004        }
1005    }
1006
1007    for file_ref in extra_refs {
1008        if !package_data
1009            .file_references
1010            .iter()
1011            .any(|existing| existing.path == file_ref.path)
1012        {
1013            package_data.file_references.push(file_ref);
1014        }
1015    }
1016}
1017
1018fn select_matching_sdist_egg_info_dir(
1019    entries: &[(String, String)],
1020    archive_root: &str,
1021    package_name: Option<&str>,
1022) -> Option<String> {
1023    let normalized_package_name = package_name.map(normalize_python_package_name);
1024
1025    entries
1026        .iter()
1027        .filter_map(|(entry_path, _)| {
1028            let components: Vec<_> = entry_path
1029                .split('/')
1030                .filter(|part| !part.is_empty())
1031                .collect();
1032            if components.len() == 3
1033                && components[0] == archive_root
1034                && components[1].ends_with(".egg-info")
1035            {
1036                Some(components[1].to_string())
1037            } else {
1038                None
1039            }
1040        })
1041        .min_by_key(|egg_info_dir| {
1042            let normalized_dir_name =
1043                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1044            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1045                0
1046            } else {
1047                1
1048            };
1049
1050            (name_rank, egg_info_dir.clone())
1051        })
1052}
1053
1054fn normalize_python_package_name(name: &str) -> String {
1055    name.to_ascii_lowercase().replace('_', "-")
1056}
1057
1058fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1059    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1060        return;
1061    };
1062
1063    let Some(stem) = strip_python_archive_extension(file_name) else {
1064        return;
1065    };
1066
1067    let Some((name, version)) = stem.rsplit_once('-') else {
1068        return;
1069    };
1070
1071    if package_data.name.is_none() {
1072        package_data.name = Some(name.replace('_', "-"));
1073    }
1074    if package_data.version.is_none() {
1075        package_data.version = Some(version.to_string());
1076    }
1077
1078    if package_data.purl.is_none()
1079        || package_data.repository_homepage_url.is_none()
1080        || package_data.repository_download_url.is_none()
1081        || package_data.api_data_url.is_none()
1082    {
1083        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1084            build_pypi_urls(
1085                package_data.name.as_deref(),
1086                package_data.version.as_deref(),
1087            );
1088
1089        if package_data.repository_homepage_url.is_none() {
1090            package_data.repository_homepage_url = repository_homepage_url;
1091        }
1092        if package_data.repository_download_url.is_none() {
1093            package_data.repository_download_url = repository_download_url;
1094        }
1095        if package_data.api_data_url.is_none() {
1096            package_data.api_data_url = api_data_url;
1097        }
1098        if package_data.purl.is_none() {
1099            package_data.purl = purl;
1100        }
1101    }
1102}
1103
1104fn extract_from_wheel_archive(path: &Path) -> PackageData {
1105    let metadata = match std::fs::metadata(path) {
1106        Ok(m) => m,
1107        Err(e) => {
1108            warn!(
1109                "Failed to read metadata for wheel archive {:?}: {}",
1110                path, e
1111            );
1112            return default_package_data(path);
1113        }
1114    };
1115
1116    if metadata.len() > MAX_ARCHIVE_SIZE {
1117        warn!(
1118            "Wheel archive too large: {} bytes (limit: {} bytes)",
1119            metadata.len(),
1120            MAX_ARCHIVE_SIZE
1121        );
1122        return default_package_data(path);
1123    }
1124
1125    let file = match File::open(path) {
1126        Ok(f) => f,
1127        Err(e) => {
1128            warn!("Failed to open wheel archive {:?}: {}", path, e);
1129            return default_package_data(path);
1130        }
1131    };
1132
1133    let mut archive = match ZipArchive::new(file) {
1134        Ok(a) => a,
1135        Err(e) => {
1136            warn!("Failed to read wheel archive {:?}: {}", path, e);
1137            return default_package_data(path);
1138        }
1139    };
1140
1141    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1142        Ok(entries) => entries,
1143        Err(_) => return default_package_data(path),
1144    };
1145
1146    let metadata_entry =
1147        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1148            Some(entry) => entry,
1149            None => {
1150                warn!("No METADATA file found in wheel archive {:?}", path);
1151                return default_package_data(path);
1152            }
1153        };
1154
1155    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1156        Ok(c) => c,
1157        Err(e) => {
1158            warn!("Failed to read METADATA from {:?}: {}", path, e);
1159            return default_package_data(path);
1160        }
1161    };
1162
1163    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1164
1165    let (size, sha256) = calculate_file_checksums(path);
1166    package_data.size = size;
1167    package_data.sha256 = sha256;
1168
1169    if let Some(record_entry) =
1170        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1171        && let Ok(record_content) =
1172            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1173    {
1174        package_data.file_references = parse_record_csv(&record_content);
1175    }
1176
1177    if let Some(wheel_info) = parse_wheel_filename(path) {
1178        if package_data.name.is_none() {
1179            package_data.name = Some(wheel_info.name.clone());
1180        }
1181        if package_data.version.is_none() {
1182            package_data.version = Some(wheel_info.version.clone());
1183        }
1184
1185        package_data.qualifiers = Some(std::collections::HashMap::from([(
1186            "extension".to_string(),
1187            format!(
1188                "{}-{}-{}",
1189                wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1190            ),
1191        )]));
1192
1193        package_data.purl = build_wheel_purl(
1194            package_data.name.as_deref(),
1195            package_data.version.as_deref(),
1196            &wheel_info,
1197        );
1198
1199        let mut extra_data = package_data.extra_data.unwrap_or_default();
1200        extra_data.insert(
1201            "python_requires".to_string(),
1202            serde_json::Value::String(wheel_info.python_tag.clone()),
1203        );
1204        extra_data.insert(
1205            "abi_tag".to_string(),
1206            serde_json::Value::String(wheel_info.abi_tag.clone()),
1207        );
1208        extra_data.insert(
1209            "platform_tag".to_string(),
1210            serde_json::Value::String(wheel_info.platform_tag.clone()),
1211        );
1212        package_data.extra_data = Some(extra_data);
1213    }
1214
1215    package_data
1216}
1217
1218fn extract_from_egg_archive(path: &Path) -> PackageData {
1219    let metadata = match std::fs::metadata(path) {
1220        Ok(m) => m,
1221        Err(e) => {
1222            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1223            return default_package_data(path);
1224        }
1225    };
1226
1227    if metadata.len() > MAX_ARCHIVE_SIZE {
1228        warn!(
1229            "Egg archive too large: {} bytes (limit: {} bytes)",
1230            metadata.len(),
1231            MAX_ARCHIVE_SIZE
1232        );
1233        return default_package_data(path);
1234    }
1235
1236    let file = match File::open(path) {
1237        Ok(f) => f,
1238        Err(e) => {
1239            warn!("Failed to open egg archive {:?}: {}", path, e);
1240            return default_package_data(path);
1241        }
1242    };
1243
1244    let mut archive = match ZipArchive::new(file) {
1245        Ok(a) => a,
1246        Err(e) => {
1247            warn!("Failed to read egg archive {:?}: {}", path, e);
1248            return default_package_data(path);
1249        }
1250    };
1251
1252    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1253        Ok(entries) => entries,
1254        Err(_) => return default_package_data(path),
1255    };
1256
1257    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1258        &validated_entries,
1259        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1260    ) {
1261        Some(entry) => entry,
1262        None => {
1263            warn!("No PKG-INFO file found in egg archive {:?}", path);
1264            return default_package_data(path);
1265        }
1266    };
1267
1268    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1269        Ok(c) => c,
1270        Err(e) => {
1271            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1272            return default_package_data(path);
1273        }
1274    };
1275
1276    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1277
1278    let (size, sha256) = calculate_file_checksums(path);
1279    package_data.size = size;
1280    package_data.sha256 = sha256;
1281
1282    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1283        &validated_entries,
1284        &[
1285            "EGG-INFO/installed-files.txt",
1286            ".egg-info/installed-files.txt",
1287        ],
1288    ) && let Ok(installed_files_content) =
1289        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1290    {
1291        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1292    }
1293
1294    if let Some(egg_info) = parse_egg_filename(path) {
1295        if package_data.name.is_none() {
1296            package_data.name = Some(egg_info.name.clone());
1297        }
1298        if package_data.version.is_none() {
1299            package_data.version = Some(egg_info.version.clone());
1300        }
1301
1302        if let Some(python_version) = &egg_info.python_version {
1303            let mut extra_data = package_data.extra_data.unwrap_or_default();
1304            extra_data.insert(
1305                "python_version".to_string(),
1306                serde_json::Value::String(python_version.clone()),
1307            );
1308            package_data.extra_data = Some(extra_data);
1309        }
1310    }
1311
1312    package_data.purl = build_egg_purl(
1313        package_data.name.as_deref(),
1314        package_data.version.as_deref(),
1315    );
1316
1317    package_data
1318}
1319
1320fn find_validated_zip_entry_by_suffix<'a>(
1321    entries: &'a [ValidatedZipEntry],
1322    suffix: &str,
1323) -> Option<&'a ValidatedZipEntry> {
1324    entries.iter().find(|entry| entry.name.ends_with(suffix))
1325}
1326
1327fn find_validated_zip_entry_by_any_suffix<'a>(
1328    entries: &'a [ValidatedZipEntry],
1329    suffixes: &[&str],
1330) -> Option<&'a ValidatedZipEntry> {
1331    entries
1332        .iter()
1333        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1334}
1335
1336fn read_validated_zip_entry<R: Read + std::io::Seek>(
1337    archive: &mut ZipArchive<R>,
1338    entry: &ValidatedZipEntry,
1339    path: &Path,
1340    archive_type: &str,
1341) -> Result<String, String> {
1342    let mut file = archive
1343        .by_index(entry.index)
1344        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1345
1346    let compressed_size = file.compressed_size();
1347    let uncompressed_size = file.size();
1348
1349    if compressed_size > 0 {
1350        let ratio = uncompressed_size as f64 / compressed_size as f64;
1351        if ratio > MAX_COMPRESSION_RATIO {
1352            return Err(format!(
1353                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1354                archive_type, path, ratio
1355            ));
1356        }
1357    }
1358
1359    if uncompressed_size > MAX_FILE_SIZE {
1360        return Err(format!(
1361            "Rejected oversized entry in {} {:?}: {} bytes",
1362            archive_type, path, uncompressed_size
1363        ));
1364    }
1365
1366    read_limited_utf8(
1367        &mut file,
1368        MAX_FILE_SIZE,
1369        &format!("{} entry {}", archive_type, entry.name),
1370    )
1371}
1372
1373fn read_limited_utf8<R: Read>(
1374    reader: &mut R,
1375    max_bytes: u64,
1376    context: &str,
1377) -> Result<String, String> {
1378    let mut limited = reader.take(max_bytes + 1);
1379    let mut bytes = Vec::new();
1380    limited
1381        .read_to_end(&mut bytes)
1382        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1383
1384    if bytes.len() as u64 > max_bytes {
1385        return Err(format!(
1386            "{} exceeded {} byte limit while reading",
1387            context, max_bytes
1388        ));
1389    }
1390
1391    String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1392}
1393
1394fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1395    let normalized = entry_path.replace('\\', "/");
1396    if normalized.len() >= 3 {
1397        let bytes = normalized.as_bytes();
1398        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1399            return None;
1400        }
1401    }
1402    let path = Path::new(&normalized);
1403    let mut components = Vec::new();
1404
1405    for component in path.components() {
1406        match component {
1407            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1408            Component::CurDir => {}
1409            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1410        }
1411    }
1412
1413    (!components.is_empty()).then_some(components.join("/"))
1414}
1415
1416/// Parses RECORD CSV format from wheel archives (PEP 427).
1417/// Format: path,hash,size (3 columns, no header)
1418/// Hash format: sha256=urlsafe_base64_hash or empty
1419/// Size: bytes as u64 or empty
1420pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1421    let mut reader = ReaderBuilder::new()
1422        .has_headers(false)
1423        .from_reader(content.as_bytes());
1424
1425    let mut file_references = Vec::new();
1426
1427    for result in reader.records() {
1428        match result {
1429            Ok(record) => {
1430                if record.len() < 3 {
1431                    continue;
1432                }
1433
1434                let path = record.get(0).unwrap_or("").trim().to_string();
1435                if path.is_empty() {
1436                    continue;
1437                }
1438
1439                let hash_field = record.get(1).unwrap_or("").trim();
1440                let size_field = record.get(2).unwrap_or("").trim();
1441
1442                // Parse hash: format is "algorithm=value"
1443                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1444                    let parts: Vec<&str> = hash_field.split('=').collect();
1445                    if parts.len() == 2 && parts[0] == "sha256" {
1446                        // Decode base64 to hex
1447                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1448                            Ok(decoded) => {
1449                                let hex = decoded
1450                                    .iter()
1451                                    .map(|b| format!("{:02x}", b))
1452                                    .collect::<String>();
1453                                Some(hex)
1454                            }
1455                            Err(_) => None,
1456                        }
1457                    } else {
1458                        None
1459                    }
1460                } else {
1461                    None
1462                };
1463
1464                // Parse size
1465                let size = if !size_field.is_empty() && size_field != "-" {
1466                    size_field.parse::<u64>().ok()
1467                } else {
1468                    None
1469                };
1470
1471                file_references.push(FileReference {
1472                    path,
1473                    size,
1474                    sha1: None,
1475                    md5: None,
1476                    sha256,
1477                    sha512: None,
1478                    extra_data: None,
1479                });
1480            }
1481            Err(e) => {
1482                warn!("Failed to parse RECORD CSV row: {}", e);
1483                continue;
1484            }
1485        }
1486    }
1487
1488    file_references
1489}
1490
1491/// Parses installed-files.txt format from egg archives (PEP 376).
1492/// Format: one file path per line, no headers, no hash, no size
1493pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1494    content
1495        .lines()
1496        .map(|line| line.trim())
1497        .filter(|line| !line.is_empty())
1498        .map(|path| FileReference {
1499            path: path.to_string(),
1500            size: None,
1501            sha1: None,
1502            md5: None,
1503            sha256: None,
1504            sha512: None,
1505            extra_data: None,
1506        })
1507        .collect()
1508}
1509
1510pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1511    content
1512        .lines()
1513        .map(str::trim)
1514        .filter(|line| !line.is_empty())
1515        .map(|path| FileReference {
1516            path: path.to_string(),
1517            size: None,
1518            sha1: None,
1519            md5: None,
1520            sha256: None,
1521            sha512: None,
1522            extra_data: None,
1523        })
1524        .collect()
1525}
1526
1527struct WheelInfo {
1528    name: String,
1529    version: String,
1530    python_tag: String,
1531    abi_tag: String,
1532    platform_tag: String,
1533}
1534
1535fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1536    let stem = path.file_stem()?.to_string_lossy();
1537    let parts: Vec<&str> = stem.split('-').collect();
1538
1539    if parts.len() >= 5 {
1540        Some(WheelInfo {
1541            name: parts[0].replace('_', "-"),
1542            version: parts[1].to_string(),
1543            python_tag: parts[2].to_string(),
1544            abi_tag: parts[3].to_string(),
1545            platform_tag: parts[4..].join("-"),
1546        })
1547    } else {
1548        None
1549    }
1550}
1551
1552struct EggInfo {
1553    name: String,
1554    version: String,
1555    python_version: Option<String>,
1556}
1557
1558fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1559    let stem = path.file_stem()?.to_string_lossy();
1560    let parts: Vec<&str> = stem.split('-').collect();
1561
1562    if parts.len() >= 2 {
1563        Some(EggInfo {
1564            name: parts[0].replace('_', "-"),
1565            version: parts[1].to_string(),
1566            python_version: parts.get(2).map(|s| s.to_string()),
1567        })
1568    } else {
1569        None
1570    }
1571}
1572
1573fn build_wheel_purl(
1574    name: Option<&str>,
1575    version: Option<&str>,
1576    wheel_info: &WheelInfo,
1577) -> Option<String> {
1578    let name = name?;
1579    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1580
1581    if let Some(ver) = version {
1582        package_url.with_version(ver).ok()?;
1583    }
1584
1585    let extension = format!(
1586        "{}-{}-{}",
1587        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1588    );
1589    package_url.add_qualifier("extension", extension).ok()?;
1590
1591    Some(package_url.to_string())
1592}
1593
1594fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1595    let name = name?;
1596    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1597
1598    if let Some(ver) = version {
1599        package_url.with_version(ver).ok()?;
1600    }
1601
1602    package_url.add_qualifier("type", "egg").ok()?;
1603
1604    Some(package_url.to_string())
1605}
1606
1607fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1608    let metadata = super::rfc822::parse_rfc822_content(content);
1609    build_package_data_from_rfc822(&metadata, datasource_id)
1610}
1611
1612/// Builds PackageData from parsed RFC822 metadata.
1613///
1614/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1615/// and `python_parse_rfc822_content` (content-based) functions.
1616fn build_package_data_from_rfc822(
1617    metadata: &super::rfc822::Rfc822Metadata,
1618    datasource_id: DatasourceId,
1619) -> PackageData {
1620    use super::rfc822::{get_header_all, get_header_first};
1621
1622    let name = get_header_first(&metadata.headers, "name");
1623    let version = get_header_first(&metadata.headers, "version");
1624    let summary = get_header_first(&metadata.headers, "summary");
1625    let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1626    let author = get_header_first(&metadata.headers, "author");
1627    let author_email = get_header_first(&metadata.headers, "author-email");
1628    let license = get_header_first(&metadata.headers, "license");
1629    let license_expression = get_header_first(&metadata.headers, "license-expression");
1630    let download_url = get_header_first(&metadata.headers, "download-url");
1631    let platform = get_header_first(&metadata.headers, "platform");
1632    let requires_python = get_header_first(&metadata.headers, "requires-python");
1633    let classifiers = get_header_all(&metadata.headers, "classifier");
1634    let license_files = get_header_all(&metadata.headers, "license-file");
1635
1636    let description_body = if metadata.body.is_empty() {
1637        get_header_first(&metadata.headers, "description").unwrap_or_default()
1638    } else {
1639        metadata.body.clone()
1640    };
1641
1642    let description = build_description(summary.as_deref(), &description_body);
1643
1644    let mut parties = Vec::new();
1645    if author.is_some() || author_email.is_some() {
1646        parties.push(Party {
1647            r#type: Some("person".to_string()),
1648            role: Some("author".to_string()),
1649            name: author,
1650            email: author_email,
1651            url: None,
1652            organization: None,
1653            organization_url: None,
1654            timezone: None,
1655        });
1656    }
1657
1658    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1659    let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1660    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1661        license_expression
1662            .as_deref()
1663            .and_then(normalize_spdx_expression)
1664            .map(|normalized| {
1665                build_declared_license_data(
1666                    normalized,
1667                    DeclaredLicenseMatchMetadata::single_line(
1668                        license_expression.as_deref().unwrap_or_default(),
1669                    )
1670                    .with_referenced_filenames(&referenced_license_files),
1671                )
1672            })
1673            .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1674
1675    let extracted_license_statement = license_expression
1676        .clone()
1677        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1678
1679    let mut extra_data = HashMap::new();
1680    if let Some(platform_value) = platform
1681        && !platform_value.eq_ignore_ascii_case("unknown")
1682        && !platform_value.is_empty()
1683    {
1684        extra_data.insert(
1685            "platform".to_string(),
1686            serde_json::Value::String(platform_value),
1687        );
1688    }
1689
1690    if let Some(requires_python_value) = requires_python
1691        && !requires_python_value.is_empty()
1692    {
1693        extra_data.insert(
1694            "requires_python".to_string(),
1695            serde_json::Value::String(requires_python_value),
1696        );
1697    }
1698
1699    if !license_files.is_empty() {
1700        extra_data.insert(
1701            "license_files".to_string(),
1702            serde_json::Value::Array(
1703                license_files
1704                    .iter()
1705                    .cloned()
1706                    .map(serde_json::Value::String)
1707                    .collect(),
1708            ),
1709        );
1710    }
1711
1712    let file_references = license_files
1713        .iter()
1714        .map(|path| FileReference {
1715            path: path.clone(),
1716            size: None,
1717            sha1: None,
1718            md5: None,
1719            sha256: None,
1720            sha512: None,
1721            extra_data: None,
1722        })
1723        .collect();
1724
1725    let project_urls = get_header_all(&metadata.headers, "project-url");
1726    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1727    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1728
1729    if !project_urls.is_empty() {
1730        let parsed_urls = parse_project_urls(&project_urls);
1731
1732        for (label, url) in &parsed_urls {
1733            let label_lower = label.to_lowercase();
1734
1735            if bug_tracking_url.is_none()
1736                && matches!(
1737                    label_lower.as_str(),
1738                    "tracker"
1739                        | "bug reports"
1740                        | "bug tracker"
1741                        | "issues"
1742                        | "issue tracker"
1743                        | "github: issues"
1744                )
1745            {
1746                bug_tracking_url = Some(url.clone());
1747            } else if code_view_url.is_none()
1748                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1749            {
1750                code_view_url = Some(url.clone());
1751            } else if vcs_url.is_none()
1752                && matches!(
1753                    label_lower.as_str(),
1754                    "github" | "gitlab" | "github: repo" | "repository"
1755                )
1756            {
1757                vcs_url = Some(url.clone());
1758            } else if homepage_url.is_none()
1759                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1760            {
1761                homepage_url = Some(url.clone());
1762            } else if label_lower == "changelog" {
1763                extra_data.insert(
1764                    "changelog_url".to_string(),
1765                    serde_json::Value::String(url.clone()),
1766                );
1767            }
1768        }
1769
1770        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1771            .iter()
1772            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1773            .collect();
1774
1775        if !project_urls_json.is_empty() {
1776            extra_data.insert(
1777                "project_urls".to_string(),
1778                serde_json::Value::Object(project_urls_json),
1779            );
1780        }
1781    }
1782
1783    let extra_data = if extra_data.is_empty() {
1784        None
1785    } else {
1786        Some(extra_data)
1787    };
1788
1789    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1790        build_pypi_urls(name.as_deref(), version.as_deref());
1791
1792    PackageData {
1793        package_type: Some(PythonParser::PACKAGE_TYPE),
1794        namespace: None,
1795        name,
1796        version,
1797        qualifiers: None,
1798        subpath: None,
1799        primary_language: Some("Python".to_string()),
1800        description,
1801        release_date: None,
1802        parties,
1803        keywords,
1804        homepage_url,
1805        download_url,
1806        size: None,
1807        sha1: None,
1808        md5: None,
1809        sha256: None,
1810        sha512: None,
1811        bug_tracking_url,
1812        code_view_url,
1813        vcs_url,
1814        copyright: None,
1815        holder: None,
1816        declared_license_expression,
1817        declared_license_expression_spdx,
1818        license_detections,
1819        other_license_expression: None,
1820        other_license_expression_spdx: None,
1821        other_license_detections: Vec::new(),
1822        extracted_license_statement,
1823        notice_text: None,
1824        source_packages: Vec::new(),
1825        file_references,
1826        is_private: false,
1827        is_virtual: false,
1828        extra_data,
1829        dependencies,
1830        repository_homepage_url,
1831        repository_download_url,
1832        api_data_url,
1833        datasource_id: Some(datasource_id),
1834        purl,
1835    }
1836}
1837
1838fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
1839    project_urls
1840        .iter()
1841        .filter_map(|url_entry| {
1842            if let Some((label, url)) = url_entry.split_once(", ") {
1843                let label_trimmed = label.trim();
1844                let url_trimmed = url.trim();
1845                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
1846                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
1847                }
1848            }
1849            None
1850        })
1851        .collect()
1852}
1853
1854fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
1855    let mut parts = Vec::new();
1856    if let Some(summary_value) = summary
1857        && !summary_value.trim().is_empty()
1858    {
1859        parts.push(summary_value.trim().to_string());
1860    }
1861
1862    if !body.trim().is_empty() {
1863        parts.push(body.trim().to_string());
1864    }
1865
1866    if parts.is_empty() {
1867        None
1868    } else {
1869        Some(parts.join("\n"))
1870    }
1871}
1872
1873fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
1874    let mut keywords = Vec::new();
1875    let mut license_classifiers = Vec::new();
1876
1877    for classifier in classifiers {
1878        if classifier.starts_with("License ::") {
1879            license_classifiers.push(classifier.to_string());
1880        } else {
1881            keywords.push(classifier.to_string());
1882        }
1883    }
1884
1885    (keywords, license_classifiers)
1886}
1887
1888fn build_extracted_license_statement(
1889    license: Option<&str>,
1890    license_classifiers: &[String],
1891) -> Option<String> {
1892    let mut lines = Vec::new();
1893
1894    if let Some(value) = license
1895        && !value.trim().is_empty()
1896    {
1897        lines.push(format!("license: {}", value.trim()));
1898    }
1899
1900    if !license_classifiers.is_empty() {
1901        lines.push("classifiers:".to_string());
1902        for classifier in license_classifiers {
1903            lines.push(format!("  - '{}'", classifier));
1904        }
1905    }
1906
1907    if lines.is_empty() {
1908        None
1909    } else {
1910        Some(format!("{}\n", lines.join("\n")))
1911    }
1912}
1913
1914pub(crate) fn build_pypi_urls(
1915    name: Option<&str>,
1916    version: Option<&str>,
1917) -> (
1918    Option<String>,
1919    Option<String>,
1920    Option<String>,
1921    Option<String>,
1922) {
1923    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
1924
1925    let repository_download_url = name.and_then(|value| {
1926        version.map(|ver| {
1927            format!(
1928                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
1929                &value[..1.min(value.len())],
1930                value,
1931                value,
1932                ver
1933            )
1934        })
1935    });
1936
1937    let api_data_url = name.map(|value| {
1938        if let Some(ver) = version {
1939            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
1940        } else {
1941            format!("https://pypi.org/pypi/{}/json", value)
1942        }
1943    });
1944
1945    let purl = name.and_then(|value| {
1946        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
1947        if let Some(ver) = version {
1948            package_url.with_version(ver).ok()?;
1949        }
1950        Some(package_url.to_string())
1951    });
1952
1953    (
1954        repository_homepage_url,
1955        repository_download_url,
1956        api_data_url,
1957        purl,
1958    )
1959}
1960
1961fn build_pypi_purl_with_extension(
1962    name: &str,
1963    version: Option<&str>,
1964    extension: &str,
1965) -> Option<String> {
1966    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1967    if let Some(ver) = version {
1968        package_url.with_version(ver).ok()?;
1969    }
1970    package_url.add_qualifier("extension", extension).ok()?;
1971    Some(package_url.to_string())
1972}
1973
1974fn extract_from_pyproject_toml(path: &Path) -> PackageData {
1975    let toml_content = match read_toml_file(path) {
1976        Ok(content) => content,
1977        Err(e) => {
1978            warn!(
1979                "Failed to read or parse pyproject.toml at {:?}: {}",
1980                path, e
1981            );
1982            return default_package_data(path);
1983        }
1984    };
1985
1986    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
1987
1988    // Handle both PEP 621 (project table) and poetry formats
1989    let project_table =
1990        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
1991            // Standard PEP 621 format with [project] table
1992            project.clone()
1993        } else if let Some(tool) = tool_table {
1994            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
1995                // Poetry format with [tool.poetry] table
1996                poetry.clone()
1997            } else {
1998                warn!(
1999                    "No project or tool.poetry data found in pyproject.toml at {:?}",
2000                    path
2001                );
2002                return default_package_data(path);
2003            }
2004        } else if toml_content.get(FIELD_NAME).is_some() {
2005            // Other format with top-level fields
2006            match toml_content.as_table() {
2007                Some(table) => table.clone(),
2008                None => {
2009                    warn!("Failed to convert TOML content to table in {:?}", path);
2010                    return default_package_data(path);
2011                }
2012            }
2013        } else {
2014            warn!("No project data found in pyproject.toml at {:?}", path);
2015            return default_package_data(path);
2016        };
2017
2018    let name = project_table
2019        .get(FIELD_NAME)
2020        .and_then(|v| v.as_str())
2021        .map(String::from);
2022
2023    let version = project_table
2024        .get(FIELD_VERSION)
2025        .and_then(|v| v.as_str())
2026        .map(String::from);
2027    let classifiers = project_table
2028        .get("classifiers")
2029        .and_then(|value| value.as_array())
2030        .map(|values| {
2031            values
2032                .iter()
2033                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2034                .collect::<Vec<_>>()
2035        })
2036        .unwrap_or_default();
2037
2038    let extracted_license_statement = extract_raw_license_string(&project_table);
2039    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2040        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2041
2042    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2043    let (homepage_url, repository_url) = extract_urls(&project_table);
2044
2045    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2046    let extra_data = extract_pyproject_extra_data(&toml_content);
2047
2048    // Create package URL
2049    let purl = name.as_ref().and_then(|n| {
2050        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2051            Ok(p) => p,
2052            Err(e) => {
2053                warn!(
2054                    "Failed to create PackageUrl for Python package '{}': {}",
2055                    n, e
2056                );
2057                return None;
2058            }
2059        };
2060
2061        if let Some(v) = &version
2062            && let Err(e) = package_url.with_version(v)
2063        {
2064            warn!(
2065                "Failed to set version '{}' for Python package '{}': {}",
2066                v, n, e
2067            );
2068            return None;
2069        }
2070
2071        Some(package_url.to_string())
2072    });
2073
2074    let api_data_url = name.as_ref().map(|n| {
2075        if let Some(v) = &version {
2076            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2077        } else {
2078            format!("https://pypi.org/pypi/{}/json", n)
2079        }
2080    });
2081
2082    let pypi_homepage_url = name
2083        .as_ref()
2084        .map(|n| format!("https://pypi.org/project/{}", n));
2085
2086    let pypi_download_url = name.as_ref().and_then(|n| {
2087        version.as_ref().map(|v| {
2088            format!(
2089                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2090                &n[..1.min(n.len())],
2091                n,
2092                n,
2093                v
2094            )
2095        })
2096    });
2097
2098    PackageData {
2099        package_type: Some(PythonParser::PACKAGE_TYPE),
2100        namespace: None,
2101        name,
2102        version,
2103        qualifiers: None,
2104        subpath: None,
2105        primary_language: None,
2106        description: None,
2107        release_date: None,
2108        parties: extract_parties(&project_table),
2109        keywords: Vec::new(),
2110        homepage_url: homepage_url.or(pypi_homepage_url),
2111        download_url: repository_url.clone().or(pypi_download_url),
2112        size: None,
2113        sha1: None,
2114        md5: None,
2115        sha256: None,
2116        sha512: None,
2117        bug_tracking_url: None,
2118        code_view_url: None,
2119        vcs_url: repository_url,
2120        copyright: None,
2121        holder: None,
2122        declared_license_expression,
2123        declared_license_expression_spdx,
2124        license_detections,
2125        other_license_expression: None,
2126        other_license_expression_spdx: None,
2127        other_license_detections: Vec::new(),
2128        extracted_license_statement,
2129        notice_text: None,
2130        source_packages: Vec::new(),
2131        file_references: Vec::new(),
2132        is_private: has_private_classifier(&classifiers),
2133        is_virtual: false,
2134        extra_data,
2135        dependencies: [dependencies, optional_dependencies].concat(),
2136        repository_homepage_url: None,
2137        repository_download_url: None,
2138        api_data_url,
2139        datasource_id: Some(DatasourceId::PypiPyprojectToml),
2140        purl,
2141    }
2142}
2143
2144fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2145    project
2146        .get(FIELD_LICENSE)
2147        .and_then(|license_value| match license_value {
2148            TomlValue::String(license_str) => Some(license_str.clone()),
2149            TomlValue::Table(license_table) => license_table
2150                .get("text")
2151                .and_then(|v| v.as_str())
2152                .map(|s| s.to_string())
2153                .or_else(|| {
2154                    license_table
2155                        .get("expression")
2156                        .and_then(|v| v.as_str())
2157                        .map(|expr| expr.to_string())
2158                }),
2159            _ => None,
2160        })
2161}
2162
2163fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2164    match project.get(FIELD_LICENSE) {
2165        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2166        Some(TomlValue::Table(license_table)) => license_table
2167            .get("expression")
2168            .and_then(|value| value.as_str()),
2169        _ => None,
2170    }
2171}
2172
2173fn extract_urls(project: &TomlMap<String, TomlValue>) -> (Option<String>, Option<String>) {
2174    let mut homepage_url = None;
2175    let mut repository_url = None;
2176
2177    // Check for URLs table
2178    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2179        homepage_url = urls
2180            .get(FIELD_HOMEPAGE)
2181            .and_then(|v| v.as_str())
2182            .map(String::from);
2183        repository_url = urls
2184            .get(FIELD_REPOSITORY)
2185            .and_then(|v| v.as_str())
2186            .map(String::from);
2187    }
2188
2189    // If not found in URLs table, check for top-level keys
2190    if homepage_url.is_none() {
2191        homepage_url = project
2192            .get(FIELD_HOMEPAGE)
2193            .and_then(|v| v.as_str())
2194            .map(String::from);
2195    }
2196
2197    if repository_url.is_none() {
2198        repository_url = project
2199            .get(FIELD_REPOSITORY)
2200            .and_then(|v| v.as_str())
2201            .map(String::from);
2202    }
2203
2204    (homepage_url, repository_url)
2205}
2206
2207fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2208    let mut parties = Vec::new();
2209
2210    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2211        for author in authors {
2212            if let Some(author_str) = author.as_str() {
2213                let (name, email) = split_name_email(author_str);
2214                parties.push(Party {
2215                    r#type: None,
2216                    role: Some("author".to_string()),
2217                    name,
2218                    email,
2219                    url: None,
2220                    organization: None,
2221                    organization_url: None,
2222                    timezone: None,
2223                });
2224            }
2225        }
2226    }
2227
2228    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2229        for maintainer in maintainers {
2230            if let Some(maintainer_str) = maintainer.as_str() {
2231                let (name, email) = split_name_email(maintainer_str);
2232                parties.push(Party {
2233                    r#type: None,
2234                    role: Some("maintainer".to_string()),
2235                    name,
2236                    email,
2237                    url: None,
2238                    organization: None,
2239                    organization_url: None,
2240                    timezone: None,
2241                });
2242            }
2243        }
2244    }
2245
2246    parties
2247}
2248
2249fn extract_dependencies(
2250    project: &TomlMap<String, TomlValue>,
2251    toml_content: &TomlValue,
2252) -> (Vec<Dependency>, Vec<Dependency>) {
2253    let mut dependencies = Vec::new();
2254    let mut optional_dependencies = Vec::new();
2255
2256    // Handle dependencies - can be array or table format
2257    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2258        match deps_value {
2259            TomlValue::Array(arr) => {
2260                dependencies = parse_dependency_array(arr, false, None);
2261            }
2262            TomlValue::Table(table) => {
2263                dependencies = parse_dependency_table(table, false, None);
2264            }
2265            _ => {}
2266        }
2267    }
2268
2269    // Handle PEP 621 optional-dependencies with scope
2270    if let Some(opt_deps_table) = project
2271        .get(FIELD_OPTIONAL_DEPENDENCIES)
2272        .and_then(|v| v.as_table())
2273    {
2274        for (extra_name, deps) in opt_deps_table {
2275            match deps {
2276                TomlValue::Array(arr) => {
2277                    optional_dependencies.extend(parse_dependency_array(
2278                        arr,
2279                        true,
2280                        Some(extra_name),
2281                    ));
2282                }
2283                TomlValue::Table(table) => {
2284                    optional_dependencies.extend(parse_dependency_table(
2285                        table,
2286                        true,
2287                        Some(extra_name),
2288                    ));
2289                }
2290                _ => {}
2291            }
2292        }
2293    }
2294
2295    // Handle Poetry dev-dependencies
2296    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2297        match dev_deps_value {
2298            TomlValue::Array(arr) => {
2299                optional_dependencies.extend(parse_dependency_array(
2300                    arr,
2301                    true,
2302                    Some(FIELD_DEV_DEPENDENCIES),
2303                ));
2304            }
2305            TomlValue::Table(table) => {
2306                optional_dependencies.extend(parse_dependency_table(
2307                    table,
2308                    true,
2309                    Some(FIELD_DEV_DEPENDENCIES),
2310                ));
2311            }
2312            _ => {}
2313        }
2314    }
2315
2316    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2317    if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
2318        for (group_name, group_data) in groups_table {
2319            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2320                match group_deps {
2321                    TomlValue::Array(arr) => {
2322                        optional_dependencies.extend(parse_dependency_array(
2323                            arr,
2324                            true,
2325                            Some(group_name),
2326                        ));
2327                    }
2328                    TomlValue::Table(table) => {
2329                        optional_dependencies.extend(parse_dependency_table(
2330                            table,
2331                            true,
2332                            Some(group_name),
2333                        ));
2334                    }
2335                    _ => {}
2336                }
2337            }
2338        }
2339    }
2340
2341    if let Some(groups_table) = toml_content
2342        .get(FIELD_DEPENDENCY_GROUPS)
2343        .and_then(|value| value.as_table())
2344    {
2345        for (group_name, deps) in groups_table {
2346            match deps {
2347                TomlValue::Array(arr) => {
2348                    optional_dependencies.extend(parse_dependency_array(
2349                        arr,
2350                        true,
2351                        Some(group_name),
2352                    ));
2353                }
2354                TomlValue::Table(table) => {
2355                    optional_dependencies.extend(parse_dependency_table(
2356                        table,
2357                        true,
2358                        Some(group_name),
2359                    ));
2360                }
2361                _ => {}
2362            }
2363        }
2364    }
2365
2366    if let Some(dev_deps_value) = toml_content
2367        .get("tool")
2368        .and_then(|value| value.as_table())
2369        .and_then(|tool| tool.get("uv"))
2370        .and_then(|value| value.as_table())
2371        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2372    {
2373        match dev_deps_value {
2374            TomlValue::Array(arr) => {
2375                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2376            }
2377            TomlValue::Table(table) => {
2378                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2379            }
2380            _ => {}
2381        }
2382    }
2383
2384    (dependencies, optional_dependencies)
2385}
2386
2387fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2388    let mut extra_data = HashMap::new();
2389
2390    if let Some(tool_uv) = toml_content
2391        .get("tool")
2392        .and_then(|value| value.as_table())
2393        .and_then(|tool| tool.get("uv"))
2394    {
2395        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2396    }
2397
2398    if extra_data.is_empty() {
2399        None
2400    } else {
2401        Some(extra_data)
2402    }
2403}
2404
2405fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2406    match value {
2407        TomlValue::String(value) => JsonValue::String(value.clone()),
2408        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2409        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2410        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2411        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2412        TomlValue::Array(values) => {
2413            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2414        }
2415        TomlValue::Table(values) => JsonValue::Object(
2416            values
2417                .iter()
2418                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2419                .collect::<JsonMap<String, JsonValue>>(),
2420        ),
2421    }
2422}
2423
2424fn parse_dependency_table(
2425    table: &TomlMap<String, TomlValue>,
2426    is_optional: bool,
2427    scope: Option<&str>,
2428) -> Vec<Dependency> {
2429    table
2430        .iter()
2431        .filter_map(|(name, version)| {
2432            let version_str = version.as_str().map(|s| s.to_string());
2433            let mut package_url =
2434                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2435
2436            if let Some(v) = &version_str {
2437                package_url.with_version(v).ok()?;
2438            }
2439
2440            Some(Dependency {
2441                purl: Some(package_url.to_string()),
2442                extracted_requirement: None,
2443                scope: scope.map(|s| s.to_string()),
2444                is_runtime: Some(!is_optional),
2445                is_optional: Some(is_optional),
2446                is_pinned: None,
2447                is_direct: Some(true),
2448                resolved_package: None,
2449                extra_data: None,
2450            })
2451        })
2452        .collect()
2453}
2454
2455fn parse_dependency_array(
2456    array: &[TomlValue],
2457    is_optional: bool,
2458    scope: Option<&str>,
2459) -> Vec<Dependency> {
2460    array
2461        .iter()
2462        .filter_map(|dep| {
2463            let dep_str = dep.as_str()?;
2464
2465            let mut parts = dep_str.split(['>', '=', '<', '~']);
2466            let name = parts.next()?.trim().to_string();
2467
2468            let version = parts.next().map(|v| v.trim().to_string());
2469
2470            let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name)
2471            {
2472                Ok(purl) => purl,
2473                Err(_) => return None,
2474            };
2475
2476            if let Some(ref v) = version {
2477                package_url.with_version(v).ok()?;
2478            }
2479
2480            Some(Dependency {
2481                purl: Some(package_url.to_string()),
2482                extracted_requirement: None,
2483                scope: scope.map(|s| s.to_string()),
2484                is_runtime: Some(!is_optional),
2485                is_optional: Some(is_optional),
2486                is_pinned: None,
2487                is_direct: Some(true),
2488                resolved_package: None,
2489                extra_data: None,
2490            })
2491        })
2492        .collect()
2493}
2494
2495#[derive(Debug, Clone)]
2496enum Value {
2497    String(String),
2498    Number(f64),
2499    Bool(bool),
2500    None,
2501    List(Vec<Value>),
2502    Tuple(Vec<Value>),
2503    Dict(HashMap<String, Value>),
2504}
2505
2506struct LiteralEvaluator {
2507    constants: HashMap<String, Value>,
2508    max_depth: usize,
2509    max_nodes: usize,
2510    nodes_visited: usize,
2511}
2512
2513impl LiteralEvaluator {
2514    fn new(constants: HashMap<String, Value>) -> Self {
2515        Self {
2516            constants,
2517            max_depth: MAX_SETUP_PY_AST_DEPTH,
2518            max_nodes: MAX_SETUP_PY_AST_NODES,
2519            nodes_visited: 0,
2520        }
2521    }
2522
2523    fn insert_constant(&mut self, name: String, value: Value) {
2524        self.constants.insert(name, value);
2525    }
2526
2527    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2528        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2529            return None;
2530        }
2531        self.nodes_visited += 1;
2532
2533        match expr {
2534            ast::Expr::Constant(ast::ExprConstant { value, .. }) => self.evaluate_constant(value),
2535            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2536            ast::Expr::List(ast::ExprList { elts, .. }) => {
2537                let mut values = Vec::new();
2538                for elt in elts {
2539                    values.push(self.evaluate_expr(elt, depth + 1)?);
2540                }
2541                Some(Value::List(values))
2542            }
2543            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
2544                let mut values = Vec::new();
2545                for elt in elts {
2546                    values.push(self.evaluate_expr(elt, depth + 1)?);
2547                }
2548                Some(Value::Tuple(values))
2549            }
2550            ast::Expr::Dict(ast::ExprDict { keys, values, .. }) => {
2551                let mut dict = HashMap::new();
2552                for (key_expr, value_expr) in keys.iter().zip(values.iter()) {
2553                    let key_expr = key_expr.as_ref()?;
2554                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
2555                    let key = value_to_string(&key_value)?;
2556                    let value = self.evaluate_expr(value_expr, depth + 1)?;
2557                    dict.insert(key, value);
2558                }
2559                Some(Value::Dict(dict))
2560            }
2561            ast::Expr::Call(ast::ExprCall {
2562                func,
2563                args,
2564                keywords,
2565                ..
2566            }) => {
2567                if keywords.is_empty()
2568                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
2569                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
2570                {
2571                    return self.evaluate_ordered_dict(args, depth + 1);
2572                }
2573
2574                if !args.is_empty() {
2575                    return None;
2576                }
2577
2578                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
2579                    && id == "dict"
2580                {
2581                    let mut dict = HashMap::new();
2582                    for keyword in keywords {
2583                        let key = keyword.arg.as_ref().map(|name| name.as_str())?;
2584                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2585                        dict.insert(key.to_string(), value);
2586                    }
2587                    return Some(Value::Dict(dict));
2588                }
2589
2590                None
2591            }
2592            _ => None,
2593        }
2594    }
2595
2596    fn evaluate_constant(&self, constant: &ast::Constant) -> Option<Value> {
2597        match constant {
2598            ast::Constant::Str(value) => Some(Value::String(value.clone())),
2599            ast::Constant::Bool(value) => Some(Value::Bool(*value)),
2600            ast::Constant::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2601            ast::Constant::Float(value) => Some(Value::Number(*value)),
2602            ast::Constant::None => Some(Value::None),
2603            _ => None,
2604        }
2605    }
2606
2607    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2608        if args.len() != 1 {
2609            return None;
2610        }
2611
2612        let items = match self.evaluate_expr(&args[0], depth)? {
2613            Value::List(items) | Value::Tuple(items) => items,
2614            _ => return None,
2615        };
2616
2617        let mut dict = HashMap::new();
2618        for item in items {
2619            let Value::Tuple(values) = item else {
2620                return None;
2621            };
2622            if values.len() != 2 {
2623                return None;
2624            }
2625            let key = value_to_string(&values[0])?;
2626            dict.insert(key, values[1].clone());
2627        }
2628
2629        Some(Value::Dict(dict))
2630    }
2631}
2632
2633#[derive(Default)]
2634struct SetupAliases {
2635    setup_names: HashSet<String>,
2636    module_aliases: HashMap<String, String>,
2637}
2638
2639fn extract_from_setup_py(path: &Path) -> PackageData {
2640    let content = match read_file_to_string(path) {
2641        Ok(content) => content,
2642        Err(e) => {
2643            warn!("Failed to read setup.py at {:?}: {}", path, e);
2644            return default_package_data(path);
2645        }
2646    };
2647
2648    if content.len() > MAX_SETUP_PY_BYTES {
2649        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
2650        return extract_from_setup_py_regex(&content);
2651    }
2652
2653    let mut package_data = match extract_from_setup_py_ast(&content) {
2654        Ok(Some(data)) => data,
2655        Ok(None) => extract_from_setup_py_regex(&content),
2656        Err(e) => {
2657            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
2658            extract_from_setup_py_regex(&content)
2659        }
2660    };
2661
2662    if package_data.name.is_none() {
2663        package_data.name = extract_setup_value(&content, "name");
2664    }
2665
2666    if package_data.version.is_none() {
2667        package_data.version = extract_setup_value(&content, "version");
2668    }
2669
2670    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
2671
2672    if package_data.purl.is_none() {
2673        package_data.purl = build_setup_py_purl(
2674            package_data.name.as_deref(),
2675            package_data.version.as_deref(),
2676        );
2677    }
2678
2679    package_data
2680}
2681
2682fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
2683    if package_data.version.is_some()
2684        && package_data.extracted_license_statement.is_some()
2685        && package_data
2686            .parties
2687            .iter()
2688            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
2689    {
2690        return;
2691    }
2692
2693    let Some(root) = path.parent() else {
2694        return;
2695    };
2696
2697    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
2698
2699    if package_data.version.is_none() {
2700        package_data.version = dunder_metadata.version;
2701    }
2702
2703    if package_data.extracted_license_statement.is_none() {
2704        package_data.extracted_license_statement = dunder_metadata.license;
2705    }
2706
2707    let has_author = package_data
2708        .parties
2709        .iter()
2710        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
2711
2712    if !has_author && let Some(author) = dunder_metadata.author {
2713        package_data.parties.push(Party {
2714            r#type: Some("person".to_string()),
2715            role: Some("author".to_string()),
2716            name: Some(author),
2717            email: None,
2718            url: None,
2719            organization: None,
2720            organization_url: None,
2721            timezone: None,
2722        });
2723    }
2724}
2725
2726#[derive(Default)]
2727struct DunderMetadata {
2728    version: Option<String>,
2729    author: Option<String>,
2730    license: Option<String>,
2731}
2732
2733fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
2734    let statements = match ast::Suite::parse(content, "<setup.py>") {
2735        Ok(statements) => statements,
2736        Err(_) => return DunderMetadata::default(),
2737    };
2738
2739    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2740    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2741    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2742    let mut metadata = DunderMetadata::default();
2743
2744    for module in imported_dunder_modules(&statements) {
2745        let Some(path) = resolve_imported_module_path(root, &module) else {
2746            continue;
2747        };
2748        let Ok(module_content) = read_file_to_string(&path) else {
2749            continue;
2750        };
2751
2752        if metadata.version.is_none() {
2753            metadata.version = version_re
2754                .as_ref()
2755                .and_then(|regex| regex.captures(&module_content))
2756                .and_then(|captures| captures.get(1))
2757                .map(|match_| match_.as_str().to_string());
2758        }
2759
2760        if metadata.author.is_none() {
2761            metadata.author = author_re
2762                .as_ref()
2763                .and_then(|regex| regex.captures(&module_content))
2764                .and_then(|captures| captures.get(1))
2765                .map(|match_| match_.as_str().to_string());
2766        }
2767
2768        if metadata.license.is_none() {
2769            metadata.license = license_re
2770                .as_ref()
2771                .and_then(|regex| regex.captures(&module_content))
2772                .and_then(|captures| captures.get(1))
2773                .map(|match_| match_.as_str().to_string());
2774        }
2775
2776        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
2777            return metadata;
2778        }
2779    }
2780
2781    metadata
2782}
2783
2784fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
2785    let mut modules = Vec::new();
2786
2787    for statement in statements {
2788        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
2789            continue;
2790        };
2791        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
2792            continue;
2793        };
2794        let imports_dunder = names.iter().any(|alias| {
2795            matches!(
2796                alias.name.as_str(),
2797                "__version__" | "__author__" | "__license__"
2798            )
2799        });
2800        if imports_dunder {
2801            modules.push(module.to_string());
2802        }
2803    }
2804
2805    modules
2806}
2807
2808fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
2809    let relative = PathBuf::from_iter(module.split('.'));
2810    let candidates = [
2811        root.join(relative.with_extension("py")),
2812        root.join(&relative).join("__init__.py"),
2813        root.join("src").join(relative.with_extension("py")),
2814        root.join("src").join(relative).join("__init__.py"),
2815    ];
2816
2817    candidates.into_iter().find(|candidate| candidate.exists())
2818}
2819
2820/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
2821///
2822/// # Security Model
2823///
2824/// This function parses setup.py as a Python AST and evaluates only literal values
2825/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
2826/// arbitrary code execution during scanning.
2827///
2828/// # DoS Prevention
2829///
2830/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
2831/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
2832/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
2833///
2834/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
2835fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
2836    let statements = ast::Suite::parse(content, "<setup.py>").map_err(|e| format!("{}", e))?;
2837    let aliases = collect_setup_aliases(&statements);
2838    let mut evaluator = LiteralEvaluator::new(HashMap::new());
2839    build_setup_py_constants(&statements, &mut evaluator);
2840
2841    let setup_call = find_setup_call(&statements, &aliases);
2842    let Some(call_expr) = setup_call else {
2843        return Ok(None);
2844    };
2845
2846    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
2847    Ok(Some(build_setup_py_package_data(&setup_values)))
2848}
2849
2850fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
2851    for stmt in statements {
2852        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
2853            if targets.len() != 1 {
2854                continue;
2855            }
2856
2857            let Some(name) = extract_assign_name(&targets[0]) else {
2858                continue;
2859            };
2860
2861            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
2862                evaluator.insert_constant(name, value);
2863            }
2864        }
2865    }
2866}
2867
2868fn extract_assign_name(target: &ast::Expr) -> Option<String> {
2869    match target {
2870        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
2871        _ => None,
2872    }
2873}
2874
2875fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
2876    let mut aliases = SetupAliases::default();
2877    aliases.setup_names.insert("setup".to_string());
2878
2879    for stmt in statements {
2880        match stmt {
2881            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
2882                for alias in names {
2883                    let module_name = alias.name.as_str();
2884                    if !is_setup_module(module_name) {
2885                        continue;
2886                    }
2887                    let alias_name = alias
2888                        .asname
2889                        .as_ref()
2890                        .map(|name| name.as_str())
2891                        .unwrap_or(module_name);
2892                    aliases
2893                        .module_aliases
2894                        .insert(alias_name.to_string(), module_name.to_string());
2895                }
2896            }
2897            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
2898                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
2899                    continue;
2900                };
2901                if !is_setup_module(module_name) {
2902                    continue;
2903                }
2904                for alias in names {
2905                    if alias.name.as_str() != "setup" {
2906                        continue;
2907                    }
2908                    let alias_name = alias
2909                        .asname
2910                        .as_ref()
2911                        .map(|name| name.as_str())
2912                        .unwrap_or("setup");
2913                    aliases.setup_names.insert(alias_name.to_string());
2914                }
2915            }
2916            _ => {}
2917        }
2918    }
2919
2920    aliases
2921}
2922
2923fn is_setup_module(module_name: &str) -> bool {
2924    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
2925}
2926
2927fn find_setup_call<'a>(
2928    statements: &'a [ast::Stmt],
2929    aliases: &'a SetupAliases,
2930) -> Option<&'a ast::Expr> {
2931    let mut finder = SetupCallFinder {
2932        aliases,
2933        nodes_visited: 0,
2934    };
2935    finder.find_in_statements(statements)
2936}
2937
2938struct SetupCallFinder<'a> {
2939    aliases: &'a SetupAliases,
2940    nodes_visited: usize,
2941}
2942
2943impl<'a> SetupCallFinder<'a> {
2944    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
2945        for stmt in statements {
2946            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
2947                return None;
2948            }
2949            self.nodes_visited += 1;
2950
2951            let found = match stmt {
2952                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
2953                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
2954                ast::Stmt::If(ast::StmtIf { body, orelse, .. }) => self
2955                    .find_in_statements(body)
2956                    .or_else(|| self.find_in_statements(orelse)),
2957                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
2958                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
2959                    .find_in_statements(body)
2960                    .or_else(|| self.find_in_statements(orelse)),
2961                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
2962                ast::Stmt::Try(ast::StmtTry {
2963                    body,
2964                    orelse,
2965                    finalbody,
2966                    handlers,
2967                    ..
2968                })
2969                | ast::Stmt::TryStar(ast::StmtTryStar {
2970                    body,
2971                    orelse,
2972                    finalbody,
2973                    handlers,
2974                    ..
2975                }) => self
2976                    .find_in_statements(body)
2977                    .or_else(|| self.find_in_statements(orelse))
2978                    .or_else(|| self.find_in_statements(finalbody))
2979                    .or_else(|| {
2980                        for handler in handlers {
2981                            let ast::ExceptHandler::ExceptHandler(
2982                                ast::ExceptHandlerExceptHandler { body, .. },
2983                            ) = handler;
2984                            if let Some(found) = self.find_in_statements(body) {
2985                                return Some(found);
2986                            }
2987                        }
2988                        None
2989                    }),
2990                _ => None,
2991            };
2992
2993            if found.is_some() {
2994                return found;
2995            }
2996        }
2997
2998        None
2999    }
3000
3001    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3002        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3003            return None;
3004        }
3005        self.nodes_visited += 1;
3006
3007        match expr {
3008            ast::Expr::Call(ast::ExprCall { func, .. })
3009                if is_setup_call(func.as_ref(), self.aliases) =>
3010            {
3011                Some(expr)
3012            }
3013            _ => None,
3014        }
3015    }
3016}
3017
3018fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3019    let Some(dotted) = dotted_name(func, 0) else {
3020        return false;
3021    };
3022
3023    if aliases.setup_names.contains(&dotted) {
3024        return true;
3025    }
3026
3027    let Some(module) = dotted.strip_suffix(".setup") else {
3028        return false;
3029    };
3030
3031    let resolved = resolve_module_alias(module, aliases);
3032    is_setup_module(&resolved)
3033}
3034
3035fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3036    if depth >= MAX_SETUP_PY_AST_DEPTH {
3037        return None;
3038    }
3039
3040    match expr {
3041        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3042        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3043            let base = dotted_name(value.as_ref(), depth + 1)?;
3044            Some(format!("{}.{}", base, attr.as_str()))
3045        }
3046        _ => None,
3047    }
3048}
3049
3050fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3051    if let Some(mapped) = aliases.module_aliases.get(module) {
3052        return mapped.clone();
3053    }
3054
3055    let Some((base, rest)) = module.split_once('.') else {
3056        return module.to_string();
3057    };
3058
3059    if let Some(mapped) = aliases.module_aliases.get(base) {
3060        return format!("{}.{}", mapped, rest);
3061    }
3062
3063    module.to_string()
3064}
3065
3066fn extract_setup_keywords(
3067    call_expr: &ast::Expr,
3068    evaluator: &mut LiteralEvaluator,
3069) -> HashMap<String, Value> {
3070    let mut values = HashMap::new();
3071    let ast::Expr::Call(ast::ExprCall { keywords, .. }) = call_expr else {
3072        return values;
3073    };
3074
3075    for keyword in keywords {
3076        if let Some(arg) = keyword.arg.as_ref().map(|name| name.as_str()) {
3077            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3078                values.insert(arg.to_string(), value);
3079            }
3080        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3081            for (key, value) in dict {
3082                values.insert(key, value);
3083            }
3084        }
3085    }
3086
3087    values
3088}
3089
3090fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3091    let name = get_value_string(values, "name");
3092    let version = get_value_string(values, "version");
3093    let description =
3094        get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3095    let homepage_url =
3096        get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3097    let author = get_value_string(values, "author");
3098    let author_email = get_value_string(values, "author_email");
3099    let maintainer = get_value_string(values, "maintainer");
3100    let maintainer_email = get_value_string(values, "maintainer_email");
3101    let license = get_value_string(values, "license");
3102    let classifiers = values
3103        .get("classifiers")
3104        .and_then(value_to_string_list)
3105        .unwrap_or_default();
3106
3107    let mut parties = Vec::new();
3108    if author.is_some() || author_email.is_some() {
3109        parties.push(Party {
3110            r#type: Some("person".to_string()),
3111            role: Some("author".to_string()),
3112            name: author,
3113            email: author_email,
3114            url: None,
3115            organization: None,
3116            organization_url: None,
3117            timezone: None,
3118        });
3119    }
3120
3121    if maintainer.is_some() || maintainer_email.is_some() {
3122        parties.push(Party {
3123            r#type: Some("person".to_string()),
3124            role: Some("maintainer".to_string()),
3125            name: maintainer,
3126            email: maintainer_email,
3127            url: None,
3128            organization: None,
3129            organization_url: None,
3130            timezone: None,
3131        });
3132    }
3133
3134    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3135        normalize_spdx_declared_license(license.as_deref());
3136    let extracted_license_statement = license.clone();
3137
3138    let dependencies = build_setup_py_dependencies(values);
3139    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3140    let mut homepage_from_project_urls = None;
3141    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3142    let mut extra_data = HashMap::new();
3143
3144    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3145        apply_project_url_mappings(
3146            &parsed_project_urls,
3147            &mut homepage_from_project_urls,
3148            &mut bug_tracking_url,
3149            &mut code_view_url,
3150            &mut vcs_url,
3151            &mut extra_data,
3152        );
3153    }
3154
3155    let extra_data = if extra_data.is_empty() {
3156        None
3157    } else {
3158        Some(extra_data)
3159    };
3160
3161    PackageData {
3162        package_type: Some(PythonParser::PACKAGE_TYPE),
3163        namespace: None,
3164        name,
3165        version,
3166        qualifiers: None,
3167        subpath: None,
3168        primary_language: Some("Python".to_string()),
3169        description,
3170        release_date: None,
3171        parties,
3172        keywords: Vec::new(),
3173        homepage_url: homepage_url.or(homepage_from_project_urls),
3174        download_url: None,
3175        size: None,
3176        sha1: None,
3177        md5: None,
3178        sha256: None,
3179        sha512: None,
3180        bug_tracking_url,
3181        code_view_url,
3182        vcs_url,
3183        copyright: None,
3184        holder: None,
3185        declared_license_expression,
3186        declared_license_expression_spdx,
3187        license_detections,
3188        other_license_expression: None,
3189        other_license_expression_spdx: None,
3190        other_license_detections: Vec::new(),
3191        extracted_license_statement,
3192        notice_text: None,
3193        source_packages: Vec::new(),
3194        file_references: Vec::new(),
3195        is_private: has_private_classifier(&classifiers),
3196        is_virtual: false,
3197        extra_data,
3198        dependencies,
3199        repository_homepage_url: None,
3200        repository_download_url: None,
3201        api_data_url: None,
3202        datasource_id: Some(DatasourceId::PypiSetupPy),
3203        purl,
3204    }
3205}
3206
3207fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3208    let mut dependencies = Vec::new();
3209
3210    if let Some(reqs) = values
3211        .get("install_requires")
3212        .and_then(value_to_string_list)
3213    {
3214        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3215    }
3216
3217    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3218        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3219    }
3220
3221    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3222        let mut extra_items: Vec<_> = extras.iter().collect();
3223        extra_items.sort_by_key(|(name, _)| *name);
3224        for (extra_name, extra_value) in extra_items {
3225            if let Some(reqs) = value_to_string_list(extra_value) {
3226                dependencies.extend(build_setup_py_dependency_list(
3227                    reqs.as_slice(),
3228                    extra_name,
3229                    true,
3230                ));
3231            }
3232        }
3233    }
3234
3235    dependencies
3236}
3237
3238fn build_setup_py_dependency_list(
3239    reqs: &[String],
3240    scope: &str,
3241    is_optional: bool,
3242) -> Vec<Dependency> {
3243    reqs.iter()
3244        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3245        .collect()
3246}
3247
3248fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3249    values.get(key).and_then(value_to_string)
3250}
3251
3252fn value_to_string(value: &Value) -> Option<String> {
3253    match value {
3254        Value::String(value) => Some(value.clone()),
3255        Value::Number(value) => Some(value.to_string()),
3256        Value::Bool(value) => Some(value.to_string()),
3257        _ => None,
3258    }
3259}
3260
3261fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3262    match value {
3263        Value::String(value) => Some(vec![value.clone()]),
3264        Value::List(values) | Value::Tuple(values) => {
3265            let mut items = Vec::new();
3266            for item in values {
3267                items.push(value_to_string(item)?);
3268            }
3269            Some(items)
3270        }
3271        _ => None,
3272    }
3273}
3274
3275fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3276    let Value::Dict(dict) = value else {
3277        return None;
3278    };
3279
3280    let mut pairs: Vec<(String, String)> = dict
3281        .iter()
3282        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3283        .collect::<Option<Vec<_>>>()?;
3284    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3285    Some(pairs)
3286}
3287
3288fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3289    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3290    extract_requires_dist_dependencies(&requires_dist)
3291}
3292
3293pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3294    requires_dist
3295        .iter()
3296        .filter_map(|entry| build_rfc822_dependency(entry))
3297        .collect()
3298}
3299
3300fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3301    build_python_dependency(entry, "install", false, None)
3302}
3303
3304fn build_python_dependency(
3305    entry: &str,
3306    default_scope: &str,
3307    default_optional: bool,
3308    marker_override: Option<&str>,
3309) -> Option<Dependency> {
3310    let (requirement_part, marker_part) = entry
3311        .split_once(';')
3312        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3313        .unwrap_or((entry.trim(), None));
3314
3315    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3316    let requirement = normalize_rfc822_requirement(requirement_part);
3317    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3318        marker_part.or(marker_override),
3319        default_scope,
3320        default_optional,
3321    );
3322    let mut purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
3323
3324    let is_pinned = requirement
3325        .as_deref()
3326        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3327    if is_pinned
3328        && let Some(version) = requirement
3329            .as_deref()
3330            .map(|req| req.trim_start_matches('='))
3331    {
3332        purl.with_version(version).ok()?;
3333    }
3334
3335    let mut extra_data = HashMap::new();
3336    extra_data.extend(marker_data);
3337    if let Some(marker) = marker {
3338        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3339    }
3340
3341    Some(Dependency {
3342        purl: Some(purl.to_string()),
3343        extracted_requirement: requirement,
3344        scope: Some(scope),
3345        is_runtime: Some(true),
3346        is_optional: Some(is_optional),
3347        is_pinned: Some(is_pinned),
3348        is_direct: Some(true),
3349        resolved_package: None,
3350        extra_data: if extra_data.is_empty() {
3351            None
3352        } else {
3353            Some(extra_data)
3354        },
3355    })
3356}
3357
3358fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3359    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3360    let trimmed = requirement_part.trim();
3361    let mut remainder = trimmed[name.len()..].trim();
3362
3363    if let Some(stripped) = remainder.strip_prefix('[')
3364        && let Some(end_idx) = stripped.find(']')
3365    {
3366        remainder = stripped[end_idx + 1..].trim();
3367    }
3368
3369    let remainder = remainder
3370        .strip_prefix('(')
3371        .and_then(|value| value.strip_suffix(')'))
3372        .unwrap_or(remainder)
3373        .trim();
3374
3375    if remainder.is_empty() {
3376        return None;
3377    }
3378
3379    let mut specifiers: Vec<String> = remainder
3380        .split(',')
3381        .map(|specifier| specifier.trim().replace(' ', ""))
3382        .filter(|specifier| !specifier.is_empty())
3383        .collect();
3384    specifiers.sort();
3385    Some(specifiers.join(","))
3386}
3387
3388fn parse_rfc822_marker(
3389    marker_part: Option<&str>,
3390    default_scope: &str,
3391    default_optional: bool,
3392) -> (
3393    String,
3394    bool,
3395    Option<String>,
3396    HashMap<String, serde_json::Value>,
3397) {
3398    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3399        return (
3400            default_scope.to_string(),
3401            default_optional,
3402            None,
3403            HashMap::new(),
3404        );
3405    };
3406
3407    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3408        .expect("extra marker regex should compile");
3409    let mut extra_data = HashMap::new();
3410
3411    if let Some(python_version) = extract_marker_field(marker, "python_version") {
3412        extra_data.insert(
3413            "python_version".to_string(),
3414            serde_json::Value::String(python_version),
3415        );
3416    }
3417    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
3418        extra_data.insert(
3419            "sys_platform".to_string(),
3420            serde_json::Value::String(sys_platform),
3421        );
3422    }
3423
3424    if let Some(captures) = extra_re.captures(marker)
3425        && let Some(scope) = captures.get(1)
3426    {
3427        return (
3428            scope.as_str().to_string(),
3429            true,
3430            Some(marker.trim().to_string()),
3431            extra_data,
3432        );
3433    }
3434
3435    (
3436        default_scope.to_string(),
3437        default_optional,
3438        Some(marker.trim().to_string()),
3439        extra_data,
3440    )
3441}
3442
3443fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
3444    let re = Regex::new(&format!(
3445        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
3446        field
3447    ))
3448    .ok()?;
3449    let captures = re.captures(marker)?;
3450    let operator = captures.get(1)?.as_str();
3451    let value = captures.get(2)?.as_str();
3452    Some(format!("{} {}", operator, value))
3453}
3454
3455fn parse_requires_txt(content: &str) -> Vec<Dependency> {
3456    let mut dependencies = Vec::new();
3457    let mut current_scope = "install".to_string();
3458    let mut current_optional = false;
3459    let mut current_marker: Option<String> = None;
3460
3461    for line in content.lines() {
3462        let trimmed = line.trim();
3463        if trimmed.is_empty() || trimmed.starts_with('#') {
3464            continue;
3465        }
3466
3467        if trimmed.starts_with('[') && trimmed.ends_with(']') {
3468            let inner = &trimmed[1..trimmed.len() - 1];
3469            if let Some(rest) = inner.strip_prefix(':') {
3470                current_scope = "install".to_string();
3471                current_optional = false;
3472                current_marker = Some(rest.trim().to_string());
3473            } else if let Some((scope, marker)) = inner.split_once(':') {
3474                current_scope = scope.trim().to_string();
3475                current_optional = true;
3476                current_marker = Some(marker.trim().to_string());
3477            } else {
3478                current_scope = inner.trim().to_string();
3479                current_optional = true;
3480                current_marker = None;
3481            }
3482            continue;
3483        }
3484
3485        if let Some(dependency) = build_python_dependency(
3486            trimmed,
3487            &current_scope,
3488            current_optional,
3489            current_marker.as_deref(),
3490        ) {
3491            dependencies.push(dependency);
3492        }
3493    }
3494
3495    dependencies
3496}
3497
3498fn has_private_classifier(classifiers: &[String]) -> bool {
3499    classifiers
3500        .iter()
3501        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
3502}
3503
3504fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
3505    let name = name?;
3506    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
3507    if let Some(version) = version {
3508        package_url.with_version(version).ok()?;
3509    }
3510    Some(package_url.to_string())
3511}
3512
3513fn extract_from_setup_py_regex(content: &str) -> PackageData {
3514    let name = extract_setup_value(content, "name");
3515    let version = extract_setup_value(content, "version");
3516    let license_expression = extract_setup_value(content, "license");
3517
3518    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3519        normalize_spdx_declared_license(license_expression.as_deref());
3520    let extracted_license_statement = license_expression.clone();
3521
3522    let dependencies = extract_setup_py_dependencies(content);
3523    let homepage_url = extract_setup_value(content, "url");
3524    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3525
3526    PackageData {
3527        package_type: Some(PythonParser::PACKAGE_TYPE),
3528        namespace: None,
3529        name,
3530        version,
3531        qualifiers: None,
3532        subpath: None,
3533        primary_language: Some("Python".to_string()),
3534        description: None,
3535        release_date: None,
3536        parties: Vec::new(),
3537        keywords: Vec::new(),
3538        homepage_url,
3539        download_url: None,
3540        size: None,
3541        sha1: None,
3542        md5: None,
3543        sha256: None,
3544        sha512: None,
3545        bug_tracking_url: None,
3546        code_view_url: None,
3547        vcs_url: None,
3548        copyright: None,
3549        holder: None,
3550        declared_license_expression,
3551        declared_license_expression_spdx,
3552        license_detections,
3553        other_license_expression: None,
3554        other_license_expression_spdx: None,
3555        other_license_detections: Vec::new(),
3556        extracted_license_statement,
3557        notice_text: None,
3558        source_packages: Vec::new(),
3559        file_references: Vec::new(),
3560        is_private: false,
3561        is_virtual: false,
3562        extra_data: None,
3563        dependencies,
3564        repository_homepage_url: None,
3565        repository_download_url: None,
3566        api_data_url: None,
3567        datasource_id: Some(DatasourceId::PypiSetupPy),
3568        purl,
3569    }
3570}
3571
3572fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
3573    crate::models::ResolvedPackage {
3574        package_type: pkg.package_type.unwrap_or(PackageType::Pypi),
3575        namespace: pkg.namespace.clone().unwrap_or_default(),
3576        name: pkg.name.clone().unwrap_or_default(),
3577        version: pkg.version.clone().unwrap_or_default(),
3578        primary_language: pkg.primary_language.clone(),
3579        download_url: pkg.download_url.clone(),
3580        sha1: pkg.sha1.clone(),
3581        sha256: pkg.sha256.clone(),
3582        sha512: pkg.sha512.clone(),
3583        md5: pkg.md5.clone(),
3584        is_virtual: pkg.is_virtual,
3585        extra_data: None,
3586        dependencies: pkg.dependencies.clone(),
3587        repository_homepage_url: pkg.repository_homepage_url.clone(),
3588        repository_download_url: pkg.repository_download_url.clone(),
3589        api_data_url: pkg.api_data_url.clone(),
3590        datasource_id: pkg.datasource_id,
3591        purl: pkg.purl.clone(),
3592    }
3593}
3594
3595fn extract_from_pypi_json(path: &Path) -> PackageData {
3596    let default = PackageData {
3597        package_type: Some(PythonParser::PACKAGE_TYPE),
3598        datasource_id: Some(DatasourceId::PypiJson),
3599        ..Default::default()
3600    };
3601
3602    let content = match read_file_to_string(path) {
3603        Ok(content) => content,
3604        Err(error) => {
3605            warn!("Failed to read pypi.json at {:?}: {}", path, error);
3606            return default;
3607        }
3608    };
3609
3610    let root: serde_json::Value = match serde_json::from_str(&content) {
3611        Ok(value) => value,
3612        Err(error) => {
3613            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
3614            return default;
3615        }
3616    };
3617
3618    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
3619        warn!("No info object found in pypi.json at {:?}", path);
3620        return default;
3621    };
3622
3623    let name = info
3624        .get("name")
3625        .and_then(|value| value.as_str())
3626        .map(ToOwned::to_owned);
3627    let version = info
3628        .get("version")
3629        .and_then(|value| value.as_str())
3630        .map(ToOwned::to_owned);
3631    let summary = info
3632        .get("summary")
3633        .and_then(|value| value.as_str())
3634        .map(ToOwned::to_owned);
3635    let description = info
3636        .get("description")
3637        .and_then(|value| value.as_str())
3638        .filter(|value| !value.trim().is_empty())
3639        .map(ToOwned::to_owned)
3640        .or(summary);
3641    let mut homepage_url = info
3642        .get("home_page")
3643        .and_then(|value| value.as_str())
3644        .map(ToOwned::to_owned);
3645    let author = info
3646        .get("author")
3647        .and_then(|value| value.as_str())
3648        .filter(|value| !value.trim().is_empty())
3649        .map(ToOwned::to_owned);
3650    let author_email = info
3651        .get("author_email")
3652        .and_then(|value| value.as_str())
3653        .filter(|value| !value.trim().is_empty())
3654        .map(ToOwned::to_owned);
3655    let license = info
3656        .get("license")
3657        .and_then(|value| value.as_str())
3658        .filter(|value| !value.trim().is_empty())
3659        .map(ToOwned::to_owned);
3660    let keywords = parse_setup_cfg_keywords(
3661        info.get("keywords")
3662            .and_then(|value| value.as_str())
3663            .map(ToOwned::to_owned),
3664    );
3665    let classifiers = info
3666        .get("classifiers")
3667        .and_then(|value| value.as_array())
3668        .map(|values| {
3669            values
3670                .iter()
3671                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
3672                .collect::<Vec<_>>()
3673        })
3674        .unwrap_or_default();
3675
3676    let mut parties = Vec::new();
3677    if author.is_some() || author_email.is_some() {
3678        parties.push(Party {
3679            r#type: Some("person".to_string()),
3680            role: Some("author".to_string()),
3681            name: author,
3682            email: author_email,
3683            url: None,
3684            organization: None,
3685            organization_url: None,
3686            timezone: None,
3687        });
3688    }
3689
3690    let mut bug_tracking_url = None;
3691    let mut code_view_url = None;
3692    let mut vcs_url = None;
3693    let mut extra_data = HashMap::new();
3694
3695    let parsed_project_urls = info
3696        .get("project_urls")
3697        .and_then(|value| value.as_object())
3698        .map(|map| {
3699            let mut pairs: Vec<(String, String)> = map
3700                .iter()
3701                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
3702                .collect();
3703            pairs.sort_by(|left, right| left.0.cmp(&right.0));
3704            pairs
3705        })
3706        .unwrap_or_default();
3707
3708    apply_project_url_mappings(
3709        &parsed_project_urls,
3710        &mut homepage_url,
3711        &mut bug_tracking_url,
3712        &mut code_view_url,
3713        &mut vcs_url,
3714        &mut extra_data,
3715    );
3716
3717    let (download_url, size, sha256) = root
3718        .get("urls")
3719        .and_then(|value| value.as_array())
3720        .map(|urls| select_pypi_json_artifact(urls))
3721        .unwrap_or((None, None, None));
3722
3723    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3724        normalize_spdx_declared_license(license.as_deref());
3725    let dependencies = info
3726        .get("requires_dist")
3727        .and_then(|value| value.as_array())
3728        .map(|entries| {
3729            entries
3730                .iter()
3731                .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
3732                .collect::<Vec<_>>()
3733        })
3734        .map(|entries| extract_requires_dist_dependencies(&entries))
3735        .unwrap_or_default();
3736
3737    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
3738        build_pypi_urls(name.as_deref(), version.as_deref());
3739
3740    PackageData {
3741        package_type: Some(PythonParser::PACKAGE_TYPE),
3742        namespace: None,
3743        name,
3744        version,
3745        qualifiers: None,
3746        subpath: None,
3747        primary_language: None,
3748        description,
3749        release_date: None,
3750        parties,
3751        keywords,
3752        homepage_url: homepage_url.or(repository_homepage_url.clone()),
3753        download_url,
3754        size,
3755        sha1: None,
3756        md5: None,
3757        sha256,
3758        sha512: None,
3759        bug_tracking_url,
3760        code_view_url,
3761        vcs_url,
3762        copyright: None,
3763        holder: None,
3764        declared_license_expression,
3765        declared_license_expression_spdx,
3766        license_detections,
3767        other_license_expression: None,
3768        other_license_expression_spdx: None,
3769        other_license_detections: Vec::new(),
3770        extracted_license_statement: license,
3771        notice_text: None,
3772        source_packages: Vec::new(),
3773        file_references: Vec::new(),
3774        is_private: has_private_classifier(&classifiers),
3775        is_virtual: false,
3776        extra_data: if extra_data.is_empty() {
3777            None
3778        } else {
3779            Some(extra_data)
3780        },
3781        dependencies,
3782        repository_homepage_url,
3783        repository_download_url,
3784        api_data_url,
3785        datasource_id: Some(DatasourceId::PypiJson),
3786        purl,
3787    }
3788}
3789
3790fn select_pypi_json_artifact(
3791    urls: &[serde_json::Value],
3792) -> (Option<String>, Option<u64>, Option<String>) {
3793    let selected = urls
3794        .iter()
3795        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
3796        .or_else(|| urls.first());
3797
3798    let Some(entry) = selected else {
3799        return (None, None, None);
3800    };
3801
3802    let download_url = entry
3803        .get("url")
3804        .and_then(|value| value.as_str())
3805        .map(ToOwned::to_owned);
3806    let size = entry.get("size").and_then(|value| value.as_u64());
3807    let sha256 = entry
3808        .get("digests")
3809        .and_then(|value| value.as_object())
3810        .and_then(|digests| digests.get("sha256"))
3811        .and_then(|value| value.as_str())
3812        .map(ToOwned::to_owned);
3813
3814    (download_url, size, sha256)
3815}
3816
3817fn extract_from_pip_inspect(path: &Path) -> PackageData {
3818    let content = match read_file_to_string(path) {
3819        Ok(content) => content,
3820        Err(e) => {
3821            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
3822            return default_package_data(path);
3823        }
3824    };
3825
3826    let root: serde_json::Value = match serde_json::from_str(&content) {
3827        Ok(value) => value,
3828        Err(e) => {
3829            warn!(
3830                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
3831                path, e
3832            );
3833            return default_package_data(path);
3834        }
3835    };
3836
3837    let installed = match root.get("installed").and_then(|v| v.as_array()) {
3838        Some(arr) => arr,
3839        None => {
3840            warn!(
3841                "No 'installed' array found in pip-inspect.deplock at {:?}",
3842                path
3843            );
3844            return default_package_data(path);
3845        }
3846    };
3847
3848    let pip_version = root
3849        .get("pip_version")
3850        .and_then(|v| v.as_str())
3851        .map(String::from);
3852    let inspect_version = root
3853        .get("version")
3854        .and_then(|v| v.as_str())
3855        .map(String::from);
3856
3857    let mut main_package: Option<PackageData> = None;
3858    let mut dependencies: Vec<Dependency> = Vec::new();
3859
3860    for package_entry in installed {
3861        let metadata = match package_entry.get("metadata") {
3862            Some(m) => m,
3863            None => continue,
3864        };
3865
3866        let is_requested = package_entry
3867            .get("requested")
3868            .and_then(|v| v.as_bool())
3869            .unwrap_or(false);
3870        let has_direct_url = package_entry.get("direct_url").is_some();
3871
3872        let name = metadata
3873            .get("name")
3874            .and_then(|v| v.as_str())
3875            .map(String::from);
3876        let version = metadata
3877            .get("version")
3878            .and_then(|v| v.as_str())
3879            .map(String::from);
3880        let summary = metadata
3881            .get("summary")
3882            .and_then(|v| v.as_str())
3883            .map(String::from);
3884        let home_page = metadata
3885            .get("home_page")
3886            .and_then(|v| v.as_str())
3887            .map(String::from);
3888        let author = metadata
3889            .get("author")
3890            .and_then(|v| v.as_str())
3891            .map(String::from);
3892        let author_email = metadata
3893            .get("author_email")
3894            .and_then(|v| v.as_str())
3895            .map(String::from);
3896        let license = metadata
3897            .get("license")
3898            .and_then(|v| v.as_str())
3899            .map(String::from);
3900        let description = metadata
3901            .get("description")
3902            .and_then(|v| v.as_str())
3903            .map(String::from);
3904        let keywords = metadata
3905            .get("keywords")
3906            .and_then(|v| v.as_array())
3907            .map(|arr| {
3908                arr.iter()
3909                    .filter_map(|k| k.as_str().map(String::from))
3910                    .collect::<Vec<_>>()
3911            })
3912            .unwrap_or_default();
3913
3914        let mut parties = Vec::new();
3915        if author.is_some() || author_email.is_some() {
3916            parties.push(Party {
3917                r#type: Some("person".to_string()),
3918                role: Some("author".to_string()),
3919                name: author,
3920                email: author_email,
3921                url: None,
3922                organization: None,
3923                organization_url: None,
3924                timezone: None,
3925            });
3926        }
3927
3928        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3929            normalize_spdx_declared_license(license.as_deref());
3930        let extracted_license_statement = license.clone();
3931        let requires_dist = metadata
3932            .get("requires_dist")
3933            .and_then(|v| v.as_array())
3934            .map(|entries| {
3935                entries
3936                    .iter()
3937                    .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
3938                    .collect::<Vec<_>>()
3939            })
3940            .unwrap_or_default();
3941        let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
3942
3943        let purl = name.as_ref().and_then(|n| {
3944            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
3945            if let Some(v) = &version {
3946                package_url.with_version(v).ok()?;
3947            }
3948            Some(package_url.to_string())
3949        });
3950
3951        if is_requested && has_direct_url {
3952            let mut extra_data = HashMap::new();
3953            if let Some(pv) = &pip_version {
3954                extra_data.insert(
3955                    "pip_version".to_string(),
3956                    serde_json::Value::String(pv.clone()),
3957                );
3958            }
3959            if let Some(iv) = &inspect_version {
3960                extra_data.insert(
3961                    "inspect_version".to_string(),
3962                    serde_json::Value::String(iv.clone()),
3963                );
3964            }
3965
3966            main_package = Some(PackageData {
3967                package_type: Some(PythonParser::PACKAGE_TYPE),
3968                namespace: None,
3969                name,
3970                version,
3971                qualifiers: None,
3972                subpath: None,
3973                primary_language: Some("Python".to_string()),
3974                description: description.or(summary),
3975                release_date: None,
3976                parties,
3977                keywords,
3978                homepage_url: home_page,
3979                download_url: None,
3980                size: None,
3981                sha1: None,
3982                md5: None,
3983                sha256: None,
3984                sha512: None,
3985                bug_tracking_url: None,
3986                code_view_url: None,
3987                vcs_url: None,
3988                copyright: None,
3989                holder: None,
3990                declared_license_expression,
3991                declared_license_expression_spdx,
3992                license_detections,
3993                other_license_expression: None,
3994                other_license_expression_spdx: None,
3995                other_license_detections: Vec::new(),
3996                extracted_license_statement,
3997                notice_text: None,
3998                source_packages: Vec::new(),
3999                file_references: Vec::new(),
4000                is_private: false,
4001                is_virtual: true,
4002                extra_data: if extra_data.is_empty() {
4003                    None
4004                } else {
4005                    Some(extra_data)
4006                },
4007                dependencies: parsed_dependencies,
4008                repository_homepage_url: None,
4009                repository_download_url: None,
4010                api_data_url: None,
4011                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4012                purl,
4013            });
4014        } else {
4015            let resolved_package = PackageData {
4016                package_type: Some(PythonParser::PACKAGE_TYPE),
4017                namespace: None,
4018                name: name.clone(),
4019                version: version.clone(),
4020                qualifiers: None,
4021                subpath: None,
4022                primary_language: Some("Python".to_string()),
4023                description: description.or(summary),
4024                release_date: None,
4025                parties,
4026                keywords,
4027                homepage_url: home_page,
4028                download_url: None,
4029                size: None,
4030                sha1: None,
4031                md5: None,
4032                sha256: None,
4033                sha512: None,
4034                bug_tracking_url: None,
4035                code_view_url: None,
4036                vcs_url: None,
4037                copyright: None,
4038                holder: None,
4039                declared_license_expression,
4040                declared_license_expression_spdx,
4041                license_detections,
4042                other_license_expression: None,
4043                other_license_expression_spdx: None,
4044                other_license_detections: Vec::new(),
4045                extracted_license_statement,
4046                notice_text: None,
4047                source_packages: Vec::new(),
4048                file_references: Vec::new(),
4049                is_private: false,
4050                is_virtual: true,
4051                extra_data: None,
4052                dependencies: parsed_dependencies,
4053                repository_homepage_url: None,
4054                repository_download_url: None,
4055                api_data_url: None,
4056                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4057                purl: purl.clone(),
4058            };
4059
4060            let resolved = package_data_to_resolved(&resolved_package);
4061            dependencies.push(Dependency {
4062                purl,
4063                extracted_requirement: None,
4064                scope: None,
4065                is_runtime: Some(true),
4066                is_optional: Some(false),
4067                is_pinned: Some(true),
4068                is_direct: Some(is_requested),
4069                resolved_package: Some(Box::new(resolved)),
4070                extra_data: None,
4071            });
4072        }
4073    }
4074
4075    if let Some(mut main_pkg) = main_package {
4076        let direct_requirement_purls: HashSet<String> = main_pkg
4077            .dependencies
4078            .iter()
4079            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4080            .collect();
4081
4082        let resolved_requirement_purls: HashSet<String> = dependencies
4083            .iter()
4084            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4085            .collect();
4086
4087        let unresolved_dependencies = main_pkg
4088            .dependencies
4089            .iter()
4090            .filter(|dep| {
4091                dep.purl.as_ref().is_some_and(|purl| {
4092                    !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4093                })
4094            })
4095            .cloned()
4096            .collect::<Vec<_>>();
4097
4098        for dependency in &mut dependencies {
4099            if dependency
4100                .purl
4101                .as_ref()
4102                .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4103            {
4104                dependency.is_direct = Some(true);
4105            }
4106        }
4107
4108        main_pkg.dependencies = dependencies;
4109        main_pkg.dependencies.extend(unresolved_dependencies);
4110        main_pkg
4111    } else {
4112        default_package_data(path)
4113    }
4114}
4115
4116fn base_dependency_purl(purl: &str) -> String {
4117    purl.split_once('@')
4118        .map(|(base, _)| base.to_string())
4119        .unwrap_or_else(|| purl.to_string())
4120}
4121
4122type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4123
4124fn extract_from_setup_cfg(path: &Path) -> PackageData {
4125    let content = match read_file_to_string(path) {
4126        Ok(content) => content,
4127        Err(e) => {
4128            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4129            return default_package_data(path);
4130        }
4131    };
4132
4133    let sections = parse_setup_cfg(&content);
4134    let name = get_ini_value(&sections, "metadata", "name");
4135    let version = get_ini_value(&sections, "metadata", "version");
4136    let description = get_ini_value(&sections, "metadata", "description");
4137    let author = get_ini_value(&sections, "metadata", "author");
4138    let author_email = get_ini_value(&sections, "metadata", "author_email");
4139    let maintainer = get_ini_value(&sections, "metadata", "maintainer");
4140    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4141    let license = get_ini_value(&sections, "metadata", "license");
4142    let mut homepage_url = get_ini_value(&sections, "metadata", "url");
4143    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4144    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4145    let python_requires = get_ini_value(&sections, "options", "python_requires");
4146    let parsed_project_urls =
4147        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4148    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4149    let mut extra_data = HashMap::new();
4150
4151    let mut parties = Vec::new();
4152    if author.is_some() || author_email.is_some() {
4153        parties.push(Party {
4154            r#type: Some("person".to_string()),
4155            role: Some("author".to_string()),
4156            name: author,
4157            email: author_email,
4158            url: None,
4159            organization: None,
4160            organization_url: None,
4161            timezone: None,
4162        });
4163    }
4164
4165    if maintainer.is_some() || maintainer_email.is_some() {
4166        parties.push(Party {
4167            r#type: Some("person".to_string()),
4168            role: Some("maintainer".to_string()),
4169            name: maintainer,
4170            email: maintainer_email,
4171            url: None,
4172            organization: None,
4173            organization_url: None,
4174            timezone: None,
4175        });
4176    }
4177
4178    let declared_license_expression = None;
4179    let declared_license_expression_spdx = None;
4180    let license_detections = Vec::new();
4181    let extracted_license_statement = license.clone();
4182
4183    let dependencies = extract_setup_cfg_dependencies(&sections);
4184
4185    if let Some(value) = python_requires {
4186        extra_data.insert(
4187            "python_requires".to_string(),
4188            serde_json::Value::String(value),
4189        );
4190    }
4191
4192    apply_project_url_mappings(
4193        &parsed_project_urls,
4194        &mut homepage_url,
4195        &mut bug_tracking_url,
4196        &mut code_view_url,
4197        &mut vcs_url,
4198        &mut extra_data,
4199    );
4200
4201    let extra_data = if extra_data.is_empty() {
4202        None
4203    } else {
4204        Some(extra_data)
4205    };
4206
4207    let purl = name.as_ref().and_then(|n| {
4208        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4209        if let Some(v) = &version {
4210            package_url.with_version(v).ok()?;
4211        }
4212        Some(package_url.to_string())
4213    });
4214
4215    PackageData {
4216        package_type: Some(PythonParser::PACKAGE_TYPE),
4217        namespace: None,
4218        name,
4219        version,
4220        qualifiers: None,
4221        subpath: None,
4222        primary_language: Some("Python".to_string()),
4223        description,
4224        release_date: None,
4225        parties,
4226        keywords,
4227        homepage_url,
4228        download_url: None,
4229        size: None,
4230        sha1: None,
4231        md5: None,
4232        sha256: None,
4233        sha512: None,
4234        bug_tracking_url,
4235        code_view_url,
4236        vcs_url,
4237        copyright: None,
4238        holder: None,
4239        declared_license_expression,
4240        declared_license_expression_spdx,
4241        license_detections,
4242        other_license_expression: None,
4243        other_license_expression_spdx: None,
4244        other_license_detections: Vec::new(),
4245        extracted_license_statement,
4246        notice_text: None,
4247        source_packages: Vec::new(),
4248        file_references: Vec::new(),
4249        is_private: has_private_classifier(&classifiers),
4250        is_virtual: false,
4251        extra_data,
4252        dependencies,
4253        repository_homepage_url: None,
4254        repository_download_url: None,
4255        api_data_url: None,
4256        datasource_id: Some(DatasourceId::PypiSetupCfg),
4257        purl,
4258    }
4259}
4260
4261fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4262    let Some(keywords) = value else {
4263        return Vec::new();
4264    };
4265
4266    keywords
4267        .split(',')
4268        .map(str::trim)
4269        .filter(|keyword| !keyword.is_empty())
4270        .map(ToOwned::to_owned)
4271        .collect()
4272}
4273
4274fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4275    entries
4276        .iter()
4277        .filter_map(|entry| {
4278            let (label, url) = entry.split_once('=')?;
4279            let label = label.trim();
4280            let url = url.trim();
4281            if label.is_empty() || url.is_empty() {
4282                None
4283            } else {
4284                Some((label.to_string(), url.to_string()))
4285            }
4286        })
4287        .collect()
4288}
4289
4290fn apply_project_url_mappings(
4291    parsed_urls: &[(String, String)],
4292    homepage_url: &mut Option<String>,
4293    bug_tracking_url: &mut Option<String>,
4294    code_view_url: &mut Option<String>,
4295    vcs_url: &mut Option<String>,
4296    extra_data: &mut HashMap<String, serde_json::Value>,
4297) {
4298    for (label, url) in parsed_urls {
4299        let label_lower = label.to_lowercase();
4300
4301        if bug_tracking_url.is_none()
4302            && matches!(
4303                label_lower.as_str(),
4304                "tracker"
4305                    | "bug reports"
4306                    | "bug tracker"
4307                    | "issues"
4308                    | "issue tracker"
4309                    | "github: issues"
4310            )
4311        {
4312            *bug_tracking_url = Some(url.clone());
4313        } else if code_view_url.is_none()
4314            && matches!(label_lower.as_str(), "source" | "source code" | "code")
4315        {
4316            *code_view_url = Some(url.clone());
4317        } else if vcs_url.is_none()
4318            && matches!(
4319                label_lower.as_str(),
4320                "github" | "gitlab" | "github: repo" | "repository"
4321            )
4322        {
4323            *vcs_url = Some(url.clone());
4324        } else if homepage_url.is_none()
4325            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4326        {
4327            *homepage_url = Some(url.clone());
4328        } else if label_lower == "changelog" {
4329            extra_data.insert(
4330                "changelog_url".to_string(),
4331                serde_json::Value::String(url.clone()),
4332            );
4333        }
4334    }
4335
4336    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4337        .iter()
4338        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4339        .collect();
4340
4341    if !project_urls_json.is_empty() {
4342        extra_data.insert(
4343            "project_urls".to_string(),
4344            serde_json::Value::Object(project_urls_json),
4345        );
4346    }
4347}
4348
4349fn parse_setup_cfg(content: &str) -> IniSections {
4350    let mut sections: IniSections = HashMap::new();
4351    let mut current_section: Option<String> = None;
4352    let mut current_key: Option<String> = None;
4353
4354    for raw_line in content.lines() {
4355        let line = raw_line.trim_end_matches('\r');
4356        let trimmed = line.trim();
4357        if trimmed.is_empty() {
4358            continue;
4359        }
4360
4361        let stripped = line.trim_start();
4362        if stripped.starts_with('#') || stripped.starts_with(';') {
4363            continue;
4364        }
4365
4366        if stripped.starts_with('[') && stripped.ends_with(']') {
4367            let section_name = stripped
4368                .trim_start_matches('[')
4369                .trim_end_matches(']')
4370                .trim()
4371                .to_ascii_lowercase();
4372            current_section = if section_name.is_empty() {
4373                None
4374            } else {
4375                Some(section_name)
4376            };
4377            current_key = None;
4378            continue;
4379        }
4380
4381        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4382            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4383                let value = stripped.trim();
4384                if !value.is_empty() {
4385                    sections
4386                        .entry(section.clone())
4387                        .or_default()
4388                        .entry(key.clone())
4389                        .or_default()
4390                        .push(value.to_string());
4391                }
4392            }
4393            continue;
4394        }
4395
4396        if let Some((key, value)) = stripped.split_once('=')
4397            && let Some(section) = current_section.as_ref()
4398        {
4399            let key_name = key.trim().to_ascii_lowercase();
4400            let value_trimmed = value.trim();
4401            let entry = sections
4402                .entry(section.clone())
4403                .or_default()
4404                .entry(key_name.clone())
4405                .or_default();
4406            if !value_trimmed.is_empty() {
4407                entry.push(value_trimmed.to_string());
4408            }
4409            current_key = Some(key_name);
4410        }
4411    }
4412
4413    sections
4414}
4415
4416fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4417    sections
4418        .get(&section.to_ascii_lowercase())
4419        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4420        .and_then(|entries| entries.first())
4421        .map(|value| value.trim().to_string())
4422}
4423
4424fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4425    sections
4426        .get(&section.to_ascii_lowercase())
4427        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4428        .cloned()
4429        .unwrap_or_default()
4430}
4431
4432fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
4433    let mut dependencies = Vec::new();
4434
4435    for (sub_section, scope) in [
4436        ("install_requires", "install"),
4437        ("tests_require", "test"),
4438        ("setup_requires", "setup"),
4439    ] {
4440        let reqs = get_ini_values(sections, "options", sub_section);
4441        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
4442    }
4443
4444    if let Some(extras) = sections.get("options.extras_require") {
4445        let mut extra_items: Vec<_> = extras.iter().collect();
4446        extra_items.sort_by_key(|(name, _)| *name);
4447        for (extra_name, reqs) in extra_items {
4448            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
4449        }
4450    }
4451
4452    dependencies
4453}
4454
4455fn parse_setup_cfg_requirements(
4456    reqs: &[String],
4457    scope: &str,
4458    is_optional: bool,
4459) -> Vec<Dependency> {
4460    reqs.iter()
4461        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
4462        .collect()
4463}
4464
4465fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
4466    let trimmed = req.trim();
4467    if trimmed.is_empty() || trimmed.starts_with('#') {
4468        return None;
4469    }
4470
4471    let name = extract_setup_cfg_dependency_name(trimmed)?;
4472    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4473
4474    Some(Dependency {
4475        purl: Some(purl.to_string()),
4476        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
4477        scope: Some(scope.to_string()),
4478        is_runtime: Some(true),
4479        is_optional: Some(is_optional),
4480        is_pinned: Some(false),
4481        is_direct: Some(true),
4482        resolved_package: None,
4483        extra_data: None,
4484    })
4485}
4486
4487fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
4488    let trimmed = req.trim();
4489    if trimmed.is_empty() {
4490        return None;
4491    }
4492
4493    let end = trimmed
4494        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
4495        .unwrap_or(trimmed.len());
4496    let name = trimmed[..end].trim();
4497    if name.is_empty() {
4498        None
4499    } else {
4500        Some(name.to_string())
4501    }
4502}
4503
4504fn normalize_setup_cfg_requirement(req: &str) -> String {
4505    req.chars().filter(|c| !c.is_whitespace()).collect()
4506}
4507
4508fn extract_setup_value(content: &str, key: &str) -> Option<String> {
4509    let patterns = vec![
4510        format!("{}=\"", key),   // name="value"
4511        format!("{} =\"", key),  // name ="value"
4512        format!("{}= \"", key),  // name= "value"
4513        format!("{} = \"", key), // name = "value"
4514        format!("{}='", key),    // name='value'
4515        format!("{} ='", key),   // name ='value'
4516        format!("{}= '", key),   // name= 'value'
4517        format!("{} = '", key),  // name = 'value'
4518    ];
4519
4520    for pattern in patterns {
4521        if let Some(start_idx) = content.find(&pattern) {
4522            let value_start = start_idx + pattern.len();
4523            let remaining = &content[value_start..];
4524
4525            if let Some(end_idx) = remaining.find(['"', '\'']) {
4526                return Some(remaining[..end_idx].to_string());
4527            }
4528        }
4529    }
4530
4531    None
4532}
4533
4534fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
4535    let mut dependencies = Vec::new();
4536
4537    if let Some(tests_deps) = extract_tests_require(content) {
4538        dependencies.extend(tests_deps);
4539    }
4540
4541    if let Some(extras_deps) = extract_extras_require(content) {
4542        dependencies.extend(extras_deps);
4543    }
4544
4545    dependencies
4546}
4547
4548fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
4549    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
4550    let re = Regex::new(pattern).ok()?;
4551    let captures = re.captures(content)?;
4552    let deps_str = captures.get(1)?.as_str();
4553
4554    let deps = parse_setup_py_dep_list(deps_str, "test", true);
4555    if deps.is_empty() { None } else { Some(deps) }
4556}
4557
4558fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
4559    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
4560    let re = Regex::new(pattern).ok()?;
4561    let captures = re.captures(content)?;
4562    let dict_content = captures.get(1)?.as_str();
4563
4564    let mut all_deps = Vec::new();
4565
4566    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
4567    let entry_re = Regex::new(entry_pattern).ok()?;
4568
4569    for entry_cap in entry_re.captures_iter(dict_content) {
4570        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
4571            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
4572            all_deps.extend(deps);
4573        }
4574    }
4575
4576    if all_deps.is_empty() {
4577        None
4578    } else {
4579        Some(all_deps)
4580    }
4581}
4582
4583fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
4584    let dep_pattern = r#"['"]([^'"]+)['"]"#;
4585    let re = match Regex::new(dep_pattern) {
4586        Ok(r) => r,
4587        Err(_) => return Vec::new(),
4588    };
4589
4590    re.captures_iter(deps_str)
4591        .filter_map(|cap| {
4592            let dep_str = cap.get(1)?.as_str().trim();
4593            if dep_str.is_empty() {
4594                return None;
4595            }
4596
4597            let name = extract_setup_cfg_dependency_name(dep_str)?;
4598            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4599
4600            Some(Dependency {
4601                purl: Some(purl.to_string()),
4602                extracted_requirement: Some(dep_str.to_string()),
4603                scope: Some(scope.to_string()),
4604                is_runtime: Some(true),
4605                is_optional: Some(is_optional),
4606                is_pinned: Some(false),
4607                is_direct: Some(true),
4608                resolved_package: None,
4609                extra_data: None,
4610            })
4611        })
4612        .collect()
4613}
4614
4615/// Reads and parses a TOML file
4616pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
4617    let content = read_file_to_string(path).map_err(|e| e.to_string())?;
4618    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
4619}
4620
4621/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
4622///
4623/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
4624/// Essential for SBOM compliance and package integrity verification.
4625///
4626/// # Returns
4627///
4628/// - `(Some(size), Some(hash))` on success
4629/// - `(None, None)` if file cannot be opened
4630/// - `(Some(size), None)` if hash calculation fails during read
4631fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<String>) {
4632    let mut file = match File::open(path) {
4633        Ok(f) => f,
4634        Err(_) => return (None, None),
4635    };
4636
4637    let metadata = match file.metadata() {
4638        Ok(m) => m,
4639        Err(_) => return (None, None),
4640    };
4641    let size = metadata.len();
4642
4643    let mut hasher = Sha256::new();
4644    let mut buffer = vec![0; 8192];
4645
4646    loop {
4647        match file.read(&mut buffer) {
4648            Ok(0) => break,
4649            Ok(n) => hasher.update(&buffer[..n]),
4650            Err(_) => return (Some(size), None),
4651        }
4652    }
4653
4654    let hash = hex::encode(hasher.finalize());
4655    (Some(size), Some(hash))
4656}
4657
4658fn default_package_data(path: &Path) -> PackageData {
4659    PackageData {
4660        package_type: Some(PythonParser::PACKAGE_TYPE),
4661        primary_language: Some("Python".to_string()),
4662        datasource_id: infer_python_datasource_id(path),
4663        ..Default::default()
4664    }
4665}
4666
4667fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
4668    let file_name = path.file_name().and_then(|name| name.to_str());
4669
4670    match file_name {
4671        Some("pyproject.toml") => Some(DatasourceId::PypiPyprojectToml),
4672        Some("setup.py") => Some(DatasourceId::PypiSetupPy),
4673        Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
4674        Some("PKG-INFO") => Some(DatasourceId::PypiSdistPkginfo),
4675        Some("METADATA") => Some(DatasourceId::PypiWheelMetadata),
4676        Some("pypi.json") => Some(DatasourceId::PypiJson),
4677        Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
4678        Some("origin.json") if is_pip_cache_origin_json(path) => {
4679            Some(DatasourceId::PypiPipOriginJson)
4680        }
4681        _ if is_python_sdist_archive_path(path) => Some(DatasourceId::PypiSdistPkginfo),
4682        _ if path
4683            .extension()
4684            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
4685        {
4686            Some(DatasourceId::PypiWheel)
4687        }
4688        _ if path
4689            .extension()
4690            .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
4691        {
4692            Some(DatasourceId::PypiEgg)
4693        }
4694        _ => None,
4695    }
4696}
4697
4698crate::register_parser!(
4699    "Python package manifests (pyproject.toml, setup.py, setup.cfg, pypi.json, PKG-INFO, METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
4700    &[
4701        "**/pyproject.toml",
4702        "**/setup.py",
4703        "**/setup.cfg",
4704        "**/pypi.json",
4705        "**/PKG-INFO",
4706        "**/METADATA",
4707        "**/origin.json",
4708        "**/*.tar.gz",
4709        "**/*.tgz",
4710        "**/*.tar.bz2",
4711        "**/*.tar.xz",
4712        "**/*.zip",
4713        "**/*.whl",
4714        "**/*.egg"
4715    ],
4716    "pypi",
4717    "Python",
4718    Some("https://packaging.python.org/"),
4719);