Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
35use crate::parser_warn as warn;
36use crate::parsers::utils::{read_file_to_string, split_name_email};
37use base64::Engine;
38use base64::engine::general_purpose::URL_SAFE_NO_PAD;
39use bzip2::read::BzDecoder;
40use csv::ReaderBuilder;
41use flate2::read::GzDecoder;
42use liblzma::read::XzDecoder;
43use packageurl::PackageUrl;
44use regex::Regex;
45use ruff_python_ast as ast;
46use ruff_python_parser::parse_module;
47use serde_json::{Map as JsonMap, Value as JsonValue};
48use sha2::{Digest, Sha256};
49use std::collections::{HashMap, HashSet};
50use std::fs::File;
51use std::io::Read;
52use std::path::{Component, Path, PathBuf};
53use tar::Archive;
54use toml::Value as TomlValue;
55use toml::map::Map as TomlMap;
56use zip::ZipArchive;
57
58use super::PackageParser;
59use super::license_normalization::{
60    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
61    normalize_spdx_expression,
62};
63
64// Field constants for pyproject.toml
65const FIELD_PROJECT: &str = "project";
66const FIELD_NAME: &str = "name";
67const FIELD_VERSION: &str = "version";
68const FIELD_LICENSE: &str = "license";
69const FIELD_AUTHORS: &str = "authors";
70const FIELD_MAINTAINERS: &str = "maintainers";
71const FIELD_URLS: &str = "urls";
72const FIELD_HOMEPAGE: &str = "homepage";
73const FIELD_REPOSITORY: &str = "repository";
74const FIELD_DEPENDENCIES: &str = "dependencies";
75const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
76const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
77const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
78const MAX_SETUP_PY_BYTES: usize = 1_048_576;
79const MAX_SETUP_PY_AST_NODES: usize = 10_000;
80const MAX_SETUP_PY_AST_DEPTH: usize = 50;
81const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
82const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
83const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
84
85/// Python package parser supporting 11 manifest formats.
86///
87/// Extracts metadata from Python package files including pyproject.toml, setup.py,
88/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
89///
90/// # Security
91///
92/// setup.py files are parsed using AST analysis rather than code execution to prevent
93/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
94pub struct PythonParser;
95
96#[derive(Clone, Copy, Debug)]
97enum PythonSdistArchiveFormat {
98    TarGz,
99    Tgz,
100    TarBz2,
101    TarXz,
102    Zip,
103}
104
105#[derive(Clone, Debug)]
106struct ValidatedZipEntry {
107    index: usize,
108    name: String,
109}
110
111impl PackageParser for PythonParser {
112    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
113
114    fn extract_packages(path: &Path) -> Vec<PackageData> {
115        vec![
116            if path.file_name().unwrap_or_default() == "pyproject.toml" {
117                extract_from_pyproject_toml(path)
118            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
119                extract_from_setup_cfg(path)
120            } else if path.file_name().unwrap_or_default() == "setup.py" {
121                extract_from_setup_py(path)
122            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
123                extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
124            } else if path.file_name().unwrap_or_default() == "METADATA" {
125                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
126            } else if is_pip_cache_origin_json(path) {
127                extract_from_pip_origin_json(path)
128            } else if path.file_name().unwrap_or_default() == "pypi.json" {
129                extract_from_pypi_json(path)
130            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
131                extract_from_pip_inspect(path)
132            } else if is_python_sdist_archive_path(path) {
133                extract_from_sdist_archive(path)
134            } else if path
135                .extension()
136                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
137            {
138                extract_from_wheel_archive(path)
139            } else if path
140                .extension()
141                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
142            {
143                extract_from_egg_archive(path)
144            } else {
145                default_package_data(path)
146            },
147        ]
148    }
149
150    fn is_match(path: &Path) -> bool {
151        if let Some(filename) = path.file_name()
152            && (filename == "pyproject.toml"
153                || filename == "setup.cfg"
154                || filename == "setup.py"
155                || filename == "PKG-INFO"
156                || filename == "METADATA"
157                || filename == "pypi.json"
158                || filename == "pip-inspect.deplock"
159                || is_pip_cache_origin_json(path))
160        {
161            return true;
162        }
163
164        if let Some(extension) = path.extension() {
165            let ext = extension.to_string_lossy().to_lowercase();
166            if ext == "whl" || ext == "egg" || is_python_sdist_archive_path(path) {
167                return true;
168            }
169        }
170
171        false
172    }
173}
174
175#[derive(Debug, Clone)]
176struct InstalledWheelMetadata {
177    wheel_tags: Vec<String>,
178    wheel_version: Option<String>,
179    wheel_generator: Option<String>,
180    root_is_purelib: Option<bool>,
181    compressed_tag: Option<String>,
182}
183
184fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
185    let Some(parent) = path.parent() else {
186        return;
187    };
188
189    if !parent
190        .file_name()
191        .and_then(|name| name.to_str())
192        .is_some_and(|name| name.ends_with(".dist-info"))
193    {
194        return;
195    }
196
197    let wheel_path = parent.join("WHEEL");
198    if !wheel_path.exists() {
199        return;
200    }
201
202    let Ok(content) = read_file_to_string(&wheel_path) else {
203        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
204        return;
205    };
206
207    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
208        return;
209    };
210
211    apply_installed_wheel_metadata(package_data, &wheel_metadata);
212}
213
214fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
215    use super::rfc822::{get_header_all, get_header_first};
216
217    let metadata = super::rfc822::parse_rfc822_content(content);
218    let wheel_tags = get_header_all(&metadata.headers, "tag");
219    if wheel_tags.is_empty() {
220        return None;
221    }
222
223    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
224    let wheel_generator = get_header_first(&metadata.headers, "generator");
225    let root_is_purelib =
226        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
227            match value.to_ascii_lowercase().as_str() {
228                "true" => Some(true),
229                "false" => Some(false),
230                _ => None,
231            }
232        });
233
234    let compressed_tag = compress_wheel_tags(&wheel_tags);
235
236    Some(InstalledWheelMetadata {
237        wheel_tags,
238        wheel_version,
239        wheel_generator,
240        root_is_purelib,
241        compressed_tag,
242    })
243}
244
245fn compress_wheel_tags(tags: &[String]) -> Option<String> {
246    if tags.is_empty() {
247        return None;
248    }
249
250    if tags.len() == 1 {
251        return Some(tags[0].clone());
252    }
253
254    let mut python_tags = Vec::new();
255    let mut abi_tag: Option<&str> = None;
256    let mut platform_tag: Option<&str> = None;
257
258    for tag in tags {
259        let mut parts = tag.splitn(3, '-');
260        let python = parts.next()?;
261        let abi = parts.next()?;
262        let platform = parts.next()?;
263
264        if abi_tag.is_some_and(|existing| existing != abi)
265            || platform_tag.is_some_and(|existing| existing != platform)
266        {
267            return None;
268        }
269
270        abi_tag = Some(abi);
271        platform_tag = Some(platform);
272        python_tags.push(python.to_string());
273    }
274
275    Some(format!(
276        "{}-{}-{}",
277        python_tags.join("."),
278        abi_tag?,
279        platform_tag?
280    ))
281}
282
283fn apply_installed_wheel_metadata(
284    package_data: &mut PackageData,
285    wheel_metadata: &InstalledWheelMetadata,
286) {
287    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
288    extra_data.insert(
289        "wheel_tags".to_string(),
290        JsonValue::Array(
291            wheel_metadata
292                .wheel_tags
293                .iter()
294                .cloned()
295                .map(JsonValue::String)
296                .collect(),
297        ),
298    );
299
300    if let Some(wheel_version) = &wheel_metadata.wheel_version {
301        extra_data.insert(
302            "wheel_version".to_string(),
303            JsonValue::String(wheel_version.clone()),
304        );
305    }
306
307    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
308        extra_data.insert(
309            "wheel_generator".to_string(),
310            JsonValue::String(wheel_generator.clone()),
311        );
312    }
313
314    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
315        extra_data.insert(
316            "root_is_purelib".to_string(),
317            JsonValue::Bool(root_is_purelib),
318        );
319    }
320
321    if let (Some(name), Some(version), Some(extension)) = (
322        package_data.name.as_deref(),
323        package_data.version.as_deref(),
324        wheel_metadata.compressed_tag.as_deref(),
325    ) {
326        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
327    }
328}
329
330fn is_pip_cache_origin_json(path: &Path) -> bool {
331    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
332        && path.ancestors().skip(1).any(|ancestor| {
333            ancestor
334                .file_name()
335                .and_then(|name| name.to_str())
336                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
337        })
338}
339
340fn extract_from_pip_origin_json(path: &Path) -> PackageData {
341    let content = match read_file_to_string(path) {
342        Ok(content) => content,
343        Err(e) => {
344            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
345            return default_package_data(path);
346        }
347    };
348
349    let root: JsonValue = match serde_json::from_str(&content) {
350        Ok(root) => root,
351        Err(e) => {
352            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
353            return default_package_data(path);
354        }
355    };
356
357    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
358        warn!("No url found in pip cache origin.json at {:?}", path);
359        return default_package_data(path);
360    };
361
362    let sibling_wheel = find_sibling_cached_wheel(path);
363    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
364        sibling_wheel
365            .as_ref()
366            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
367    });
368
369    let Some((name, version)) = name_version else {
370        warn!(
371            "Failed to infer package name/version from pip cache origin.json at {:?}",
372            path
373        );
374        return default_package_data(path);
375    };
376
377    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
378        build_pypi_urls(Some(&name), Some(&version));
379    let purl = sibling_wheel
380        .as_ref()
381        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
382        .or(plain_purl);
383
384    PackageData {
385        package_type: Some(PythonParser::PACKAGE_TYPE),
386        primary_language: Some("Python".to_string()),
387        name: Some(name),
388        version: Some(version),
389        datasource_id: Some(DatasourceId::PypiPipOriginJson),
390        download_url: Some(download_url.to_string()),
391        sha256: extract_sha256_from_origin_json(&root),
392        repository_homepage_url,
393        repository_download_url,
394        api_data_url,
395        purl,
396        ..Default::default()
397    }
398}
399
400fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
401    let parent = path.parent()?;
402    let entries = parent.read_dir().ok()?;
403
404    for entry in entries.flatten() {
405        let sibling_path = entry.path();
406        if sibling_path
407            .extension()
408            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
409            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
410        {
411            return Some(wheel_info);
412        }
413    }
414
415    None
416}
417
418fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
419    let file_name = url.rsplit('/').next()?;
420
421    if file_name.ends_with(".whl") {
422        return parse_wheel_filename(Path::new(file_name))
423            .map(|wheel_info| (wheel_info.name, wheel_info.version));
424    }
425
426    let stem = strip_python_archive_extension(file_name)?;
427    let (name, version) = stem.rsplit_once('-')?;
428    if name.is_empty() || version.is_empty() {
429        return None;
430    }
431
432    Some((name.replace('_', "-"), version.to_string()))
433}
434
435fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
436    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
437        .iter()
438        .find_map(|suffix| file_name.strip_suffix(suffix))
439}
440
441fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
442    root.pointer("/archive_info/hashes/sha256")
443        .and_then(|value| value.as_str())
444        .map(ToOwned::to_owned)
445        .or_else(|| {
446            root.pointer("/archive_info/hash")
447                .and_then(|value| value.as_str())
448                .and_then(normalize_origin_hash)
449        })
450}
451
452fn normalize_origin_hash(hash: &str) -> Option<String> {
453    if let Some(value) = hash.strip_prefix("sha256=") {
454        return Some(value.to_string());
455    }
456    if let Some(value) = hash.strip_prefix("sha256:") {
457        return Some(value.to_string());
458    }
459    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
460        return Some(hash.to_string());
461    }
462    None
463}
464
465fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
466    let content = match read_file_to_string(path) {
467        Ok(content) => content,
468        Err(e) => {
469            warn!("Failed to read metadata at {:?}: {}", path, e);
470            return default_package_data(path);
471        }
472    };
473
474    let metadata = super::rfc822::parse_rfc822_content(&content);
475    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
476    merge_sibling_metadata_dependencies(path, &mut package_data);
477    merge_sibling_metadata_file_references(path, &mut package_data);
478    if datasource_id == DatasourceId::PypiWheelMetadata {
479        merge_sibling_wheel_metadata(path, &mut package_data);
480    }
481    package_data
482}
483
484fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
485    let mut extra_dependencies = Vec::new();
486
487    if let Some(parent) = path.parent() {
488        let direct_requires = parent.join("requires.txt");
489        if direct_requires.exists()
490            && let Ok(content) = read_file_to_string(&direct_requires)
491        {
492            extra_dependencies.extend(parse_requires_txt(&content));
493        }
494
495        let sibling_egg_info_requires = parent
496            .read_dir()
497            .ok()
498            .into_iter()
499            .flatten()
500            .flatten()
501            .find_map(|entry| {
502                let child_path = entry.path();
503                if child_path.is_dir()
504                    && child_path
505                        .file_name()
506                        .and_then(|name| name.to_str())
507                        .is_some_and(|name| name.ends_with(".egg-info"))
508                {
509                    let requires = child_path.join("requires.txt");
510                    requires.exists().then_some(requires)
511                } else {
512                    None
513                }
514            });
515
516        if let Some(requires_path) = sibling_egg_info_requires
517            && let Ok(content) = read_file_to_string(&requires_path)
518        {
519            extra_dependencies.extend(parse_requires_txt(&content));
520        }
521    }
522
523    for dependency in extra_dependencies {
524        if !package_data.dependencies.iter().any(|existing| {
525            existing.purl == dependency.purl
526                && existing.scope == dependency.scope
527                && existing.extracted_requirement == dependency.extracted_requirement
528                && existing.extra_data == dependency.extra_data
529        }) {
530            package_data.dependencies.push(dependency);
531        }
532    }
533}
534
535fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
536    let mut extra_refs = Vec::new();
537
538    if let Some(parent) = path.parent() {
539        let record_path = parent.join("RECORD");
540        if record_path.exists()
541            && let Ok(content) = read_file_to_string(&record_path)
542        {
543            extra_refs.extend(parse_record_csv(&content));
544        }
545
546        let installed_files_path = parent.join("installed-files.txt");
547        if installed_files_path.exists()
548            && let Ok(content) = read_file_to_string(&installed_files_path)
549        {
550            extra_refs.extend(parse_installed_files_txt(&content));
551        }
552
553        let sources_path = parent.join("SOURCES.txt");
554        if sources_path.exists()
555            && let Ok(content) = read_file_to_string(&sources_path)
556        {
557            extra_refs.extend(parse_sources_txt(&content));
558        }
559    }
560
561    for file_ref in extra_refs {
562        if !package_data
563            .file_references
564            .iter()
565            .any(|existing| existing.path == file_ref.path)
566        {
567            package_data.file_references.push(file_ref);
568        }
569    }
570}
571
572fn collect_validated_zip_entries<R: Read + std::io::Seek>(
573    archive: &mut ZipArchive<R>,
574    path: &Path,
575    archive_type: &str,
576) -> Result<Vec<ValidatedZipEntry>, String> {
577    let mut total_extracted = 0u64;
578    let mut entries = Vec::new();
579
580    for i in 0..archive.len() {
581        if let Ok(file) = archive.by_index_raw(i) {
582            let compressed_size = file.compressed_size();
583            let uncompressed_size = file.size();
584            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
585                warn!(
586                    "Skipping unsafe path in {} {:?}: {}",
587                    archive_type,
588                    path,
589                    file.name()
590                );
591                continue;
592            };
593
594            if compressed_size > 0 {
595                let ratio = uncompressed_size as f64 / compressed_size as f64;
596                if ratio > MAX_COMPRESSION_RATIO {
597                    warn!(
598                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
599                        archive_type, path, ratio
600                    );
601                    continue;
602                }
603            }
604
605            if uncompressed_size > MAX_FILE_SIZE {
606                warn!(
607                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
608                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
609                );
610                continue;
611            }
612
613            total_extracted += uncompressed_size;
614            if total_extracted > MAX_ARCHIVE_SIZE {
615                let msg = format!(
616                    "Total extracted size exceeds limit for {} {:?}",
617                    archive_type, path
618                );
619                warn!("{}", msg);
620                return Err(msg);
621            }
622
623            entries.push(ValidatedZipEntry {
624                index: i,
625                name: entry_name,
626            });
627        }
628    }
629
630    Ok(entries)
631}
632
633fn is_python_sdist_archive_path(path: &Path) -> bool {
634    detect_python_sdist_archive_format(path).is_some()
635}
636
637fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
638    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
639
640    if !is_likely_python_sdist_filename(&file_name) {
641        return None;
642    }
643
644    if file_name.ends_with(".tar.gz") {
645        Some(PythonSdistArchiveFormat::TarGz)
646    } else if file_name.ends_with(".tgz") {
647        Some(PythonSdistArchiveFormat::Tgz)
648    } else if file_name.ends_with(".tar.bz2") {
649        Some(PythonSdistArchiveFormat::TarBz2)
650    } else if file_name.ends_with(".tar.xz") {
651        Some(PythonSdistArchiveFormat::TarXz)
652    } else if file_name.ends_with(".zip") {
653        Some(PythonSdistArchiveFormat::Zip)
654    } else {
655        None
656    }
657}
658
659fn is_likely_python_sdist_filename(file_name: &str) -> bool {
660    let Some(stem) = strip_python_archive_extension(file_name) else {
661        return false;
662    };
663
664    let Some((name, version)) = stem.rsplit_once('-') else {
665        return false;
666    };
667
668    !name.is_empty()
669        && !version.is_empty()
670        && version.chars().any(|ch| ch.is_ascii_digit())
671        && name
672            .chars()
673            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
674}
675
676fn extract_from_sdist_archive(path: &Path) -> PackageData {
677    let metadata = match std::fs::metadata(path) {
678        Ok(m) => m,
679        Err(e) => {
680            warn!(
681                "Failed to read metadata for sdist archive {:?}: {}",
682                path, e
683            );
684            return default_package_data(path);
685        }
686    };
687
688    if metadata.len() > MAX_ARCHIVE_SIZE {
689        warn!(
690            "sdist archive too large: {} bytes (limit: {} bytes)",
691            metadata.len(),
692            MAX_ARCHIVE_SIZE
693        );
694        return default_package_data(path);
695    }
696
697    let Some(format) = detect_python_sdist_archive_format(path) else {
698        return default_package_data(path);
699    };
700
701    let mut package_data = match format {
702        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
703            let file = match File::open(path) {
704                Ok(file) => file,
705                Err(e) => {
706                    warn!("Failed to open sdist archive {:?}: {}", path, e);
707                    return default_package_data(path);
708                }
709            };
710            let decoder = GzDecoder::new(file);
711            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
712        }
713        PythonSdistArchiveFormat::TarBz2 => {
714            let file = match File::open(path) {
715                Ok(file) => file,
716                Err(e) => {
717                    warn!("Failed to open sdist archive {:?}: {}", path, e);
718                    return default_package_data(path);
719                }
720            };
721            let decoder = BzDecoder::new(file);
722            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
723        }
724        PythonSdistArchiveFormat::TarXz => {
725            let file = match File::open(path) {
726                Ok(file) => file,
727                Err(e) => {
728                    warn!("Failed to open sdist archive {:?}: {}", path, e);
729                    return default_package_data(path);
730                }
731            };
732            let decoder = XzDecoder::new(file);
733            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
734        }
735        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
736    };
737
738    if package_data.package_type.is_some() {
739        let (size, sha256) = calculate_file_checksums(path);
740        package_data.size = size;
741        package_data.sha256 = sha256;
742    }
743
744    package_data
745}
746
747fn extract_from_tar_sdist_archive<R: Read>(
748    path: &Path,
749    reader: R,
750    archive_type: &str,
751    compressed_size: u64,
752) -> PackageData {
753    let mut archive = Archive::new(reader);
754    let archive_entries = match archive.entries() {
755        Ok(entries) => entries,
756        Err(e) => {
757            warn!(
758                "Failed to read {} sdist archive {:?}: {}",
759                archive_type, path, e
760            );
761            return default_package_data(path);
762        }
763    };
764
765    let mut total_extracted = 0u64;
766    let mut entries = Vec::new();
767
768    for entry_result in archive_entries {
769        let mut entry = match entry_result {
770            Ok(entry) => entry,
771            Err(e) => {
772                warn!(
773                    "Failed to read {} sdist entry from {:?}: {}",
774                    archive_type, path, e
775                );
776                continue;
777            }
778        };
779
780        let entry_size = entry.size();
781        if entry_size > MAX_FILE_SIZE {
782            warn!(
783                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
784                archive_type, path, entry_size, MAX_FILE_SIZE
785            );
786            continue;
787        }
788
789        total_extracted += entry_size;
790        if total_extracted > MAX_ARCHIVE_SIZE {
791            warn!(
792                "Total extracted size exceeds limit for {} sdist {:?}",
793                archive_type, path
794            );
795            return default_package_data(path);
796        }
797
798        if compressed_size > 0 {
799            let ratio = total_extracted as f64 / compressed_size as f64;
800            if ratio > MAX_COMPRESSION_RATIO {
801                warn!(
802                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
803                    archive_type, path, ratio
804                );
805                return default_package_data(path);
806            }
807        }
808
809        let entry_path = match entry.path() {
810            Ok(path) => path.to_string_lossy().replace('\\', "/"),
811            Err(e) => {
812                warn!(
813                    "Failed to get {} sdist entry path from {:?}: {}",
814                    archive_type, path, e
815                );
816                continue;
817            }
818        };
819
820        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
821            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
822            continue;
823        };
824
825        if !is_relevant_sdist_text_entry(&entry_path) {
826            continue;
827        }
828
829        if let Ok(content) = read_limited_utf8(
830            &mut entry,
831            MAX_FILE_SIZE,
832            &format!("{} entry {}", archive_type, entry_path),
833        ) {
834            entries.push((entry_path, content));
835        }
836    }
837
838    build_sdist_package_data(path, entries)
839}
840
841fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
842    let file = match File::open(path) {
843        Ok(file) => file,
844        Err(e) => {
845            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
846            return default_package_data(path);
847        }
848    };
849
850    let mut archive = match ZipArchive::new(file) {
851        Ok(archive) => archive,
852        Err(e) => {
853            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
854            return default_package_data(path);
855        }
856    };
857
858    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
859        Ok(entries) => entries,
860        Err(_) => return default_package_data(path),
861    };
862
863    let mut entries = Vec::new();
864    for entry in validated_entries.iter() {
865        if !is_relevant_sdist_text_entry(&entry.name) {
866            continue;
867        }
868
869        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
870            entries.push((entry.name.clone(), content));
871        }
872    }
873
874    build_sdist_package_data(path, entries)
875}
876
877fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
878    entry_path.ends_with("/PKG-INFO")
879        || entry_path.ends_with("/requires.txt")
880        || entry_path.ends_with("/SOURCES.txt")
881}
882
883fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
884    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
885        warn!("No PKG-INFO file found in sdist archive {:?}", path);
886        return default_package_data(path);
887    };
888
889    let mut package_data =
890        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
891    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
892    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
893    apply_sdist_name_version_fallback(path, &mut package_data);
894    package_data.datasource_id = Some(DatasourceId::PypiSdist);
895    package_data
896}
897
898fn select_sdist_pkginfo_entry(
899    archive_path: &Path,
900    entries: &[(String, String)],
901) -> Option<(String, String)> {
902    let expected_name = archive_path
903        .file_name()
904        .and_then(|name| name.to_str())
905        .and_then(strip_python_archive_extension)
906        .and_then(|stem| {
907            stem.rsplit_once('-')
908                .map(|(name, _)| normalize_python_package_name(name))
909        });
910
911    entries
912        .iter()
913        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
914        .min_by_key(|(entry_path, content)| {
915            let components: Vec<_> = entry_path
916                .split('/')
917                .filter(|part| !part.is_empty())
918                .collect();
919            let metadata = super::rfc822::parse_rfc822_content(content);
920            let candidate_name = super::rfc822::get_header_first(&metadata.headers, "name")
921                .map(|name| normalize_python_package_name(&name));
922            let name_rank = if candidate_name == expected_name {
923                0
924            } else {
925                1
926            };
927            let kind_rank = if components.len() == 3
928                && components[1].ends_with(".egg-info")
929                && components[2] == "PKG-INFO"
930            {
931                0
932            } else if components.len() == 2 && components[1] == "PKG-INFO" {
933                1
934            } else if entry_path.ends_with(".egg-info/PKG-INFO") {
935                2
936            } else {
937                3
938            };
939
940            (name_rank, kind_rank, components.len(), entry_path.clone())
941        })
942        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
943}
944
945fn merge_sdist_archive_dependencies(
946    entries: &[(String, String)],
947    metadata_path: &str,
948    package_data: &mut PackageData,
949) {
950    let metadata_dir = metadata_path
951        .rsplit_once('/')
952        .map(|(dir, _)| dir)
953        .unwrap_or("");
954    let archive_root = metadata_path.split('/').next().unwrap_or("");
955    let matched_egg_info_dir =
956        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
957    let mut extra_dependencies = Vec::new();
958
959    for (entry_path, content) in entries {
960        let is_direct_requires =
961            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
962        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
963            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
964        });
965
966        if is_direct_requires || is_egg_info_requires {
967            extra_dependencies.extend(parse_requires_txt(content));
968        }
969    }
970
971    for dependency in extra_dependencies {
972        if !package_data.dependencies.iter().any(|existing| {
973            existing.purl == dependency.purl
974                && existing.scope == dependency.scope
975                && existing.extracted_requirement == dependency.extracted_requirement
976                && existing.extra_data == dependency.extra_data
977        }) {
978            package_data.dependencies.push(dependency);
979        }
980    }
981}
982
983fn merge_sdist_archive_file_references(
984    entries: &[(String, String)],
985    metadata_path: &str,
986    package_data: &mut PackageData,
987) {
988    let metadata_dir = metadata_path
989        .rsplit_once('/')
990        .map(|(dir, _)| dir)
991        .unwrap_or("");
992    let archive_root = metadata_path.split('/').next().unwrap_or("");
993    let matched_egg_info_dir =
994        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
995    let mut extra_refs = Vec::new();
996
997    for (entry_path, content) in entries {
998        let is_direct_sources =
999            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1000        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1001            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1002        });
1003
1004        if is_direct_sources || is_egg_info_sources {
1005            extra_refs.extend(parse_sources_txt(content));
1006        }
1007    }
1008
1009    for file_ref in extra_refs {
1010        if !package_data
1011            .file_references
1012            .iter()
1013            .any(|existing| existing.path == file_ref.path)
1014        {
1015            package_data.file_references.push(file_ref);
1016        }
1017    }
1018}
1019
1020fn select_matching_sdist_egg_info_dir(
1021    entries: &[(String, String)],
1022    archive_root: &str,
1023    package_name: Option<&str>,
1024) -> Option<String> {
1025    let normalized_package_name = package_name.map(normalize_python_package_name);
1026
1027    entries
1028        .iter()
1029        .filter_map(|(entry_path, _)| {
1030            let components: Vec<_> = entry_path
1031                .split('/')
1032                .filter(|part| !part.is_empty())
1033                .collect();
1034            if components.len() == 3
1035                && components[0] == archive_root
1036                && components[1].ends_with(".egg-info")
1037            {
1038                Some(components[1].to_string())
1039            } else {
1040                None
1041            }
1042        })
1043        .min_by_key(|egg_info_dir| {
1044            let normalized_dir_name =
1045                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1046            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1047                0
1048            } else {
1049                1
1050            };
1051
1052            (name_rank, egg_info_dir.clone())
1053        })
1054}
1055
1056fn normalize_python_package_name(name: &str) -> String {
1057    name.to_ascii_lowercase().replace('_', "-")
1058}
1059
1060fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1061    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1062        return;
1063    };
1064
1065    let Some(stem) = strip_python_archive_extension(file_name) else {
1066        return;
1067    };
1068
1069    let Some((name, version)) = stem.rsplit_once('-') else {
1070        return;
1071    };
1072
1073    if package_data.name.is_none() {
1074        package_data.name = Some(name.replace('_', "-"));
1075    }
1076    if package_data.version.is_none() {
1077        package_data.version = Some(version.to_string());
1078    }
1079
1080    if package_data.purl.is_none()
1081        || package_data.repository_homepage_url.is_none()
1082        || package_data.repository_download_url.is_none()
1083        || package_data.api_data_url.is_none()
1084    {
1085        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1086            build_pypi_urls(
1087                package_data.name.as_deref(),
1088                package_data.version.as_deref(),
1089            );
1090
1091        if package_data.repository_homepage_url.is_none() {
1092            package_data.repository_homepage_url = repository_homepage_url;
1093        }
1094        if package_data.repository_download_url.is_none() {
1095            package_data.repository_download_url = repository_download_url;
1096        }
1097        if package_data.api_data_url.is_none() {
1098            package_data.api_data_url = api_data_url;
1099        }
1100        if package_data.purl.is_none() {
1101            package_data.purl = purl;
1102        }
1103    }
1104}
1105
1106fn extract_from_wheel_archive(path: &Path) -> PackageData {
1107    let metadata = match std::fs::metadata(path) {
1108        Ok(m) => m,
1109        Err(e) => {
1110            warn!(
1111                "Failed to read metadata for wheel archive {:?}: {}",
1112                path, e
1113            );
1114            return default_package_data(path);
1115        }
1116    };
1117
1118    if metadata.len() > MAX_ARCHIVE_SIZE {
1119        warn!(
1120            "Wheel archive too large: {} bytes (limit: {} bytes)",
1121            metadata.len(),
1122            MAX_ARCHIVE_SIZE
1123        );
1124        return default_package_data(path);
1125    }
1126
1127    let file = match File::open(path) {
1128        Ok(f) => f,
1129        Err(e) => {
1130            warn!("Failed to open wheel archive {:?}: {}", path, e);
1131            return default_package_data(path);
1132        }
1133    };
1134
1135    let mut archive = match ZipArchive::new(file) {
1136        Ok(a) => a,
1137        Err(e) => {
1138            warn!("Failed to read wheel archive {:?}: {}", path, e);
1139            return default_package_data(path);
1140        }
1141    };
1142
1143    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1144        Ok(entries) => entries,
1145        Err(_) => return default_package_data(path),
1146    };
1147
1148    let metadata_entry =
1149        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1150            Some(entry) => entry,
1151            None => {
1152                warn!("No METADATA file found in wheel archive {:?}", path);
1153                return default_package_data(path);
1154            }
1155        };
1156
1157    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1158        Ok(c) => c,
1159        Err(e) => {
1160            warn!("Failed to read METADATA from {:?}: {}", path, e);
1161            return default_package_data(path);
1162        }
1163    };
1164
1165    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1166
1167    let (size, sha256) = calculate_file_checksums(path);
1168    package_data.size = size;
1169    package_data.sha256 = sha256;
1170
1171    if let Some(record_entry) =
1172        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1173        && let Ok(record_content) =
1174            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1175    {
1176        package_data.file_references = parse_record_csv(&record_content);
1177    }
1178
1179    if let Some(wheel_info) = parse_wheel_filename(path) {
1180        if package_data.name.is_none() {
1181            package_data.name = Some(wheel_info.name.clone());
1182        }
1183        if package_data.version.is_none() {
1184            package_data.version = Some(wheel_info.version.clone());
1185        }
1186
1187        package_data.qualifiers = Some(std::collections::HashMap::from([(
1188            "extension".to_string(),
1189            format!(
1190                "{}-{}-{}",
1191                wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1192            ),
1193        )]));
1194
1195        package_data.purl = build_wheel_purl(
1196            package_data.name.as_deref(),
1197            package_data.version.as_deref(),
1198            &wheel_info,
1199        );
1200
1201        let mut extra_data = package_data.extra_data.unwrap_or_default();
1202        extra_data.insert(
1203            "python_requires".to_string(),
1204            serde_json::Value::String(wheel_info.python_tag.clone()),
1205        );
1206        extra_data.insert(
1207            "abi_tag".to_string(),
1208            serde_json::Value::String(wheel_info.abi_tag.clone()),
1209        );
1210        extra_data.insert(
1211            "platform_tag".to_string(),
1212            serde_json::Value::String(wheel_info.platform_tag.clone()),
1213        );
1214        package_data.extra_data = Some(extra_data);
1215    }
1216
1217    package_data
1218}
1219
1220fn extract_from_egg_archive(path: &Path) -> PackageData {
1221    let metadata = match std::fs::metadata(path) {
1222        Ok(m) => m,
1223        Err(e) => {
1224            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1225            return default_package_data(path);
1226        }
1227    };
1228
1229    if metadata.len() > MAX_ARCHIVE_SIZE {
1230        warn!(
1231            "Egg archive too large: {} bytes (limit: {} bytes)",
1232            metadata.len(),
1233            MAX_ARCHIVE_SIZE
1234        );
1235        return default_package_data(path);
1236    }
1237
1238    let file = match File::open(path) {
1239        Ok(f) => f,
1240        Err(e) => {
1241            warn!("Failed to open egg archive {:?}: {}", path, e);
1242            return default_package_data(path);
1243        }
1244    };
1245
1246    let mut archive = match ZipArchive::new(file) {
1247        Ok(a) => a,
1248        Err(e) => {
1249            warn!("Failed to read egg archive {:?}: {}", path, e);
1250            return default_package_data(path);
1251        }
1252    };
1253
1254    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1255        Ok(entries) => entries,
1256        Err(_) => return default_package_data(path),
1257    };
1258
1259    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1260        &validated_entries,
1261        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1262    ) {
1263        Some(entry) => entry,
1264        None => {
1265            warn!("No PKG-INFO file found in egg archive {:?}", path);
1266            return default_package_data(path);
1267        }
1268    };
1269
1270    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1271        Ok(c) => c,
1272        Err(e) => {
1273            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1274            return default_package_data(path);
1275        }
1276    };
1277
1278    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1279
1280    let (size, sha256) = calculate_file_checksums(path);
1281    package_data.size = size;
1282    package_data.sha256 = sha256;
1283
1284    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1285        &validated_entries,
1286        &[
1287            "EGG-INFO/installed-files.txt",
1288            ".egg-info/installed-files.txt",
1289        ],
1290    ) && let Ok(installed_files_content) =
1291        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1292    {
1293        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1294    }
1295
1296    if let Some(egg_info) = parse_egg_filename(path) {
1297        if package_data.name.is_none() {
1298            package_data.name = Some(egg_info.name.clone());
1299        }
1300        if package_data.version.is_none() {
1301            package_data.version = Some(egg_info.version.clone());
1302        }
1303
1304        if let Some(python_version) = &egg_info.python_version {
1305            let mut extra_data = package_data.extra_data.unwrap_or_default();
1306            extra_data.insert(
1307                "python_version".to_string(),
1308                serde_json::Value::String(python_version.clone()),
1309            );
1310            package_data.extra_data = Some(extra_data);
1311        }
1312    }
1313
1314    package_data.purl = build_egg_purl(
1315        package_data.name.as_deref(),
1316        package_data.version.as_deref(),
1317    );
1318
1319    package_data
1320}
1321
1322fn find_validated_zip_entry_by_suffix<'a>(
1323    entries: &'a [ValidatedZipEntry],
1324    suffix: &str,
1325) -> Option<&'a ValidatedZipEntry> {
1326    entries.iter().find(|entry| entry.name.ends_with(suffix))
1327}
1328
1329fn find_validated_zip_entry_by_any_suffix<'a>(
1330    entries: &'a [ValidatedZipEntry],
1331    suffixes: &[&str],
1332) -> Option<&'a ValidatedZipEntry> {
1333    entries
1334        .iter()
1335        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1336}
1337
1338fn read_validated_zip_entry<R: Read + std::io::Seek>(
1339    archive: &mut ZipArchive<R>,
1340    entry: &ValidatedZipEntry,
1341    path: &Path,
1342    archive_type: &str,
1343) -> Result<String, String> {
1344    let mut file = archive
1345        .by_index(entry.index)
1346        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1347
1348    let compressed_size = file.compressed_size();
1349    let uncompressed_size = file.size();
1350
1351    if compressed_size > 0 {
1352        let ratio = uncompressed_size as f64 / compressed_size as f64;
1353        if ratio > MAX_COMPRESSION_RATIO {
1354            return Err(format!(
1355                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1356                archive_type, path, ratio
1357            ));
1358        }
1359    }
1360
1361    if uncompressed_size > MAX_FILE_SIZE {
1362        return Err(format!(
1363            "Rejected oversized entry in {} {:?}: {} bytes",
1364            archive_type, path, uncompressed_size
1365        ));
1366    }
1367
1368    read_limited_utf8(
1369        &mut file,
1370        MAX_FILE_SIZE,
1371        &format!("{} entry {}", archive_type, entry.name),
1372    )
1373}
1374
1375fn read_limited_utf8<R: Read>(
1376    reader: &mut R,
1377    max_bytes: u64,
1378    context: &str,
1379) -> Result<String, String> {
1380    let mut limited = reader.take(max_bytes + 1);
1381    let mut bytes = Vec::new();
1382    limited
1383        .read_to_end(&mut bytes)
1384        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1385
1386    if bytes.len() as u64 > max_bytes {
1387        return Err(format!(
1388            "{} exceeded {} byte limit while reading",
1389            context, max_bytes
1390        ));
1391    }
1392
1393    String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1394}
1395
1396fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1397    let normalized = entry_path.replace('\\', "/");
1398    if normalized.len() >= 3 {
1399        let bytes = normalized.as_bytes();
1400        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1401            return None;
1402        }
1403    }
1404    let path = Path::new(&normalized);
1405    let mut components = Vec::new();
1406
1407    for component in path.components() {
1408        match component {
1409            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1410            Component::CurDir => {}
1411            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1412        }
1413    }
1414
1415    (!components.is_empty()).then_some(components.join("/"))
1416}
1417
1418/// Parses RECORD CSV format from wheel archives (PEP 427).
1419/// Format: path,hash,size (3 columns, no header)
1420/// Hash format: sha256=urlsafe_base64_hash or empty
1421/// Size: bytes as u64 or empty
1422pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1423    let mut reader = ReaderBuilder::new()
1424        .has_headers(false)
1425        .from_reader(content.as_bytes());
1426
1427    let mut file_references = Vec::new();
1428
1429    for result in reader.records() {
1430        match result {
1431            Ok(record) => {
1432                if record.len() < 3 {
1433                    continue;
1434                }
1435
1436                let path = record.get(0).unwrap_or("").trim().to_string();
1437                if path.is_empty() {
1438                    continue;
1439                }
1440
1441                let hash_field = record.get(1).unwrap_or("").trim();
1442                let size_field = record.get(2).unwrap_or("").trim();
1443
1444                // Parse hash: format is "algorithm=value"
1445                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1446                    let parts: Vec<&str> = hash_field.split('=').collect();
1447                    if parts.len() == 2 && parts[0] == "sha256" {
1448                        // Decode base64 to hex
1449                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1450                            Ok(decoded) => {
1451                                let hex = decoded
1452                                    .iter()
1453                                    .map(|b| format!("{:02x}", b))
1454                                    .collect::<String>();
1455                                Some(hex)
1456                            }
1457                            Err(_) => None,
1458                        }
1459                    } else {
1460                        None
1461                    }
1462                } else {
1463                    None
1464                };
1465
1466                // Parse size
1467                let size = if !size_field.is_empty() && size_field != "-" {
1468                    size_field.parse::<u64>().ok()
1469                } else {
1470                    None
1471                };
1472
1473                file_references.push(FileReference {
1474                    path,
1475                    size,
1476                    sha1: None,
1477                    md5: None,
1478                    sha256,
1479                    sha512: None,
1480                    extra_data: None,
1481                });
1482            }
1483            Err(e) => {
1484                warn!("Failed to parse RECORD CSV row: {}", e);
1485                continue;
1486            }
1487        }
1488    }
1489
1490    file_references
1491}
1492
1493/// Parses installed-files.txt format from egg archives (PEP 376).
1494/// Format: one file path per line, no headers, no hash, no size
1495pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1496    content
1497        .lines()
1498        .map(|line| line.trim())
1499        .filter(|line| !line.is_empty())
1500        .map(|path| FileReference {
1501            path: path.to_string(),
1502            size: None,
1503            sha1: None,
1504            md5: None,
1505            sha256: None,
1506            sha512: None,
1507            extra_data: None,
1508        })
1509        .collect()
1510}
1511
1512pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1513    content
1514        .lines()
1515        .map(str::trim)
1516        .filter(|line| !line.is_empty())
1517        .map(|path| FileReference {
1518            path: path.to_string(),
1519            size: None,
1520            sha1: None,
1521            md5: None,
1522            sha256: None,
1523            sha512: None,
1524            extra_data: None,
1525        })
1526        .collect()
1527}
1528
1529struct WheelInfo {
1530    name: String,
1531    version: String,
1532    python_tag: String,
1533    abi_tag: String,
1534    platform_tag: String,
1535}
1536
1537fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1538    let stem = path.file_stem()?.to_string_lossy();
1539    let parts: Vec<&str> = stem.split('-').collect();
1540
1541    if parts.len() >= 5 {
1542        Some(WheelInfo {
1543            name: parts[0].replace('_', "-"),
1544            version: parts[1].to_string(),
1545            python_tag: parts[2].to_string(),
1546            abi_tag: parts[3].to_string(),
1547            platform_tag: parts[4..].join("-"),
1548        })
1549    } else {
1550        None
1551    }
1552}
1553
1554struct EggInfo {
1555    name: String,
1556    version: String,
1557    python_version: Option<String>,
1558}
1559
1560fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1561    let stem = path.file_stem()?.to_string_lossy();
1562    let parts: Vec<&str> = stem.split('-').collect();
1563
1564    if parts.len() >= 2 {
1565        Some(EggInfo {
1566            name: parts[0].replace('_', "-"),
1567            version: parts[1].to_string(),
1568            python_version: parts.get(2).map(|s| s.to_string()),
1569        })
1570    } else {
1571        None
1572    }
1573}
1574
1575fn build_wheel_purl(
1576    name: Option<&str>,
1577    version: Option<&str>,
1578    wheel_info: &WheelInfo,
1579) -> Option<String> {
1580    let name = name?;
1581    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1582
1583    if let Some(ver) = version {
1584        package_url.with_version(ver).ok()?;
1585    }
1586
1587    let extension = format!(
1588        "{}-{}-{}",
1589        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1590    );
1591    package_url.add_qualifier("extension", extension).ok()?;
1592
1593    Some(package_url.to_string())
1594}
1595
1596fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1597    let name = name?;
1598    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1599
1600    if let Some(ver) = version {
1601        package_url.with_version(ver).ok()?;
1602    }
1603
1604    package_url.add_qualifier("type", "egg").ok()?;
1605
1606    Some(package_url.to_string())
1607}
1608
1609fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1610    let metadata = super::rfc822::parse_rfc822_content(content);
1611    build_package_data_from_rfc822(&metadata, datasource_id)
1612}
1613
1614/// Builds PackageData from parsed RFC822 metadata.
1615///
1616/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1617/// and `python_parse_rfc822_content` (content-based) functions.
1618fn build_package_data_from_rfc822(
1619    metadata: &super::rfc822::Rfc822Metadata,
1620    datasource_id: DatasourceId,
1621) -> PackageData {
1622    use super::rfc822::{get_header_all, get_header_first};
1623
1624    let name = get_header_first(&metadata.headers, "name");
1625    let version = get_header_first(&metadata.headers, "version");
1626    let summary = get_header_first(&metadata.headers, "summary");
1627    let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1628    let author = get_header_first(&metadata.headers, "author");
1629    let author_email = get_header_first(&metadata.headers, "author-email");
1630    let license = get_header_first(&metadata.headers, "license");
1631    let license_expression = get_header_first(&metadata.headers, "license-expression");
1632    let download_url = get_header_first(&metadata.headers, "download-url");
1633    let platform = get_header_first(&metadata.headers, "platform");
1634    let requires_python = get_header_first(&metadata.headers, "requires-python");
1635    let classifiers = get_header_all(&metadata.headers, "classifier");
1636    let license_files = get_header_all(&metadata.headers, "license-file");
1637
1638    let description_body = if metadata.body.is_empty() {
1639        get_header_first(&metadata.headers, "description").unwrap_or_default()
1640    } else {
1641        metadata.body.clone()
1642    };
1643
1644    let description = build_description(summary.as_deref(), &description_body);
1645
1646    let mut parties = Vec::new();
1647    if author.is_some() || author_email.is_some() {
1648        parties.push(Party {
1649            r#type: Some("person".to_string()),
1650            role: Some("author".to_string()),
1651            name: author,
1652            email: author_email,
1653            url: None,
1654            organization: None,
1655            organization_url: None,
1656            timezone: None,
1657        });
1658    }
1659
1660    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1661    let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1662    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1663        license_expression
1664            .as_deref()
1665            .and_then(normalize_spdx_expression)
1666            .map(|normalized| {
1667                build_declared_license_data(
1668                    normalized,
1669                    DeclaredLicenseMatchMetadata::single_line(
1670                        license_expression.as_deref().unwrap_or_default(),
1671                    )
1672                    .with_referenced_filenames(&referenced_license_files),
1673                )
1674            })
1675            .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1676
1677    let extracted_license_statement = license_expression
1678        .clone()
1679        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1680
1681    let mut extra_data = HashMap::new();
1682    if let Some(platform_value) = platform
1683        && !platform_value.eq_ignore_ascii_case("unknown")
1684        && !platform_value.is_empty()
1685    {
1686        extra_data.insert(
1687            "platform".to_string(),
1688            serde_json::Value::String(platform_value),
1689        );
1690    }
1691
1692    if let Some(requires_python_value) = requires_python
1693        && !requires_python_value.is_empty()
1694    {
1695        extra_data.insert(
1696            "requires_python".to_string(),
1697            serde_json::Value::String(requires_python_value),
1698        );
1699    }
1700
1701    if !license_files.is_empty() {
1702        extra_data.insert(
1703            "license_files".to_string(),
1704            serde_json::Value::Array(
1705                license_files
1706                    .iter()
1707                    .cloned()
1708                    .map(serde_json::Value::String)
1709                    .collect(),
1710            ),
1711        );
1712    }
1713
1714    let file_references = license_files
1715        .iter()
1716        .map(|path| FileReference {
1717            path: path.clone(),
1718            size: None,
1719            sha1: None,
1720            md5: None,
1721            sha256: None,
1722            sha512: None,
1723            extra_data: None,
1724        })
1725        .collect();
1726
1727    let project_urls = get_header_all(&metadata.headers, "project-url");
1728    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1729    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1730
1731    if !project_urls.is_empty() {
1732        let parsed_urls = parse_project_urls(&project_urls);
1733
1734        for (label, url) in &parsed_urls {
1735            let label_lower = label.to_lowercase();
1736
1737            if bug_tracking_url.is_none()
1738                && matches!(
1739                    label_lower.as_str(),
1740                    "tracker"
1741                        | "bug reports"
1742                        | "bug tracker"
1743                        | "issues"
1744                        | "issue tracker"
1745                        | "github: issues"
1746                )
1747            {
1748                bug_tracking_url = Some(url.clone());
1749            } else if code_view_url.is_none()
1750                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1751            {
1752                code_view_url = Some(url.clone());
1753            } else if vcs_url.is_none()
1754                && matches!(
1755                    label_lower.as_str(),
1756                    "github" | "gitlab" | "github: repo" | "repository"
1757                )
1758            {
1759                vcs_url = Some(url.clone());
1760            } else if homepage_url.is_none()
1761                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1762            {
1763                homepage_url = Some(url.clone());
1764            } else if label_lower == "changelog" {
1765                extra_data.insert(
1766                    "changelog_url".to_string(),
1767                    serde_json::Value::String(url.clone()),
1768                );
1769            }
1770        }
1771
1772        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1773            .iter()
1774            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1775            .collect();
1776
1777        if !project_urls_json.is_empty() {
1778            extra_data.insert(
1779                "project_urls".to_string(),
1780                serde_json::Value::Object(project_urls_json),
1781            );
1782        }
1783    }
1784
1785    let extra_data = if extra_data.is_empty() {
1786        None
1787    } else {
1788        Some(extra_data)
1789    };
1790
1791    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1792        build_pypi_urls(name.as_deref(), version.as_deref());
1793
1794    PackageData {
1795        package_type: Some(PythonParser::PACKAGE_TYPE),
1796        namespace: None,
1797        name,
1798        version,
1799        qualifiers: None,
1800        subpath: None,
1801        primary_language: Some("Python".to_string()),
1802        description,
1803        release_date: None,
1804        parties,
1805        keywords,
1806        homepage_url,
1807        download_url,
1808        size: None,
1809        sha1: None,
1810        md5: None,
1811        sha256: None,
1812        sha512: None,
1813        bug_tracking_url,
1814        code_view_url,
1815        vcs_url,
1816        copyright: None,
1817        holder: None,
1818        declared_license_expression,
1819        declared_license_expression_spdx,
1820        license_detections,
1821        other_license_expression: None,
1822        other_license_expression_spdx: None,
1823        other_license_detections: Vec::new(),
1824        extracted_license_statement,
1825        notice_text: None,
1826        source_packages: Vec::new(),
1827        file_references,
1828        is_private: false,
1829        is_virtual: false,
1830        extra_data,
1831        dependencies,
1832        repository_homepage_url,
1833        repository_download_url,
1834        api_data_url,
1835        datasource_id: Some(datasource_id),
1836        purl,
1837    }
1838}
1839
1840fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
1841    project_urls
1842        .iter()
1843        .filter_map(|url_entry| {
1844            if let Some((label, url)) = url_entry.split_once(", ") {
1845                let label_trimmed = label.trim();
1846                let url_trimmed = url.trim();
1847                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
1848                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
1849                }
1850            }
1851            None
1852        })
1853        .collect()
1854}
1855
1856fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
1857    let mut parts = Vec::new();
1858    if let Some(summary_value) = summary
1859        && !summary_value.trim().is_empty()
1860    {
1861        parts.push(summary_value.trim().to_string());
1862    }
1863
1864    if !body.trim().is_empty() {
1865        parts.push(body.trim().to_string());
1866    }
1867
1868    if parts.is_empty() {
1869        None
1870    } else {
1871        Some(parts.join("\n"))
1872    }
1873}
1874
1875fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
1876    let mut keywords = Vec::new();
1877    let mut license_classifiers = Vec::new();
1878
1879    for classifier in classifiers {
1880        if classifier.starts_with("License ::") {
1881            license_classifiers.push(classifier.to_string());
1882        } else {
1883            keywords.push(classifier.to_string());
1884        }
1885    }
1886
1887    (keywords, license_classifiers)
1888}
1889
1890fn build_extracted_license_statement(
1891    license: Option<&str>,
1892    license_classifiers: &[String],
1893) -> Option<String> {
1894    let mut lines = Vec::new();
1895
1896    if let Some(value) = license
1897        && !value.trim().is_empty()
1898    {
1899        lines.push(format!("license: {}", value.trim()));
1900    }
1901
1902    if !license_classifiers.is_empty() {
1903        lines.push("classifiers:".to_string());
1904        for classifier in license_classifiers {
1905            lines.push(format!("  - '{}'", classifier));
1906        }
1907    }
1908
1909    if lines.is_empty() {
1910        None
1911    } else {
1912        Some(format!("{}\n", lines.join("\n")))
1913    }
1914}
1915
1916pub(crate) fn build_pypi_urls(
1917    name: Option<&str>,
1918    version: Option<&str>,
1919) -> (
1920    Option<String>,
1921    Option<String>,
1922    Option<String>,
1923    Option<String>,
1924) {
1925    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
1926
1927    let repository_download_url = name.and_then(|value| {
1928        version.map(|ver| {
1929            format!(
1930                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
1931                &value[..1.min(value.len())],
1932                value,
1933                value,
1934                ver
1935            )
1936        })
1937    });
1938
1939    let api_data_url = name.map(|value| {
1940        if let Some(ver) = version {
1941            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
1942        } else {
1943            format!("https://pypi.org/pypi/{}/json", value)
1944        }
1945    });
1946
1947    let purl = name.and_then(|value| {
1948        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
1949        if let Some(ver) = version {
1950            package_url.with_version(ver).ok()?;
1951        }
1952        Some(package_url.to_string())
1953    });
1954
1955    (
1956        repository_homepage_url,
1957        repository_download_url,
1958        api_data_url,
1959        purl,
1960    )
1961}
1962
1963fn build_pypi_purl_with_extension(
1964    name: &str,
1965    version: Option<&str>,
1966    extension: &str,
1967) -> Option<String> {
1968    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1969    if let Some(ver) = version {
1970        package_url.with_version(ver).ok()?;
1971    }
1972    package_url.add_qualifier("extension", extension).ok()?;
1973    Some(package_url.to_string())
1974}
1975
1976fn extract_from_pyproject_toml(path: &Path) -> PackageData {
1977    let toml_content = match read_toml_file(path) {
1978        Ok(content) => content,
1979        Err(e) => {
1980            warn!(
1981                "Failed to read or parse pyproject.toml at {:?}: {}",
1982                path, e
1983            );
1984            return default_package_data(path);
1985        }
1986    };
1987
1988    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
1989    let is_poetry_pyproject = tool_table
1990        .and_then(|tool| tool.get("poetry"))
1991        .and_then(|value| value.as_table())
1992        .is_some();
1993
1994    // Handle both PEP 621 (project table) and poetry formats
1995    let project_table =
1996        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
1997            // Standard PEP 621 format with [project] table
1998            project.clone()
1999        } else if let Some(tool) = tool_table {
2000            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2001                // Poetry format with [tool.poetry] table
2002                poetry.clone()
2003            } else {
2004                warn!(
2005                    "No project or tool.poetry data found in pyproject.toml at {:?}",
2006                    path
2007                );
2008                return default_package_data(path);
2009            }
2010        } else if toml_content.get(FIELD_NAME).is_some() {
2011            // Other format with top-level fields
2012            match toml_content.as_table() {
2013                Some(table) => table.clone(),
2014                None => {
2015                    warn!("Failed to convert TOML content to table in {:?}", path);
2016                    return default_package_data(path);
2017                }
2018            }
2019        } else {
2020            warn!("No project data found in pyproject.toml at {:?}", path);
2021            return default_package_data(path);
2022        };
2023
2024    let name = project_table
2025        .get(FIELD_NAME)
2026        .and_then(|v| v.as_str())
2027        .map(String::from);
2028
2029    let version = project_table
2030        .get(FIELD_VERSION)
2031        .and_then(|v| v.as_str())
2032        .map(String::from);
2033    let classifiers = project_table
2034        .get("classifiers")
2035        .and_then(|value| value.as_array())
2036        .map(|values| {
2037            values
2038                .iter()
2039                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2040                .collect::<Vec<_>>()
2041        })
2042        .unwrap_or_default();
2043
2044    let extracted_license_statement = extract_raw_license_string(&project_table);
2045    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2046        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2047
2048    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2049    let (homepage_url, repository_url) = extract_urls(&project_table);
2050
2051    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2052    let extra_data = extract_pyproject_extra_data(&toml_content);
2053
2054    // Create package URL
2055    let purl = name.as_ref().and_then(|n| {
2056        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2057            Ok(p) => p,
2058            Err(e) => {
2059                warn!(
2060                    "Failed to create PackageUrl for Python package '{}': {}",
2061                    n, e
2062                );
2063                return None;
2064            }
2065        };
2066
2067        if let Some(v) = &version
2068            && let Err(e) = package_url.with_version(v)
2069        {
2070            warn!(
2071                "Failed to set version '{}' for Python package '{}': {}",
2072                v, n, e
2073            );
2074            return None;
2075        }
2076
2077        Some(package_url.to_string())
2078    });
2079
2080    let api_data_url = name.as_ref().map(|n| {
2081        if let Some(v) = &version {
2082            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2083        } else {
2084            format!("https://pypi.org/pypi/{}/json", n)
2085        }
2086    });
2087
2088    let pypi_homepage_url = name
2089        .as_ref()
2090        .map(|n| format!("https://pypi.org/project/{}", n));
2091
2092    let pypi_download_url = name.as_ref().and_then(|n| {
2093        version.as_ref().map(|v| {
2094            format!(
2095                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2096                &n[..1.min(n.len())],
2097                n,
2098                n,
2099                v
2100            )
2101        })
2102    });
2103
2104    PackageData {
2105        package_type: Some(PythonParser::PACKAGE_TYPE),
2106        namespace: None,
2107        name,
2108        version,
2109        qualifiers: None,
2110        subpath: None,
2111        primary_language: None,
2112        description: None,
2113        release_date: None,
2114        parties: extract_parties(&project_table),
2115        keywords: Vec::new(),
2116        homepage_url: homepage_url.or(pypi_homepage_url),
2117        download_url: repository_url.clone().or(pypi_download_url),
2118        size: None,
2119        sha1: None,
2120        md5: None,
2121        sha256: None,
2122        sha512: None,
2123        bug_tracking_url: None,
2124        code_view_url: None,
2125        vcs_url: repository_url,
2126        copyright: None,
2127        holder: None,
2128        declared_license_expression,
2129        declared_license_expression_spdx,
2130        license_detections,
2131        other_license_expression: None,
2132        other_license_expression_spdx: None,
2133        other_license_detections: Vec::new(),
2134        extracted_license_statement,
2135        notice_text: None,
2136        source_packages: Vec::new(),
2137        file_references: Vec::new(),
2138        is_private: has_private_classifier(&classifiers),
2139        is_virtual: false,
2140        extra_data,
2141        dependencies: [dependencies, optional_dependencies].concat(),
2142        repository_homepage_url: None,
2143        repository_download_url: None,
2144        api_data_url,
2145        datasource_id: Some(if is_poetry_pyproject {
2146            DatasourceId::PypiPoetryPyprojectToml
2147        } else {
2148            DatasourceId::PypiPyprojectToml
2149        }),
2150        purl,
2151    }
2152}
2153
2154fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2155    let path_str = path.to_string_lossy().replace('\\', "/");
2156    if path_str.contains("/EGG-INFO/PKG-INFO") {
2157        DatasourceId::PypiEggPkginfo
2158    } else if path_str.ends_with(".egg-info/PKG-INFO") {
2159        DatasourceId::PypiEditableEggPkginfo
2160    } else {
2161        DatasourceId::PypiSdistPkginfo
2162    }
2163}
2164
2165fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2166    project
2167        .get(FIELD_LICENSE)
2168        .and_then(|license_value| match license_value {
2169            TomlValue::String(license_str) => Some(license_str.clone()),
2170            TomlValue::Table(license_table) => license_table
2171                .get("text")
2172                .and_then(|v| v.as_str())
2173                .map(|s| s.to_string())
2174                .or_else(|| {
2175                    license_table
2176                        .get("expression")
2177                        .and_then(|v| v.as_str())
2178                        .map(|expr| expr.to_string())
2179                }),
2180            _ => None,
2181        })
2182}
2183
2184fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2185    match project.get(FIELD_LICENSE) {
2186        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2187        Some(TomlValue::Table(license_table)) => license_table
2188            .get("expression")
2189            .and_then(|value| value.as_str()),
2190        _ => None,
2191    }
2192}
2193
2194fn extract_urls(project: &TomlMap<String, TomlValue>) -> (Option<String>, Option<String>) {
2195    let mut homepage_url = None;
2196    let mut repository_url = None;
2197
2198    // Check for URLs table
2199    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2200        homepage_url = urls
2201            .get(FIELD_HOMEPAGE)
2202            .and_then(|v| v.as_str())
2203            .map(String::from);
2204        repository_url = urls
2205            .get(FIELD_REPOSITORY)
2206            .and_then(|v| v.as_str())
2207            .map(String::from);
2208    }
2209
2210    // If not found in URLs table, check for top-level keys
2211    if homepage_url.is_none() {
2212        homepage_url = project
2213            .get(FIELD_HOMEPAGE)
2214            .and_then(|v| v.as_str())
2215            .map(String::from);
2216    }
2217
2218    if repository_url.is_none() {
2219        repository_url = project
2220            .get(FIELD_REPOSITORY)
2221            .and_then(|v| v.as_str())
2222            .map(String::from);
2223    }
2224
2225    (homepage_url, repository_url)
2226}
2227
2228fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2229    let mut parties = Vec::new();
2230
2231    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2232        for author in authors {
2233            if let Some(author_str) = author.as_str() {
2234                let (name, email) = split_name_email(author_str);
2235                parties.push(Party {
2236                    r#type: None,
2237                    role: Some("author".to_string()),
2238                    name,
2239                    email,
2240                    url: None,
2241                    organization: None,
2242                    organization_url: None,
2243                    timezone: None,
2244                });
2245            }
2246        }
2247    }
2248
2249    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2250        for maintainer in maintainers {
2251            if let Some(maintainer_str) = maintainer.as_str() {
2252                let (name, email) = split_name_email(maintainer_str);
2253                parties.push(Party {
2254                    r#type: None,
2255                    role: Some("maintainer".to_string()),
2256                    name,
2257                    email,
2258                    url: None,
2259                    organization: None,
2260                    organization_url: None,
2261                    timezone: None,
2262                });
2263            }
2264        }
2265    }
2266
2267    parties
2268}
2269
2270fn extract_dependencies(
2271    project: &TomlMap<String, TomlValue>,
2272    toml_content: &TomlValue,
2273) -> (Vec<Dependency>, Vec<Dependency>) {
2274    let mut dependencies = Vec::new();
2275    let mut optional_dependencies = Vec::new();
2276
2277    // Handle dependencies - can be array or table format
2278    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2279        match deps_value {
2280            TomlValue::Array(arr) => {
2281                dependencies = parse_dependency_array(arr, false, None);
2282            }
2283            TomlValue::Table(table) => {
2284                dependencies = parse_dependency_table(table, false, None);
2285            }
2286            _ => {}
2287        }
2288    }
2289
2290    // Handle PEP 621 optional-dependencies with scope
2291    if let Some(opt_deps_table) = project
2292        .get(FIELD_OPTIONAL_DEPENDENCIES)
2293        .and_then(|v| v.as_table())
2294    {
2295        for (extra_name, deps) in opt_deps_table {
2296            match deps {
2297                TomlValue::Array(arr) => {
2298                    optional_dependencies.extend(parse_dependency_array(
2299                        arr,
2300                        true,
2301                        Some(extra_name),
2302                    ));
2303                }
2304                TomlValue::Table(table) => {
2305                    optional_dependencies.extend(parse_dependency_table(
2306                        table,
2307                        true,
2308                        Some(extra_name),
2309                    ));
2310                }
2311                _ => {}
2312            }
2313        }
2314    }
2315
2316    // Handle Poetry dev-dependencies
2317    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2318        match dev_deps_value {
2319            TomlValue::Array(arr) => {
2320                optional_dependencies.extend(parse_dependency_array(
2321                    arr,
2322                    true,
2323                    Some(FIELD_DEV_DEPENDENCIES),
2324                ));
2325            }
2326            TomlValue::Table(table) => {
2327                optional_dependencies.extend(parse_dependency_table(
2328                    table,
2329                    true,
2330                    Some(FIELD_DEV_DEPENDENCIES),
2331                ));
2332            }
2333            _ => {}
2334        }
2335    }
2336
2337    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2338    if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
2339        for (group_name, group_data) in groups_table {
2340            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2341                match group_deps {
2342                    TomlValue::Array(arr) => {
2343                        optional_dependencies.extend(parse_dependency_array(
2344                            arr,
2345                            true,
2346                            Some(group_name),
2347                        ));
2348                    }
2349                    TomlValue::Table(table) => {
2350                        optional_dependencies.extend(parse_dependency_table(
2351                            table,
2352                            true,
2353                            Some(group_name),
2354                        ));
2355                    }
2356                    _ => {}
2357                }
2358            }
2359        }
2360    }
2361
2362    if let Some(groups_table) = toml_content
2363        .get(FIELD_DEPENDENCY_GROUPS)
2364        .and_then(|value| value.as_table())
2365    {
2366        for (group_name, deps) in groups_table {
2367            match deps {
2368                TomlValue::Array(arr) => {
2369                    optional_dependencies.extend(parse_dependency_array(
2370                        arr,
2371                        true,
2372                        Some(group_name),
2373                    ));
2374                }
2375                TomlValue::Table(table) => {
2376                    optional_dependencies.extend(parse_dependency_table(
2377                        table,
2378                        true,
2379                        Some(group_name),
2380                    ));
2381                }
2382                _ => {}
2383            }
2384        }
2385    }
2386
2387    if let Some(dev_deps_value) = toml_content
2388        .get("tool")
2389        .and_then(|value| value.as_table())
2390        .and_then(|tool| tool.get("uv"))
2391        .and_then(|value| value.as_table())
2392        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2393    {
2394        match dev_deps_value {
2395            TomlValue::Array(arr) => {
2396                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2397            }
2398            TomlValue::Table(table) => {
2399                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2400            }
2401            _ => {}
2402        }
2403    }
2404
2405    (dependencies, optional_dependencies)
2406}
2407
2408fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2409    let mut extra_data = HashMap::new();
2410
2411    if let Some(tool_uv) = toml_content
2412        .get("tool")
2413        .and_then(|value| value.as_table())
2414        .and_then(|tool| tool.get("uv"))
2415    {
2416        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2417    }
2418
2419    if extra_data.is_empty() {
2420        None
2421    } else {
2422        Some(extra_data)
2423    }
2424}
2425
2426fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2427    match value {
2428        TomlValue::String(value) => JsonValue::String(value.clone()),
2429        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2430        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2431        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2432        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2433        TomlValue::Array(values) => {
2434            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2435        }
2436        TomlValue::Table(values) => JsonValue::Object(
2437            values
2438                .iter()
2439                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2440                .collect::<JsonMap<String, JsonValue>>(),
2441        ),
2442    }
2443}
2444
2445fn parse_dependency_table(
2446    table: &TomlMap<String, TomlValue>,
2447    is_optional: bool,
2448    scope: Option<&str>,
2449) -> Vec<Dependency> {
2450    table
2451        .iter()
2452        .filter_map(|(name, version)| {
2453            let version_str = version.as_str().map(|s| s.to_string());
2454            let mut package_url =
2455                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2456
2457            if let Some(v) = &version_str {
2458                package_url.with_version(v).ok()?;
2459            }
2460
2461            Some(Dependency {
2462                purl: Some(package_url.to_string()),
2463                extracted_requirement: None,
2464                scope: scope.map(|s| s.to_string()),
2465                is_runtime: Some(!is_optional),
2466                is_optional: Some(is_optional),
2467                is_pinned: None,
2468                is_direct: Some(true),
2469                resolved_package: None,
2470                extra_data: None,
2471            })
2472        })
2473        .collect()
2474}
2475
2476fn parse_dependency_array(
2477    array: &[TomlValue],
2478    is_optional: bool,
2479    scope: Option<&str>,
2480) -> Vec<Dependency> {
2481    array
2482        .iter()
2483        .filter_map(|dep| {
2484            let dep_str = dep.as_str()?;
2485
2486            let mut parts = dep_str.split(['>', '=', '<', '~']);
2487            let name = parts.next()?.trim().to_string();
2488
2489            let version = parts.next().map(|v| v.trim().to_string());
2490
2491            let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name)
2492            {
2493                Ok(purl) => purl,
2494                Err(_) => return None,
2495            };
2496
2497            if let Some(ref v) = version {
2498                package_url.with_version(v).ok()?;
2499            }
2500
2501            Some(Dependency {
2502                purl: Some(package_url.to_string()),
2503                extracted_requirement: None,
2504                scope: scope.map(|s| s.to_string()),
2505                is_runtime: Some(!is_optional),
2506                is_optional: Some(is_optional),
2507                is_pinned: None,
2508                is_direct: Some(true),
2509                resolved_package: None,
2510                extra_data: None,
2511            })
2512        })
2513        .collect()
2514}
2515
2516#[derive(Debug, Clone)]
2517enum Value {
2518    String(String),
2519    Number(f64),
2520    Bool(bool),
2521    None,
2522    List(Vec<Value>),
2523    Tuple(Vec<Value>),
2524    Dict(HashMap<String, Value>),
2525}
2526
2527struct LiteralEvaluator {
2528    constants: HashMap<String, Value>,
2529    max_depth: usize,
2530    max_nodes: usize,
2531    nodes_visited: usize,
2532}
2533
2534impl LiteralEvaluator {
2535    fn new(constants: HashMap<String, Value>) -> Self {
2536        Self {
2537            constants,
2538            max_depth: MAX_SETUP_PY_AST_DEPTH,
2539            max_nodes: MAX_SETUP_PY_AST_NODES,
2540            nodes_visited: 0,
2541        }
2542    }
2543
2544    fn insert_constant(&mut self, name: String, value: Value) {
2545        self.constants.insert(name, value);
2546    }
2547
2548    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2549        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2550            return None;
2551        }
2552        self.nodes_visited += 1;
2553
2554        match expr {
2555            ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
2556                Some(Value::String(value.to_str().to_string()))
2557            }
2558            ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
2559                Some(Value::Bool(*value))
2560            }
2561            ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
2562                self.evaluate_number(value)
2563            }
2564            ast::Expr::NoneLiteral(_) => Some(Value::None),
2565            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2566            ast::Expr::List(ast::ExprList { elts, .. }) => {
2567                let mut values = Vec::new();
2568                for elt in elts {
2569                    values.push(self.evaluate_expr(elt, depth + 1)?);
2570                }
2571                Some(Value::List(values))
2572            }
2573            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
2574                let mut values = Vec::new();
2575                for elt in elts {
2576                    values.push(self.evaluate_expr(elt, depth + 1)?);
2577                }
2578                Some(Value::Tuple(values))
2579            }
2580            ast::Expr::Dict(ast::ExprDict { items, .. }) => {
2581                let mut dict = HashMap::new();
2582                for item in items {
2583                    let key_expr = item.key.as_ref()?;
2584                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
2585                    let key = value_to_string(&key_value)?;
2586                    let value = self.evaluate_expr(&item.value, depth + 1)?;
2587                    dict.insert(key, value);
2588                }
2589                Some(Value::Dict(dict))
2590            }
2591            ast::Expr::Call(ast::ExprCall {
2592                func, arguments, ..
2593            }) => {
2594                let args = arguments.args.as_ref();
2595                let keywords = arguments.keywords.as_ref();
2596                if keywords.is_empty()
2597                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
2598                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
2599                {
2600                    return self.evaluate_ordered_dict(args, depth + 1);
2601                }
2602
2603                if !args.is_empty() {
2604                    return None;
2605                }
2606
2607                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
2608                    && id == "dict"
2609                {
2610                    let mut dict = HashMap::new();
2611                    for keyword in keywords {
2612                        let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
2613                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2614                        dict.insert(key.to_string(), value);
2615                    }
2616                    return Some(Value::Dict(dict));
2617                }
2618
2619                None
2620            }
2621            _ => None,
2622        }
2623    }
2624
2625    fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
2626        match number {
2627            ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2628            ast::Number::Float(value) => Some(Value::Number(*value)),
2629            ast::Number::Complex { .. } => None,
2630        }
2631    }
2632
2633    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2634        if args.len() != 1 {
2635            return None;
2636        }
2637
2638        let items = match self.evaluate_expr(&args[0], depth)? {
2639            Value::List(items) | Value::Tuple(items) => items,
2640            _ => return None,
2641        };
2642
2643        let mut dict = HashMap::new();
2644        for item in items {
2645            let Value::Tuple(values) = item else {
2646                return None;
2647            };
2648            if values.len() != 2 {
2649                return None;
2650            }
2651            let key = value_to_string(&values[0])?;
2652            dict.insert(key, values[1].clone());
2653        }
2654
2655        Some(Value::Dict(dict))
2656    }
2657}
2658
2659#[derive(Default)]
2660struct SetupAliases {
2661    setup_names: HashSet<String>,
2662    module_aliases: HashMap<String, String>,
2663}
2664
2665fn extract_from_setup_py(path: &Path) -> PackageData {
2666    let content = match read_file_to_string(path) {
2667        Ok(content) => content,
2668        Err(e) => {
2669            warn!("Failed to read setup.py at {:?}: {}", path, e);
2670            return default_package_data(path);
2671        }
2672    };
2673
2674    if content.len() > MAX_SETUP_PY_BYTES {
2675        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
2676        return extract_from_setup_py_regex(&content);
2677    }
2678
2679    let mut package_data = match extract_from_setup_py_ast(&content) {
2680        Ok(Some(data)) => data,
2681        Ok(None) => extract_from_setup_py_regex(&content),
2682        Err(e) => {
2683            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
2684            extract_from_setup_py_regex(&content)
2685        }
2686    };
2687
2688    if package_data.name.is_none() {
2689        package_data.name = extract_setup_value(&content, "name");
2690    }
2691
2692    if package_data.version.is_none() {
2693        package_data.version = extract_setup_value(&content, "version");
2694    }
2695
2696    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
2697
2698    if package_data.purl.is_none() {
2699        package_data.purl = build_setup_py_purl(
2700            package_data.name.as_deref(),
2701            package_data.version.as_deref(),
2702        );
2703    }
2704
2705    package_data
2706}
2707
2708fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
2709    if package_data.version.is_some()
2710        && package_data.extracted_license_statement.is_some()
2711        && package_data
2712            .parties
2713            .iter()
2714            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
2715    {
2716        return;
2717    }
2718
2719    let Some(root) = path.parent() else {
2720        return;
2721    };
2722
2723    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
2724
2725    if package_data.version.is_none() {
2726        package_data.version = dunder_metadata.version;
2727    }
2728
2729    if package_data.extracted_license_statement.is_none() {
2730        package_data.extracted_license_statement = dunder_metadata.license;
2731    }
2732
2733    let has_author = package_data
2734        .parties
2735        .iter()
2736        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
2737
2738    if !has_author && let Some(author) = dunder_metadata.author {
2739        package_data.parties.push(Party {
2740            r#type: Some("person".to_string()),
2741            role: Some("author".to_string()),
2742            name: Some(author),
2743            email: None,
2744            url: None,
2745            organization: None,
2746            organization_url: None,
2747            timezone: None,
2748        });
2749    }
2750}
2751
2752#[derive(Default)]
2753struct DunderMetadata {
2754    version: Option<String>,
2755    author: Option<String>,
2756    license: Option<String>,
2757}
2758
2759fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
2760    let statements = match parse_module(content) {
2761        Ok(parsed) => parsed.into_suite(),
2762        Err(_) => return DunderMetadata::default(),
2763    };
2764
2765    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2766    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2767    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2768    let mut metadata = DunderMetadata::default();
2769
2770    for module in imported_dunder_modules(&statements) {
2771        let Some(path) = resolve_imported_module_path(root, &module) else {
2772            continue;
2773        };
2774        let Ok(module_content) = read_file_to_string(&path) else {
2775            continue;
2776        };
2777
2778        if metadata.version.is_none() {
2779            metadata.version = version_re
2780                .as_ref()
2781                .and_then(|regex| regex.captures(&module_content))
2782                .and_then(|captures| captures.get(1))
2783                .map(|match_| match_.as_str().to_string());
2784        }
2785
2786        if metadata.author.is_none() {
2787            metadata.author = author_re
2788                .as_ref()
2789                .and_then(|regex| regex.captures(&module_content))
2790                .and_then(|captures| captures.get(1))
2791                .map(|match_| match_.as_str().to_string());
2792        }
2793
2794        if metadata.license.is_none() {
2795            metadata.license = license_re
2796                .as_ref()
2797                .and_then(|regex| regex.captures(&module_content))
2798                .and_then(|captures| captures.get(1))
2799                .map(|match_| match_.as_str().to_string());
2800        }
2801
2802        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
2803            return metadata;
2804        }
2805    }
2806
2807    metadata
2808}
2809
2810fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
2811    let mut modules = Vec::new();
2812
2813    for statement in statements {
2814        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
2815            continue;
2816        };
2817        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
2818            continue;
2819        };
2820        let imports_dunder = names.iter().any(|alias| {
2821            matches!(
2822                alias.name.as_str(),
2823                "__version__" | "__author__" | "__license__"
2824            )
2825        });
2826        if imports_dunder {
2827            modules.push(module.to_string());
2828        }
2829    }
2830
2831    modules
2832}
2833
2834fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
2835    let relative = PathBuf::from_iter(module.split('.'));
2836    let candidates = [
2837        root.join(relative.with_extension("py")),
2838        root.join(&relative).join("__init__.py"),
2839        root.join("src").join(relative.with_extension("py")),
2840        root.join("src").join(relative).join("__init__.py"),
2841    ];
2842
2843    candidates.into_iter().find(|candidate| candidate.exists())
2844}
2845
2846/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
2847///
2848/// # Security Model
2849///
2850/// This function parses setup.py as a Python AST and evaluates only literal values
2851/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
2852/// arbitrary code execution during scanning.
2853///
2854/// # DoS Prevention
2855///
2856/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
2857/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
2858/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
2859///
2860/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
2861fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
2862    let statements = parse_module(content)
2863        .map(|parsed| parsed.into_suite())
2864        .map_err(|e| e.to_string())?;
2865    let aliases = collect_setup_aliases(&statements);
2866    let mut evaluator = LiteralEvaluator::new(HashMap::new());
2867    build_setup_py_constants(&statements, &mut evaluator);
2868
2869    let setup_call = find_setup_call(&statements, &aliases);
2870    let Some(call_expr) = setup_call else {
2871        return Ok(None);
2872    };
2873
2874    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
2875    Ok(Some(build_setup_py_package_data(&setup_values)))
2876}
2877
2878fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
2879    for stmt in statements {
2880        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
2881            if targets.len() != 1 {
2882                continue;
2883            }
2884
2885            let Some(name) = extract_assign_name(&targets[0]) else {
2886                continue;
2887            };
2888
2889            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
2890                evaluator.insert_constant(name, value);
2891            }
2892        }
2893    }
2894}
2895
2896fn extract_assign_name(target: &ast::Expr) -> Option<String> {
2897    match target {
2898        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
2899        _ => None,
2900    }
2901}
2902
2903fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
2904    let mut aliases = SetupAliases::default();
2905    aliases.setup_names.insert("setup".to_string());
2906
2907    for stmt in statements {
2908        match stmt {
2909            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
2910                for alias in names {
2911                    let module_name = alias.name.as_str();
2912                    if !is_setup_module(module_name) {
2913                        continue;
2914                    }
2915                    let alias_name = alias
2916                        .asname
2917                        .as_ref()
2918                        .map(|name| name.as_str())
2919                        .unwrap_or(module_name);
2920                    aliases
2921                        .module_aliases
2922                        .insert(alias_name.to_string(), module_name.to_string());
2923                }
2924            }
2925            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
2926                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
2927                    continue;
2928                };
2929                if !is_setup_module(module_name) {
2930                    continue;
2931                }
2932                for alias in names {
2933                    if alias.name.as_str() != "setup" {
2934                        continue;
2935                    }
2936                    let alias_name = alias
2937                        .asname
2938                        .as_ref()
2939                        .map(|name| name.as_str())
2940                        .unwrap_or("setup");
2941                    aliases.setup_names.insert(alias_name.to_string());
2942                }
2943            }
2944            _ => {}
2945        }
2946    }
2947
2948    aliases
2949}
2950
2951fn is_setup_module(module_name: &str) -> bool {
2952    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
2953}
2954
2955fn find_setup_call<'a>(
2956    statements: &'a [ast::Stmt],
2957    aliases: &'a SetupAliases,
2958) -> Option<&'a ast::Expr> {
2959    let mut finder = SetupCallFinder {
2960        aliases,
2961        nodes_visited: 0,
2962    };
2963    finder.find_in_statements(statements)
2964}
2965
2966struct SetupCallFinder<'a> {
2967    aliases: &'a SetupAliases,
2968    nodes_visited: usize,
2969}
2970
2971impl<'a> SetupCallFinder<'a> {
2972    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
2973        for stmt in statements {
2974            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
2975                return None;
2976            }
2977            self.nodes_visited += 1;
2978
2979            let found = match stmt {
2980                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
2981                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
2982                ast::Stmt::If(ast::StmtIf {
2983                    body,
2984                    elif_else_clauses,
2985                    ..
2986                }) => self.find_in_statements(body).or_else(|| {
2987                    for clause in elif_else_clauses {
2988                        if let Some(found) = self.find_in_statements(&clause.body) {
2989                            return Some(found);
2990                        }
2991                    }
2992                    None
2993                }),
2994                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
2995                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
2996                    .find_in_statements(body)
2997                    .or_else(|| self.find_in_statements(orelse)),
2998                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
2999                ast::Stmt::Try(ast::StmtTry {
3000                    body,
3001                    orelse,
3002                    finalbody,
3003                    handlers,
3004                    ..
3005                }) => self
3006                    .find_in_statements(body)
3007                    .or_else(|| self.find_in_statements(orelse))
3008                    .or_else(|| self.find_in_statements(finalbody))
3009                    .or_else(|| {
3010                        for handler in handlers {
3011                            let ast::ExceptHandler::ExceptHandler(
3012                                ast::ExceptHandlerExceptHandler { body, .. },
3013                            ) = handler;
3014                            if let Some(found) = self.find_in_statements(body) {
3015                                return Some(found);
3016                            }
3017                        }
3018                        None
3019                    }),
3020                _ => None,
3021            };
3022
3023            if found.is_some() {
3024                return found;
3025            }
3026        }
3027
3028        None
3029    }
3030
3031    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3032        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3033            return None;
3034        }
3035        self.nodes_visited += 1;
3036
3037        match expr {
3038            ast::Expr::Call(ast::ExprCall { func, .. })
3039                if is_setup_call(func.as_ref(), self.aliases) =>
3040            {
3041                Some(expr)
3042            }
3043            _ => None,
3044        }
3045    }
3046}
3047
3048fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3049    let Some(dotted) = dotted_name(func, 0) else {
3050        return false;
3051    };
3052
3053    if aliases.setup_names.contains(&dotted) {
3054        return true;
3055    }
3056
3057    let Some(module) = dotted.strip_suffix(".setup") else {
3058        return false;
3059    };
3060
3061    let resolved = resolve_module_alias(module, aliases);
3062    is_setup_module(&resolved)
3063}
3064
3065fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3066    if depth >= MAX_SETUP_PY_AST_DEPTH {
3067        return None;
3068    }
3069
3070    match expr {
3071        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3072        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3073            let base = dotted_name(value.as_ref(), depth + 1)?;
3074            Some(format!("{}.{}", base, attr.as_str()))
3075        }
3076        _ => None,
3077    }
3078}
3079
3080fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3081    if let Some(mapped) = aliases.module_aliases.get(module) {
3082        return mapped.clone();
3083    }
3084
3085    let Some((base, rest)) = module.split_once('.') else {
3086        return module.to_string();
3087    };
3088
3089    if let Some(mapped) = aliases.module_aliases.get(base) {
3090        return format!("{}.{}", mapped, rest);
3091    }
3092
3093    module.to_string()
3094}
3095
3096fn extract_setup_keywords(
3097    call_expr: &ast::Expr,
3098    evaluator: &mut LiteralEvaluator,
3099) -> HashMap<String, Value> {
3100    let mut values = HashMap::new();
3101    let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3102        return values;
3103    };
3104
3105    for keyword in arguments.keywords.iter() {
3106        if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3107            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3108                values.insert(arg.to_string(), value);
3109            }
3110        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3111            for (key, value) in dict {
3112                values.insert(key, value);
3113            }
3114        }
3115    }
3116
3117    values
3118}
3119
3120fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3121    let name = get_value_string(values, "name");
3122    let version = get_value_string(values, "version");
3123    let description =
3124        get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3125    let homepage_url =
3126        get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3127    let author = get_value_string(values, "author");
3128    let author_email = get_value_string(values, "author_email");
3129    let maintainer = get_value_string(values, "maintainer");
3130    let maintainer_email = get_value_string(values, "maintainer_email");
3131    let license = get_value_string(values, "license");
3132    let classifiers = values
3133        .get("classifiers")
3134        .and_then(value_to_string_list)
3135        .unwrap_or_default();
3136
3137    let mut parties = Vec::new();
3138    if author.is_some() || author_email.is_some() {
3139        parties.push(Party {
3140            r#type: Some("person".to_string()),
3141            role: Some("author".to_string()),
3142            name: author,
3143            email: author_email,
3144            url: None,
3145            organization: None,
3146            organization_url: None,
3147            timezone: None,
3148        });
3149    }
3150
3151    if maintainer.is_some() || maintainer_email.is_some() {
3152        parties.push(Party {
3153            r#type: Some("person".to_string()),
3154            role: Some("maintainer".to_string()),
3155            name: maintainer,
3156            email: maintainer_email,
3157            url: None,
3158            organization: None,
3159            organization_url: None,
3160            timezone: None,
3161        });
3162    }
3163
3164    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3165        normalize_spdx_declared_license(license.as_deref());
3166    let extracted_license_statement = license.clone();
3167
3168    let dependencies = build_setup_py_dependencies(values);
3169    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3170    let mut homepage_from_project_urls = None;
3171    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3172    let mut extra_data = HashMap::new();
3173
3174    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3175        apply_project_url_mappings(
3176            &parsed_project_urls,
3177            &mut homepage_from_project_urls,
3178            &mut bug_tracking_url,
3179            &mut code_view_url,
3180            &mut vcs_url,
3181            &mut extra_data,
3182        );
3183    }
3184
3185    let extra_data = if extra_data.is_empty() {
3186        None
3187    } else {
3188        Some(extra_data)
3189    };
3190
3191    PackageData {
3192        package_type: Some(PythonParser::PACKAGE_TYPE),
3193        namespace: None,
3194        name,
3195        version,
3196        qualifiers: None,
3197        subpath: None,
3198        primary_language: Some("Python".to_string()),
3199        description,
3200        release_date: None,
3201        parties,
3202        keywords: Vec::new(),
3203        homepage_url: homepage_url.or(homepage_from_project_urls),
3204        download_url: None,
3205        size: None,
3206        sha1: None,
3207        md5: None,
3208        sha256: None,
3209        sha512: None,
3210        bug_tracking_url,
3211        code_view_url,
3212        vcs_url,
3213        copyright: None,
3214        holder: None,
3215        declared_license_expression,
3216        declared_license_expression_spdx,
3217        license_detections,
3218        other_license_expression: None,
3219        other_license_expression_spdx: None,
3220        other_license_detections: Vec::new(),
3221        extracted_license_statement,
3222        notice_text: None,
3223        source_packages: Vec::new(),
3224        file_references: Vec::new(),
3225        is_private: has_private_classifier(&classifiers),
3226        is_virtual: false,
3227        extra_data,
3228        dependencies,
3229        repository_homepage_url: None,
3230        repository_download_url: None,
3231        api_data_url: None,
3232        datasource_id: Some(DatasourceId::PypiSetupPy),
3233        purl,
3234    }
3235}
3236
3237fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3238    let mut dependencies = Vec::new();
3239
3240    if let Some(reqs) = values
3241        .get("install_requires")
3242        .and_then(value_to_string_list)
3243    {
3244        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3245    }
3246
3247    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3248        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3249    }
3250
3251    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3252        let mut extra_items: Vec<_> = extras.iter().collect();
3253        extra_items.sort_by_key(|(name, _)| *name);
3254        for (extra_name, extra_value) in extra_items {
3255            if let Some(reqs) = value_to_string_list(extra_value) {
3256                dependencies.extend(build_setup_py_dependency_list(
3257                    reqs.as_slice(),
3258                    extra_name,
3259                    true,
3260                ));
3261            }
3262        }
3263    }
3264
3265    dependencies
3266}
3267
3268fn build_setup_py_dependency_list(
3269    reqs: &[String],
3270    scope: &str,
3271    is_optional: bool,
3272) -> Vec<Dependency> {
3273    reqs.iter()
3274        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3275        .collect()
3276}
3277
3278fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3279    values.get(key).and_then(value_to_string)
3280}
3281
3282fn value_to_string(value: &Value) -> Option<String> {
3283    match value {
3284        Value::String(value) => Some(value.clone()),
3285        Value::Number(value) => Some(value.to_string()),
3286        Value::Bool(value) => Some(value.to_string()),
3287        _ => None,
3288    }
3289}
3290
3291fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3292    match value {
3293        Value::String(value) => Some(vec![value.clone()]),
3294        Value::List(values) | Value::Tuple(values) => {
3295            let mut items = Vec::new();
3296            for item in values {
3297                items.push(value_to_string(item)?);
3298            }
3299            Some(items)
3300        }
3301        _ => None,
3302    }
3303}
3304
3305fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3306    let Value::Dict(dict) = value else {
3307        return None;
3308    };
3309
3310    let mut pairs: Vec<(String, String)> = dict
3311        .iter()
3312        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3313        .collect::<Option<Vec<_>>>()?;
3314    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3315    Some(pairs)
3316}
3317
3318fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3319    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3320    extract_requires_dist_dependencies(&requires_dist)
3321}
3322
3323pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3324    requires_dist
3325        .iter()
3326        .filter_map(|entry| build_rfc822_dependency(entry))
3327        .collect()
3328}
3329
3330fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3331    build_python_dependency(entry, "install", false, None)
3332}
3333
3334fn build_python_dependency(
3335    entry: &str,
3336    default_scope: &str,
3337    default_optional: bool,
3338    marker_override: Option<&str>,
3339) -> Option<Dependency> {
3340    let (requirement_part, marker_part) = entry
3341        .split_once(';')
3342        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3343        .unwrap_or((entry.trim(), None));
3344
3345    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3346    let requirement = normalize_rfc822_requirement(requirement_part);
3347    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3348        marker_part.or(marker_override),
3349        default_scope,
3350        default_optional,
3351    );
3352    let mut purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
3353
3354    let is_pinned = requirement
3355        .as_deref()
3356        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3357    if is_pinned
3358        && let Some(version) = requirement
3359            .as_deref()
3360            .map(|req| req.trim_start_matches('='))
3361    {
3362        purl.with_version(version).ok()?;
3363    }
3364
3365    let mut extra_data = HashMap::new();
3366    extra_data.extend(marker_data);
3367    if let Some(marker) = marker {
3368        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3369    }
3370
3371    Some(Dependency {
3372        purl: Some(purl.to_string()),
3373        extracted_requirement: requirement,
3374        scope: Some(scope),
3375        is_runtime: Some(true),
3376        is_optional: Some(is_optional),
3377        is_pinned: Some(is_pinned),
3378        is_direct: Some(true),
3379        resolved_package: None,
3380        extra_data: if extra_data.is_empty() {
3381            None
3382        } else {
3383            Some(extra_data)
3384        },
3385    })
3386}
3387
3388fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3389    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3390    let trimmed = requirement_part.trim();
3391    let mut remainder = trimmed[name.len()..].trim();
3392
3393    if let Some(stripped) = remainder.strip_prefix('[')
3394        && let Some(end_idx) = stripped.find(']')
3395    {
3396        remainder = stripped[end_idx + 1..].trim();
3397    }
3398
3399    let remainder = remainder
3400        .strip_prefix('(')
3401        .and_then(|value| value.strip_suffix(')'))
3402        .unwrap_or(remainder)
3403        .trim();
3404
3405    if remainder.is_empty() {
3406        return None;
3407    }
3408
3409    let mut specifiers: Vec<String> = remainder
3410        .split(',')
3411        .map(|specifier| specifier.trim().replace(' ', ""))
3412        .filter(|specifier| !specifier.is_empty())
3413        .collect();
3414    specifiers.sort();
3415    Some(specifiers.join(","))
3416}
3417
3418fn parse_rfc822_marker(
3419    marker_part: Option<&str>,
3420    default_scope: &str,
3421    default_optional: bool,
3422) -> (
3423    String,
3424    bool,
3425    Option<String>,
3426    HashMap<String, serde_json::Value>,
3427) {
3428    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3429        return (
3430            default_scope.to_string(),
3431            default_optional,
3432            None,
3433            HashMap::new(),
3434        );
3435    };
3436
3437    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3438        .expect("extra marker regex should compile");
3439    let mut extra_data = HashMap::new();
3440
3441    if let Some(python_version) = extract_marker_field(marker, "python_version") {
3442        extra_data.insert(
3443            "python_version".to_string(),
3444            serde_json::Value::String(python_version),
3445        );
3446    }
3447    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
3448        extra_data.insert(
3449            "sys_platform".to_string(),
3450            serde_json::Value::String(sys_platform),
3451        );
3452    }
3453
3454    if let Some(captures) = extra_re.captures(marker)
3455        && let Some(scope) = captures.get(1)
3456    {
3457        return (
3458            scope.as_str().to_string(),
3459            true,
3460            Some(marker.trim().to_string()),
3461            extra_data,
3462        );
3463    }
3464
3465    (
3466        default_scope.to_string(),
3467        default_optional,
3468        Some(marker.trim().to_string()),
3469        extra_data,
3470    )
3471}
3472
3473fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
3474    let re = Regex::new(&format!(
3475        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
3476        field
3477    ))
3478    .ok()?;
3479    let captures = re.captures(marker)?;
3480    let operator = captures.get(1)?.as_str();
3481    let value = captures.get(2)?.as_str();
3482    Some(format!("{} {}", operator, value))
3483}
3484
3485fn parse_requires_txt(content: &str) -> Vec<Dependency> {
3486    let mut dependencies = Vec::new();
3487    let mut current_scope = "install".to_string();
3488    let mut current_optional = false;
3489    let mut current_marker: Option<String> = None;
3490
3491    for line in content.lines() {
3492        let trimmed = line.trim();
3493        if trimmed.is_empty() || trimmed.starts_with('#') {
3494            continue;
3495        }
3496
3497        if trimmed.starts_with('[') && trimmed.ends_with(']') {
3498            let inner = &trimmed[1..trimmed.len() - 1];
3499            if let Some(rest) = inner.strip_prefix(':') {
3500                current_scope = "install".to_string();
3501                current_optional = false;
3502                current_marker = Some(rest.trim().to_string());
3503            } else if let Some((scope, marker)) = inner.split_once(':') {
3504                current_scope = scope.trim().to_string();
3505                current_optional = true;
3506                current_marker = Some(marker.trim().to_string());
3507            } else {
3508                current_scope = inner.trim().to_string();
3509                current_optional = true;
3510                current_marker = None;
3511            }
3512            continue;
3513        }
3514
3515        if let Some(dependency) = build_python_dependency(
3516            trimmed,
3517            &current_scope,
3518            current_optional,
3519            current_marker.as_deref(),
3520        ) {
3521            dependencies.push(dependency);
3522        }
3523    }
3524
3525    dependencies
3526}
3527
3528fn has_private_classifier(classifiers: &[String]) -> bool {
3529    classifiers
3530        .iter()
3531        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
3532}
3533
3534fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
3535    let name = name?;
3536    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
3537    if let Some(version) = version {
3538        package_url.with_version(version).ok()?;
3539    }
3540    Some(package_url.to_string())
3541}
3542
3543fn extract_from_setup_py_regex(content: &str) -> PackageData {
3544    let name = extract_setup_value(content, "name");
3545    let version = extract_setup_value(content, "version");
3546    let license_expression = extract_setup_value(content, "license");
3547
3548    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3549        normalize_spdx_declared_license(license_expression.as_deref());
3550    let extracted_license_statement = license_expression.clone();
3551
3552    let dependencies = extract_setup_py_dependencies(content);
3553    let homepage_url = extract_setup_value(content, "url");
3554    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3555
3556    PackageData {
3557        package_type: Some(PythonParser::PACKAGE_TYPE),
3558        namespace: None,
3559        name,
3560        version,
3561        qualifiers: None,
3562        subpath: None,
3563        primary_language: Some("Python".to_string()),
3564        description: None,
3565        release_date: None,
3566        parties: Vec::new(),
3567        keywords: Vec::new(),
3568        homepage_url,
3569        download_url: None,
3570        size: None,
3571        sha1: None,
3572        md5: None,
3573        sha256: None,
3574        sha512: None,
3575        bug_tracking_url: None,
3576        code_view_url: None,
3577        vcs_url: None,
3578        copyright: None,
3579        holder: None,
3580        declared_license_expression,
3581        declared_license_expression_spdx,
3582        license_detections,
3583        other_license_expression: None,
3584        other_license_expression_spdx: None,
3585        other_license_detections: Vec::new(),
3586        extracted_license_statement,
3587        notice_text: None,
3588        source_packages: Vec::new(),
3589        file_references: Vec::new(),
3590        is_private: false,
3591        is_virtual: false,
3592        extra_data: None,
3593        dependencies,
3594        repository_homepage_url: None,
3595        repository_download_url: None,
3596        api_data_url: None,
3597        datasource_id: Some(DatasourceId::PypiSetupPy),
3598        purl,
3599    }
3600}
3601
3602fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
3603    crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
3604}
3605
3606fn extract_from_pypi_json(path: &Path) -> PackageData {
3607    let default = PackageData {
3608        package_type: Some(PythonParser::PACKAGE_TYPE),
3609        datasource_id: Some(DatasourceId::PypiJson),
3610        ..Default::default()
3611    };
3612
3613    let content = match read_file_to_string(path) {
3614        Ok(content) => content,
3615        Err(error) => {
3616            warn!("Failed to read pypi.json at {:?}: {}", path, error);
3617            return default;
3618        }
3619    };
3620
3621    let root: serde_json::Value = match serde_json::from_str(&content) {
3622        Ok(value) => value,
3623        Err(error) => {
3624            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
3625            return default;
3626        }
3627    };
3628
3629    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
3630        warn!("No info object found in pypi.json at {:?}", path);
3631        return default;
3632    };
3633
3634    let name = info
3635        .get("name")
3636        .and_then(|value| value.as_str())
3637        .map(ToOwned::to_owned);
3638    let version = info
3639        .get("version")
3640        .and_then(|value| value.as_str())
3641        .map(ToOwned::to_owned);
3642    let summary = info
3643        .get("summary")
3644        .and_then(|value| value.as_str())
3645        .map(ToOwned::to_owned);
3646    let description = info
3647        .get("description")
3648        .and_then(|value| value.as_str())
3649        .filter(|value| !value.trim().is_empty())
3650        .map(ToOwned::to_owned)
3651        .or(summary);
3652    let mut homepage_url = info
3653        .get("home_page")
3654        .and_then(|value| value.as_str())
3655        .map(ToOwned::to_owned);
3656    let author = info
3657        .get("author")
3658        .and_then(|value| value.as_str())
3659        .filter(|value| !value.trim().is_empty())
3660        .map(ToOwned::to_owned);
3661    let author_email = info
3662        .get("author_email")
3663        .and_then(|value| value.as_str())
3664        .filter(|value| !value.trim().is_empty())
3665        .map(ToOwned::to_owned);
3666    let license = info
3667        .get("license")
3668        .and_then(|value| value.as_str())
3669        .filter(|value| !value.trim().is_empty())
3670        .map(ToOwned::to_owned);
3671    let keywords = parse_setup_cfg_keywords(
3672        info.get("keywords")
3673            .and_then(|value| value.as_str())
3674            .map(ToOwned::to_owned),
3675    );
3676    let classifiers = info
3677        .get("classifiers")
3678        .and_then(|value| value.as_array())
3679        .map(|values| {
3680            values
3681                .iter()
3682                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
3683                .collect::<Vec<_>>()
3684        })
3685        .unwrap_or_default();
3686
3687    let mut parties = Vec::new();
3688    if author.is_some() || author_email.is_some() {
3689        parties.push(Party {
3690            r#type: Some("person".to_string()),
3691            role: Some("author".to_string()),
3692            name: author,
3693            email: author_email,
3694            url: None,
3695            organization: None,
3696            organization_url: None,
3697            timezone: None,
3698        });
3699    }
3700
3701    let mut bug_tracking_url = None;
3702    let mut code_view_url = None;
3703    let mut vcs_url = None;
3704    let mut extra_data = HashMap::new();
3705
3706    let parsed_project_urls = info
3707        .get("project_urls")
3708        .and_then(|value| value.as_object())
3709        .map(|map| {
3710            let mut pairs: Vec<(String, String)> = map
3711                .iter()
3712                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
3713                .collect();
3714            pairs.sort_by(|left, right| left.0.cmp(&right.0));
3715            pairs
3716        })
3717        .unwrap_or_default();
3718
3719    apply_project_url_mappings(
3720        &parsed_project_urls,
3721        &mut homepage_url,
3722        &mut bug_tracking_url,
3723        &mut code_view_url,
3724        &mut vcs_url,
3725        &mut extra_data,
3726    );
3727
3728    let (download_url, size, sha256) = root
3729        .get("urls")
3730        .and_then(|value| value.as_array())
3731        .map(|urls| select_pypi_json_artifact(urls))
3732        .unwrap_or((None, None, None));
3733
3734    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3735        normalize_spdx_declared_license(license.as_deref());
3736    let dependencies = info
3737        .get("requires_dist")
3738        .and_then(|value| value.as_array())
3739        .map(|entries| {
3740            entries
3741                .iter()
3742                .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
3743                .collect::<Vec<_>>()
3744        })
3745        .map(|entries| extract_requires_dist_dependencies(&entries))
3746        .unwrap_or_default();
3747
3748    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
3749        build_pypi_urls(name.as_deref(), version.as_deref());
3750
3751    PackageData {
3752        package_type: Some(PythonParser::PACKAGE_TYPE),
3753        namespace: None,
3754        name,
3755        version,
3756        qualifiers: None,
3757        subpath: None,
3758        primary_language: None,
3759        description,
3760        release_date: None,
3761        parties,
3762        keywords,
3763        homepage_url: homepage_url.or(repository_homepage_url.clone()),
3764        download_url,
3765        size,
3766        sha1: None,
3767        md5: None,
3768        sha256,
3769        sha512: None,
3770        bug_tracking_url,
3771        code_view_url,
3772        vcs_url,
3773        copyright: None,
3774        holder: None,
3775        declared_license_expression,
3776        declared_license_expression_spdx,
3777        license_detections,
3778        other_license_expression: None,
3779        other_license_expression_spdx: None,
3780        other_license_detections: Vec::new(),
3781        extracted_license_statement: license,
3782        notice_text: None,
3783        source_packages: Vec::new(),
3784        file_references: Vec::new(),
3785        is_private: has_private_classifier(&classifiers),
3786        is_virtual: false,
3787        extra_data: if extra_data.is_empty() {
3788            None
3789        } else {
3790            Some(extra_data)
3791        },
3792        dependencies,
3793        repository_homepage_url,
3794        repository_download_url,
3795        api_data_url,
3796        datasource_id: Some(DatasourceId::PypiJson),
3797        purl,
3798    }
3799}
3800
3801fn select_pypi_json_artifact(
3802    urls: &[serde_json::Value],
3803) -> (Option<String>, Option<u64>, Option<String>) {
3804    let selected = urls
3805        .iter()
3806        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
3807        .or_else(|| urls.first());
3808
3809    let Some(entry) = selected else {
3810        return (None, None, None);
3811    };
3812
3813    let download_url = entry
3814        .get("url")
3815        .and_then(|value| value.as_str())
3816        .map(ToOwned::to_owned);
3817    let size = entry.get("size").and_then(|value| value.as_u64());
3818    let sha256 = entry
3819        .get("digests")
3820        .and_then(|value| value.as_object())
3821        .and_then(|digests| digests.get("sha256"))
3822        .and_then(|value| value.as_str())
3823        .map(ToOwned::to_owned);
3824
3825    (download_url, size, sha256)
3826}
3827
3828fn extract_from_pip_inspect(path: &Path) -> PackageData {
3829    let content = match read_file_to_string(path) {
3830        Ok(content) => content,
3831        Err(e) => {
3832            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
3833            return default_package_data(path);
3834        }
3835    };
3836
3837    let root: serde_json::Value = match serde_json::from_str(&content) {
3838        Ok(value) => value,
3839        Err(e) => {
3840            warn!(
3841                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
3842                path, e
3843            );
3844            return default_package_data(path);
3845        }
3846    };
3847
3848    let installed = match root.get("installed").and_then(|v| v.as_array()) {
3849        Some(arr) => arr,
3850        None => {
3851            warn!(
3852                "No 'installed' array found in pip-inspect.deplock at {:?}",
3853                path
3854            );
3855            return default_package_data(path);
3856        }
3857    };
3858
3859    let pip_version = root
3860        .get("pip_version")
3861        .and_then(|v| v.as_str())
3862        .map(String::from);
3863    let inspect_version = root
3864        .get("version")
3865        .and_then(|v| v.as_str())
3866        .map(String::from);
3867
3868    let mut main_package: Option<PackageData> = None;
3869    let mut dependencies: Vec<Dependency> = Vec::new();
3870
3871    for package_entry in installed {
3872        let metadata = match package_entry.get("metadata") {
3873            Some(m) => m,
3874            None => continue,
3875        };
3876
3877        let is_requested = package_entry
3878            .get("requested")
3879            .and_then(|v| v.as_bool())
3880            .unwrap_or(false);
3881        let has_direct_url = package_entry.get("direct_url").is_some();
3882
3883        let name = metadata
3884            .get("name")
3885            .and_then(|v| v.as_str())
3886            .map(String::from);
3887        let version = metadata
3888            .get("version")
3889            .and_then(|v| v.as_str())
3890            .map(String::from);
3891        let summary = metadata
3892            .get("summary")
3893            .and_then(|v| v.as_str())
3894            .map(String::from);
3895        let home_page = metadata
3896            .get("home_page")
3897            .and_then(|v| v.as_str())
3898            .map(String::from);
3899        let author = metadata
3900            .get("author")
3901            .and_then(|v| v.as_str())
3902            .map(String::from);
3903        let author_email = metadata
3904            .get("author_email")
3905            .and_then(|v| v.as_str())
3906            .map(String::from);
3907        let license = metadata
3908            .get("license")
3909            .and_then(|v| v.as_str())
3910            .map(String::from);
3911        let description = metadata
3912            .get("description")
3913            .and_then(|v| v.as_str())
3914            .map(String::from);
3915        let keywords = metadata
3916            .get("keywords")
3917            .and_then(|v| v.as_array())
3918            .map(|arr| {
3919                arr.iter()
3920                    .filter_map(|k| k.as_str().map(String::from))
3921                    .collect::<Vec<_>>()
3922            })
3923            .unwrap_or_default();
3924
3925        let mut parties = Vec::new();
3926        if author.is_some() || author_email.is_some() {
3927            parties.push(Party {
3928                r#type: Some("person".to_string()),
3929                role: Some("author".to_string()),
3930                name: author,
3931                email: author_email,
3932                url: None,
3933                organization: None,
3934                organization_url: None,
3935                timezone: None,
3936            });
3937        }
3938
3939        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3940            normalize_spdx_declared_license(license.as_deref());
3941        let extracted_license_statement = license.clone();
3942        let requires_dist = metadata
3943            .get("requires_dist")
3944            .and_then(|v| v.as_array())
3945            .map(|entries| {
3946                entries
3947                    .iter()
3948                    .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
3949                    .collect::<Vec<_>>()
3950            })
3951            .unwrap_or_default();
3952        let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
3953
3954        let purl = name.as_ref().and_then(|n| {
3955            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
3956            if let Some(v) = &version {
3957                package_url.with_version(v).ok()?;
3958            }
3959            Some(package_url.to_string())
3960        });
3961
3962        if is_requested && has_direct_url {
3963            let mut extra_data = HashMap::new();
3964            if let Some(pv) = &pip_version {
3965                extra_data.insert(
3966                    "pip_version".to_string(),
3967                    serde_json::Value::String(pv.clone()),
3968                );
3969            }
3970            if let Some(iv) = &inspect_version {
3971                extra_data.insert(
3972                    "inspect_version".to_string(),
3973                    serde_json::Value::String(iv.clone()),
3974                );
3975            }
3976
3977            main_package = Some(PackageData {
3978                package_type: Some(PythonParser::PACKAGE_TYPE),
3979                namespace: None,
3980                name,
3981                version,
3982                qualifiers: None,
3983                subpath: None,
3984                primary_language: Some("Python".to_string()),
3985                description: description.or(summary),
3986                release_date: None,
3987                parties,
3988                keywords,
3989                homepage_url: home_page,
3990                download_url: None,
3991                size: None,
3992                sha1: None,
3993                md5: None,
3994                sha256: None,
3995                sha512: None,
3996                bug_tracking_url: None,
3997                code_view_url: None,
3998                vcs_url: None,
3999                copyright: None,
4000                holder: None,
4001                declared_license_expression,
4002                declared_license_expression_spdx,
4003                license_detections,
4004                other_license_expression: None,
4005                other_license_expression_spdx: None,
4006                other_license_detections: Vec::new(),
4007                extracted_license_statement,
4008                notice_text: None,
4009                source_packages: Vec::new(),
4010                file_references: Vec::new(),
4011                is_private: false,
4012                is_virtual: true,
4013                extra_data: if extra_data.is_empty() {
4014                    None
4015                } else {
4016                    Some(extra_data)
4017                },
4018                dependencies: parsed_dependencies,
4019                repository_homepage_url: None,
4020                repository_download_url: None,
4021                api_data_url: None,
4022                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4023                purl,
4024            });
4025        } else {
4026            let resolved_package = PackageData {
4027                package_type: Some(PythonParser::PACKAGE_TYPE),
4028                namespace: None,
4029                name: name.clone(),
4030                version: version.clone(),
4031                qualifiers: None,
4032                subpath: None,
4033                primary_language: Some("Python".to_string()),
4034                description: description.or(summary),
4035                release_date: None,
4036                parties,
4037                keywords,
4038                homepage_url: home_page,
4039                download_url: None,
4040                size: None,
4041                sha1: None,
4042                md5: None,
4043                sha256: None,
4044                sha512: None,
4045                bug_tracking_url: None,
4046                code_view_url: None,
4047                vcs_url: None,
4048                copyright: None,
4049                holder: None,
4050                declared_license_expression,
4051                declared_license_expression_spdx,
4052                license_detections,
4053                other_license_expression: None,
4054                other_license_expression_spdx: None,
4055                other_license_detections: Vec::new(),
4056                extracted_license_statement,
4057                notice_text: None,
4058                source_packages: Vec::new(),
4059                file_references: Vec::new(),
4060                is_private: false,
4061                is_virtual: true,
4062                extra_data: None,
4063                dependencies: parsed_dependencies,
4064                repository_homepage_url: None,
4065                repository_download_url: None,
4066                api_data_url: None,
4067                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4068                purl: purl.clone(),
4069            };
4070
4071            let resolved = package_data_to_resolved(&resolved_package);
4072            dependencies.push(Dependency {
4073                purl,
4074                extracted_requirement: None,
4075                scope: None,
4076                is_runtime: Some(true),
4077                is_optional: Some(false),
4078                is_pinned: Some(true),
4079                is_direct: Some(is_requested),
4080                resolved_package: Some(Box::new(resolved)),
4081                extra_data: None,
4082            });
4083        }
4084    }
4085
4086    if let Some(mut main_pkg) = main_package {
4087        let direct_requirement_purls: HashSet<String> = main_pkg
4088            .dependencies
4089            .iter()
4090            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4091            .collect();
4092
4093        let resolved_requirement_purls: HashSet<String> = dependencies
4094            .iter()
4095            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4096            .collect();
4097
4098        let unresolved_dependencies = main_pkg
4099            .dependencies
4100            .iter()
4101            .filter(|dep| {
4102                dep.purl.as_ref().is_some_and(|purl| {
4103                    !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4104                })
4105            })
4106            .cloned()
4107            .collect::<Vec<_>>();
4108
4109        for dependency in &mut dependencies {
4110            if dependency
4111                .purl
4112                .as_ref()
4113                .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4114            {
4115                dependency.is_direct = Some(true);
4116            }
4117        }
4118
4119        main_pkg.dependencies = dependencies;
4120        main_pkg.dependencies.extend(unresolved_dependencies);
4121        main_pkg
4122    } else {
4123        default_package_data(path)
4124    }
4125}
4126
4127fn base_dependency_purl(purl: &str) -> String {
4128    purl.split_once('@')
4129        .map(|(base, _)| base.to_string())
4130        .unwrap_or_else(|| purl.to_string())
4131}
4132
4133type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4134
4135fn extract_from_setup_cfg(path: &Path) -> PackageData {
4136    let content = match read_file_to_string(path) {
4137        Ok(content) => content,
4138        Err(e) => {
4139            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4140            return default_package_data(path);
4141        }
4142    };
4143
4144    let sections = parse_setup_cfg(&content);
4145    let name = get_ini_value(&sections, "metadata", "name");
4146    let version = get_ini_value(&sections, "metadata", "version");
4147    let description = get_ini_value(&sections, "metadata", "description");
4148    let author = get_ini_value(&sections, "metadata", "author");
4149    let author_email = get_ini_value(&sections, "metadata", "author_email");
4150    let maintainer = get_ini_value(&sections, "metadata", "maintainer");
4151    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4152    let license = get_ini_value(&sections, "metadata", "license");
4153    let mut homepage_url = get_ini_value(&sections, "metadata", "url");
4154    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4155    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4156    let python_requires = get_ini_value(&sections, "options", "python_requires");
4157    let parsed_project_urls =
4158        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4159    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4160    let mut extra_data = HashMap::new();
4161
4162    let mut parties = Vec::new();
4163    if author.is_some() || author_email.is_some() {
4164        parties.push(Party {
4165            r#type: Some("person".to_string()),
4166            role: Some("author".to_string()),
4167            name: author,
4168            email: author_email,
4169            url: None,
4170            organization: None,
4171            organization_url: None,
4172            timezone: None,
4173        });
4174    }
4175
4176    if maintainer.is_some() || maintainer_email.is_some() {
4177        parties.push(Party {
4178            r#type: Some("person".to_string()),
4179            role: Some("maintainer".to_string()),
4180            name: maintainer,
4181            email: maintainer_email,
4182            url: None,
4183            organization: None,
4184            organization_url: None,
4185            timezone: None,
4186        });
4187    }
4188
4189    let declared_license_expression = None;
4190    let declared_license_expression_spdx = None;
4191    let license_detections = Vec::new();
4192    let extracted_license_statement = license.clone();
4193
4194    let dependencies = extract_setup_cfg_dependencies(&sections);
4195
4196    if let Some(value) = python_requires {
4197        extra_data.insert(
4198            "python_requires".to_string(),
4199            serde_json::Value::String(value),
4200        );
4201    }
4202
4203    apply_project_url_mappings(
4204        &parsed_project_urls,
4205        &mut homepage_url,
4206        &mut bug_tracking_url,
4207        &mut code_view_url,
4208        &mut vcs_url,
4209        &mut extra_data,
4210    );
4211
4212    let extra_data = if extra_data.is_empty() {
4213        None
4214    } else {
4215        Some(extra_data)
4216    };
4217
4218    let purl = name.as_ref().and_then(|n| {
4219        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4220        if let Some(v) = &version {
4221            package_url.with_version(v).ok()?;
4222        }
4223        Some(package_url.to_string())
4224    });
4225
4226    PackageData {
4227        package_type: Some(PythonParser::PACKAGE_TYPE),
4228        namespace: None,
4229        name,
4230        version,
4231        qualifiers: None,
4232        subpath: None,
4233        primary_language: Some("Python".to_string()),
4234        description,
4235        release_date: None,
4236        parties,
4237        keywords,
4238        homepage_url,
4239        download_url: None,
4240        size: None,
4241        sha1: None,
4242        md5: None,
4243        sha256: None,
4244        sha512: None,
4245        bug_tracking_url,
4246        code_view_url,
4247        vcs_url,
4248        copyright: None,
4249        holder: None,
4250        declared_license_expression,
4251        declared_license_expression_spdx,
4252        license_detections,
4253        other_license_expression: None,
4254        other_license_expression_spdx: None,
4255        other_license_detections: Vec::new(),
4256        extracted_license_statement,
4257        notice_text: None,
4258        source_packages: Vec::new(),
4259        file_references: Vec::new(),
4260        is_private: has_private_classifier(&classifiers),
4261        is_virtual: false,
4262        extra_data,
4263        dependencies,
4264        repository_homepage_url: None,
4265        repository_download_url: None,
4266        api_data_url: None,
4267        datasource_id: Some(DatasourceId::PypiSetupCfg),
4268        purl,
4269    }
4270}
4271
4272fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4273    let Some(keywords) = value else {
4274        return Vec::new();
4275    };
4276
4277    keywords
4278        .split(',')
4279        .map(str::trim)
4280        .filter(|keyword| !keyword.is_empty())
4281        .map(ToOwned::to_owned)
4282        .collect()
4283}
4284
4285fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4286    entries
4287        .iter()
4288        .filter_map(|entry| {
4289            let (label, url) = entry.split_once('=')?;
4290            let label = label.trim();
4291            let url = url.trim();
4292            if label.is_empty() || url.is_empty() {
4293                None
4294            } else {
4295                Some((label.to_string(), url.to_string()))
4296            }
4297        })
4298        .collect()
4299}
4300
4301fn apply_project_url_mappings(
4302    parsed_urls: &[(String, String)],
4303    homepage_url: &mut Option<String>,
4304    bug_tracking_url: &mut Option<String>,
4305    code_view_url: &mut Option<String>,
4306    vcs_url: &mut Option<String>,
4307    extra_data: &mut HashMap<String, serde_json::Value>,
4308) {
4309    for (label, url) in parsed_urls {
4310        let label_lower = label.to_lowercase();
4311
4312        if bug_tracking_url.is_none()
4313            && matches!(
4314                label_lower.as_str(),
4315                "tracker"
4316                    | "bug reports"
4317                    | "bug tracker"
4318                    | "issues"
4319                    | "issue tracker"
4320                    | "github: issues"
4321            )
4322        {
4323            *bug_tracking_url = Some(url.clone());
4324        } else if code_view_url.is_none()
4325            && matches!(label_lower.as_str(), "source" | "source code" | "code")
4326        {
4327            *code_view_url = Some(url.clone());
4328        } else if vcs_url.is_none()
4329            && matches!(
4330                label_lower.as_str(),
4331                "github" | "gitlab" | "github: repo" | "repository"
4332            )
4333        {
4334            *vcs_url = Some(url.clone());
4335        } else if homepage_url.is_none()
4336            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4337        {
4338            *homepage_url = Some(url.clone());
4339        } else if label_lower == "changelog" {
4340            extra_data.insert(
4341                "changelog_url".to_string(),
4342                serde_json::Value::String(url.clone()),
4343            );
4344        }
4345    }
4346
4347    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4348        .iter()
4349        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4350        .collect();
4351
4352    if !project_urls_json.is_empty() {
4353        extra_data.insert(
4354            "project_urls".to_string(),
4355            serde_json::Value::Object(project_urls_json),
4356        );
4357    }
4358}
4359
4360fn parse_setup_cfg(content: &str) -> IniSections {
4361    let mut sections: IniSections = HashMap::new();
4362    let mut current_section: Option<String> = None;
4363    let mut current_key: Option<String> = None;
4364
4365    for raw_line in content.lines() {
4366        let line = raw_line.trim_end_matches('\r');
4367        let trimmed = line.trim();
4368        if trimmed.is_empty() {
4369            continue;
4370        }
4371
4372        let stripped = line.trim_start();
4373        if stripped.starts_with('#') || stripped.starts_with(';') {
4374            continue;
4375        }
4376
4377        if stripped.starts_with('[') && stripped.ends_with(']') {
4378            let section_name = stripped
4379                .trim_start_matches('[')
4380                .trim_end_matches(']')
4381                .trim()
4382                .to_ascii_lowercase();
4383            current_section = if section_name.is_empty() {
4384                None
4385            } else {
4386                Some(section_name)
4387            };
4388            current_key = None;
4389            continue;
4390        }
4391
4392        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4393            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4394                let value = stripped.trim();
4395                if !value.is_empty() {
4396                    sections
4397                        .entry(section.clone())
4398                        .or_default()
4399                        .entry(key.clone())
4400                        .or_default()
4401                        .push(value.to_string());
4402                }
4403            }
4404            continue;
4405        }
4406
4407        if let Some((key, value)) = stripped.split_once('=')
4408            && let Some(section) = current_section.as_ref()
4409        {
4410            let key_name = key.trim().to_ascii_lowercase();
4411            let value_trimmed = value.trim();
4412            let entry = sections
4413                .entry(section.clone())
4414                .or_default()
4415                .entry(key_name.clone())
4416                .or_default();
4417            if !value_trimmed.is_empty() {
4418                entry.push(value_trimmed.to_string());
4419            }
4420            current_key = Some(key_name);
4421        }
4422    }
4423
4424    sections
4425}
4426
4427fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4428    sections
4429        .get(&section.to_ascii_lowercase())
4430        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4431        .and_then(|entries| entries.first())
4432        .map(|value| value.trim().to_string())
4433}
4434
4435fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4436    sections
4437        .get(&section.to_ascii_lowercase())
4438        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4439        .cloned()
4440        .unwrap_or_default()
4441}
4442
4443fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
4444    let mut dependencies = Vec::new();
4445
4446    for (sub_section, scope) in [
4447        ("install_requires", "install"),
4448        ("tests_require", "test"),
4449        ("setup_requires", "setup"),
4450    ] {
4451        let reqs = get_ini_values(sections, "options", sub_section);
4452        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
4453    }
4454
4455    if let Some(extras) = sections.get("options.extras_require") {
4456        let mut extra_items: Vec<_> = extras.iter().collect();
4457        extra_items.sort_by_key(|(name, _)| *name);
4458        for (extra_name, reqs) in extra_items {
4459            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
4460        }
4461    }
4462
4463    dependencies
4464}
4465
4466fn parse_setup_cfg_requirements(
4467    reqs: &[String],
4468    scope: &str,
4469    is_optional: bool,
4470) -> Vec<Dependency> {
4471    reqs.iter()
4472        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
4473        .collect()
4474}
4475
4476fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
4477    let trimmed = req.trim();
4478    if trimmed.is_empty() || trimmed.starts_with('#') {
4479        return None;
4480    }
4481
4482    let name = extract_setup_cfg_dependency_name(trimmed)?;
4483    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4484
4485    Some(Dependency {
4486        purl: Some(purl.to_string()),
4487        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
4488        scope: Some(scope.to_string()),
4489        is_runtime: Some(true),
4490        is_optional: Some(is_optional),
4491        is_pinned: Some(false),
4492        is_direct: Some(true),
4493        resolved_package: None,
4494        extra_data: None,
4495    })
4496}
4497
4498fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
4499    let trimmed = req.trim();
4500    if trimmed.is_empty() {
4501        return None;
4502    }
4503
4504    let end = trimmed
4505        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
4506        .unwrap_or(trimmed.len());
4507    let name = trimmed[..end].trim();
4508    if name.is_empty() {
4509        None
4510    } else {
4511        Some(name.to_string())
4512    }
4513}
4514
4515fn normalize_setup_cfg_requirement(req: &str) -> String {
4516    req.chars().filter(|c| !c.is_whitespace()).collect()
4517}
4518
4519fn extract_setup_value(content: &str, key: &str) -> Option<String> {
4520    let patterns = vec![
4521        format!("{}=\"", key),   // name="value"
4522        format!("{} =\"", key),  // name ="value"
4523        format!("{}= \"", key),  // name= "value"
4524        format!("{} = \"", key), // name = "value"
4525        format!("{}='", key),    // name='value'
4526        format!("{} ='", key),   // name ='value'
4527        format!("{}= '", key),   // name= 'value'
4528        format!("{} = '", key),  // name = 'value'
4529    ];
4530
4531    for pattern in patterns {
4532        if let Some(start_idx) = content.find(&pattern) {
4533            let value_start = start_idx + pattern.len();
4534            let remaining = &content[value_start..];
4535
4536            if let Some(end_idx) = remaining.find(['"', '\'']) {
4537                return Some(remaining[..end_idx].to_string());
4538            }
4539        }
4540    }
4541
4542    None
4543}
4544
4545fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
4546    let mut dependencies = Vec::new();
4547
4548    if let Some(tests_deps) = extract_tests_require(content) {
4549        dependencies.extend(tests_deps);
4550    }
4551
4552    if let Some(extras_deps) = extract_extras_require(content) {
4553        dependencies.extend(extras_deps);
4554    }
4555
4556    dependencies
4557}
4558
4559fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
4560    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
4561    let re = Regex::new(pattern).ok()?;
4562    let captures = re.captures(content)?;
4563    let deps_str = captures.get(1)?.as_str();
4564
4565    let deps = parse_setup_py_dep_list(deps_str, "test", true);
4566    if deps.is_empty() { None } else { Some(deps) }
4567}
4568
4569fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
4570    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
4571    let re = Regex::new(pattern).ok()?;
4572    let captures = re.captures(content)?;
4573    let dict_content = captures.get(1)?.as_str();
4574
4575    let mut all_deps = Vec::new();
4576
4577    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
4578    let entry_re = Regex::new(entry_pattern).ok()?;
4579
4580    for entry_cap in entry_re.captures_iter(dict_content) {
4581        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
4582            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
4583            all_deps.extend(deps);
4584        }
4585    }
4586
4587    if all_deps.is_empty() {
4588        None
4589    } else {
4590        Some(all_deps)
4591    }
4592}
4593
4594fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
4595    let dep_pattern = r#"['"]([^'"]+)['"]"#;
4596    let re = match Regex::new(dep_pattern) {
4597        Ok(r) => r,
4598        Err(_) => return Vec::new(),
4599    };
4600
4601    re.captures_iter(deps_str)
4602        .filter_map(|cap| {
4603            let dep_str = cap.get(1)?.as_str().trim();
4604            if dep_str.is_empty() {
4605                return None;
4606            }
4607
4608            let name = extract_setup_cfg_dependency_name(dep_str)?;
4609            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4610
4611            Some(Dependency {
4612                purl: Some(purl.to_string()),
4613                extracted_requirement: Some(dep_str.to_string()),
4614                scope: Some(scope.to_string()),
4615                is_runtime: Some(true),
4616                is_optional: Some(is_optional),
4617                is_pinned: Some(false),
4618                is_direct: Some(true),
4619                resolved_package: None,
4620                extra_data: None,
4621            })
4622        })
4623        .collect()
4624}
4625
4626/// Reads and parses a TOML file
4627pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
4628    let content = read_file_to_string(path).map_err(|e| e.to_string())?;
4629    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
4630}
4631
4632/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
4633///
4634/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
4635/// Essential for SBOM compliance and package integrity verification.
4636///
4637/// # Returns
4638///
4639/// - `(Some(size), Some(hash))` on success
4640/// - `(None, None)` if file cannot be opened
4641/// - `(Some(size), None)` if hash calculation fails during read
4642fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<String>) {
4643    let mut file = match File::open(path) {
4644        Ok(f) => f,
4645        Err(_) => return (None, None),
4646    };
4647
4648    let metadata = match file.metadata() {
4649        Ok(m) => m,
4650        Err(_) => return (None, None),
4651    };
4652    let size = metadata.len();
4653
4654    let mut hasher = Sha256::new();
4655    let mut buffer = vec![0; 8192];
4656
4657    loop {
4658        match file.read(&mut buffer) {
4659            Ok(0) => break,
4660            Ok(n) => hasher.update(&buffer[..n]),
4661            Err(_) => return (Some(size), None),
4662        }
4663    }
4664
4665    let hash = hex::encode(hasher.finalize());
4666    (Some(size), Some(hash))
4667}
4668
4669fn default_package_data(path: &Path) -> PackageData {
4670    PackageData {
4671        package_type: Some(PythonParser::PACKAGE_TYPE),
4672        primary_language: Some("Python".to_string()),
4673        datasource_id: infer_python_datasource_id(path),
4674        ..Default::default()
4675    }
4676}
4677
4678fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
4679    let file_name = path.file_name().and_then(|name| name.to_str());
4680
4681    match file_name {
4682        Some("pyproject.toml") => {
4683            if read_toml_file(path)
4684                .ok()
4685                .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
4686                .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
4687                .is_some()
4688            {
4689                Some(DatasourceId::PypiPoetryPyprojectToml)
4690            } else {
4691                Some(DatasourceId::PypiPyprojectToml)
4692            }
4693        }
4694        Some("setup.py") => Some(DatasourceId::PypiSetupPy),
4695        Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
4696        Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
4697        Some("METADATA") => Some(DatasourceId::PypiWheelMetadata),
4698        Some("pypi.json") => Some(DatasourceId::PypiJson),
4699        Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
4700        Some("origin.json") if is_pip_cache_origin_json(path) => {
4701            Some(DatasourceId::PypiPipOriginJson)
4702        }
4703        _ if is_python_sdist_archive_path(path) => Some(DatasourceId::PypiSdist),
4704        _ if path
4705            .extension()
4706            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
4707        {
4708            Some(DatasourceId::PypiWheel)
4709        }
4710        _ if path
4711            .extension()
4712            .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
4713        {
4714            Some(DatasourceId::PypiEgg)
4715        }
4716        _ => None,
4717    }
4718}
4719
4720crate::register_parser!(
4721    "Python package manifests (pyproject.toml, setup.py, setup.cfg, pypi.json, PKG-INFO, METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
4722    &[
4723        "**/pyproject.toml",
4724        "**/setup.py",
4725        "**/setup.cfg",
4726        "**/pypi.json",
4727        "**/PKG-INFO",
4728        "**/METADATA",
4729        "**/origin.json",
4730        "**/*.tar.gz",
4731        "**/*.tgz",
4732        "**/*.tar.bz2",
4733        "**/*.tar.xz",
4734        "**/*.zip",
4735        "**/*.whl",
4736        "**/*.egg"
4737    ],
4738    "pypi",
4739    "Python",
4740    Some("https://packaging.python.org/"),
4741);