Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
35use crate::parsers::utils::{read_file_to_string, split_name_email};
36use base64::Engine;
37use base64::engine::general_purpose::URL_SAFE_NO_PAD;
38use bzip2::read::BzDecoder;
39use csv::ReaderBuilder;
40use flate2::read::GzDecoder;
41use liblzma::read::XzDecoder;
42use log::warn;
43use packageurl::PackageUrl;
44use regex::Regex;
45use rustpython_parser::{Parse, ast};
46use serde_json::{Map as JsonMap, Value as JsonValue};
47use sha2::{Digest, Sha256};
48use std::collections::{HashMap, HashSet};
49use std::fs::File;
50use std::io::Read;
51use std::path::{Component, Path, PathBuf};
52use tar::Archive;
53use toml::Value as TomlValue;
54use toml::map::Map as TomlMap;
55use zip::ZipArchive;
56
57use super::PackageParser;
58use super::license_normalization::normalize_spdx_declared_license;
59
60// Field constants for pyproject.toml
61const FIELD_PROJECT: &str = "project";
62const FIELD_NAME: &str = "name";
63const FIELD_VERSION: &str = "version";
64const FIELD_LICENSE: &str = "license";
65const FIELD_AUTHORS: &str = "authors";
66const FIELD_MAINTAINERS: &str = "maintainers";
67const FIELD_URLS: &str = "urls";
68const FIELD_HOMEPAGE: &str = "homepage";
69const FIELD_REPOSITORY: &str = "repository";
70const FIELD_DEPENDENCIES: &str = "dependencies";
71const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
72const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
73const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
74const MAX_SETUP_PY_BYTES: usize = 1_048_576;
75const MAX_SETUP_PY_AST_NODES: usize = 10_000;
76const MAX_SETUP_PY_AST_DEPTH: usize = 50;
77const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
78const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
79const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
80
81/// Python package parser supporting 11 manifest formats.
82///
83/// Extracts metadata from Python package files including pyproject.toml, setup.py,
84/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
85///
86/// # Security
87///
88/// setup.py files are parsed using AST analysis rather than code execution to prevent
89/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
90pub struct PythonParser;
91
92#[derive(Clone, Copy, Debug)]
93enum PythonSdistArchiveFormat {
94    TarGz,
95    Tgz,
96    TarBz2,
97    TarXz,
98    Zip,
99}
100
101#[derive(Clone, Debug)]
102struct ValidatedZipEntry {
103    index: usize,
104    name: String,
105}
106
107impl PackageParser for PythonParser {
108    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
109
110    fn extract_packages(path: &Path) -> Vec<PackageData> {
111        vec![
112            if path.file_name().unwrap_or_default() == "pyproject.toml" {
113                extract_from_pyproject_toml(path)
114            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
115                extract_from_setup_cfg(path)
116            } else if path.file_name().unwrap_or_default() == "setup.py" {
117                extract_from_setup_py(path)
118            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
119                extract_from_rfc822_metadata(path, DatasourceId::PypiSdistPkginfo)
120            } else if path.file_name().unwrap_or_default() == "METADATA" {
121                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
122            } else if is_pip_cache_origin_json(path) {
123                extract_from_pip_origin_json(path)
124            } else if path.file_name().unwrap_or_default() == "pypi.json" {
125                extract_from_pypi_json(path)
126            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
127                extract_from_pip_inspect(path)
128            } else if is_python_sdist_archive_path(path) {
129                extract_from_sdist_archive(path)
130            } else if path
131                .extension()
132                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
133            {
134                extract_from_wheel_archive(path)
135            } else if path
136                .extension()
137                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
138            {
139                extract_from_egg_archive(path)
140            } else {
141                default_package_data()
142            },
143        ]
144    }
145
146    fn is_match(path: &Path) -> bool {
147        if let Some(filename) = path.file_name()
148            && (filename == "pyproject.toml"
149                || filename == "setup.cfg"
150                || filename == "setup.py"
151                || filename == "PKG-INFO"
152                || filename == "METADATA"
153                || filename == "pypi.json"
154                || filename == "pip-inspect.deplock"
155                || is_pip_cache_origin_json(path))
156        {
157            return true;
158        }
159
160        if let Some(extension) = path.extension() {
161            let ext = extension.to_string_lossy().to_lowercase();
162            if ext == "whl" || ext == "egg" || is_python_sdist_archive_path(path) {
163                return true;
164            }
165        }
166
167        false
168    }
169}
170
171#[derive(Debug, Clone)]
172struct InstalledWheelMetadata {
173    wheel_tags: Vec<String>,
174    wheel_version: Option<String>,
175    wheel_generator: Option<String>,
176    root_is_purelib: Option<bool>,
177    compressed_tag: Option<String>,
178}
179
180fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
181    let Some(parent) = path.parent() else {
182        return;
183    };
184
185    if !parent
186        .file_name()
187        .and_then(|name| name.to_str())
188        .is_some_and(|name| name.ends_with(".dist-info"))
189    {
190        return;
191    }
192
193    let wheel_path = parent.join("WHEEL");
194    if !wheel_path.exists() {
195        return;
196    }
197
198    let Ok(content) = read_file_to_string(&wheel_path) else {
199        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
200        return;
201    };
202
203    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
204        return;
205    };
206
207    apply_installed_wheel_metadata(package_data, &wheel_metadata);
208}
209
210fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
211    use super::rfc822::{get_header_all, get_header_first};
212
213    let metadata = super::rfc822::parse_rfc822_content(content);
214    let wheel_tags = get_header_all(&metadata.headers, "tag");
215    if wheel_tags.is_empty() {
216        return None;
217    }
218
219    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
220    let wheel_generator = get_header_first(&metadata.headers, "generator");
221    let root_is_purelib =
222        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
223            match value.to_ascii_lowercase().as_str() {
224                "true" => Some(true),
225                "false" => Some(false),
226                _ => None,
227            }
228        });
229
230    let compressed_tag = compress_wheel_tags(&wheel_tags);
231
232    Some(InstalledWheelMetadata {
233        wheel_tags,
234        wheel_version,
235        wheel_generator,
236        root_is_purelib,
237        compressed_tag,
238    })
239}
240
241fn compress_wheel_tags(tags: &[String]) -> Option<String> {
242    if tags.is_empty() {
243        return None;
244    }
245
246    if tags.len() == 1 {
247        return Some(tags[0].clone());
248    }
249
250    let mut python_tags = Vec::new();
251    let mut abi_tag: Option<&str> = None;
252    let mut platform_tag: Option<&str> = None;
253
254    for tag in tags {
255        let mut parts = tag.splitn(3, '-');
256        let python = parts.next()?;
257        let abi = parts.next()?;
258        let platform = parts.next()?;
259
260        if abi_tag.is_some_and(|existing| existing != abi)
261            || platform_tag.is_some_and(|existing| existing != platform)
262        {
263            return None;
264        }
265
266        abi_tag = Some(abi);
267        platform_tag = Some(platform);
268        python_tags.push(python.to_string());
269    }
270
271    Some(format!(
272        "{}-{}-{}",
273        python_tags.join("."),
274        abi_tag?,
275        platform_tag?
276    ))
277}
278
279fn apply_installed_wheel_metadata(
280    package_data: &mut PackageData,
281    wheel_metadata: &InstalledWheelMetadata,
282) {
283    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
284    extra_data.insert(
285        "wheel_tags".to_string(),
286        JsonValue::Array(
287            wheel_metadata
288                .wheel_tags
289                .iter()
290                .cloned()
291                .map(JsonValue::String)
292                .collect(),
293        ),
294    );
295
296    if let Some(wheel_version) = &wheel_metadata.wheel_version {
297        extra_data.insert(
298            "wheel_version".to_string(),
299            JsonValue::String(wheel_version.clone()),
300        );
301    }
302
303    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
304        extra_data.insert(
305            "wheel_generator".to_string(),
306            JsonValue::String(wheel_generator.clone()),
307        );
308    }
309
310    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
311        extra_data.insert(
312            "root_is_purelib".to_string(),
313            JsonValue::Bool(root_is_purelib),
314        );
315    }
316
317    if let (Some(name), Some(version), Some(extension)) = (
318        package_data.name.as_deref(),
319        package_data.version.as_deref(),
320        wheel_metadata.compressed_tag.as_deref(),
321    ) {
322        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
323    }
324}
325
326fn is_pip_cache_origin_json(path: &Path) -> bool {
327    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
328        && path.ancestors().skip(1).any(|ancestor| {
329            ancestor
330                .file_name()
331                .and_then(|name| name.to_str())
332                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
333        })
334}
335
336fn extract_from_pip_origin_json(path: &Path) -> PackageData {
337    let content = match read_file_to_string(path) {
338        Ok(content) => content,
339        Err(e) => {
340            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
341            return default_package_data();
342        }
343    };
344
345    let root: JsonValue = match serde_json::from_str(&content) {
346        Ok(root) => root,
347        Err(e) => {
348            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
349            return default_package_data();
350        }
351    };
352
353    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
354        warn!("No url found in pip cache origin.json at {:?}", path);
355        return default_package_data();
356    };
357
358    let sibling_wheel = find_sibling_cached_wheel(path);
359    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
360        sibling_wheel
361            .as_ref()
362            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
363    });
364
365    let Some((name, version)) = name_version else {
366        warn!(
367            "Failed to infer package name/version from pip cache origin.json at {:?}",
368            path
369        );
370        return default_package_data();
371    };
372
373    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
374        build_pypi_urls(Some(&name), Some(&version));
375    let purl = sibling_wheel
376        .as_ref()
377        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
378        .or(plain_purl);
379
380    PackageData {
381        package_type: Some(PythonParser::PACKAGE_TYPE),
382        primary_language: Some("Python".to_string()),
383        name: Some(name),
384        version: Some(version),
385        datasource_id: Some(DatasourceId::PypiPipOriginJson),
386        download_url: Some(download_url.to_string()),
387        sha256: extract_sha256_from_origin_json(&root),
388        repository_homepage_url,
389        repository_download_url,
390        api_data_url,
391        purl,
392        ..Default::default()
393    }
394}
395
396fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
397    let parent = path.parent()?;
398    let entries = parent.read_dir().ok()?;
399
400    for entry in entries.flatten() {
401        let sibling_path = entry.path();
402        if sibling_path
403            .extension()
404            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
405            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
406        {
407            return Some(wheel_info);
408        }
409    }
410
411    None
412}
413
414fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
415    let file_name = url.rsplit('/').next()?;
416
417    if file_name.ends_with(".whl") {
418        return parse_wheel_filename(Path::new(file_name))
419            .map(|wheel_info| (wheel_info.name, wheel_info.version));
420    }
421
422    let stem = strip_python_archive_extension(file_name)?;
423    let (name, version) = stem.rsplit_once('-')?;
424    if name.is_empty() || version.is_empty() {
425        return None;
426    }
427
428    Some((name.replace('_', "-"), version.to_string()))
429}
430
431fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
432    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
433        .iter()
434        .find_map(|suffix| file_name.strip_suffix(suffix))
435}
436
437fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
438    root.pointer("/archive_info/hashes/sha256")
439        .and_then(|value| value.as_str())
440        .map(ToOwned::to_owned)
441        .or_else(|| {
442            root.pointer("/archive_info/hash")
443                .and_then(|value| value.as_str())
444                .and_then(normalize_origin_hash)
445        })
446}
447
448fn normalize_origin_hash(hash: &str) -> Option<String> {
449    if let Some(value) = hash.strip_prefix("sha256=") {
450        return Some(value.to_string());
451    }
452    if let Some(value) = hash.strip_prefix("sha256:") {
453        return Some(value.to_string());
454    }
455    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
456        return Some(hash.to_string());
457    }
458    None
459}
460
461fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
462    let content = match read_file_to_string(path) {
463        Ok(content) => content,
464        Err(e) => {
465            warn!("Failed to read metadata at {:?}: {}", path, e);
466            return default_package_data();
467        }
468    };
469
470    let metadata = super::rfc822::parse_rfc822_content(&content);
471    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
472    merge_sibling_metadata_dependencies(path, &mut package_data);
473    merge_sibling_metadata_file_references(path, &mut package_data);
474    if datasource_id == DatasourceId::PypiWheelMetadata {
475        merge_sibling_wheel_metadata(path, &mut package_data);
476    }
477    package_data
478}
479
480fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
481    let mut extra_dependencies = Vec::new();
482
483    if let Some(parent) = path.parent() {
484        let direct_requires = parent.join("requires.txt");
485        if direct_requires.exists()
486            && let Ok(content) = read_file_to_string(&direct_requires)
487        {
488            extra_dependencies.extend(parse_requires_txt(&content));
489        }
490
491        let sibling_egg_info_requires = parent
492            .read_dir()
493            .ok()
494            .into_iter()
495            .flatten()
496            .flatten()
497            .find_map(|entry| {
498                let child_path = entry.path();
499                if child_path.is_dir()
500                    && child_path
501                        .file_name()
502                        .and_then(|name| name.to_str())
503                        .is_some_and(|name| name.ends_with(".egg-info"))
504                {
505                    let requires = child_path.join("requires.txt");
506                    requires.exists().then_some(requires)
507                } else {
508                    None
509                }
510            });
511
512        if let Some(requires_path) = sibling_egg_info_requires
513            && let Ok(content) = read_file_to_string(&requires_path)
514        {
515            extra_dependencies.extend(parse_requires_txt(&content));
516        }
517    }
518
519    for dependency in extra_dependencies {
520        if !package_data.dependencies.iter().any(|existing| {
521            existing.purl == dependency.purl
522                && existing.scope == dependency.scope
523                && existing.extracted_requirement == dependency.extracted_requirement
524                && existing.extra_data == dependency.extra_data
525        }) {
526            package_data.dependencies.push(dependency);
527        }
528    }
529}
530
531fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
532    let mut extra_refs = Vec::new();
533
534    if let Some(parent) = path.parent() {
535        let record_path = parent.join("RECORD");
536        if record_path.exists()
537            && let Ok(content) = read_file_to_string(&record_path)
538        {
539            extra_refs.extend(parse_record_csv(&content));
540        }
541
542        let installed_files_path = parent.join("installed-files.txt");
543        if installed_files_path.exists()
544            && let Ok(content) = read_file_to_string(&installed_files_path)
545        {
546            extra_refs.extend(parse_installed_files_txt(&content));
547        }
548
549        let sources_path = parent.join("SOURCES.txt");
550        if sources_path.exists()
551            && let Ok(content) = read_file_to_string(&sources_path)
552        {
553            extra_refs.extend(parse_sources_txt(&content));
554        }
555    }
556
557    for file_ref in extra_refs {
558        if !package_data
559            .file_references
560            .iter()
561            .any(|existing| existing.path == file_ref.path)
562        {
563            package_data.file_references.push(file_ref);
564        }
565    }
566}
567
568fn collect_validated_zip_entries<R: Read + std::io::Seek>(
569    archive: &mut ZipArchive<R>,
570    path: &Path,
571    archive_type: &str,
572) -> Result<Vec<ValidatedZipEntry>, String> {
573    let mut total_extracted = 0u64;
574    let mut entries = Vec::new();
575
576    for i in 0..archive.len() {
577        if let Ok(file) = archive.by_index_raw(i) {
578            let compressed_size = file.compressed_size();
579            let uncompressed_size = file.size();
580            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
581                warn!(
582                    "Skipping unsafe path in {} {:?}: {}",
583                    archive_type,
584                    path,
585                    file.name()
586                );
587                continue;
588            };
589
590            if compressed_size > 0 {
591                let ratio = uncompressed_size as f64 / compressed_size as f64;
592                if ratio > MAX_COMPRESSION_RATIO {
593                    warn!(
594                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
595                        archive_type, path, ratio
596                    );
597                    continue;
598                }
599            }
600
601            if uncompressed_size > MAX_FILE_SIZE {
602                warn!(
603                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
604                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
605                );
606                continue;
607            }
608
609            total_extracted += uncompressed_size;
610            if total_extracted > MAX_ARCHIVE_SIZE {
611                let msg = format!(
612                    "Total extracted size exceeds limit for {} {:?}",
613                    archive_type, path
614                );
615                warn!("{}", msg);
616                return Err(msg);
617            }
618
619            entries.push(ValidatedZipEntry {
620                index: i,
621                name: entry_name,
622            });
623        }
624    }
625
626    Ok(entries)
627}
628
629fn is_python_sdist_archive_path(path: &Path) -> bool {
630    detect_python_sdist_archive_format(path).is_some()
631}
632
633fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
634    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
635
636    if !is_likely_python_sdist_filename(&file_name) {
637        return None;
638    }
639
640    if file_name.ends_with(".tar.gz") {
641        Some(PythonSdistArchiveFormat::TarGz)
642    } else if file_name.ends_with(".tgz") {
643        Some(PythonSdistArchiveFormat::Tgz)
644    } else if file_name.ends_with(".tar.bz2") {
645        Some(PythonSdistArchiveFormat::TarBz2)
646    } else if file_name.ends_with(".tar.xz") {
647        Some(PythonSdistArchiveFormat::TarXz)
648    } else if file_name.ends_with(".zip") {
649        Some(PythonSdistArchiveFormat::Zip)
650    } else {
651        None
652    }
653}
654
655fn is_likely_python_sdist_filename(file_name: &str) -> bool {
656    let Some(stem) = strip_python_archive_extension(file_name) else {
657        return false;
658    };
659
660    let Some((name, version)) = stem.rsplit_once('-') else {
661        return false;
662    };
663
664    !name.is_empty()
665        && !version.is_empty()
666        && version.chars().any(|ch| ch.is_ascii_digit())
667        && name
668            .chars()
669            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
670}
671
672fn extract_from_sdist_archive(path: &Path) -> PackageData {
673    let metadata = match std::fs::metadata(path) {
674        Ok(m) => m,
675        Err(e) => {
676            warn!(
677                "Failed to read metadata for sdist archive {:?}: {}",
678                path, e
679            );
680            return default_package_data();
681        }
682    };
683
684    if metadata.len() > MAX_ARCHIVE_SIZE {
685        warn!(
686            "sdist archive too large: {} bytes (limit: {} bytes)",
687            metadata.len(),
688            MAX_ARCHIVE_SIZE
689        );
690        return default_package_data();
691    }
692
693    let Some(format) = detect_python_sdist_archive_format(path) else {
694        return default_package_data();
695    };
696
697    let mut package_data = match format {
698        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
699            let file = match File::open(path) {
700                Ok(file) => file,
701                Err(e) => {
702                    warn!("Failed to open sdist archive {:?}: {}", path, e);
703                    return default_package_data();
704                }
705            };
706            let decoder = GzDecoder::new(file);
707            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
708        }
709        PythonSdistArchiveFormat::TarBz2 => {
710            let file = match File::open(path) {
711                Ok(file) => file,
712                Err(e) => {
713                    warn!("Failed to open sdist archive {:?}: {}", path, e);
714                    return default_package_data();
715                }
716            };
717            let decoder = BzDecoder::new(file);
718            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
719        }
720        PythonSdistArchiveFormat::TarXz => {
721            let file = match File::open(path) {
722                Ok(file) => file,
723                Err(e) => {
724                    warn!("Failed to open sdist archive {:?}: {}", path, e);
725                    return default_package_data();
726                }
727            };
728            let decoder = XzDecoder::new(file);
729            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
730        }
731        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
732    };
733
734    if package_data.package_type.is_some() {
735        let (size, sha256) = calculate_file_checksums(path);
736        package_data.size = size;
737        package_data.sha256 = sha256;
738    }
739
740    package_data
741}
742
743fn extract_from_tar_sdist_archive<R: Read>(
744    path: &Path,
745    reader: R,
746    archive_type: &str,
747    compressed_size: u64,
748) -> PackageData {
749    let mut archive = Archive::new(reader);
750    let archive_entries = match archive.entries() {
751        Ok(entries) => entries,
752        Err(e) => {
753            warn!(
754                "Failed to read {} sdist archive {:?}: {}",
755                archive_type, path, e
756            );
757            return default_package_data();
758        }
759    };
760
761    let mut total_extracted = 0u64;
762    let mut entries = Vec::new();
763
764    for entry_result in archive_entries {
765        let mut entry = match entry_result {
766            Ok(entry) => entry,
767            Err(e) => {
768                warn!(
769                    "Failed to read {} sdist entry from {:?}: {}",
770                    archive_type, path, e
771                );
772                continue;
773            }
774        };
775
776        let entry_size = entry.size();
777        if entry_size > MAX_FILE_SIZE {
778            warn!(
779                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
780                archive_type, path, entry_size, MAX_FILE_SIZE
781            );
782            continue;
783        }
784
785        total_extracted += entry_size;
786        if total_extracted > MAX_ARCHIVE_SIZE {
787            warn!(
788                "Total extracted size exceeds limit for {} sdist {:?}",
789                archive_type, path
790            );
791            return default_package_data();
792        }
793
794        if compressed_size > 0 {
795            let ratio = total_extracted as f64 / compressed_size as f64;
796            if ratio > MAX_COMPRESSION_RATIO {
797                warn!(
798                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
799                    archive_type, path, ratio
800                );
801                return default_package_data();
802            }
803        }
804
805        let entry_path = match entry.path() {
806            Ok(path) => path.to_string_lossy().replace('\\', "/"),
807            Err(e) => {
808                warn!(
809                    "Failed to get {} sdist entry path from {:?}: {}",
810                    archive_type, path, e
811                );
812                continue;
813            }
814        };
815
816        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
817            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
818            continue;
819        };
820
821        if !is_relevant_sdist_text_entry(&entry_path) {
822            continue;
823        }
824
825        if let Ok(content) = read_limited_utf8(
826            &mut entry,
827            MAX_FILE_SIZE,
828            &format!("{} entry {}", archive_type, entry_path),
829        ) {
830            entries.push((entry_path, content));
831        }
832    }
833
834    build_sdist_package_data(path, entries)
835}
836
837fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
838    let file = match File::open(path) {
839        Ok(file) => file,
840        Err(e) => {
841            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
842            return default_package_data();
843        }
844    };
845
846    let mut archive = match ZipArchive::new(file) {
847        Ok(archive) => archive,
848        Err(e) => {
849            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
850            return default_package_data();
851        }
852    };
853
854    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
855        Ok(entries) => entries,
856        Err(_) => return default_package_data(),
857    };
858
859    let mut entries = Vec::new();
860    for entry in validated_entries.iter() {
861        if !is_relevant_sdist_text_entry(&entry.name) {
862            continue;
863        }
864
865        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
866            entries.push((entry.name.clone(), content));
867        }
868    }
869
870    build_sdist_package_data(path, entries)
871}
872
873fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
874    entry_path.ends_with("/PKG-INFO")
875        || entry_path.ends_with("/requires.txt")
876        || entry_path.ends_with("/SOURCES.txt")
877}
878
879fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
880    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
881        warn!("No PKG-INFO file found in sdist archive {:?}", path);
882        return default_package_data();
883    };
884
885    let mut package_data =
886        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
887    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
888    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
889    apply_sdist_name_version_fallback(path, &mut package_data);
890    package_data
891}
892
893fn select_sdist_pkginfo_entry(
894    archive_path: &Path,
895    entries: &[(String, String)],
896) -> Option<(String, String)> {
897    let expected_name = archive_path
898        .file_name()
899        .and_then(|name| name.to_str())
900        .and_then(strip_python_archive_extension)
901        .and_then(|stem| {
902            stem.rsplit_once('-')
903                .map(|(name, _)| normalize_python_package_name(name))
904        });
905
906    entries
907        .iter()
908        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
909        .min_by_key(|(entry_path, content)| {
910            let components: Vec<_> = entry_path
911                .split('/')
912                .filter(|part| !part.is_empty())
913                .collect();
914            let metadata = super::rfc822::parse_rfc822_content(content);
915            let candidate_name = super::rfc822::get_header_first(&metadata.headers, "name")
916                .map(|name| normalize_python_package_name(&name));
917            let name_rank = if candidate_name == expected_name {
918                0
919            } else {
920                1
921            };
922            let kind_rank = if components.len() == 3
923                && components[1].ends_with(".egg-info")
924                && components[2] == "PKG-INFO"
925            {
926                0
927            } else if components.len() == 2 && components[1] == "PKG-INFO" {
928                1
929            } else if entry_path.ends_with(".egg-info/PKG-INFO") {
930                2
931            } else {
932                3
933            };
934
935            (name_rank, kind_rank, components.len(), entry_path.clone())
936        })
937        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
938}
939
940fn merge_sdist_archive_dependencies(
941    entries: &[(String, String)],
942    metadata_path: &str,
943    package_data: &mut PackageData,
944) {
945    let metadata_dir = metadata_path
946        .rsplit_once('/')
947        .map(|(dir, _)| dir)
948        .unwrap_or("");
949    let archive_root = metadata_path.split('/').next().unwrap_or("");
950    let matched_egg_info_dir =
951        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
952    let mut extra_dependencies = Vec::new();
953
954    for (entry_path, content) in entries {
955        let is_direct_requires =
956            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
957        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
958            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
959        });
960
961        if is_direct_requires || is_egg_info_requires {
962            extra_dependencies.extend(parse_requires_txt(content));
963        }
964    }
965
966    for dependency in extra_dependencies {
967        if !package_data.dependencies.iter().any(|existing| {
968            existing.purl == dependency.purl
969                && existing.scope == dependency.scope
970                && existing.extracted_requirement == dependency.extracted_requirement
971                && existing.extra_data == dependency.extra_data
972        }) {
973            package_data.dependencies.push(dependency);
974        }
975    }
976}
977
978fn merge_sdist_archive_file_references(
979    entries: &[(String, String)],
980    metadata_path: &str,
981    package_data: &mut PackageData,
982) {
983    let metadata_dir = metadata_path
984        .rsplit_once('/')
985        .map(|(dir, _)| dir)
986        .unwrap_or("");
987    let archive_root = metadata_path.split('/').next().unwrap_or("");
988    let matched_egg_info_dir =
989        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
990    let mut extra_refs = Vec::new();
991
992    for (entry_path, content) in entries {
993        let is_direct_sources =
994            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
995        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
996            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
997        });
998
999        if is_direct_sources || is_egg_info_sources {
1000            extra_refs.extend(parse_sources_txt(content));
1001        }
1002    }
1003
1004    for file_ref in extra_refs {
1005        if !package_data
1006            .file_references
1007            .iter()
1008            .any(|existing| existing.path == file_ref.path)
1009        {
1010            package_data.file_references.push(file_ref);
1011        }
1012    }
1013}
1014
1015fn select_matching_sdist_egg_info_dir(
1016    entries: &[(String, String)],
1017    archive_root: &str,
1018    package_name: Option<&str>,
1019) -> Option<String> {
1020    let normalized_package_name = package_name.map(normalize_python_package_name);
1021
1022    entries
1023        .iter()
1024        .filter_map(|(entry_path, _)| {
1025            let components: Vec<_> = entry_path
1026                .split('/')
1027                .filter(|part| !part.is_empty())
1028                .collect();
1029            if components.len() == 3
1030                && components[0] == archive_root
1031                && components[1].ends_with(".egg-info")
1032            {
1033                Some(components[1].to_string())
1034            } else {
1035                None
1036            }
1037        })
1038        .min_by_key(|egg_info_dir| {
1039            let normalized_dir_name =
1040                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1041            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1042                0
1043            } else {
1044                1
1045            };
1046
1047            (name_rank, egg_info_dir.clone())
1048        })
1049}
1050
1051fn normalize_python_package_name(name: &str) -> String {
1052    name.to_ascii_lowercase().replace('_', "-")
1053}
1054
1055fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1056    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1057        return;
1058    };
1059
1060    let Some(stem) = strip_python_archive_extension(file_name) else {
1061        return;
1062    };
1063
1064    let Some((name, version)) = stem.rsplit_once('-') else {
1065        return;
1066    };
1067
1068    if package_data.name.is_none() {
1069        package_data.name = Some(name.replace('_', "-"));
1070    }
1071    if package_data.version.is_none() {
1072        package_data.version = Some(version.to_string());
1073    }
1074
1075    if package_data.purl.is_none()
1076        || package_data.repository_homepage_url.is_none()
1077        || package_data.repository_download_url.is_none()
1078        || package_data.api_data_url.is_none()
1079    {
1080        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1081            build_pypi_urls(
1082                package_data.name.as_deref(),
1083                package_data.version.as_deref(),
1084            );
1085
1086        if package_data.repository_homepage_url.is_none() {
1087            package_data.repository_homepage_url = repository_homepage_url;
1088        }
1089        if package_data.repository_download_url.is_none() {
1090            package_data.repository_download_url = repository_download_url;
1091        }
1092        if package_data.api_data_url.is_none() {
1093            package_data.api_data_url = api_data_url;
1094        }
1095        if package_data.purl.is_none() {
1096            package_data.purl = purl;
1097        }
1098    }
1099}
1100
1101fn extract_from_wheel_archive(path: &Path) -> PackageData {
1102    let metadata = match std::fs::metadata(path) {
1103        Ok(m) => m,
1104        Err(e) => {
1105            warn!(
1106                "Failed to read metadata for wheel archive {:?}: {}",
1107                path, e
1108            );
1109            return default_package_data();
1110        }
1111    };
1112
1113    if metadata.len() > MAX_ARCHIVE_SIZE {
1114        warn!(
1115            "Wheel archive too large: {} bytes (limit: {} bytes)",
1116            metadata.len(),
1117            MAX_ARCHIVE_SIZE
1118        );
1119        return default_package_data();
1120    }
1121
1122    let file = match File::open(path) {
1123        Ok(f) => f,
1124        Err(e) => {
1125            warn!("Failed to open wheel archive {:?}: {}", path, e);
1126            return default_package_data();
1127        }
1128    };
1129
1130    let mut archive = match ZipArchive::new(file) {
1131        Ok(a) => a,
1132        Err(e) => {
1133            warn!("Failed to read wheel archive {:?}: {}", path, e);
1134            return default_package_data();
1135        }
1136    };
1137
1138    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1139        Ok(entries) => entries,
1140        Err(_) => return default_package_data(),
1141    };
1142
1143    let metadata_entry =
1144        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1145            Some(entry) => entry,
1146            None => {
1147                warn!("No METADATA file found in wheel archive {:?}", path);
1148                return default_package_data();
1149            }
1150        };
1151
1152    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1153        Ok(c) => c,
1154        Err(e) => {
1155            warn!("Failed to read METADATA from {:?}: {}", path, e);
1156            return default_package_data();
1157        }
1158    };
1159
1160    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1161
1162    let (size, sha256) = calculate_file_checksums(path);
1163    package_data.size = size;
1164    package_data.sha256 = sha256;
1165
1166    if let Some(record_entry) =
1167        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1168        && let Ok(record_content) =
1169            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1170    {
1171        package_data.file_references = parse_record_csv(&record_content);
1172    }
1173
1174    if let Some(wheel_info) = parse_wheel_filename(path) {
1175        if package_data.name.is_none() {
1176            package_data.name = Some(wheel_info.name.clone());
1177        }
1178        if package_data.version.is_none() {
1179            package_data.version = Some(wheel_info.version.clone());
1180        }
1181
1182        package_data.purl = build_wheel_purl(
1183            package_data.name.as_deref(),
1184            package_data.version.as_deref(),
1185            &wheel_info,
1186        );
1187
1188        let mut extra_data = package_data.extra_data.unwrap_or_default();
1189        extra_data.insert(
1190            "python_requires".to_string(),
1191            serde_json::Value::String(wheel_info.python_tag.clone()),
1192        );
1193        extra_data.insert(
1194            "abi_tag".to_string(),
1195            serde_json::Value::String(wheel_info.abi_tag.clone()),
1196        );
1197        extra_data.insert(
1198            "platform_tag".to_string(),
1199            serde_json::Value::String(wheel_info.platform_tag.clone()),
1200        );
1201        package_data.extra_data = Some(extra_data);
1202    }
1203
1204    package_data
1205}
1206
1207fn extract_from_egg_archive(path: &Path) -> PackageData {
1208    let metadata = match std::fs::metadata(path) {
1209        Ok(m) => m,
1210        Err(e) => {
1211            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1212            return default_package_data();
1213        }
1214    };
1215
1216    if metadata.len() > MAX_ARCHIVE_SIZE {
1217        warn!(
1218            "Egg archive too large: {} bytes (limit: {} bytes)",
1219            metadata.len(),
1220            MAX_ARCHIVE_SIZE
1221        );
1222        return default_package_data();
1223    }
1224
1225    let file = match File::open(path) {
1226        Ok(f) => f,
1227        Err(e) => {
1228            warn!("Failed to open egg archive {:?}: {}", path, e);
1229            return default_package_data();
1230        }
1231    };
1232
1233    let mut archive = match ZipArchive::new(file) {
1234        Ok(a) => a,
1235        Err(e) => {
1236            warn!("Failed to read egg archive {:?}: {}", path, e);
1237            return default_package_data();
1238        }
1239    };
1240
1241    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1242        Ok(entries) => entries,
1243        Err(_) => return default_package_data(),
1244    };
1245
1246    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1247        &validated_entries,
1248        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1249    ) {
1250        Some(entry) => entry,
1251        None => {
1252            warn!("No PKG-INFO file found in egg archive {:?}", path);
1253            return default_package_data();
1254        }
1255    };
1256
1257    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1258        Ok(c) => c,
1259        Err(e) => {
1260            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1261            return default_package_data();
1262        }
1263    };
1264
1265    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1266
1267    let (size, sha256) = calculate_file_checksums(path);
1268    package_data.size = size;
1269    package_data.sha256 = sha256;
1270
1271    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1272        &validated_entries,
1273        &[
1274            "EGG-INFO/installed-files.txt",
1275            ".egg-info/installed-files.txt",
1276        ],
1277    ) && let Ok(installed_files_content) =
1278        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1279    {
1280        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1281    }
1282
1283    if let Some(egg_info) = parse_egg_filename(path) {
1284        if package_data.name.is_none() {
1285            package_data.name = Some(egg_info.name.clone());
1286        }
1287        if package_data.version.is_none() {
1288            package_data.version = Some(egg_info.version.clone());
1289        }
1290
1291        if let Some(python_version) = &egg_info.python_version {
1292            let mut extra_data = package_data.extra_data.unwrap_or_default();
1293            extra_data.insert(
1294                "python_version".to_string(),
1295                serde_json::Value::String(python_version.clone()),
1296            );
1297            package_data.extra_data = Some(extra_data);
1298        }
1299    }
1300
1301    package_data.purl = build_egg_purl(
1302        package_data.name.as_deref(),
1303        package_data.version.as_deref(),
1304    );
1305
1306    package_data
1307}
1308
1309fn find_validated_zip_entry_by_suffix<'a>(
1310    entries: &'a [ValidatedZipEntry],
1311    suffix: &str,
1312) -> Option<&'a ValidatedZipEntry> {
1313    entries.iter().find(|entry| entry.name.ends_with(suffix))
1314}
1315
1316fn find_validated_zip_entry_by_any_suffix<'a>(
1317    entries: &'a [ValidatedZipEntry],
1318    suffixes: &[&str],
1319) -> Option<&'a ValidatedZipEntry> {
1320    entries
1321        .iter()
1322        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1323}
1324
1325fn read_validated_zip_entry<R: Read + std::io::Seek>(
1326    archive: &mut ZipArchive<R>,
1327    entry: &ValidatedZipEntry,
1328    path: &Path,
1329    archive_type: &str,
1330) -> Result<String, String> {
1331    let mut file = archive
1332        .by_index(entry.index)
1333        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1334
1335    let compressed_size = file.compressed_size();
1336    let uncompressed_size = file.size();
1337
1338    if compressed_size > 0 {
1339        let ratio = uncompressed_size as f64 / compressed_size as f64;
1340        if ratio > MAX_COMPRESSION_RATIO {
1341            return Err(format!(
1342                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1343                archive_type, path, ratio
1344            ));
1345        }
1346    }
1347
1348    if uncompressed_size > MAX_FILE_SIZE {
1349        return Err(format!(
1350            "Rejected oversized entry in {} {:?}: {} bytes",
1351            archive_type, path, uncompressed_size
1352        ));
1353    }
1354
1355    read_limited_utf8(
1356        &mut file,
1357        MAX_FILE_SIZE,
1358        &format!("{} entry {}", archive_type, entry.name),
1359    )
1360}
1361
1362fn read_limited_utf8<R: Read>(
1363    reader: &mut R,
1364    max_bytes: u64,
1365    context: &str,
1366) -> Result<String, String> {
1367    let mut limited = reader.take(max_bytes + 1);
1368    let mut bytes = Vec::new();
1369    limited
1370        .read_to_end(&mut bytes)
1371        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1372
1373    if bytes.len() as u64 > max_bytes {
1374        return Err(format!(
1375            "{} exceeded {} byte limit while reading",
1376            context, max_bytes
1377        ));
1378    }
1379
1380    String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1381}
1382
1383fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1384    let normalized = entry_path.replace('\\', "/");
1385    if normalized.len() >= 3 {
1386        let bytes = normalized.as_bytes();
1387        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1388            return None;
1389        }
1390    }
1391    let path = Path::new(&normalized);
1392    let mut components = Vec::new();
1393
1394    for component in path.components() {
1395        match component {
1396            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1397            Component::CurDir => {}
1398            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1399        }
1400    }
1401
1402    (!components.is_empty()).then_some(components.join("/"))
1403}
1404
1405/// Parses RECORD CSV format from wheel archives (PEP 427).
1406/// Format: path,hash,size (3 columns, no header)
1407/// Hash format: sha256=urlsafe_base64_hash or empty
1408/// Size: bytes as u64 or empty
1409pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1410    let mut reader = ReaderBuilder::new()
1411        .has_headers(false)
1412        .from_reader(content.as_bytes());
1413
1414    let mut file_references = Vec::new();
1415
1416    for result in reader.records() {
1417        match result {
1418            Ok(record) => {
1419                if record.len() < 3 {
1420                    continue;
1421                }
1422
1423                let path = record.get(0).unwrap_or("").trim().to_string();
1424                if path.is_empty() {
1425                    continue;
1426                }
1427
1428                let hash_field = record.get(1).unwrap_or("").trim();
1429                let size_field = record.get(2).unwrap_or("").trim();
1430
1431                // Parse hash: format is "algorithm=value"
1432                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1433                    let parts: Vec<&str> = hash_field.split('=').collect();
1434                    if parts.len() == 2 && parts[0] == "sha256" {
1435                        // Decode base64 to hex
1436                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1437                            Ok(decoded) => {
1438                                let hex = decoded
1439                                    .iter()
1440                                    .map(|b| format!("{:02x}", b))
1441                                    .collect::<String>();
1442                                Some(hex)
1443                            }
1444                            Err(_) => None,
1445                        }
1446                    } else {
1447                        None
1448                    }
1449                } else {
1450                    None
1451                };
1452
1453                // Parse size
1454                let size = if !size_field.is_empty() && size_field != "-" {
1455                    size_field.parse::<u64>().ok()
1456                } else {
1457                    None
1458                };
1459
1460                file_references.push(FileReference {
1461                    path,
1462                    size,
1463                    sha1: None,
1464                    md5: None,
1465                    sha256,
1466                    sha512: None,
1467                    extra_data: None,
1468                });
1469            }
1470            Err(e) => {
1471                warn!("Failed to parse RECORD CSV row: {}", e);
1472                continue;
1473            }
1474        }
1475    }
1476
1477    file_references
1478}
1479
1480/// Parses installed-files.txt format from egg archives (PEP 376).
1481/// Format: one file path per line, no headers, no hash, no size
1482pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1483    content
1484        .lines()
1485        .map(|line| line.trim())
1486        .filter(|line| !line.is_empty())
1487        .map(|path| FileReference {
1488            path: path.to_string(),
1489            size: None,
1490            sha1: None,
1491            md5: None,
1492            sha256: None,
1493            sha512: None,
1494            extra_data: None,
1495        })
1496        .collect()
1497}
1498
1499pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1500    content
1501        .lines()
1502        .map(str::trim)
1503        .filter(|line| !line.is_empty())
1504        .map(|path| FileReference {
1505            path: path.to_string(),
1506            size: None,
1507            sha1: None,
1508            md5: None,
1509            sha256: None,
1510            sha512: None,
1511            extra_data: None,
1512        })
1513        .collect()
1514}
1515
1516struct WheelInfo {
1517    name: String,
1518    version: String,
1519    python_tag: String,
1520    abi_tag: String,
1521    platform_tag: String,
1522}
1523
1524fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1525    let stem = path.file_stem()?.to_string_lossy();
1526    let parts: Vec<&str> = stem.split('-').collect();
1527
1528    if parts.len() >= 5 {
1529        Some(WheelInfo {
1530            name: parts[0].replace('_', "-"),
1531            version: parts[1].to_string(),
1532            python_tag: parts[2].to_string(),
1533            abi_tag: parts[3].to_string(),
1534            platform_tag: parts[4..].join("-"),
1535        })
1536    } else {
1537        None
1538    }
1539}
1540
1541struct EggInfo {
1542    name: String,
1543    version: String,
1544    python_version: Option<String>,
1545}
1546
1547fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1548    let stem = path.file_stem()?.to_string_lossy();
1549    let parts: Vec<&str> = stem.split('-').collect();
1550
1551    if parts.len() >= 2 {
1552        Some(EggInfo {
1553            name: parts[0].replace('_', "-"),
1554            version: parts[1].to_string(),
1555            python_version: parts.get(2).map(|s| s.to_string()),
1556        })
1557    } else {
1558        None
1559    }
1560}
1561
1562fn build_wheel_purl(
1563    name: Option<&str>,
1564    version: Option<&str>,
1565    wheel_info: &WheelInfo,
1566) -> Option<String> {
1567    let name = name?;
1568    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1569
1570    if let Some(ver) = version {
1571        package_url.with_version(ver).ok()?;
1572    }
1573
1574    let extension = format!(
1575        "{}-{}-{}",
1576        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1577    );
1578    package_url.add_qualifier("extension", extension).ok()?;
1579
1580    Some(package_url.to_string())
1581}
1582
1583fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1584    let name = name?;
1585    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1586
1587    if let Some(ver) = version {
1588        package_url.with_version(ver).ok()?;
1589    }
1590
1591    package_url.add_qualifier("type", "egg").ok()?;
1592
1593    Some(package_url.to_string())
1594}
1595
1596fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1597    let metadata = super::rfc822::parse_rfc822_content(content);
1598    build_package_data_from_rfc822(&metadata, datasource_id)
1599}
1600
1601/// Builds PackageData from parsed RFC822 metadata.
1602///
1603/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1604/// and `python_parse_rfc822_content` (content-based) functions.
1605fn build_package_data_from_rfc822(
1606    metadata: &super::rfc822::Rfc822Metadata,
1607    datasource_id: DatasourceId,
1608) -> PackageData {
1609    use super::rfc822::{get_header_all, get_header_first};
1610
1611    let name = get_header_first(&metadata.headers, "name");
1612    let version = get_header_first(&metadata.headers, "version");
1613    let summary = get_header_first(&metadata.headers, "summary");
1614    let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1615    let author = get_header_first(&metadata.headers, "author");
1616    let author_email = get_header_first(&metadata.headers, "author-email");
1617    let license = get_header_first(&metadata.headers, "license");
1618    let license_expression = get_header_first(&metadata.headers, "license-expression");
1619    let download_url = get_header_first(&metadata.headers, "download-url");
1620    let platform = get_header_first(&metadata.headers, "platform");
1621    let requires_python = get_header_first(&metadata.headers, "requires-python");
1622    let classifiers = get_header_all(&metadata.headers, "classifier");
1623    let license_files = get_header_all(&metadata.headers, "license-file");
1624
1625    let description_body = if metadata.body.is_empty() {
1626        get_header_first(&metadata.headers, "description").unwrap_or_default()
1627    } else {
1628        metadata.body.clone()
1629    };
1630
1631    let description = build_description(summary.as_deref(), &description_body);
1632
1633    let mut parties = Vec::new();
1634    if author.is_some() || author_email.is_some() {
1635        parties.push(Party {
1636            r#type: Some("person".to_string()),
1637            role: Some("author".to_string()),
1638            name: author,
1639            email: author_email,
1640            url: None,
1641            organization: None,
1642            organization_url: None,
1643            timezone: None,
1644        });
1645    }
1646
1647    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1648    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1649        normalize_spdx_declared_license(license_expression.as_deref());
1650
1651    let extracted_license_statement = license_expression
1652        .clone()
1653        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1654
1655    let mut extra_data = HashMap::new();
1656    if let Some(platform_value) = platform
1657        && !platform_value.eq_ignore_ascii_case("unknown")
1658        && !platform_value.is_empty()
1659    {
1660        extra_data.insert(
1661            "platform".to_string(),
1662            serde_json::Value::String(platform_value),
1663        );
1664    }
1665
1666    if let Some(requires_python_value) = requires_python
1667        && !requires_python_value.is_empty()
1668    {
1669        extra_data.insert(
1670            "requires_python".to_string(),
1671            serde_json::Value::String(requires_python_value),
1672        );
1673    }
1674
1675    if !license_files.is_empty() {
1676        extra_data.insert(
1677            "license_files".to_string(),
1678            serde_json::Value::Array(
1679                license_files
1680                    .iter()
1681                    .cloned()
1682                    .map(serde_json::Value::String)
1683                    .collect(),
1684            ),
1685        );
1686    }
1687
1688    let file_references = license_files
1689        .iter()
1690        .map(|path| FileReference {
1691            path: path.clone(),
1692            size: None,
1693            sha1: None,
1694            md5: None,
1695            sha256: None,
1696            sha512: None,
1697            extra_data: None,
1698        })
1699        .collect();
1700
1701    let project_urls = get_header_all(&metadata.headers, "project-url");
1702    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1703    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1704
1705    if !project_urls.is_empty() {
1706        let parsed_urls = parse_project_urls(&project_urls);
1707
1708        for (label, url) in &parsed_urls {
1709            let label_lower = label.to_lowercase();
1710
1711            if bug_tracking_url.is_none()
1712                && matches!(
1713                    label_lower.as_str(),
1714                    "tracker"
1715                        | "bug reports"
1716                        | "bug tracker"
1717                        | "issues"
1718                        | "issue tracker"
1719                        | "github: issues"
1720                )
1721            {
1722                bug_tracking_url = Some(url.clone());
1723            } else if code_view_url.is_none()
1724                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1725            {
1726                code_view_url = Some(url.clone());
1727            } else if vcs_url.is_none()
1728                && matches!(
1729                    label_lower.as_str(),
1730                    "github" | "gitlab" | "github: repo" | "repository"
1731                )
1732            {
1733                vcs_url = Some(url.clone());
1734            } else if homepage_url.is_none()
1735                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1736            {
1737                homepage_url = Some(url.clone());
1738            } else if label_lower == "changelog" {
1739                extra_data.insert(
1740                    "changelog_url".to_string(),
1741                    serde_json::Value::String(url.clone()),
1742                );
1743            }
1744        }
1745
1746        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1747            .iter()
1748            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1749            .collect();
1750
1751        if !project_urls_json.is_empty() {
1752            extra_data.insert(
1753                "project_urls".to_string(),
1754                serde_json::Value::Object(project_urls_json),
1755            );
1756        }
1757    }
1758
1759    let extra_data = if extra_data.is_empty() {
1760        None
1761    } else {
1762        Some(extra_data)
1763    };
1764
1765    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1766        build_pypi_urls(name.as_deref(), version.as_deref());
1767
1768    PackageData {
1769        package_type: Some(PythonParser::PACKAGE_TYPE),
1770        namespace: None,
1771        name,
1772        version,
1773        qualifiers: None,
1774        subpath: None,
1775        primary_language: Some("Python".to_string()),
1776        description,
1777        release_date: None,
1778        parties,
1779        keywords,
1780        homepage_url,
1781        download_url,
1782        size: None,
1783        sha1: None,
1784        md5: None,
1785        sha256: None,
1786        sha512: None,
1787        bug_tracking_url,
1788        code_view_url,
1789        vcs_url,
1790        copyright: None,
1791        holder: None,
1792        declared_license_expression,
1793        declared_license_expression_spdx,
1794        license_detections,
1795        other_license_expression: None,
1796        other_license_expression_spdx: None,
1797        other_license_detections: Vec::new(),
1798        extracted_license_statement,
1799        notice_text: None,
1800        source_packages: Vec::new(),
1801        file_references,
1802        is_private: false,
1803        is_virtual: false,
1804        extra_data,
1805        dependencies,
1806        repository_homepage_url,
1807        repository_download_url,
1808        api_data_url,
1809        datasource_id: Some(datasource_id),
1810        purl,
1811    }
1812}
1813
1814fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
1815    project_urls
1816        .iter()
1817        .filter_map(|url_entry| {
1818            if let Some((label, url)) = url_entry.split_once(", ") {
1819                let label_trimmed = label.trim();
1820                let url_trimmed = url.trim();
1821                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
1822                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
1823                }
1824            }
1825            None
1826        })
1827        .collect()
1828}
1829
1830fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
1831    let mut parts = Vec::new();
1832    if let Some(summary_value) = summary
1833        && !summary_value.trim().is_empty()
1834    {
1835        parts.push(summary_value.trim().to_string());
1836    }
1837
1838    if !body.trim().is_empty() {
1839        parts.push(body.trim().to_string());
1840    }
1841
1842    if parts.is_empty() {
1843        None
1844    } else {
1845        Some(parts.join("\n"))
1846    }
1847}
1848
1849fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
1850    let mut keywords = Vec::new();
1851    let mut license_classifiers = Vec::new();
1852
1853    for classifier in classifiers {
1854        if classifier.starts_with("License ::") {
1855            license_classifiers.push(classifier.to_string());
1856        } else {
1857            keywords.push(classifier.to_string());
1858        }
1859    }
1860
1861    (keywords, license_classifiers)
1862}
1863
1864fn build_extracted_license_statement(
1865    license: Option<&str>,
1866    license_classifiers: &[String],
1867) -> Option<String> {
1868    let mut lines = Vec::new();
1869
1870    if let Some(value) = license
1871        && !value.trim().is_empty()
1872    {
1873        lines.push(format!("license: {}", value.trim()));
1874    }
1875
1876    if !license_classifiers.is_empty() {
1877        lines.push("classifiers:".to_string());
1878        for classifier in license_classifiers {
1879            lines.push(format!("  - '{}'", classifier));
1880        }
1881    }
1882
1883    if lines.is_empty() {
1884        None
1885    } else {
1886        Some(format!("{}\n", lines.join("\n")))
1887    }
1888}
1889
1890pub(crate) fn build_pypi_urls(
1891    name: Option<&str>,
1892    version: Option<&str>,
1893) -> (
1894    Option<String>,
1895    Option<String>,
1896    Option<String>,
1897    Option<String>,
1898) {
1899    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
1900
1901    let repository_download_url = name.and_then(|value| {
1902        version.map(|ver| {
1903            format!(
1904                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
1905                &value[..1.min(value.len())],
1906                value,
1907                value,
1908                ver
1909            )
1910        })
1911    });
1912
1913    let api_data_url = name.map(|value| {
1914        if let Some(ver) = version {
1915            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
1916        } else {
1917            format!("https://pypi.org/pypi/{}/json", value)
1918        }
1919    });
1920
1921    let purl = name.and_then(|value| {
1922        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
1923        if let Some(ver) = version {
1924            package_url.with_version(ver).ok()?;
1925        }
1926        Some(package_url.to_string())
1927    });
1928
1929    (
1930        repository_homepage_url,
1931        repository_download_url,
1932        api_data_url,
1933        purl,
1934    )
1935}
1936
1937fn build_pypi_purl_with_extension(
1938    name: &str,
1939    version: Option<&str>,
1940    extension: &str,
1941) -> Option<String> {
1942    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1943    if let Some(ver) = version {
1944        package_url.with_version(ver).ok()?;
1945    }
1946    package_url.add_qualifier("extension", extension).ok()?;
1947    Some(package_url.to_string())
1948}
1949
1950fn extract_from_pyproject_toml(path: &Path) -> PackageData {
1951    let toml_content = match read_toml_file(path) {
1952        Ok(content) => content,
1953        Err(e) => {
1954            warn!(
1955                "Failed to read or parse pyproject.toml at {:?}: {}",
1956                path, e
1957            );
1958            return default_package_data();
1959        }
1960    };
1961
1962    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
1963
1964    // Handle both PEP 621 (project table) and poetry formats
1965    let project_table =
1966        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
1967            // Standard PEP 621 format with [project] table
1968            project.clone()
1969        } else if let Some(tool) = tool_table {
1970            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
1971                // Poetry format with [tool.poetry] table
1972                poetry.clone()
1973            } else {
1974                warn!(
1975                    "No project or tool.poetry data found in pyproject.toml at {:?}",
1976                    path
1977                );
1978                return default_package_data();
1979            }
1980        } else if toml_content.get(FIELD_NAME).is_some() {
1981            // Other format with top-level fields
1982            match toml_content.as_table() {
1983                Some(table) => table.clone(),
1984                None => {
1985                    warn!("Failed to convert TOML content to table in {:?}", path);
1986                    return default_package_data();
1987                }
1988            }
1989        } else {
1990            warn!("No project data found in pyproject.toml at {:?}", path);
1991            return default_package_data();
1992        };
1993
1994    let name = project_table
1995        .get(FIELD_NAME)
1996        .and_then(|v| v.as_str())
1997        .map(String::from);
1998
1999    let version = project_table
2000        .get(FIELD_VERSION)
2001        .and_then(|v| v.as_str())
2002        .map(String::from);
2003    let classifiers = project_table
2004        .get("classifiers")
2005        .and_then(|value| value.as_array())
2006        .map(|values| {
2007            values
2008                .iter()
2009                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2010                .collect::<Vec<_>>()
2011        })
2012        .unwrap_or_default();
2013
2014    let extracted_license_statement = extract_raw_license_string(&project_table);
2015    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2016        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2017
2018    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2019    let (homepage_url, repository_url) = extract_urls(&project_table);
2020
2021    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2022    let extra_data = extract_pyproject_extra_data(&toml_content);
2023
2024    // Create package URL
2025    let purl = name.as_ref().and_then(|n| {
2026        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2027            Ok(p) => p,
2028            Err(e) => {
2029                warn!(
2030                    "Failed to create PackageUrl for Python package '{}': {}",
2031                    n, e
2032                );
2033                return None;
2034            }
2035        };
2036
2037        if let Some(v) = &version
2038            && let Err(e) = package_url.with_version(v)
2039        {
2040            warn!(
2041                "Failed to set version '{}' for Python package '{}': {}",
2042                v, n, e
2043            );
2044            return None;
2045        }
2046
2047        Some(package_url.to_string())
2048    });
2049
2050    let api_data_url = name.as_ref().map(|n| {
2051        if let Some(v) = &version {
2052            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2053        } else {
2054            format!("https://pypi.org/pypi/{}/json", n)
2055        }
2056    });
2057
2058    let pypi_homepage_url = name
2059        .as_ref()
2060        .map(|n| format!("https://pypi.org/project/{}", n));
2061
2062    let pypi_download_url = name.as_ref().and_then(|n| {
2063        version.as_ref().map(|v| {
2064            format!(
2065                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2066                &n[..1.min(n.len())],
2067                n,
2068                n,
2069                v
2070            )
2071        })
2072    });
2073
2074    PackageData {
2075        package_type: Some(PythonParser::PACKAGE_TYPE),
2076        namespace: None,
2077        name,
2078        version,
2079        qualifiers: None,
2080        subpath: None,
2081        primary_language: None,
2082        description: None,
2083        release_date: None,
2084        parties: extract_parties(&project_table),
2085        keywords: Vec::new(),
2086        homepage_url: homepage_url.or(pypi_homepage_url),
2087        download_url: repository_url.clone().or(pypi_download_url),
2088        size: None,
2089        sha1: None,
2090        md5: None,
2091        sha256: None,
2092        sha512: None,
2093        bug_tracking_url: None,
2094        code_view_url: None,
2095        vcs_url: repository_url,
2096        copyright: None,
2097        holder: None,
2098        declared_license_expression,
2099        declared_license_expression_spdx,
2100        license_detections,
2101        other_license_expression: None,
2102        other_license_expression_spdx: None,
2103        other_license_detections: Vec::new(),
2104        extracted_license_statement,
2105        notice_text: None,
2106        source_packages: Vec::new(),
2107        file_references: Vec::new(),
2108        is_private: has_private_classifier(&classifiers),
2109        is_virtual: false,
2110        extra_data,
2111        dependencies: [dependencies, optional_dependencies].concat(),
2112        repository_homepage_url: None,
2113        repository_download_url: None,
2114        api_data_url,
2115        datasource_id: Some(DatasourceId::PypiPyprojectToml),
2116        purl,
2117    }
2118}
2119
2120fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2121    project
2122        .get(FIELD_LICENSE)
2123        .and_then(|license_value| match license_value {
2124            TomlValue::String(license_str) => Some(license_str.clone()),
2125            TomlValue::Table(license_table) => license_table
2126                .get("text")
2127                .and_then(|v| v.as_str())
2128                .map(|s| s.to_string())
2129                .or_else(|| {
2130                    license_table
2131                        .get("expression")
2132                        .and_then(|v| v.as_str())
2133                        .map(|expr| expr.to_string())
2134                }),
2135            _ => None,
2136        })
2137}
2138
2139fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2140    match project.get(FIELD_LICENSE) {
2141        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2142        Some(TomlValue::Table(license_table)) => license_table
2143            .get("expression")
2144            .and_then(|value| value.as_str()),
2145        _ => None,
2146    }
2147}
2148
2149fn extract_urls(project: &TomlMap<String, TomlValue>) -> (Option<String>, Option<String>) {
2150    let mut homepage_url = None;
2151    let mut repository_url = None;
2152
2153    // Check for URLs table
2154    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2155        homepage_url = urls
2156            .get(FIELD_HOMEPAGE)
2157            .and_then(|v| v.as_str())
2158            .map(String::from);
2159        repository_url = urls
2160            .get(FIELD_REPOSITORY)
2161            .and_then(|v| v.as_str())
2162            .map(String::from);
2163    }
2164
2165    // If not found in URLs table, check for top-level keys
2166    if homepage_url.is_none() {
2167        homepage_url = project
2168            .get(FIELD_HOMEPAGE)
2169            .and_then(|v| v.as_str())
2170            .map(String::from);
2171    }
2172
2173    if repository_url.is_none() {
2174        repository_url = project
2175            .get(FIELD_REPOSITORY)
2176            .and_then(|v| v.as_str())
2177            .map(String::from);
2178    }
2179
2180    (homepage_url, repository_url)
2181}
2182
2183fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2184    let mut parties = Vec::new();
2185
2186    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2187        for author in authors {
2188            if let Some(author_str) = author.as_str() {
2189                let (name, email) = split_name_email(author_str);
2190                parties.push(Party {
2191                    r#type: None,
2192                    role: Some("author".to_string()),
2193                    name,
2194                    email,
2195                    url: None,
2196                    organization: None,
2197                    organization_url: None,
2198                    timezone: None,
2199                });
2200            }
2201        }
2202    }
2203
2204    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2205        for maintainer in maintainers {
2206            if let Some(maintainer_str) = maintainer.as_str() {
2207                let (name, email) = split_name_email(maintainer_str);
2208                parties.push(Party {
2209                    r#type: None,
2210                    role: Some("maintainer".to_string()),
2211                    name,
2212                    email,
2213                    url: None,
2214                    organization: None,
2215                    organization_url: None,
2216                    timezone: None,
2217                });
2218            }
2219        }
2220    }
2221
2222    parties
2223}
2224
2225fn extract_dependencies(
2226    project: &TomlMap<String, TomlValue>,
2227    toml_content: &TomlValue,
2228) -> (Vec<Dependency>, Vec<Dependency>) {
2229    let mut dependencies = Vec::new();
2230    let mut optional_dependencies = Vec::new();
2231
2232    // Handle dependencies - can be array or table format
2233    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2234        match deps_value {
2235            TomlValue::Array(arr) => {
2236                dependencies = parse_dependency_array(arr, false, None);
2237            }
2238            TomlValue::Table(table) => {
2239                dependencies = parse_dependency_table(table, false, None);
2240            }
2241            _ => {}
2242        }
2243    }
2244
2245    // Handle PEP 621 optional-dependencies with scope
2246    if let Some(opt_deps_table) = project
2247        .get(FIELD_OPTIONAL_DEPENDENCIES)
2248        .and_then(|v| v.as_table())
2249    {
2250        for (extra_name, deps) in opt_deps_table {
2251            match deps {
2252                TomlValue::Array(arr) => {
2253                    optional_dependencies.extend(parse_dependency_array(
2254                        arr,
2255                        true,
2256                        Some(extra_name),
2257                    ));
2258                }
2259                TomlValue::Table(table) => {
2260                    optional_dependencies.extend(parse_dependency_table(
2261                        table,
2262                        true,
2263                        Some(extra_name),
2264                    ));
2265                }
2266                _ => {}
2267            }
2268        }
2269    }
2270
2271    // Handle Poetry dev-dependencies
2272    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2273        match dev_deps_value {
2274            TomlValue::Array(arr) => {
2275                optional_dependencies.extend(parse_dependency_array(
2276                    arr,
2277                    true,
2278                    Some(FIELD_DEV_DEPENDENCIES),
2279                ));
2280            }
2281            TomlValue::Table(table) => {
2282                optional_dependencies.extend(parse_dependency_table(
2283                    table,
2284                    true,
2285                    Some(FIELD_DEV_DEPENDENCIES),
2286                ));
2287            }
2288            _ => {}
2289        }
2290    }
2291
2292    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2293    if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
2294        for (group_name, group_data) in groups_table {
2295            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2296                match group_deps {
2297                    TomlValue::Array(arr) => {
2298                        optional_dependencies.extend(parse_dependency_array(
2299                            arr,
2300                            true,
2301                            Some(group_name),
2302                        ));
2303                    }
2304                    TomlValue::Table(table) => {
2305                        optional_dependencies.extend(parse_dependency_table(
2306                            table,
2307                            true,
2308                            Some(group_name),
2309                        ));
2310                    }
2311                    _ => {}
2312                }
2313            }
2314        }
2315    }
2316
2317    if let Some(groups_table) = toml_content
2318        .get(FIELD_DEPENDENCY_GROUPS)
2319        .and_then(|value| value.as_table())
2320    {
2321        for (group_name, deps) in groups_table {
2322            match deps {
2323                TomlValue::Array(arr) => {
2324                    optional_dependencies.extend(parse_dependency_array(
2325                        arr,
2326                        true,
2327                        Some(group_name),
2328                    ));
2329                }
2330                TomlValue::Table(table) => {
2331                    optional_dependencies.extend(parse_dependency_table(
2332                        table,
2333                        true,
2334                        Some(group_name),
2335                    ));
2336                }
2337                _ => {}
2338            }
2339        }
2340    }
2341
2342    if let Some(dev_deps_value) = toml_content
2343        .get("tool")
2344        .and_then(|value| value.as_table())
2345        .and_then(|tool| tool.get("uv"))
2346        .and_then(|value| value.as_table())
2347        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2348    {
2349        match dev_deps_value {
2350            TomlValue::Array(arr) => {
2351                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2352            }
2353            TomlValue::Table(table) => {
2354                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2355            }
2356            _ => {}
2357        }
2358    }
2359
2360    (dependencies, optional_dependencies)
2361}
2362
2363fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2364    let mut extra_data = HashMap::new();
2365
2366    if let Some(tool_uv) = toml_content
2367        .get("tool")
2368        .and_then(|value| value.as_table())
2369        .and_then(|tool| tool.get("uv"))
2370    {
2371        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2372    }
2373
2374    if extra_data.is_empty() {
2375        None
2376    } else {
2377        Some(extra_data)
2378    }
2379}
2380
2381fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2382    match value {
2383        TomlValue::String(value) => JsonValue::String(value.clone()),
2384        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2385        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2386        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2387        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2388        TomlValue::Array(values) => {
2389            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2390        }
2391        TomlValue::Table(values) => JsonValue::Object(
2392            values
2393                .iter()
2394                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2395                .collect::<JsonMap<String, JsonValue>>(),
2396        ),
2397    }
2398}
2399
2400fn parse_dependency_table(
2401    table: &TomlMap<String, TomlValue>,
2402    is_optional: bool,
2403    scope: Option<&str>,
2404) -> Vec<Dependency> {
2405    table
2406        .iter()
2407        .filter_map(|(name, version)| {
2408            let version_str = version.as_str().map(|s| s.to_string());
2409            let mut package_url =
2410                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2411
2412            if let Some(v) = &version_str {
2413                package_url.with_version(v).ok()?;
2414            }
2415
2416            Some(Dependency {
2417                purl: Some(package_url.to_string()),
2418                extracted_requirement: None,
2419                scope: scope.map(|s| s.to_string()),
2420                is_runtime: Some(!is_optional),
2421                is_optional: Some(is_optional),
2422                is_pinned: None,
2423                is_direct: Some(true),
2424                resolved_package: None,
2425                extra_data: None,
2426            })
2427        })
2428        .collect()
2429}
2430
2431fn parse_dependency_array(
2432    array: &[TomlValue],
2433    is_optional: bool,
2434    scope: Option<&str>,
2435) -> Vec<Dependency> {
2436    array
2437        .iter()
2438        .filter_map(|dep| {
2439            let dep_str = dep.as_str()?;
2440
2441            let mut parts = dep_str.split(['>', '=', '<', '~']);
2442            let name = parts.next()?.trim().to_string();
2443
2444            let version = parts.next().map(|v| v.trim().to_string());
2445
2446            let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name)
2447            {
2448                Ok(purl) => purl,
2449                Err(_) => return None,
2450            };
2451
2452            if let Some(ref v) = version {
2453                package_url.with_version(v).ok()?;
2454            }
2455
2456            Some(Dependency {
2457                purl: Some(package_url.to_string()),
2458                extracted_requirement: None,
2459                scope: scope.map(|s| s.to_string()),
2460                is_runtime: Some(!is_optional),
2461                is_optional: Some(is_optional),
2462                is_pinned: None,
2463                is_direct: Some(true),
2464                resolved_package: None,
2465                extra_data: None,
2466            })
2467        })
2468        .collect()
2469}
2470
2471#[derive(Debug, Clone)]
2472enum Value {
2473    String(String),
2474    Number(f64),
2475    Bool(bool),
2476    None,
2477    List(Vec<Value>),
2478    Tuple(Vec<Value>),
2479    Dict(HashMap<String, Value>),
2480}
2481
2482struct LiteralEvaluator {
2483    constants: HashMap<String, Value>,
2484    max_depth: usize,
2485    max_nodes: usize,
2486    nodes_visited: usize,
2487}
2488
2489impl LiteralEvaluator {
2490    fn new(constants: HashMap<String, Value>) -> Self {
2491        Self {
2492            constants,
2493            max_depth: MAX_SETUP_PY_AST_DEPTH,
2494            max_nodes: MAX_SETUP_PY_AST_NODES,
2495            nodes_visited: 0,
2496        }
2497    }
2498
2499    fn insert_constant(&mut self, name: String, value: Value) {
2500        self.constants.insert(name, value);
2501    }
2502
2503    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2504        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2505            return None;
2506        }
2507        self.nodes_visited += 1;
2508
2509        match expr {
2510            ast::Expr::Constant(ast::ExprConstant { value, .. }) => self.evaluate_constant(value),
2511            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2512            ast::Expr::List(ast::ExprList { elts, .. }) => {
2513                let mut values = Vec::new();
2514                for elt in elts {
2515                    values.push(self.evaluate_expr(elt, depth + 1)?);
2516                }
2517                Some(Value::List(values))
2518            }
2519            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
2520                let mut values = Vec::new();
2521                for elt in elts {
2522                    values.push(self.evaluate_expr(elt, depth + 1)?);
2523                }
2524                Some(Value::Tuple(values))
2525            }
2526            ast::Expr::Dict(ast::ExprDict { keys, values, .. }) => {
2527                let mut dict = HashMap::new();
2528                for (key_expr, value_expr) in keys.iter().zip(values.iter()) {
2529                    let key_expr = key_expr.as_ref()?;
2530                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
2531                    let key = value_to_string(&key_value)?;
2532                    let value = self.evaluate_expr(value_expr, depth + 1)?;
2533                    dict.insert(key, value);
2534                }
2535                Some(Value::Dict(dict))
2536            }
2537            ast::Expr::Call(ast::ExprCall {
2538                func,
2539                args,
2540                keywords,
2541                ..
2542            }) => {
2543                if keywords.is_empty()
2544                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
2545                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
2546                {
2547                    return self.evaluate_ordered_dict(args, depth + 1);
2548                }
2549
2550                if !args.is_empty() {
2551                    return None;
2552                }
2553
2554                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
2555                    && id == "dict"
2556                {
2557                    let mut dict = HashMap::new();
2558                    for keyword in keywords {
2559                        let key = keyword.arg.as_ref().map(|name| name.as_str())?;
2560                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2561                        dict.insert(key.to_string(), value);
2562                    }
2563                    return Some(Value::Dict(dict));
2564                }
2565
2566                None
2567            }
2568            _ => None,
2569        }
2570    }
2571
2572    fn evaluate_constant(&self, constant: &ast::Constant) -> Option<Value> {
2573        match constant {
2574            ast::Constant::Str(value) => Some(Value::String(value.clone())),
2575            ast::Constant::Bool(value) => Some(Value::Bool(*value)),
2576            ast::Constant::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2577            ast::Constant::Float(value) => Some(Value::Number(*value)),
2578            ast::Constant::None => Some(Value::None),
2579            _ => None,
2580        }
2581    }
2582
2583    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2584        if args.len() != 1 {
2585            return None;
2586        }
2587
2588        let items = match self.evaluate_expr(&args[0], depth)? {
2589            Value::List(items) | Value::Tuple(items) => items,
2590            _ => return None,
2591        };
2592
2593        let mut dict = HashMap::new();
2594        for item in items {
2595            let Value::Tuple(values) = item else {
2596                return None;
2597            };
2598            if values.len() != 2 {
2599                return None;
2600            }
2601            let key = value_to_string(&values[0])?;
2602            dict.insert(key, values[1].clone());
2603        }
2604
2605        Some(Value::Dict(dict))
2606    }
2607}
2608
2609#[derive(Default)]
2610struct SetupAliases {
2611    setup_names: HashSet<String>,
2612    module_aliases: HashMap<String, String>,
2613}
2614
2615fn extract_from_setup_py(path: &Path) -> PackageData {
2616    let content = match read_file_to_string(path) {
2617        Ok(content) => content,
2618        Err(e) => {
2619            warn!("Failed to read setup.py at {:?}: {}", path, e);
2620            return default_package_data();
2621        }
2622    };
2623
2624    if content.len() > MAX_SETUP_PY_BYTES {
2625        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
2626        return extract_from_setup_py_regex(&content);
2627    }
2628
2629    let mut package_data = match extract_from_setup_py_ast(&content) {
2630        Ok(Some(data)) => data,
2631        Ok(None) => extract_from_setup_py_regex(&content),
2632        Err(e) => {
2633            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
2634            extract_from_setup_py_regex(&content)
2635        }
2636    };
2637
2638    if package_data.name.is_none() {
2639        package_data.name = extract_setup_value(&content, "name");
2640    }
2641
2642    if package_data.version.is_none() {
2643        package_data.version = extract_setup_value(&content, "version");
2644    }
2645
2646    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
2647
2648    if package_data.purl.is_none() {
2649        package_data.purl = build_setup_py_purl(
2650            package_data.name.as_deref(),
2651            package_data.version.as_deref(),
2652        );
2653    }
2654
2655    package_data
2656}
2657
2658fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
2659    if package_data.version.is_some()
2660        && package_data.extracted_license_statement.is_some()
2661        && package_data
2662            .parties
2663            .iter()
2664            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
2665    {
2666        return;
2667    }
2668
2669    let Some(root) = path.parent() else {
2670        return;
2671    };
2672
2673    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
2674
2675    if package_data.version.is_none() {
2676        package_data.version = dunder_metadata.version;
2677    }
2678
2679    if package_data.extracted_license_statement.is_none() {
2680        package_data.extracted_license_statement = dunder_metadata.license;
2681    }
2682
2683    let has_author = package_data
2684        .parties
2685        .iter()
2686        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
2687
2688    if !has_author && let Some(author) = dunder_metadata.author {
2689        package_data.parties.push(Party {
2690            r#type: Some("person".to_string()),
2691            role: Some("author".to_string()),
2692            name: Some(author),
2693            email: None,
2694            url: None,
2695            organization: None,
2696            organization_url: None,
2697            timezone: None,
2698        });
2699    }
2700}
2701
2702#[derive(Default)]
2703struct DunderMetadata {
2704    version: Option<String>,
2705    author: Option<String>,
2706    license: Option<String>,
2707}
2708
2709fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
2710    let statements = match ast::Suite::parse(content, "<setup.py>") {
2711        Ok(statements) => statements,
2712        Err(_) => return DunderMetadata::default(),
2713    };
2714
2715    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2716    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2717    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2718    let mut metadata = DunderMetadata::default();
2719
2720    for module in imported_dunder_modules(&statements) {
2721        let Some(path) = resolve_imported_module_path(root, &module) else {
2722            continue;
2723        };
2724        let Ok(module_content) = read_file_to_string(&path) else {
2725            continue;
2726        };
2727
2728        if metadata.version.is_none() {
2729            metadata.version = version_re
2730                .as_ref()
2731                .and_then(|regex| regex.captures(&module_content))
2732                .and_then(|captures| captures.get(1))
2733                .map(|match_| match_.as_str().to_string());
2734        }
2735
2736        if metadata.author.is_none() {
2737            metadata.author = author_re
2738                .as_ref()
2739                .and_then(|regex| regex.captures(&module_content))
2740                .and_then(|captures| captures.get(1))
2741                .map(|match_| match_.as_str().to_string());
2742        }
2743
2744        if metadata.license.is_none() {
2745            metadata.license = license_re
2746                .as_ref()
2747                .and_then(|regex| regex.captures(&module_content))
2748                .and_then(|captures| captures.get(1))
2749                .map(|match_| match_.as_str().to_string());
2750        }
2751
2752        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
2753            return metadata;
2754        }
2755    }
2756
2757    metadata
2758}
2759
2760fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
2761    let mut modules = Vec::new();
2762
2763    for statement in statements {
2764        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
2765            continue;
2766        };
2767        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
2768            continue;
2769        };
2770        let imports_dunder = names.iter().any(|alias| {
2771            matches!(
2772                alias.name.as_str(),
2773                "__version__" | "__author__" | "__license__"
2774            )
2775        });
2776        if imports_dunder {
2777            modules.push(module.to_string());
2778        }
2779    }
2780
2781    modules
2782}
2783
2784fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
2785    let relative = PathBuf::from_iter(module.split('.'));
2786    let candidates = [
2787        root.join(relative.with_extension("py")),
2788        root.join(&relative).join("__init__.py"),
2789        root.join("src").join(relative.with_extension("py")),
2790        root.join("src").join(relative).join("__init__.py"),
2791    ];
2792
2793    candidates.into_iter().find(|candidate| candidate.exists())
2794}
2795
2796/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
2797///
2798/// # Security Model
2799///
2800/// This function parses setup.py as a Python AST and evaluates only literal values
2801/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
2802/// arbitrary code execution during scanning.
2803///
2804/// # DoS Prevention
2805///
2806/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
2807/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
2808/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
2809///
2810/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
2811fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
2812    let statements = ast::Suite::parse(content, "<setup.py>").map_err(|e| format!("{}", e))?;
2813    let aliases = collect_setup_aliases(&statements);
2814    let mut evaluator = LiteralEvaluator::new(HashMap::new());
2815    build_setup_py_constants(&statements, &mut evaluator);
2816
2817    let setup_call = find_setup_call(&statements, &aliases);
2818    let Some(call_expr) = setup_call else {
2819        return Ok(None);
2820    };
2821
2822    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
2823    Ok(Some(build_setup_py_package_data(&setup_values)))
2824}
2825
2826fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
2827    for stmt in statements {
2828        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
2829            if targets.len() != 1 {
2830                continue;
2831            }
2832
2833            let Some(name) = extract_assign_name(&targets[0]) else {
2834                continue;
2835            };
2836
2837            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
2838                evaluator.insert_constant(name, value);
2839            }
2840        }
2841    }
2842}
2843
2844fn extract_assign_name(target: &ast::Expr) -> Option<String> {
2845    match target {
2846        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
2847        _ => None,
2848    }
2849}
2850
2851fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
2852    let mut aliases = SetupAliases::default();
2853    aliases.setup_names.insert("setup".to_string());
2854
2855    for stmt in statements {
2856        match stmt {
2857            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
2858                for alias in names {
2859                    let module_name = alias.name.as_str();
2860                    if !is_setup_module(module_name) {
2861                        continue;
2862                    }
2863                    let alias_name = alias
2864                        .asname
2865                        .as_ref()
2866                        .map(|name| name.as_str())
2867                        .unwrap_or(module_name);
2868                    aliases
2869                        .module_aliases
2870                        .insert(alias_name.to_string(), module_name.to_string());
2871                }
2872            }
2873            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
2874                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
2875                    continue;
2876                };
2877                if !is_setup_module(module_name) {
2878                    continue;
2879                }
2880                for alias in names {
2881                    if alias.name.as_str() != "setup" {
2882                        continue;
2883                    }
2884                    let alias_name = alias
2885                        .asname
2886                        .as_ref()
2887                        .map(|name| name.as_str())
2888                        .unwrap_or("setup");
2889                    aliases.setup_names.insert(alias_name.to_string());
2890                }
2891            }
2892            _ => {}
2893        }
2894    }
2895
2896    aliases
2897}
2898
2899fn is_setup_module(module_name: &str) -> bool {
2900    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
2901}
2902
2903fn find_setup_call<'a>(
2904    statements: &'a [ast::Stmt],
2905    aliases: &'a SetupAliases,
2906) -> Option<&'a ast::Expr> {
2907    let mut finder = SetupCallFinder {
2908        aliases,
2909        nodes_visited: 0,
2910    };
2911    finder.find_in_statements(statements)
2912}
2913
2914struct SetupCallFinder<'a> {
2915    aliases: &'a SetupAliases,
2916    nodes_visited: usize,
2917}
2918
2919impl<'a> SetupCallFinder<'a> {
2920    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
2921        for stmt in statements {
2922            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
2923                return None;
2924            }
2925            self.nodes_visited += 1;
2926
2927            let found = match stmt {
2928                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
2929                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
2930                ast::Stmt::If(ast::StmtIf { body, orelse, .. }) => self
2931                    .find_in_statements(body)
2932                    .or_else(|| self.find_in_statements(orelse)),
2933                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
2934                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
2935                    .find_in_statements(body)
2936                    .or_else(|| self.find_in_statements(orelse)),
2937                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
2938                ast::Stmt::Try(ast::StmtTry {
2939                    body,
2940                    orelse,
2941                    finalbody,
2942                    handlers,
2943                    ..
2944                })
2945                | ast::Stmt::TryStar(ast::StmtTryStar {
2946                    body,
2947                    orelse,
2948                    finalbody,
2949                    handlers,
2950                    ..
2951                }) => self
2952                    .find_in_statements(body)
2953                    .or_else(|| self.find_in_statements(orelse))
2954                    .or_else(|| self.find_in_statements(finalbody))
2955                    .or_else(|| {
2956                        for handler in handlers {
2957                            let ast::ExceptHandler::ExceptHandler(
2958                                ast::ExceptHandlerExceptHandler { body, .. },
2959                            ) = handler;
2960                            if let Some(found) = self.find_in_statements(body) {
2961                                return Some(found);
2962                            }
2963                        }
2964                        None
2965                    }),
2966                _ => None,
2967            };
2968
2969            if found.is_some() {
2970                return found;
2971            }
2972        }
2973
2974        None
2975    }
2976
2977    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
2978        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
2979            return None;
2980        }
2981        self.nodes_visited += 1;
2982
2983        match expr {
2984            ast::Expr::Call(ast::ExprCall { func, .. })
2985                if is_setup_call(func.as_ref(), self.aliases) =>
2986            {
2987                Some(expr)
2988            }
2989            _ => None,
2990        }
2991    }
2992}
2993
2994fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
2995    let Some(dotted) = dotted_name(func, 0) else {
2996        return false;
2997    };
2998
2999    if aliases.setup_names.contains(&dotted) {
3000        return true;
3001    }
3002
3003    let Some(module) = dotted.strip_suffix(".setup") else {
3004        return false;
3005    };
3006
3007    let resolved = resolve_module_alias(module, aliases);
3008    is_setup_module(&resolved)
3009}
3010
3011fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3012    if depth >= MAX_SETUP_PY_AST_DEPTH {
3013        return None;
3014    }
3015
3016    match expr {
3017        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3018        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3019            let base = dotted_name(value.as_ref(), depth + 1)?;
3020            Some(format!("{}.{}", base, attr.as_str()))
3021        }
3022        _ => None,
3023    }
3024}
3025
3026fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3027    if let Some(mapped) = aliases.module_aliases.get(module) {
3028        return mapped.clone();
3029    }
3030
3031    let Some((base, rest)) = module.split_once('.') else {
3032        return module.to_string();
3033    };
3034
3035    if let Some(mapped) = aliases.module_aliases.get(base) {
3036        return format!("{}.{}", mapped, rest);
3037    }
3038
3039    module.to_string()
3040}
3041
3042fn extract_setup_keywords(
3043    call_expr: &ast::Expr,
3044    evaluator: &mut LiteralEvaluator,
3045) -> HashMap<String, Value> {
3046    let mut values = HashMap::new();
3047    let ast::Expr::Call(ast::ExprCall { keywords, .. }) = call_expr else {
3048        return values;
3049    };
3050
3051    for keyword in keywords {
3052        if let Some(arg) = keyword.arg.as_ref().map(|name| name.as_str()) {
3053            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3054                values.insert(arg.to_string(), value);
3055            }
3056        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3057            for (key, value) in dict {
3058                values.insert(key, value);
3059            }
3060        }
3061    }
3062
3063    values
3064}
3065
3066fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3067    let name = get_value_string(values, "name");
3068    let version = get_value_string(values, "version");
3069    let description =
3070        get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3071    let homepage_url =
3072        get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3073    let author = get_value_string(values, "author");
3074    let author_email = get_value_string(values, "author_email");
3075    let maintainer = get_value_string(values, "maintainer");
3076    let maintainer_email = get_value_string(values, "maintainer_email");
3077    let license = get_value_string(values, "license");
3078    let classifiers = values
3079        .get("classifiers")
3080        .and_then(value_to_string_list)
3081        .unwrap_or_default();
3082
3083    let mut parties = Vec::new();
3084    if author.is_some() || author_email.is_some() {
3085        parties.push(Party {
3086            r#type: Some("person".to_string()),
3087            role: Some("author".to_string()),
3088            name: author,
3089            email: author_email,
3090            url: None,
3091            organization: None,
3092            organization_url: None,
3093            timezone: None,
3094        });
3095    }
3096
3097    if maintainer.is_some() || maintainer_email.is_some() {
3098        parties.push(Party {
3099            r#type: Some("person".to_string()),
3100            role: Some("maintainer".to_string()),
3101            name: maintainer,
3102            email: maintainer_email,
3103            url: None,
3104            organization: None,
3105            organization_url: None,
3106            timezone: None,
3107        });
3108    }
3109
3110    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3111        normalize_spdx_declared_license(license.as_deref());
3112    let extracted_license_statement = license.clone();
3113
3114    let dependencies = build_setup_py_dependencies(values);
3115    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3116    let mut homepage_from_project_urls = None;
3117    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3118    let mut extra_data = HashMap::new();
3119
3120    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3121        apply_project_url_mappings(
3122            &parsed_project_urls,
3123            &mut homepage_from_project_urls,
3124            &mut bug_tracking_url,
3125            &mut code_view_url,
3126            &mut vcs_url,
3127            &mut extra_data,
3128        );
3129    }
3130
3131    let extra_data = if extra_data.is_empty() {
3132        None
3133    } else {
3134        Some(extra_data)
3135    };
3136
3137    PackageData {
3138        package_type: Some(PythonParser::PACKAGE_TYPE),
3139        namespace: None,
3140        name,
3141        version,
3142        qualifiers: None,
3143        subpath: None,
3144        primary_language: Some("Python".to_string()),
3145        description,
3146        release_date: None,
3147        parties,
3148        keywords: Vec::new(),
3149        homepage_url: homepage_url.or(homepage_from_project_urls),
3150        download_url: None,
3151        size: None,
3152        sha1: None,
3153        md5: None,
3154        sha256: None,
3155        sha512: None,
3156        bug_tracking_url,
3157        code_view_url,
3158        vcs_url,
3159        copyright: None,
3160        holder: None,
3161        declared_license_expression,
3162        declared_license_expression_spdx,
3163        license_detections,
3164        other_license_expression: None,
3165        other_license_expression_spdx: None,
3166        other_license_detections: Vec::new(),
3167        extracted_license_statement,
3168        notice_text: None,
3169        source_packages: Vec::new(),
3170        file_references: Vec::new(),
3171        is_private: has_private_classifier(&classifiers),
3172        is_virtual: false,
3173        extra_data,
3174        dependencies,
3175        repository_homepage_url: None,
3176        repository_download_url: None,
3177        api_data_url: None,
3178        datasource_id: Some(DatasourceId::PypiSetupPy),
3179        purl,
3180    }
3181}
3182
3183fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3184    let mut dependencies = Vec::new();
3185
3186    if let Some(reqs) = values
3187        .get("install_requires")
3188        .and_then(value_to_string_list)
3189    {
3190        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3191    }
3192
3193    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3194        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3195    }
3196
3197    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3198        let mut extra_items: Vec<_> = extras.iter().collect();
3199        extra_items.sort_by_key(|(name, _)| *name);
3200        for (extra_name, extra_value) in extra_items {
3201            if let Some(reqs) = value_to_string_list(extra_value) {
3202                dependencies.extend(build_setup_py_dependency_list(
3203                    reqs.as_slice(),
3204                    extra_name,
3205                    true,
3206                ));
3207            }
3208        }
3209    }
3210
3211    dependencies
3212}
3213
3214fn build_setup_py_dependency_list(
3215    reqs: &[String],
3216    scope: &str,
3217    is_optional: bool,
3218) -> Vec<Dependency> {
3219    reqs.iter()
3220        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3221        .collect()
3222}
3223
3224fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3225    values.get(key).and_then(value_to_string)
3226}
3227
3228fn value_to_string(value: &Value) -> Option<String> {
3229    match value {
3230        Value::String(value) => Some(value.clone()),
3231        Value::Number(value) => Some(value.to_string()),
3232        Value::Bool(value) => Some(value.to_string()),
3233        _ => None,
3234    }
3235}
3236
3237fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3238    match value {
3239        Value::String(value) => Some(vec![value.clone()]),
3240        Value::List(values) | Value::Tuple(values) => {
3241            let mut items = Vec::new();
3242            for item in values {
3243                items.push(value_to_string(item)?);
3244            }
3245            Some(items)
3246        }
3247        _ => None,
3248    }
3249}
3250
3251fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3252    let Value::Dict(dict) = value else {
3253        return None;
3254    };
3255
3256    let mut pairs: Vec<(String, String)> = dict
3257        .iter()
3258        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3259        .collect::<Option<Vec<_>>>()?;
3260    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3261    Some(pairs)
3262}
3263
3264fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3265    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3266    requires_dist
3267        .iter()
3268        .filter_map(|entry| build_rfc822_dependency(entry))
3269        .collect()
3270}
3271
3272fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3273    build_python_dependency(entry, "install", false, None)
3274}
3275
3276fn build_python_dependency(
3277    entry: &str,
3278    default_scope: &str,
3279    default_optional: bool,
3280    marker_override: Option<&str>,
3281) -> Option<Dependency> {
3282    let (requirement_part, marker_part) = entry
3283        .split_once(';')
3284        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3285        .unwrap_or((entry.trim(), None));
3286
3287    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3288    let requirement = normalize_rfc822_requirement(requirement_part);
3289    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3290        marker_part.or(marker_override),
3291        default_scope,
3292        default_optional,
3293    );
3294    let mut purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
3295
3296    let is_pinned = requirement
3297        .as_deref()
3298        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3299    if is_pinned
3300        && let Some(version) = requirement
3301            .as_deref()
3302            .map(|req| req.trim_start_matches('='))
3303    {
3304        purl.with_version(version).ok()?;
3305    }
3306
3307    let mut extra_data = HashMap::new();
3308    extra_data.extend(marker_data);
3309    if let Some(marker) = marker {
3310        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3311    }
3312
3313    Some(Dependency {
3314        purl: Some(purl.to_string()),
3315        extracted_requirement: requirement,
3316        scope: Some(scope),
3317        is_runtime: Some(true),
3318        is_optional: Some(is_optional),
3319        is_pinned: Some(is_pinned),
3320        is_direct: Some(true),
3321        resolved_package: None,
3322        extra_data: if extra_data.is_empty() {
3323            None
3324        } else {
3325            Some(extra_data)
3326        },
3327    })
3328}
3329
3330fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3331    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3332    let trimmed = requirement_part.trim();
3333    let mut remainder = trimmed[name.len()..].trim();
3334
3335    if let Some(stripped) = remainder.strip_prefix('[')
3336        && let Some(end_idx) = stripped.find(']')
3337    {
3338        remainder = stripped[end_idx + 1..].trim();
3339    }
3340
3341    let remainder = remainder
3342        .strip_prefix('(')
3343        .and_then(|value| value.strip_suffix(')'))
3344        .unwrap_or(remainder)
3345        .trim();
3346
3347    if remainder.is_empty() {
3348        return None;
3349    }
3350
3351    let mut specifiers: Vec<String> = remainder
3352        .split(',')
3353        .map(|specifier| specifier.trim().replace(' ', ""))
3354        .filter(|specifier| !specifier.is_empty())
3355        .collect();
3356    specifiers.sort();
3357    Some(specifiers.join(","))
3358}
3359
3360fn parse_rfc822_marker(
3361    marker_part: Option<&str>,
3362    default_scope: &str,
3363    default_optional: bool,
3364) -> (
3365    String,
3366    bool,
3367    Option<String>,
3368    HashMap<String, serde_json::Value>,
3369) {
3370    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3371        return (
3372            default_scope.to_string(),
3373            default_optional,
3374            None,
3375            HashMap::new(),
3376        );
3377    };
3378
3379    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3380        .expect("extra marker regex should compile");
3381    let mut extra_data = HashMap::new();
3382
3383    if let Some(python_version) = extract_marker_field(marker, "python_version") {
3384        extra_data.insert(
3385            "python_version".to_string(),
3386            serde_json::Value::String(python_version),
3387        );
3388    }
3389    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
3390        extra_data.insert(
3391            "sys_platform".to_string(),
3392            serde_json::Value::String(sys_platform),
3393        );
3394    }
3395
3396    if let Some(captures) = extra_re.captures(marker)
3397        && let Some(scope) = captures.get(1)
3398    {
3399        return (
3400            scope.as_str().to_string(),
3401            true,
3402            Some(marker.trim().to_string()),
3403            extra_data,
3404        );
3405    }
3406
3407    (
3408        default_scope.to_string(),
3409        default_optional,
3410        Some(marker.trim().to_string()),
3411        extra_data,
3412    )
3413}
3414
3415fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
3416    let re = Regex::new(&format!(
3417        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
3418        field
3419    ))
3420    .ok()?;
3421    let captures = re.captures(marker)?;
3422    let operator = captures.get(1)?.as_str();
3423    let value = captures.get(2)?.as_str();
3424    Some(format!("{} {}", operator, value))
3425}
3426
3427fn parse_requires_txt(content: &str) -> Vec<Dependency> {
3428    let mut dependencies = Vec::new();
3429    let mut current_scope = "install".to_string();
3430    let mut current_optional = false;
3431    let mut current_marker: Option<String> = None;
3432
3433    for line in content.lines() {
3434        let trimmed = line.trim();
3435        if trimmed.is_empty() || trimmed.starts_with('#') {
3436            continue;
3437        }
3438
3439        if trimmed.starts_with('[') && trimmed.ends_with(']') {
3440            let inner = &trimmed[1..trimmed.len() - 1];
3441            if let Some(rest) = inner.strip_prefix(':') {
3442                current_scope = "install".to_string();
3443                current_optional = false;
3444                current_marker = Some(rest.trim().to_string());
3445            } else if let Some((scope, marker)) = inner.split_once(':') {
3446                current_scope = scope.trim().to_string();
3447                current_optional = true;
3448                current_marker = Some(marker.trim().to_string());
3449            } else {
3450                current_scope = inner.trim().to_string();
3451                current_optional = true;
3452                current_marker = None;
3453            }
3454            continue;
3455        }
3456
3457        if let Some(dependency) = build_python_dependency(
3458            trimmed,
3459            &current_scope,
3460            current_optional,
3461            current_marker.as_deref(),
3462        ) {
3463            dependencies.push(dependency);
3464        }
3465    }
3466
3467    dependencies
3468}
3469
3470fn has_private_classifier(classifiers: &[String]) -> bool {
3471    classifiers
3472        .iter()
3473        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
3474}
3475
3476fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
3477    let name = name?;
3478    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
3479    if let Some(version) = version {
3480        package_url.with_version(version).ok()?;
3481    }
3482    Some(package_url.to_string())
3483}
3484
3485fn extract_from_setup_py_regex(content: &str) -> PackageData {
3486    let name = extract_setup_value(content, "name");
3487    let version = extract_setup_value(content, "version");
3488    let license_expression = extract_setup_value(content, "license");
3489
3490    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3491        normalize_spdx_declared_license(license_expression.as_deref());
3492    let extracted_license_statement = license_expression.clone();
3493
3494    let dependencies = extract_setup_py_dependencies(content);
3495    let homepage_url = extract_setup_value(content, "url");
3496    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3497
3498    PackageData {
3499        package_type: Some(PythonParser::PACKAGE_TYPE),
3500        namespace: None,
3501        name,
3502        version,
3503        qualifiers: None,
3504        subpath: None,
3505        primary_language: Some("Python".to_string()),
3506        description: None,
3507        release_date: None,
3508        parties: Vec::new(),
3509        keywords: Vec::new(),
3510        homepage_url,
3511        download_url: None,
3512        size: None,
3513        sha1: None,
3514        md5: None,
3515        sha256: None,
3516        sha512: None,
3517        bug_tracking_url: None,
3518        code_view_url: None,
3519        vcs_url: None,
3520        copyright: None,
3521        holder: None,
3522        declared_license_expression,
3523        declared_license_expression_spdx,
3524        license_detections,
3525        other_license_expression: None,
3526        other_license_expression_spdx: None,
3527        other_license_detections: Vec::new(),
3528        extracted_license_statement,
3529        notice_text: None,
3530        source_packages: Vec::new(),
3531        file_references: Vec::new(),
3532        is_private: false,
3533        is_virtual: false,
3534        extra_data: None,
3535        dependencies,
3536        repository_homepage_url: None,
3537        repository_download_url: None,
3538        api_data_url: None,
3539        datasource_id: Some(DatasourceId::PypiSetupPy),
3540        purl,
3541    }
3542}
3543
3544fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
3545    crate::models::ResolvedPackage {
3546        package_type: pkg.package_type.unwrap_or(PackageType::Pypi),
3547        namespace: pkg.namespace.clone().unwrap_or_default(),
3548        name: pkg.name.clone().unwrap_or_default(),
3549        version: pkg.version.clone().unwrap_or_default(),
3550        primary_language: pkg.primary_language.clone(),
3551        download_url: pkg.download_url.clone(),
3552        sha1: pkg.sha1.clone(),
3553        sha256: pkg.sha256.clone(),
3554        sha512: pkg.sha512.clone(),
3555        md5: pkg.md5.clone(),
3556        is_virtual: pkg.is_virtual,
3557        extra_data: None,
3558        dependencies: pkg.dependencies.clone(),
3559        repository_homepage_url: pkg.repository_homepage_url.clone(),
3560        repository_download_url: pkg.repository_download_url.clone(),
3561        api_data_url: pkg.api_data_url.clone(),
3562        datasource_id: pkg.datasource_id,
3563        purl: pkg.purl.clone(),
3564    }
3565}
3566
3567fn extract_from_pypi_json(path: &Path) -> PackageData {
3568    let default = PackageData {
3569        package_type: Some(PythonParser::PACKAGE_TYPE),
3570        datasource_id: Some(DatasourceId::PypiJson),
3571        ..Default::default()
3572    };
3573
3574    let content = match read_file_to_string(path) {
3575        Ok(content) => content,
3576        Err(error) => {
3577            warn!("Failed to read pypi.json at {:?}: {}", path, error);
3578            return default;
3579        }
3580    };
3581
3582    let root: serde_json::Value = match serde_json::from_str(&content) {
3583        Ok(value) => value,
3584        Err(error) => {
3585            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
3586            return default;
3587        }
3588    };
3589
3590    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
3591        warn!("No info object found in pypi.json at {:?}", path);
3592        return default;
3593    };
3594
3595    let name = info
3596        .get("name")
3597        .and_then(|value| value.as_str())
3598        .map(ToOwned::to_owned);
3599    let version = info
3600        .get("version")
3601        .and_then(|value| value.as_str())
3602        .map(ToOwned::to_owned);
3603    let summary = info
3604        .get("summary")
3605        .and_then(|value| value.as_str())
3606        .map(ToOwned::to_owned);
3607    let description = info
3608        .get("description")
3609        .and_then(|value| value.as_str())
3610        .filter(|value| !value.trim().is_empty())
3611        .map(ToOwned::to_owned)
3612        .or(summary);
3613    let mut homepage_url = info
3614        .get("home_page")
3615        .and_then(|value| value.as_str())
3616        .map(ToOwned::to_owned);
3617    let author = info
3618        .get("author")
3619        .and_then(|value| value.as_str())
3620        .filter(|value| !value.trim().is_empty())
3621        .map(ToOwned::to_owned);
3622    let author_email = info
3623        .get("author_email")
3624        .and_then(|value| value.as_str())
3625        .filter(|value| !value.trim().is_empty())
3626        .map(ToOwned::to_owned);
3627    let license = info
3628        .get("license")
3629        .and_then(|value| value.as_str())
3630        .filter(|value| !value.trim().is_empty())
3631        .map(ToOwned::to_owned);
3632    let keywords = parse_setup_cfg_keywords(
3633        info.get("keywords")
3634            .and_then(|value| value.as_str())
3635            .map(ToOwned::to_owned),
3636    );
3637    let classifiers = info
3638        .get("classifiers")
3639        .and_then(|value| value.as_array())
3640        .map(|values| {
3641            values
3642                .iter()
3643                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
3644                .collect::<Vec<_>>()
3645        })
3646        .unwrap_or_default();
3647
3648    let mut parties = Vec::new();
3649    if author.is_some() || author_email.is_some() {
3650        parties.push(Party {
3651            r#type: Some("person".to_string()),
3652            role: Some("author".to_string()),
3653            name: author,
3654            email: author_email,
3655            url: None,
3656            organization: None,
3657            organization_url: None,
3658            timezone: None,
3659        });
3660    }
3661
3662    let mut bug_tracking_url = None;
3663    let mut code_view_url = None;
3664    let mut vcs_url = None;
3665    let mut extra_data = HashMap::new();
3666
3667    let parsed_project_urls = info
3668        .get("project_urls")
3669        .and_then(|value| value.as_object())
3670        .map(|map| {
3671            let mut pairs: Vec<(String, String)> = map
3672                .iter()
3673                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
3674                .collect();
3675            pairs.sort_by(|left, right| left.0.cmp(&right.0));
3676            pairs
3677        })
3678        .unwrap_or_default();
3679
3680    apply_project_url_mappings(
3681        &parsed_project_urls,
3682        &mut homepage_url,
3683        &mut bug_tracking_url,
3684        &mut code_view_url,
3685        &mut vcs_url,
3686        &mut extra_data,
3687    );
3688
3689    let (download_url, size, sha256) = root
3690        .get("urls")
3691        .and_then(|value| value.as_array())
3692        .map(|urls| select_pypi_json_artifact(urls))
3693        .unwrap_or((None, None, None));
3694
3695    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
3696        build_pypi_urls(name.as_deref(), version.as_deref());
3697
3698    PackageData {
3699        package_type: Some(PythonParser::PACKAGE_TYPE),
3700        namespace: None,
3701        name,
3702        version,
3703        qualifiers: None,
3704        subpath: None,
3705        primary_language: None,
3706        description,
3707        release_date: None,
3708        parties,
3709        keywords,
3710        homepage_url: homepage_url.or(repository_homepage_url.clone()),
3711        download_url,
3712        size,
3713        sha1: None,
3714        md5: None,
3715        sha256,
3716        sha512: None,
3717        bug_tracking_url,
3718        code_view_url,
3719        vcs_url,
3720        copyright: None,
3721        holder: None,
3722        declared_license_expression: None,
3723        declared_license_expression_spdx: None,
3724        license_detections: Vec::new(),
3725        other_license_expression: None,
3726        other_license_expression_spdx: None,
3727        other_license_detections: Vec::new(),
3728        extracted_license_statement: license,
3729        notice_text: None,
3730        source_packages: Vec::new(),
3731        file_references: Vec::new(),
3732        is_private: has_private_classifier(&classifiers),
3733        is_virtual: false,
3734        extra_data: if extra_data.is_empty() {
3735            None
3736        } else {
3737            Some(extra_data)
3738        },
3739        dependencies: Vec::new(),
3740        repository_homepage_url,
3741        repository_download_url,
3742        api_data_url,
3743        datasource_id: Some(DatasourceId::PypiJson),
3744        purl,
3745    }
3746}
3747
3748fn select_pypi_json_artifact(
3749    urls: &[serde_json::Value],
3750) -> (Option<String>, Option<u64>, Option<String>) {
3751    let selected = urls
3752        .iter()
3753        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
3754        .or_else(|| urls.first());
3755
3756    let Some(entry) = selected else {
3757        return (None, None, None);
3758    };
3759
3760    let download_url = entry
3761        .get("url")
3762        .and_then(|value| value.as_str())
3763        .map(ToOwned::to_owned);
3764    let size = entry.get("size").and_then(|value| value.as_u64());
3765    let sha256 = entry
3766        .get("digests")
3767        .and_then(|value| value.as_object())
3768        .and_then(|digests| digests.get("sha256"))
3769        .and_then(|value| value.as_str())
3770        .map(ToOwned::to_owned);
3771
3772    (download_url, size, sha256)
3773}
3774
3775fn extract_from_pip_inspect(path: &Path) -> PackageData {
3776    let content = match read_file_to_string(path) {
3777        Ok(content) => content,
3778        Err(e) => {
3779            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
3780            return default_package_data();
3781        }
3782    };
3783
3784    let root: serde_json::Value = match serde_json::from_str(&content) {
3785        Ok(value) => value,
3786        Err(e) => {
3787            warn!(
3788                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
3789                path, e
3790            );
3791            return default_package_data();
3792        }
3793    };
3794
3795    let installed = match root.get("installed").and_then(|v| v.as_array()) {
3796        Some(arr) => arr,
3797        None => {
3798            warn!(
3799                "No 'installed' array found in pip-inspect.deplock at {:?}",
3800                path
3801            );
3802            return default_package_data();
3803        }
3804    };
3805
3806    let pip_version = root
3807        .get("pip_version")
3808        .and_then(|v| v.as_str())
3809        .map(String::from);
3810    let inspect_version = root
3811        .get("version")
3812        .and_then(|v| v.as_str())
3813        .map(String::from);
3814
3815    let mut main_package: Option<PackageData> = None;
3816    let mut dependencies: Vec<Dependency> = Vec::new();
3817
3818    for package_entry in installed {
3819        let metadata = match package_entry.get("metadata") {
3820            Some(m) => m,
3821            None => continue,
3822        };
3823
3824        let is_requested = package_entry
3825            .get("requested")
3826            .and_then(|v| v.as_bool())
3827            .unwrap_or(false);
3828        let has_direct_url = package_entry.get("direct_url").is_some();
3829
3830        let name = metadata
3831            .get("name")
3832            .and_then(|v| v.as_str())
3833            .map(String::from);
3834        let version = metadata
3835            .get("version")
3836            .and_then(|v| v.as_str())
3837            .map(String::from);
3838        let summary = metadata
3839            .get("summary")
3840            .and_then(|v| v.as_str())
3841            .map(String::from);
3842        let home_page = metadata
3843            .get("home_page")
3844            .and_then(|v| v.as_str())
3845            .map(String::from);
3846        let author = metadata
3847            .get("author")
3848            .and_then(|v| v.as_str())
3849            .map(String::from);
3850        let author_email = metadata
3851            .get("author_email")
3852            .and_then(|v| v.as_str())
3853            .map(String::from);
3854        let license = metadata
3855            .get("license")
3856            .and_then(|v| v.as_str())
3857            .map(String::from);
3858        let description = metadata
3859            .get("description")
3860            .and_then(|v| v.as_str())
3861            .map(String::from);
3862        let keywords = metadata
3863            .get("keywords")
3864            .and_then(|v| v.as_array())
3865            .map(|arr| {
3866                arr.iter()
3867                    .filter_map(|k| k.as_str().map(String::from))
3868                    .collect::<Vec<_>>()
3869            })
3870            .unwrap_or_default();
3871
3872        let mut parties = Vec::new();
3873        if author.is_some() || author_email.is_some() {
3874            parties.push(Party {
3875                r#type: Some("person".to_string()),
3876                role: Some("author".to_string()),
3877                name: author,
3878                email: author_email,
3879                url: None,
3880                organization: None,
3881                organization_url: None,
3882                timezone: None,
3883            });
3884        }
3885
3886        // Extract license statement only - detection happens in separate engine
3887        let license_detections = Vec::new();
3888        let declared_license_expression = None;
3889        let declared_license_expression_spdx = None;
3890        let extracted_license_statement = license.clone();
3891
3892        let purl = name.as_ref().and_then(|n| {
3893            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
3894            if let Some(v) = &version {
3895                package_url.with_version(v).ok()?;
3896            }
3897            Some(package_url.to_string())
3898        });
3899
3900        if is_requested && has_direct_url {
3901            let mut extra_data = HashMap::new();
3902            if let Some(pv) = &pip_version {
3903                extra_data.insert(
3904                    "pip_version".to_string(),
3905                    serde_json::Value::String(pv.clone()),
3906                );
3907            }
3908            if let Some(iv) = &inspect_version {
3909                extra_data.insert(
3910                    "inspect_version".to_string(),
3911                    serde_json::Value::String(iv.clone()),
3912                );
3913            }
3914
3915            main_package = Some(PackageData {
3916                package_type: Some(PythonParser::PACKAGE_TYPE),
3917                namespace: None,
3918                name,
3919                version,
3920                qualifiers: None,
3921                subpath: None,
3922                primary_language: Some("Python".to_string()),
3923                description: description.or(summary),
3924                release_date: None,
3925                parties,
3926                keywords,
3927                homepage_url: home_page,
3928                download_url: None,
3929                size: None,
3930                sha1: None,
3931                md5: None,
3932                sha256: None,
3933                sha512: None,
3934                bug_tracking_url: None,
3935                code_view_url: None,
3936                vcs_url: None,
3937                copyright: None,
3938                holder: None,
3939                declared_license_expression,
3940                declared_license_expression_spdx,
3941                license_detections,
3942                other_license_expression: None,
3943                other_license_expression_spdx: None,
3944                other_license_detections: Vec::new(),
3945                extracted_license_statement,
3946                notice_text: None,
3947                source_packages: Vec::new(),
3948                file_references: Vec::new(),
3949                is_private: false,
3950                is_virtual: true,
3951                extra_data: if extra_data.is_empty() {
3952                    None
3953                } else {
3954                    Some(extra_data)
3955                },
3956                dependencies: Vec::new(),
3957                repository_homepage_url: None,
3958                repository_download_url: None,
3959                api_data_url: None,
3960                datasource_id: Some(DatasourceId::PypiInspectDeplock),
3961                purl,
3962            });
3963        } else {
3964            let resolved_package = PackageData {
3965                package_type: Some(PythonParser::PACKAGE_TYPE),
3966                namespace: None,
3967                name: name.clone(),
3968                version: version.clone(),
3969                qualifiers: None,
3970                subpath: None,
3971                primary_language: Some("Python".to_string()),
3972                description: description.or(summary),
3973                release_date: None,
3974                parties,
3975                keywords,
3976                homepage_url: home_page,
3977                download_url: None,
3978                size: None,
3979                sha1: None,
3980                md5: None,
3981                sha256: None,
3982                sha512: None,
3983                bug_tracking_url: None,
3984                code_view_url: None,
3985                vcs_url: None,
3986                copyright: None,
3987                holder: None,
3988                declared_license_expression,
3989                declared_license_expression_spdx,
3990                license_detections,
3991                other_license_expression: None,
3992                other_license_expression_spdx: None,
3993                other_license_detections: Vec::new(),
3994                extracted_license_statement,
3995                notice_text: None,
3996                source_packages: Vec::new(),
3997                file_references: Vec::new(),
3998                is_private: false,
3999                is_virtual: true,
4000                extra_data: None,
4001                dependencies: Vec::new(),
4002                repository_homepage_url: None,
4003                repository_download_url: None,
4004                api_data_url: None,
4005                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4006                purl: purl.clone(),
4007            };
4008
4009            let resolved = package_data_to_resolved(&resolved_package);
4010            dependencies.push(Dependency {
4011                purl,
4012                extracted_requirement: None,
4013                scope: None,
4014                is_runtime: Some(true),
4015                is_optional: Some(false),
4016                is_pinned: Some(true),
4017                is_direct: Some(is_requested),
4018                resolved_package: Some(Box::new(resolved)),
4019                extra_data: None,
4020            });
4021        }
4022    }
4023
4024    if let Some(mut main_pkg) = main_package {
4025        main_pkg.dependencies = dependencies;
4026        main_pkg
4027    } else {
4028        default_package_data()
4029    }
4030}
4031
4032type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4033
4034fn extract_from_setup_cfg(path: &Path) -> PackageData {
4035    let content = match read_file_to_string(path) {
4036        Ok(content) => content,
4037        Err(e) => {
4038            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4039            return default_package_data();
4040        }
4041    };
4042
4043    let sections = parse_setup_cfg(&content);
4044    let name = get_ini_value(&sections, "metadata", "name");
4045    let version = get_ini_value(&sections, "metadata", "version");
4046    let description = get_ini_value(&sections, "metadata", "description");
4047    let author = get_ini_value(&sections, "metadata", "author");
4048    let author_email = get_ini_value(&sections, "metadata", "author_email");
4049    let maintainer = get_ini_value(&sections, "metadata", "maintainer");
4050    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4051    let license = get_ini_value(&sections, "metadata", "license");
4052    let mut homepage_url = get_ini_value(&sections, "metadata", "url");
4053    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4054    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4055    let python_requires = get_ini_value(&sections, "options", "python_requires");
4056    let parsed_project_urls =
4057        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4058    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4059    let mut extra_data = HashMap::new();
4060
4061    let mut parties = Vec::new();
4062    if author.is_some() || author_email.is_some() {
4063        parties.push(Party {
4064            r#type: Some("person".to_string()),
4065            role: Some("author".to_string()),
4066            name: author,
4067            email: author_email,
4068            url: None,
4069            organization: None,
4070            organization_url: None,
4071            timezone: None,
4072        });
4073    }
4074
4075    if maintainer.is_some() || maintainer_email.is_some() {
4076        parties.push(Party {
4077            r#type: Some("person".to_string()),
4078            role: Some("maintainer".to_string()),
4079            name: maintainer,
4080            email: maintainer_email,
4081            url: None,
4082            organization: None,
4083            organization_url: None,
4084            timezone: None,
4085        });
4086    }
4087
4088    let declared_license_expression = None;
4089    let declared_license_expression_spdx = None;
4090    let license_detections = Vec::new();
4091    let extracted_license_statement = license.clone();
4092
4093    let dependencies = extract_setup_cfg_dependencies(&sections);
4094
4095    if let Some(value) = python_requires {
4096        extra_data.insert(
4097            "python_requires".to_string(),
4098            serde_json::Value::String(value),
4099        );
4100    }
4101
4102    apply_project_url_mappings(
4103        &parsed_project_urls,
4104        &mut homepage_url,
4105        &mut bug_tracking_url,
4106        &mut code_view_url,
4107        &mut vcs_url,
4108        &mut extra_data,
4109    );
4110
4111    let extra_data = if extra_data.is_empty() {
4112        None
4113    } else {
4114        Some(extra_data)
4115    };
4116
4117    let purl = name.as_ref().and_then(|n| {
4118        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4119        if let Some(v) = &version {
4120            package_url.with_version(v).ok()?;
4121        }
4122        Some(package_url.to_string())
4123    });
4124
4125    PackageData {
4126        package_type: Some(PythonParser::PACKAGE_TYPE),
4127        namespace: None,
4128        name,
4129        version,
4130        qualifiers: None,
4131        subpath: None,
4132        primary_language: Some("Python".to_string()),
4133        description,
4134        release_date: None,
4135        parties,
4136        keywords,
4137        homepage_url,
4138        download_url: None,
4139        size: None,
4140        sha1: None,
4141        md5: None,
4142        sha256: None,
4143        sha512: None,
4144        bug_tracking_url,
4145        code_view_url,
4146        vcs_url,
4147        copyright: None,
4148        holder: None,
4149        declared_license_expression,
4150        declared_license_expression_spdx,
4151        license_detections,
4152        other_license_expression: None,
4153        other_license_expression_spdx: None,
4154        other_license_detections: Vec::new(),
4155        extracted_license_statement,
4156        notice_text: None,
4157        source_packages: Vec::new(),
4158        file_references: Vec::new(),
4159        is_private: has_private_classifier(&classifiers),
4160        is_virtual: false,
4161        extra_data,
4162        dependencies,
4163        repository_homepage_url: None,
4164        repository_download_url: None,
4165        api_data_url: None,
4166        datasource_id: Some(DatasourceId::PypiSetupCfg),
4167        purl,
4168    }
4169}
4170
4171fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4172    let Some(keywords) = value else {
4173        return Vec::new();
4174    };
4175
4176    keywords
4177        .split(',')
4178        .map(str::trim)
4179        .filter(|keyword| !keyword.is_empty())
4180        .map(ToOwned::to_owned)
4181        .collect()
4182}
4183
4184fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4185    entries
4186        .iter()
4187        .filter_map(|entry| {
4188            let (label, url) = entry.split_once('=')?;
4189            let label = label.trim();
4190            let url = url.trim();
4191            if label.is_empty() || url.is_empty() {
4192                None
4193            } else {
4194                Some((label.to_string(), url.to_string()))
4195            }
4196        })
4197        .collect()
4198}
4199
4200fn apply_project_url_mappings(
4201    parsed_urls: &[(String, String)],
4202    homepage_url: &mut Option<String>,
4203    bug_tracking_url: &mut Option<String>,
4204    code_view_url: &mut Option<String>,
4205    vcs_url: &mut Option<String>,
4206    extra_data: &mut HashMap<String, serde_json::Value>,
4207) {
4208    for (label, url) in parsed_urls {
4209        let label_lower = label.to_lowercase();
4210
4211        if bug_tracking_url.is_none()
4212            && matches!(
4213                label_lower.as_str(),
4214                "tracker"
4215                    | "bug reports"
4216                    | "bug tracker"
4217                    | "issues"
4218                    | "issue tracker"
4219                    | "github: issues"
4220            )
4221        {
4222            *bug_tracking_url = Some(url.clone());
4223        } else if code_view_url.is_none()
4224            && matches!(label_lower.as_str(), "source" | "source code" | "code")
4225        {
4226            *code_view_url = Some(url.clone());
4227        } else if vcs_url.is_none()
4228            && matches!(
4229                label_lower.as_str(),
4230                "github" | "gitlab" | "github: repo" | "repository"
4231            )
4232        {
4233            *vcs_url = Some(url.clone());
4234        } else if homepage_url.is_none()
4235            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4236        {
4237            *homepage_url = Some(url.clone());
4238        } else if label_lower == "changelog" {
4239            extra_data.insert(
4240                "changelog_url".to_string(),
4241                serde_json::Value::String(url.clone()),
4242            );
4243        }
4244    }
4245
4246    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4247        .iter()
4248        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4249        .collect();
4250
4251    if !project_urls_json.is_empty() {
4252        extra_data.insert(
4253            "project_urls".to_string(),
4254            serde_json::Value::Object(project_urls_json),
4255        );
4256    }
4257}
4258
4259fn parse_setup_cfg(content: &str) -> IniSections {
4260    let mut sections: IniSections = HashMap::new();
4261    let mut current_section: Option<String> = None;
4262    let mut current_key: Option<String> = None;
4263
4264    for raw_line in content.lines() {
4265        let line = raw_line.trim_end_matches('\r');
4266        let trimmed = line.trim();
4267        if trimmed.is_empty() {
4268            continue;
4269        }
4270
4271        let stripped = line.trim_start();
4272        if stripped.starts_with('#') || stripped.starts_with(';') {
4273            continue;
4274        }
4275
4276        if stripped.starts_with('[') && stripped.ends_with(']') {
4277            let section_name = stripped
4278                .trim_start_matches('[')
4279                .trim_end_matches(']')
4280                .trim()
4281                .to_ascii_lowercase();
4282            current_section = if section_name.is_empty() {
4283                None
4284            } else {
4285                Some(section_name)
4286            };
4287            current_key = None;
4288            continue;
4289        }
4290
4291        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4292            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4293                let value = stripped.trim();
4294                if !value.is_empty() {
4295                    sections
4296                        .entry(section.clone())
4297                        .or_default()
4298                        .entry(key.clone())
4299                        .or_default()
4300                        .push(value.to_string());
4301                }
4302            }
4303            continue;
4304        }
4305
4306        if let Some((key, value)) = stripped.split_once('=')
4307            && let Some(section) = current_section.as_ref()
4308        {
4309            let key_name = key.trim().to_ascii_lowercase();
4310            let value_trimmed = value.trim();
4311            let entry = sections
4312                .entry(section.clone())
4313                .or_default()
4314                .entry(key_name.clone())
4315                .or_default();
4316            if !value_trimmed.is_empty() {
4317                entry.push(value_trimmed.to_string());
4318            }
4319            current_key = Some(key_name);
4320        }
4321    }
4322
4323    sections
4324}
4325
4326fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4327    sections
4328        .get(&section.to_ascii_lowercase())
4329        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4330        .and_then(|entries| entries.first())
4331        .map(|value| value.trim().to_string())
4332}
4333
4334fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4335    sections
4336        .get(&section.to_ascii_lowercase())
4337        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4338        .cloned()
4339        .unwrap_or_default()
4340}
4341
4342fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
4343    let mut dependencies = Vec::new();
4344
4345    for (sub_section, scope) in [
4346        ("install_requires", "install"),
4347        ("tests_require", "test"),
4348        ("setup_requires", "setup"),
4349    ] {
4350        let reqs = get_ini_values(sections, "options", sub_section);
4351        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
4352    }
4353
4354    if let Some(extras) = sections.get("options.extras_require") {
4355        let mut extra_items: Vec<_> = extras.iter().collect();
4356        extra_items.sort_by_key(|(name, _)| *name);
4357        for (extra_name, reqs) in extra_items {
4358            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
4359        }
4360    }
4361
4362    dependencies
4363}
4364
4365fn parse_setup_cfg_requirements(
4366    reqs: &[String],
4367    scope: &str,
4368    is_optional: bool,
4369) -> Vec<Dependency> {
4370    reqs.iter()
4371        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
4372        .collect()
4373}
4374
4375fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
4376    let trimmed = req.trim();
4377    if trimmed.is_empty() || trimmed.starts_with('#') {
4378        return None;
4379    }
4380
4381    let name = extract_setup_cfg_dependency_name(trimmed)?;
4382    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4383
4384    Some(Dependency {
4385        purl: Some(purl.to_string()),
4386        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
4387        scope: Some(scope.to_string()),
4388        is_runtime: Some(true),
4389        is_optional: Some(is_optional),
4390        is_pinned: Some(false),
4391        is_direct: Some(true),
4392        resolved_package: None,
4393        extra_data: None,
4394    })
4395}
4396
4397fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
4398    let trimmed = req.trim();
4399    if trimmed.is_empty() {
4400        return None;
4401    }
4402
4403    let end = trimmed
4404        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
4405        .unwrap_or(trimmed.len());
4406    let name = trimmed[..end].trim();
4407    if name.is_empty() {
4408        None
4409    } else {
4410        Some(name.to_string())
4411    }
4412}
4413
4414fn normalize_setup_cfg_requirement(req: &str) -> String {
4415    req.chars().filter(|c| !c.is_whitespace()).collect()
4416}
4417
4418fn extract_setup_value(content: &str, key: &str) -> Option<String> {
4419    let patterns = vec![
4420        format!("{}=\"", key),   // name="value"
4421        format!("{} =\"", key),  // name ="value"
4422        format!("{}= \"", key),  // name= "value"
4423        format!("{} = \"", key), // name = "value"
4424        format!("{}='", key),    // name='value'
4425        format!("{} ='", key),   // name ='value'
4426        format!("{}= '", key),   // name= 'value'
4427        format!("{} = '", key),  // name = 'value'
4428    ];
4429
4430    for pattern in patterns {
4431        if let Some(start_idx) = content.find(&pattern) {
4432            let value_start = start_idx + pattern.len();
4433            let remaining = &content[value_start..];
4434
4435            if let Some(end_idx) = remaining.find(['"', '\'']) {
4436                return Some(remaining[..end_idx].to_string());
4437            }
4438        }
4439    }
4440
4441    None
4442}
4443
4444fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
4445    let mut dependencies = Vec::new();
4446
4447    if let Some(tests_deps) = extract_tests_require(content) {
4448        dependencies.extend(tests_deps);
4449    }
4450
4451    if let Some(extras_deps) = extract_extras_require(content) {
4452        dependencies.extend(extras_deps);
4453    }
4454
4455    dependencies
4456}
4457
4458fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
4459    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
4460    let re = Regex::new(pattern).ok()?;
4461    let captures = re.captures(content)?;
4462    let deps_str = captures.get(1)?.as_str();
4463
4464    let deps = parse_setup_py_dep_list(deps_str, "test", true);
4465    if deps.is_empty() { None } else { Some(deps) }
4466}
4467
4468fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
4469    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
4470    let re = Regex::new(pattern).ok()?;
4471    let captures = re.captures(content)?;
4472    let dict_content = captures.get(1)?.as_str();
4473
4474    let mut all_deps = Vec::new();
4475
4476    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
4477    let entry_re = Regex::new(entry_pattern).ok()?;
4478
4479    for entry_cap in entry_re.captures_iter(dict_content) {
4480        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
4481            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
4482            all_deps.extend(deps);
4483        }
4484    }
4485
4486    if all_deps.is_empty() {
4487        None
4488    } else {
4489        Some(all_deps)
4490    }
4491}
4492
4493fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
4494    let dep_pattern = r#"['"]([^'"]+)['"]"#;
4495    let re = match Regex::new(dep_pattern) {
4496        Ok(r) => r,
4497        Err(_) => return Vec::new(),
4498    };
4499
4500    re.captures_iter(deps_str)
4501        .filter_map(|cap| {
4502            let dep_str = cap.get(1)?.as_str().trim();
4503            if dep_str.is_empty() {
4504                return None;
4505            }
4506
4507            let name = extract_setup_cfg_dependency_name(dep_str)?;
4508            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4509
4510            Some(Dependency {
4511                purl: Some(purl.to_string()),
4512                extracted_requirement: Some(dep_str.to_string()),
4513                scope: Some(scope.to_string()),
4514                is_runtime: Some(true),
4515                is_optional: Some(is_optional),
4516                is_pinned: Some(false),
4517                is_direct: Some(true),
4518                resolved_package: None,
4519                extra_data: None,
4520            })
4521        })
4522        .collect()
4523}
4524
4525/// Reads and parses a TOML file
4526pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
4527    let content = read_file_to_string(path).map_err(|e| e.to_string())?;
4528    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
4529}
4530
4531/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
4532///
4533/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
4534/// Essential for SBOM compliance and package integrity verification.
4535///
4536/// # Returns
4537///
4538/// - `(Some(size), Some(hash))` on success
4539/// - `(None, None)` if file cannot be opened
4540/// - `(Some(size), None)` if hash calculation fails during read
4541fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<String>) {
4542    let mut file = match File::open(path) {
4543        Ok(f) => f,
4544        Err(_) => return (None, None),
4545    };
4546
4547    let metadata = match file.metadata() {
4548        Ok(m) => m,
4549        Err(_) => return (None, None),
4550    };
4551    let size = metadata.len();
4552
4553    let mut hasher = Sha256::new();
4554    let mut buffer = vec![0; 8192];
4555
4556    loop {
4557        match file.read(&mut buffer) {
4558            Ok(0) => break,
4559            Ok(n) => hasher.update(&buffer[..n]),
4560            Err(_) => return (Some(size), None),
4561        }
4562    }
4563
4564    let hash = format!("{:x}", hasher.finalize());
4565    (Some(size), Some(hash))
4566}
4567
4568fn default_package_data() -> PackageData {
4569    PackageData::default()
4570}
4571
4572crate::register_parser!(
4573    "Python package manifests (pyproject.toml, setup.py, setup.cfg, pypi.json, PKG-INFO, METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
4574    &[
4575        "**/pyproject.toml",
4576        "**/setup.py",
4577        "**/setup.cfg",
4578        "**/pypi.json",
4579        "**/PKG-INFO",
4580        "**/METADATA",
4581        "**/origin.json",
4582        "**/*.tar.gz",
4583        "**/*.tgz",
4584        "**/*.tar.bz2",
4585        "**/*.tar.xz",
4586        "**/*.zip",
4587        "**/*.whl",
4588        "**/*.egg"
4589    ],
4590    "pypi",
4591    "Python",
4592    Some("https://packaging.python.org/"),
4593);