Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
35use crate::parsers::utils::{read_file_to_string, split_name_email};
36use base64::Engine;
37use base64::engine::general_purpose::URL_SAFE_NO_PAD;
38use bzip2::read::BzDecoder;
39use csv::ReaderBuilder;
40use flate2::read::GzDecoder;
41use liblzma::read::XzDecoder;
42use log::warn;
43use packageurl::PackageUrl;
44use regex::Regex;
45use rustpython_parser::{Parse, ast};
46use serde_json::{Map as JsonMap, Value as JsonValue};
47use sha2::{Digest, Sha256};
48use std::collections::{HashMap, HashSet};
49use std::fs::File;
50use std::io::Read;
51use std::path::{Component, Path, PathBuf};
52use tar::Archive;
53use toml::Value as TomlValue;
54use toml::map::Map as TomlMap;
55use zip::ZipArchive;
56
57use super::PackageParser;
58use super::license_normalization::normalize_spdx_declared_license;
59
60// Field constants for pyproject.toml
61const FIELD_PROJECT: &str = "project";
62const FIELD_NAME: &str = "name";
63const FIELD_VERSION: &str = "version";
64const FIELD_LICENSE: &str = "license";
65const FIELD_AUTHORS: &str = "authors";
66const FIELD_MAINTAINERS: &str = "maintainers";
67const FIELD_URLS: &str = "urls";
68const FIELD_HOMEPAGE: &str = "homepage";
69const FIELD_REPOSITORY: &str = "repository";
70const FIELD_DEPENDENCIES: &str = "dependencies";
71const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
72const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
73const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
74const MAX_SETUP_PY_BYTES: usize = 1_048_576;
75const MAX_SETUP_PY_AST_NODES: usize = 10_000;
76const MAX_SETUP_PY_AST_DEPTH: usize = 50;
77const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
78const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
79const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
80
81/// Python package parser supporting 11 manifest formats.
82///
83/// Extracts metadata from Python package files including pyproject.toml, setup.py,
84/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
85///
86/// # Security
87///
88/// setup.py files are parsed using AST analysis rather than code execution to prevent
89/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
90pub struct PythonParser;
91
92#[derive(Clone, Copy, Debug)]
93enum PythonSdistArchiveFormat {
94    TarGz,
95    Tgz,
96    TarBz2,
97    TarXz,
98    Zip,
99}
100
101#[derive(Clone, Debug)]
102struct ValidatedZipEntry {
103    index: usize,
104    name: String,
105}
106
107impl PackageParser for PythonParser {
108    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
109
110    fn extract_packages(path: &Path) -> Vec<PackageData> {
111        vec![
112            if path.file_name().unwrap_or_default() == "pyproject.toml" {
113                extract_from_pyproject_toml(path)
114            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
115                extract_from_setup_cfg(path)
116            } else if path.file_name().unwrap_or_default() == "setup.py" {
117                extract_from_setup_py(path)
118            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
119                extract_from_rfc822_metadata(path, DatasourceId::PypiSdistPkginfo)
120            } else if path.file_name().unwrap_or_default() == "METADATA" {
121                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
122            } else if is_pip_cache_origin_json(path) {
123                extract_from_pip_origin_json(path)
124            } else if path.file_name().unwrap_or_default() == "pypi.json" {
125                extract_from_pypi_json(path)
126            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
127                extract_from_pip_inspect(path)
128            } else if is_python_sdist_archive_path(path) {
129                extract_from_sdist_archive(path)
130            } else if path
131                .extension()
132                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
133            {
134                extract_from_wheel_archive(path)
135            } else if path
136                .extension()
137                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
138            {
139                extract_from_egg_archive(path)
140            } else {
141                default_package_data()
142            },
143        ]
144    }
145
146    fn is_match(path: &Path) -> bool {
147        if let Some(filename) = path.file_name()
148            && (filename == "pyproject.toml"
149                || filename == "setup.cfg"
150                || filename == "setup.py"
151                || filename == "PKG-INFO"
152                || filename == "METADATA"
153                || filename == "pypi.json"
154                || filename == "pip-inspect.deplock"
155                || is_pip_cache_origin_json(path))
156        {
157            return true;
158        }
159
160        if let Some(extension) = path.extension() {
161            let ext = extension.to_string_lossy().to_lowercase();
162            if ext == "whl" || ext == "egg" || is_python_sdist_archive_path(path) {
163                return true;
164            }
165        }
166
167        false
168    }
169}
170
171#[derive(Debug, Clone)]
172struct InstalledWheelMetadata {
173    wheel_tags: Vec<String>,
174    wheel_version: Option<String>,
175    wheel_generator: Option<String>,
176    root_is_purelib: Option<bool>,
177    compressed_tag: Option<String>,
178}
179
180fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
181    let Some(parent) = path.parent() else {
182        return;
183    };
184
185    if !parent
186        .file_name()
187        .and_then(|name| name.to_str())
188        .is_some_and(|name| name.ends_with(".dist-info"))
189    {
190        return;
191    }
192
193    let wheel_path = parent.join("WHEEL");
194    if !wheel_path.exists() {
195        return;
196    }
197
198    let Ok(content) = read_file_to_string(&wheel_path) else {
199        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
200        return;
201    };
202
203    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
204        return;
205    };
206
207    apply_installed_wheel_metadata(package_data, &wheel_metadata);
208}
209
210fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
211    use super::rfc822::{get_header_all, get_header_first};
212
213    let metadata = super::rfc822::parse_rfc822_content(content);
214    let wheel_tags = get_header_all(&metadata.headers, "tag");
215    if wheel_tags.is_empty() {
216        return None;
217    }
218
219    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
220    let wheel_generator = get_header_first(&metadata.headers, "generator");
221    let root_is_purelib =
222        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
223            match value.to_ascii_lowercase().as_str() {
224                "true" => Some(true),
225                "false" => Some(false),
226                _ => None,
227            }
228        });
229
230    let compressed_tag = compress_wheel_tags(&wheel_tags);
231
232    Some(InstalledWheelMetadata {
233        wheel_tags,
234        wheel_version,
235        wheel_generator,
236        root_is_purelib,
237        compressed_tag,
238    })
239}
240
241fn compress_wheel_tags(tags: &[String]) -> Option<String> {
242    if tags.is_empty() {
243        return None;
244    }
245
246    if tags.len() == 1 {
247        return Some(tags[0].clone());
248    }
249
250    let mut python_tags = Vec::new();
251    let mut abi_tag: Option<&str> = None;
252    let mut platform_tag: Option<&str> = None;
253
254    for tag in tags {
255        let mut parts = tag.splitn(3, '-');
256        let python = parts.next()?;
257        let abi = parts.next()?;
258        let platform = parts.next()?;
259
260        if abi_tag.is_some_and(|existing| existing != abi)
261            || platform_tag.is_some_and(|existing| existing != platform)
262        {
263            return None;
264        }
265
266        abi_tag = Some(abi);
267        platform_tag = Some(platform);
268        python_tags.push(python.to_string());
269    }
270
271    Some(format!(
272        "{}-{}-{}",
273        python_tags.join("."),
274        abi_tag?,
275        platform_tag?
276    ))
277}
278
279fn apply_installed_wheel_metadata(
280    package_data: &mut PackageData,
281    wheel_metadata: &InstalledWheelMetadata,
282) {
283    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
284    extra_data.insert(
285        "wheel_tags".to_string(),
286        JsonValue::Array(
287            wheel_metadata
288                .wheel_tags
289                .iter()
290                .cloned()
291                .map(JsonValue::String)
292                .collect(),
293        ),
294    );
295
296    if let Some(wheel_version) = &wheel_metadata.wheel_version {
297        extra_data.insert(
298            "wheel_version".to_string(),
299            JsonValue::String(wheel_version.clone()),
300        );
301    }
302
303    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
304        extra_data.insert(
305            "wheel_generator".to_string(),
306            JsonValue::String(wheel_generator.clone()),
307        );
308    }
309
310    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
311        extra_data.insert(
312            "root_is_purelib".to_string(),
313            JsonValue::Bool(root_is_purelib),
314        );
315    }
316
317    if let (Some(name), Some(version), Some(extension)) = (
318        package_data.name.as_deref(),
319        package_data.version.as_deref(),
320        wheel_metadata.compressed_tag.as_deref(),
321    ) {
322        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
323    }
324}
325
326fn is_pip_cache_origin_json(path: &Path) -> bool {
327    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
328        && path.ancestors().skip(1).any(|ancestor| {
329            ancestor
330                .file_name()
331                .and_then(|name| name.to_str())
332                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
333        })
334}
335
336fn extract_from_pip_origin_json(path: &Path) -> PackageData {
337    let content = match read_file_to_string(path) {
338        Ok(content) => content,
339        Err(e) => {
340            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
341            return default_package_data();
342        }
343    };
344
345    let root: JsonValue = match serde_json::from_str(&content) {
346        Ok(root) => root,
347        Err(e) => {
348            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
349            return default_package_data();
350        }
351    };
352
353    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
354        warn!("No url found in pip cache origin.json at {:?}", path);
355        return default_package_data();
356    };
357
358    let sibling_wheel = find_sibling_cached_wheel(path);
359    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
360        sibling_wheel
361            .as_ref()
362            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
363    });
364
365    let Some((name, version)) = name_version else {
366        warn!(
367            "Failed to infer package name/version from pip cache origin.json at {:?}",
368            path
369        );
370        return default_package_data();
371    };
372
373    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
374        build_pypi_urls(Some(&name), Some(&version));
375    let purl = sibling_wheel
376        .as_ref()
377        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
378        .or(plain_purl);
379
380    PackageData {
381        package_type: Some(PythonParser::PACKAGE_TYPE),
382        primary_language: Some("Python".to_string()),
383        name: Some(name),
384        version: Some(version),
385        datasource_id: Some(DatasourceId::PypiPipOriginJson),
386        download_url: Some(download_url.to_string()),
387        sha256: extract_sha256_from_origin_json(&root),
388        repository_homepage_url,
389        repository_download_url,
390        api_data_url,
391        purl,
392        ..Default::default()
393    }
394}
395
396fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
397    let parent = path.parent()?;
398    let entries = parent.read_dir().ok()?;
399
400    for entry in entries.flatten() {
401        let sibling_path = entry.path();
402        if sibling_path
403            .extension()
404            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
405            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
406        {
407            return Some(wheel_info);
408        }
409    }
410
411    None
412}
413
414fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
415    let file_name = url.rsplit('/').next()?;
416
417    if file_name.ends_with(".whl") {
418        return parse_wheel_filename(Path::new(file_name))
419            .map(|wheel_info| (wheel_info.name, wheel_info.version));
420    }
421
422    let stem = strip_python_archive_extension(file_name)?;
423    let (name, version) = stem.rsplit_once('-')?;
424    if name.is_empty() || version.is_empty() {
425        return None;
426    }
427
428    Some((name.replace('_', "-"), version.to_string()))
429}
430
431fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
432    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
433        .iter()
434        .find_map(|suffix| file_name.strip_suffix(suffix))
435}
436
437fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
438    root.pointer("/archive_info/hashes/sha256")
439        .and_then(|value| value.as_str())
440        .map(ToOwned::to_owned)
441        .or_else(|| {
442            root.pointer("/archive_info/hash")
443                .and_then(|value| value.as_str())
444                .and_then(normalize_origin_hash)
445        })
446}
447
448fn normalize_origin_hash(hash: &str) -> Option<String> {
449    if let Some(value) = hash.strip_prefix("sha256=") {
450        return Some(value.to_string());
451    }
452    if let Some(value) = hash.strip_prefix("sha256:") {
453        return Some(value.to_string());
454    }
455    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
456        return Some(hash.to_string());
457    }
458    None
459}
460
461fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
462    let content = match read_file_to_string(path) {
463        Ok(content) => content,
464        Err(e) => {
465            warn!("Failed to read metadata at {:?}: {}", path, e);
466            return default_package_data();
467        }
468    };
469
470    let metadata = super::rfc822::parse_rfc822_content(&content);
471    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
472    merge_sibling_metadata_dependencies(path, &mut package_data);
473    merge_sibling_metadata_file_references(path, &mut package_data);
474    if datasource_id == DatasourceId::PypiWheelMetadata {
475        merge_sibling_wheel_metadata(path, &mut package_data);
476    }
477    package_data
478}
479
480fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
481    let mut extra_dependencies = Vec::new();
482
483    if let Some(parent) = path.parent() {
484        let direct_requires = parent.join("requires.txt");
485        if direct_requires.exists()
486            && let Ok(content) = read_file_to_string(&direct_requires)
487        {
488            extra_dependencies.extend(parse_requires_txt(&content));
489        }
490
491        let sibling_egg_info_requires = parent
492            .read_dir()
493            .ok()
494            .into_iter()
495            .flatten()
496            .flatten()
497            .find_map(|entry| {
498                let child_path = entry.path();
499                if child_path.is_dir()
500                    && child_path
501                        .file_name()
502                        .and_then(|name| name.to_str())
503                        .is_some_and(|name| name.ends_with(".egg-info"))
504                {
505                    let requires = child_path.join("requires.txt");
506                    requires.exists().then_some(requires)
507                } else {
508                    None
509                }
510            });
511
512        if let Some(requires_path) = sibling_egg_info_requires
513            && let Ok(content) = read_file_to_string(&requires_path)
514        {
515            extra_dependencies.extend(parse_requires_txt(&content));
516        }
517    }
518
519    for dependency in extra_dependencies {
520        if !package_data.dependencies.iter().any(|existing| {
521            existing.purl == dependency.purl
522                && existing.scope == dependency.scope
523                && existing.extracted_requirement == dependency.extracted_requirement
524                && existing.extra_data == dependency.extra_data
525        }) {
526            package_data.dependencies.push(dependency);
527        }
528    }
529}
530
531fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
532    let mut extra_refs = Vec::new();
533
534    if let Some(parent) = path.parent() {
535        let record_path = parent.join("RECORD");
536        if record_path.exists()
537            && let Ok(content) = read_file_to_string(&record_path)
538        {
539            extra_refs.extend(parse_record_csv(&content));
540        }
541
542        let installed_files_path = parent.join("installed-files.txt");
543        if installed_files_path.exists()
544            && let Ok(content) = read_file_to_string(&installed_files_path)
545        {
546            extra_refs.extend(parse_installed_files_txt(&content));
547        }
548
549        let sources_path = parent.join("SOURCES.txt");
550        if sources_path.exists()
551            && let Ok(content) = read_file_to_string(&sources_path)
552        {
553            extra_refs.extend(parse_sources_txt(&content));
554        }
555    }
556
557    for file_ref in extra_refs {
558        if !package_data
559            .file_references
560            .iter()
561            .any(|existing| existing.path == file_ref.path)
562        {
563            package_data.file_references.push(file_ref);
564        }
565    }
566}
567
568fn collect_validated_zip_entries<R: Read + std::io::Seek>(
569    archive: &mut ZipArchive<R>,
570    path: &Path,
571    archive_type: &str,
572) -> Result<Vec<ValidatedZipEntry>, String> {
573    let mut total_extracted = 0u64;
574    let mut entries = Vec::new();
575
576    for i in 0..archive.len() {
577        if let Ok(file) = archive.by_index_raw(i) {
578            let compressed_size = file.compressed_size();
579            let uncompressed_size = file.size();
580            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
581                warn!(
582                    "Skipping unsafe path in {} {:?}: {}",
583                    archive_type,
584                    path,
585                    file.name()
586                );
587                continue;
588            };
589
590            if compressed_size > 0 {
591                let ratio = uncompressed_size as f64 / compressed_size as f64;
592                if ratio > MAX_COMPRESSION_RATIO {
593                    warn!(
594                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
595                        archive_type, path, ratio
596                    );
597                    continue;
598                }
599            }
600
601            if uncompressed_size > MAX_FILE_SIZE {
602                warn!(
603                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
604                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
605                );
606                continue;
607            }
608
609            total_extracted += uncompressed_size;
610            if total_extracted > MAX_ARCHIVE_SIZE {
611                let msg = format!(
612                    "Total extracted size exceeds limit for {} {:?}",
613                    archive_type, path
614                );
615                warn!("{}", msg);
616                return Err(msg);
617            }
618
619            entries.push(ValidatedZipEntry {
620                index: i,
621                name: entry_name,
622            });
623        }
624    }
625
626    Ok(entries)
627}
628
629fn is_python_sdist_archive_path(path: &Path) -> bool {
630    detect_python_sdist_archive_format(path).is_some()
631}
632
633fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
634    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
635
636    if !is_likely_python_sdist_filename(&file_name) {
637        return None;
638    }
639
640    if file_name.ends_with(".tar.gz") {
641        Some(PythonSdistArchiveFormat::TarGz)
642    } else if file_name.ends_with(".tgz") {
643        Some(PythonSdistArchiveFormat::Tgz)
644    } else if file_name.ends_with(".tar.bz2") {
645        Some(PythonSdistArchiveFormat::TarBz2)
646    } else if file_name.ends_with(".tar.xz") {
647        Some(PythonSdistArchiveFormat::TarXz)
648    } else if file_name.ends_with(".zip") {
649        Some(PythonSdistArchiveFormat::Zip)
650    } else {
651        None
652    }
653}
654
655fn is_likely_python_sdist_filename(file_name: &str) -> bool {
656    let Some(stem) = strip_python_archive_extension(file_name) else {
657        return false;
658    };
659
660    let Some((name, version)) = stem.rsplit_once('-') else {
661        return false;
662    };
663
664    !name.is_empty()
665        && !version.is_empty()
666        && version.chars().any(|ch| ch.is_ascii_digit())
667        && name
668            .chars()
669            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
670}
671
672fn extract_from_sdist_archive(path: &Path) -> PackageData {
673    let metadata = match std::fs::metadata(path) {
674        Ok(m) => m,
675        Err(e) => {
676            warn!(
677                "Failed to read metadata for sdist archive {:?}: {}",
678                path, e
679            );
680            return default_package_data();
681        }
682    };
683
684    if metadata.len() > MAX_ARCHIVE_SIZE {
685        warn!(
686            "sdist archive too large: {} bytes (limit: {} bytes)",
687            metadata.len(),
688            MAX_ARCHIVE_SIZE
689        );
690        return default_package_data();
691    }
692
693    let Some(format) = detect_python_sdist_archive_format(path) else {
694        return default_package_data();
695    };
696
697    let mut package_data = match format {
698        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
699            let file = match File::open(path) {
700                Ok(file) => file,
701                Err(e) => {
702                    warn!("Failed to open sdist archive {:?}: {}", path, e);
703                    return default_package_data();
704                }
705            };
706            let decoder = GzDecoder::new(file);
707            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
708        }
709        PythonSdistArchiveFormat::TarBz2 => {
710            let file = match File::open(path) {
711                Ok(file) => file,
712                Err(e) => {
713                    warn!("Failed to open sdist archive {:?}: {}", path, e);
714                    return default_package_data();
715                }
716            };
717            let decoder = BzDecoder::new(file);
718            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
719        }
720        PythonSdistArchiveFormat::TarXz => {
721            let file = match File::open(path) {
722                Ok(file) => file,
723                Err(e) => {
724                    warn!("Failed to open sdist archive {:?}: {}", path, e);
725                    return default_package_data();
726                }
727            };
728            let decoder = XzDecoder::new(file);
729            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
730        }
731        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
732    };
733
734    if package_data.package_type.is_some() {
735        let (size, sha256) = calculate_file_checksums(path);
736        package_data.size = size;
737        package_data.sha256 = sha256;
738    }
739
740    package_data
741}
742
743fn extract_from_tar_sdist_archive<R: Read>(
744    path: &Path,
745    reader: R,
746    archive_type: &str,
747    compressed_size: u64,
748) -> PackageData {
749    let mut archive = Archive::new(reader);
750    let archive_entries = match archive.entries() {
751        Ok(entries) => entries,
752        Err(e) => {
753            warn!(
754                "Failed to read {} sdist archive {:?}: {}",
755                archive_type, path, e
756            );
757            return default_package_data();
758        }
759    };
760
761    let mut total_extracted = 0u64;
762    let mut entries = Vec::new();
763
764    for entry_result in archive_entries {
765        let mut entry = match entry_result {
766            Ok(entry) => entry,
767            Err(e) => {
768                warn!(
769                    "Failed to read {} sdist entry from {:?}: {}",
770                    archive_type, path, e
771                );
772                continue;
773            }
774        };
775
776        let entry_size = entry.size();
777        if entry_size > MAX_FILE_SIZE {
778            warn!(
779                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
780                archive_type, path, entry_size, MAX_FILE_SIZE
781            );
782            continue;
783        }
784
785        total_extracted += entry_size;
786        if total_extracted > MAX_ARCHIVE_SIZE {
787            warn!(
788                "Total extracted size exceeds limit for {} sdist {:?}",
789                archive_type, path
790            );
791            return default_package_data();
792        }
793
794        if compressed_size > 0 {
795            let ratio = total_extracted as f64 / compressed_size as f64;
796            if ratio > MAX_COMPRESSION_RATIO {
797                warn!(
798                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
799                    archive_type, path, ratio
800                );
801                return default_package_data();
802            }
803        }
804
805        let entry_path = match entry.path() {
806            Ok(path) => path.to_string_lossy().replace('\\', "/"),
807            Err(e) => {
808                warn!(
809                    "Failed to get {} sdist entry path from {:?}: {}",
810                    archive_type, path, e
811                );
812                continue;
813            }
814        };
815
816        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
817            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
818            continue;
819        };
820
821        if !is_relevant_sdist_text_entry(&entry_path) {
822            continue;
823        }
824
825        if let Ok(content) = read_limited_utf8(
826            &mut entry,
827            MAX_FILE_SIZE,
828            &format!("{} entry {}", archive_type, entry_path),
829        ) {
830            entries.push((entry_path, content));
831        }
832    }
833
834    build_sdist_package_data(path, entries)
835}
836
837fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
838    let file = match File::open(path) {
839        Ok(file) => file,
840        Err(e) => {
841            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
842            return default_package_data();
843        }
844    };
845
846    let mut archive = match ZipArchive::new(file) {
847        Ok(archive) => archive,
848        Err(e) => {
849            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
850            return default_package_data();
851        }
852    };
853
854    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
855        Ok(entries) => entries,
856        Err(_) => return default_package_data(),
857    };
858
859    let mut entries = Vec::new();
860    for entry in validated_entries.iter() {
861        if !is_relevant_sdist_text_entry(&entry.name) {
862            continue;
863        }
864
865        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
866            entries.push((entry.name.clone(), content));
867        }
868    }
869
870    build_sdist_package_data(path, entries)
871}
872
873fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
874    entry_path.ends_with("/PKG-INFO")
875        || entry_path.ends_with("/requires.txt")
876        || entry_path.ends_with("/SOURCES.txt")
877}
878
879fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
880    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
881        warn!("No PKG-INFO file found in sdist archive {:?}", path);
882        return default_package_data();
883    };
884
885    let mut package_data =
886        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
887    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
888    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
889    apply_sdist_name_version_fallback(path, &mut package_data);
890    package_data
891}
892
893fn select_sdist_pkginfo_entry(
894    archive_path: &Path,
895    entries: &[(String, String)],
896) -> Option<(String, String)> {
897    let expected_name = archive_path
898        .file_name()
899        .and_then(|name| name.to_str())
900        .and_then(strip_python_archive_extension)
901        .and_then(|stem| {
902            stem.rsplit_once('-')
903                .map(|(name, _)| normalize_python_package_name(name))
904        });
905
906    entries
907        .iter()
908        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
909        .min_by_key(|(entry_path, content)| {
910            let components: Vec<_> = entry_path
911                .split('/')
912                .filter(|part| !part.is_empty())
913                .collect();
914            let metadata = super::rfc822::parse_rfc822_content(content);
915            let candidate_name = super::rfc822::get_header_first(&metadata.headers, "name")
916                .map(|name| normalize_python_package_name(&name));
917            let name_rank = if candidate_name == expected_name {
918                0
919            } else {
920                1
921            };
922            let kind_rank = if components.len() == 3
923                && components[1].ends_with(".egg-info")
924                && components[2] == "PKG-INFO"
925            {
926                0
927            } else if components.len() == 2 && components[1] == "PKG-INFO" {
928                1
929            } else if entry_path.ends_with(".egg-info/PKG-INFO") {
930                2
931            } else {
932                3
933            };
934
935            (name_rank, kind_rank, components.len(), entry_path.clone())
936        })
937        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
938}
939
940fn merge_sdist_archive_dependencies(
941    entries: &[(String, String)],
942    metadata_path: &str,
943    package_data: &mut PackageData,
944) {
945    let metadata_dir = metadata_path
946        .rsplit_once('/')
947        .map(|(dir, _)| dir)
948        .unwrap_or("");
949    let archive_root = metadata_path.split('/').next().unwrap_or("");
950    let matched_egg_info_dir =
951        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
952    let mut extra_dependencies = Vec::new();
953
954    for (entry_path, content) in entries {
955        let is_direct_requires =
956            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
957        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
958            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
959        });
960
961        if is_direct_requires || is_egg_info_requires {
962            extra_dependencies.extend(parse_requires_txt(content));
963        }
964    }
965
966    for dependency in extra_dependencies {
967        if !package_data.dependencies.iter().any(|existing| {
968            existing.purl == dependency.purl
969                && existing.scope == dependency.scope
970                && existing.extracted_requirement == dependency.extracted_requirement
971                && existing.extra_data == dependency.extra_data
972        }) {
973            package_data.dependencies.push(dependency);
974        }
975    }
976}
977
978fn merge_sdist_archive_file_references(
979    entries: &[(String, String)],
980    metadata_path: &str,
981    package_data: &mut PackageData,
982) {
983    let metadata_dir = metadata_path
984        .rsplit_once('/')
985        .map(|(dir, _)| dir)
986        .unwrap_or("");
987    let archive_root = metadata_path.split('/').next().unwrap_or("");
988    let matched_egg_info_dir =
989        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
990    let mut extra_refs = Vec::new();
991
992    for (entry_path, content) in entries {
993        let is_direct_sources =
994            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
995        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
996            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
997        });
998
999        if is_direct_sources || is_egg_info_sources {
1000            extra_refs.extend(parse_sources_txt(content));
1001        }
1002    }
1003
1004    for file_ref in extra_refs {
1005        if !package_data
1006            .file_references
1007            .iter()
1008            .any(|existing| existing.path == file_ref.path)
1009        {
1010            package_data.file_references.push(file_ref);
1011        }
1012    }
1013}
1014
1015fn select_matching_sdist_egg_info_dir(
1016    entries: &[(String, String)],
1017    archive_root: &str,
1018    package_name: Option<&str>,
1019) -> Option<String> {
1020    let normalized_package_name = package_name.map(normalize_python_package_name);
1021
1022    entries
1023        .iter()
1024        .filter_map(|(entry_path, _)| {
1025            let components: Vec<_> = entry_path
1026                .split('/')
1027                .filter(|part| !part.is_empty())
1028                .collect();
1029            if components.len() == 3
1030                && components[0] == archive_root
1031                && components[1].ends_with(".egg-info")
1032            {
1033                Some(components[1].to_string())
1034            } else {
1035                None
1036            }
1037        })
1038        .min_by_key(|egg_info_dir| {
1039            let normalized_dir_name =
1040                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1041            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1042                0
1043            } else {
1044                1
1045            };
1046
1047            (name_rank, egg_info_dir.clone())
1048        })
1049}
1050
1051fn normalize_python_package_name(name: &str) -> String {
1052    name.to_ascii_lowercase().replace('_', "-")
1053}
1054
1055fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1056    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1057        return;
1058    };
1059
1060    let Some(stem) = strip_python_archive_extension(file_name) else {
1061        return;
1062    };
1063
1064    let Some((name, version)) = stem.rsplit_once('-') else {
1065        return;
1066    };
1067
1068    if package_data.name.is_none() {
1069        package_data.name = Some(name.replace('_', "-"));
1070    }
1071    if package_data.version.is_none() {
1072        package_data.version = Some(version.to_string());
1073    }
1074
1075    if package_data.purl.is_none()
1076        || package_data.repository_homepage_url.is_none()
1077        || package_data.repository_download_url.is_none()
1078        || package_data.api_data_url.is_none()
1079    {
1080        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1081            build_pypi_urls(
1082                package_data.name.as_deref(),
1083                package_data.version.as_deref(),
1084            );
1085
1086        if package_data.repository_homepage_url.is_none() {
1087            package_data.repository_homepage_url = repository_homepage_url;
1088        }
1089        if package_data.repository_download_url.is_none() {
1090            package_data.repository_download_url = repository_download_url;
1091        }
1092        if package_data.api_data_url.is_none() {
1093            package_data.api_data_url = api_data_url;
1094        }
1095        if package_data.purl.is_none() {
1096            package_data.purl = purl;
1097        }
1098    }
1099}
1100
1101fn extract_from_wheel_archive(path: &Path) -> PackageData {
1102    let metadata = match std::fs::metadata(path) {
1103        Ok(m) => m,
1104        Err(e) => {
1105            warn!(
1106                "Failed to read metadata for wheel archive {:?}: {}",
1107                path, e
1108            );
1109            return default_package_data();
1110        }
1111    };
1112
1113    if metadata.len() > MAX_ARCHIVE_SIZE {
1114        warn!(
1115            "Wheel archive too large: {} bytes (limit: {} bytes)",
1116            metadata.len(),
1117            MAX_ARCHIVE_SIZE
1118        );
1119        return default_package_data();
1120    }
1121
1122    let file = match File::open(path) {
1123        Ok(f) => f,
1124        Err(e) => {
1125            warn!("Failed to open wheel archive {:?}: {}", path, e);
1126            return default_package_data();
1127        }
1128    };
1129
1130    let mut archive = match ZipArchive::new(file) {
1131        Ok(a) => a,
1132        Err(e) => {
1133            warn!("Failed to read wheel archive {:?}: {}", path, e);
1134            return default_package_data();
1135        }
1136    };
1137
1138    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1139        Ok(entries) => entries,
1140        Err(_) => return default_package_data(),
1141    };
1142
1143    let metadata_entry =
1144        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1145            Some(entry) => entry,
1146            None => {
1147                warn!("No METADATA file found in wheel archive {:?}", path);
1148                return default_package_data();
1149            }
1150        };
1151
1152    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1153        Ok(c) => c,
1154        Err(e) => {
1155            warn!("Failed to read METADATA from {:?}: {}", path, e);
1156            return default_package_data();
1157        }
1158    };
1159
1160    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1161
1162    let (size, sha256) = calculate_file_checksums(path);
1163    package_data.size = size;
1164    package_data.sha256 = sha256;
1165
1166    if let Some(record_entry) =
1167        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1168        && let Ok(record_content) =
1169            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1170    {
1171        package_data.file_references = parse_record_csv(&record_content);
1172    }
1173
1174    if let Some(wheel_info) = parse_wheel_filename(path) {
1175        if package_data.name.is_none() {
1176            package_data.name = Some(wheel_info.name.clone());
1177        }
1178        if package_data.version.is_none() {
1179            package_data.version = Some(wheel_info.version.clone());
1180        }
1181
1182        package_data.qualifiers = Some(std::collections::HashMap::from([(
1183            "extension".to_string(),
1184            format!(
1185                "{}-{}-{}",
1186                wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1187            ),
1188        )]));
1189
1190        package_data.purl = build_wheel_purl(
1191            package_data.name.as_deref(),
1192            package_data.version.as_deref(),
1193            &wheel_info,
1194        );
1195
1196        let mut extra_data = package_data.extra_data.unwrap_or_default();
1197        extra_data.insert(
1198            "python_requires".to_string(),
1199            serde_json::Value::String(wheel_info.python_tag.clone()),
1200        );
1201        extra_data.insert(
1202            "abi_tag".to_string(),
1203            serde_json::Value::String(wheel_info.abi_tag.clone()),
1204        );
1205        extra_data.insert(
1206            "platform_tag".to_string(),
1207            serde_json::Value::String(wheel_info.platform_tag.clone()),
1208        );
1209        package_data.extra_data = Some(extra_data);
1210    }
1211
1212    package_data
1213}
1214
1215fn extract_from_egg_archive(path: &Path) -> PackageData {
1216    let metadata = match std::fs::metadata(path) {
1217        Ok(m) => m,
1218        Err(e) => {
1219            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1220            return default_package_data();
1221        }
1222    };
1223
1224    if metadata.len() > MAX_ARCHIVE_SIZE {
1225        warn!(
1226            "Egg archive too large: {} bytes (limit: {} bytes)",
1227            metadata.len(),
1228            MAX_ARCHIVE_SIZE
1229        );
1230        return default_package_data();
1231    }
1232
1233    let file = match File::open(path) {
1234        Ok(f) => f,
1235        Err(e) => {
1236            warn!("Failed to open egg archive {:?}: {}", path, e);
1237            return default_package_data();
1238        }
1239    };
1240
1241    let mut archive = match ZipArchive::new(file) {
1242        Ok(a) => a,
1243        Err(e) => {
1244            warn!("Failed to read egg archive {:?}: {}", path, e);
1245            return default_package_data();
1246        }
1247    };
1248
1249    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1250        Ok(entries) => entries,
1251        Err(_) => return default_package_data(),
1252    };
1253
1254    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1255        &validated_entries,
1256        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1257    ) {
1258        Some(entry) => entry,
1259        None => {
1260            warn!("No PKG-INFO file found in egg archive {:?}", path);
1261            return default_package_data();
1262        }
1263    };
1264
1265    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1266        Ok(c) => c,
1267        Err(e) => {
1268            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1269            return default_package_data();
1270        }
1271    };
1272
1273    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1274
1275    let (size, sha256) = calculate_file_checksums(path);
1276    package_data.size = size;
1277    package_data.sha256 = sha256;
1278
1279    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1280        &validated_entries,
1281        &[
1282            "EGG-INFO/installed-files.txt",
1283            ".egg-info/installed-files.txt",
1284        ],
1285    ) && let Ok(installed_files_content) =
1286        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1287    {
1288        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1289    }
1290
1291    if let Some(egg_info) = parse_egg_filename(path) {
1292        if package_data.name.is_none() {
1293            package_data.name = Some(egg_info.name.clone());
1294        }
1295        if package_data.version.is_none() {
1296            package_data.version = Some(egg_info.version.clone());
1297        }
1298
1299        if let Some(python_version) = &egg_info.python_version {
1300            let mut extra_data = package_data.extra_data.unwrap_or_default();
1301            extra_data.insert(
1302                "python_version".to_string(),
1303                serde_json::Value::String(python_version.clone()),
1304            );
1305            package_data.extra_data = Some(extra_data);
1306        }
1307    }
1308
1309    package_data.purl = build_egg_purl(
1310        package_data.name.as_deref(),
1311        package_data.version.as_deref(),
1312    );
1313
1314    package_data
1315}
1316
1317fn find_validated_zip_entry_by_suffix<'a>(
1318    entries: &'a [ValidatedZipEntry],
1319    suffix: &str,
1320) -> Option<&'a ValidatedZipEntry> {
1321    entries.iter().find(|entry| entry.name.ends_with(suffix))
1322}
1323
1324fn find_validated_zip_entry_by_any_suffix<'a>(
1325    entries: &'a [ValidatedZipEntry],
1326    suffixes: &[&str],
1327) -> Option<&'a ValidatedZipEntry> {
1328    entries
1329        .iter()
1330        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1331}
1332
1333fn read_validated_zip_entry<R: Read + std::io::Seek>(
1334    archive: &mut ZipArchive<R>,
1335    entry: &ValidatedZipEntry,
1336    path: &Path,
1337    archive_type: &str,
1338) -> Result<String, String> {
1339    let mut file = archive
1340        .by_index(entry.index)
1341        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1342
1343    let compressed_size = file.compressed_size();
1344    let uncompressed_size = file.size();
1345
1346    if compressed_size > 0 {
1347        let ratio = uncompressed_size as f64 / compressed_size as f64;
1348        if ratio > MAX_COMPRESSION_RATIO {
1349            return Err(format!(
1350                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1351                archive_type, path, ratio
1352            ));
1353        }
1354    }
1355
1356    if uncompressed_size > MAX_FILE_SIZE {
1357        return Err(format!(
1358            "Rejected oversized entry in {} {:?}: {} bytes",
1359            archive_type, path, uncompressed_size
1360        ));
1361    }
1362
1363    read_limited_utf8(
1364        &mut file,
1365        MAX_FILE_SIZE,
1366        &format!("{} entry {}", archive_type, entry.name),
1367    )
1368}
1369
1370fn read_limited_utf8<R: Read>(
1371    reader: &mut R,
1372    max_bytes: u64,
1373    context: &str,
1374) -> Result<String, String> {
1375    let mut limited = reader.take(max_bytes + 1);
1376    let mut bytes = Vec::new();
1377    limited
1378        .read_to_end(&mut bytes)
1379        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1380
1381    if bytes.len() as u64 > max_bytes {
1382        return Err(format!(
1383            "{} exceeded {} byte limit while reading",
1384            context, max_bytes
1385        ));
1386    }
1387
1388    String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1389}
1390
1391fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1392    let normalized = entry_path.replace('\\', "/");
1393    if normalized.len() >= 3 {
1394        let bytes = normalized.as_bytes();
1395        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1396            return None;
1397        }
1398    }
1399    let path = Path::new(&normalized);
1400    let mut components = Vec::new();
1401
1402    for component in path.components() {
1403        match component {
1404            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1405            Component::CurDir => {}
1406            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1407        }
1408    }
1409
1410    (!components.is_empty()).then_some(components.join("/"))
1411}
1412
1413/// Parses RECORD CSV format from wheel archives (PEP 427).
1414/// Format: path,hash,size (3 columns, no header)
1415/// Hash format: sha256=urlsafe_base64_hash or empty
1416/// Size: bytes as u64 or empty
1417pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1418    let mut reader = ReaderBuilder::new()
1419        .has_headers(false)
1420        .from_reader(content.as_bytes());
1421
1422    let mut file_references = Vec::new();
1423
1424    for result in reader.records() {
1425        match result {
1426            Ok(record) => {
1427                if record.len() < 3 {
1428                    continue;
1429                }
1430
1431                let path = record.get(0).unwrap_or("").trim().to_string();
1432                if path.is_empty() {
1433                    continue;
1434                }
1435
1436                let hash_field = record.get(1).unwrap_or("").trim();
1437                let size_field = record.get(2).unwrap_or("").trim();
1438
1439                // Parse hash: format is "algorithm=value"
1440                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1441                    let parts: Vec<&str> = hash_field.split('=').collect();
1442                    if parts.len() == 2 && parts[0] == "sha256" {
1443                        // Decode base64 to hex
1444                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1445                            Ok(decoded) => {
1446                                let hex = decoded
1447                                    .iter()
1448                                    .map(|b| format!("{:02x}", b))
1449                                    .collect::<String>();
1450                                Some(hex)
1451                            }
1452                            Err(_) => None,
1453                        }
1454                    } else {
1455                        None
1456                    }
1457                } else {
1458                    None
1459                };
1460
1461                // Parse size
1462                let size = if !size_field.is_empty() && size_field != "-" {
1463                    size_field.parse::<u64>().ok()
1464                } else {
1465                    None
1466                };
1467
1468                file_references.push(FileReference {
1469                    path,
1470                    size,
1471                    sha1: None,
1472                    md5: None,
1473                    sha256,
1474                    sha512: None,
1475                    extra_data: None,
1476                });
1477            }
1478            Err(e) => {
1479                warn!("Failed to parse RECORD CSV row: {}", e);
1480                continue;
1481            }
1482        }
1483    }
1484
1485    file_references
1486}
1487
1488/// Parses installed-files.txt format from egg archives (PEP 376).
1489/// Format: one file path per line, no headers, no hash, no size
1490pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1491    content
1492        .lines()
1493        .map(|line| line.trim())
1494        .filter(|line| !line.is_empty())
1495        .map(|path| FileReference {
1496            path: path.to_string(),
1497            size: None,
1498            sha1: None,
1499            md5: None,
1500            sha256: None,
1501            sha512: None,
1502            extra_data: None,
1503        })
1504        .collect()
1505}
1506
1507pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1508    content
1509        .lines()
1510        .map(str::trim)
1511        .filter(|line| !line.is_empty())
1512        .map(|path| FileReference {
1513            path: path.to_string(),
1514            size: None,
1515            sha1: None,
1516            md5: None,
1517            sha256: None,
1518            sha512: None,
1519            extra_data: None,
1520        })
1521        .collect()
1522}
1523
1524struct WheelInfo {
1525    name: String,
1526    version: String,
1527    python_tag: String,
1528    abi_tag: String,
1529    platform_tag: String,
1530}
1531
1532fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1533    let stem = path.file_stem()?.to_string_lossy();
1534    let parts: Vec<&str> = stem.split('-').collect();
1535
1536    if parts.len() >= 5 {
1537        Some(WheelInfo {
1538            name: parts[0].replace('_', "-"),
1539            version: parts[1].to_string(),
1540            python_tag: parts[2].to_string(),
1541            abi_tag: parts[3].to_string(),
1542            platform_tag: parts[4..].join("-"),
1543        })
1544    } else {
1545        None
1546    }
1547}
1548
1549struct EggInfo {
1550    name: String,
1551    version: String,
1552    python_version: Option<String>,
1553}
1554
1555fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1556    let stem = path.file_stem()?.to_string_lossy();
1557    let parts: Vec<&str> = stem.split('-').collect();
1558
1559    if parts.len() >= 2 {
1560        Some(EggInfo {
1561            name: parts[0].replace('_', "-"),
1562            version: parts[1].to_string(),
1563            python_version: parts.get(2).map(|s| s.to_string()),
1564        })
1565    } else {
1566        None
1567    }
1568}
1569
1570fn build_wheel_purl(
1571    name: Option<&str>,
1572    version: Option<&str>,
1573    wheel_info: &WheelInfo,
1574) -> Option<String> {
1575    let name = name?;
1576    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1577
1578    if let Some(ver) = version {
1579        package_url.with_version(ver).ok()?;
1580    }
1581
1582    let extension = format!(
1583        "{}-{}-{}",
1584        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1585    );
1586    package_url.add_qualifier("extension", extension).ok()?;
1587
1588    Some(package_url.to_string())
1589}
1590
1591fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1592    let name = name?;
1593    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1594
1595    if let Some(ver) = version {
1596        package_url.with_version(ver).ok()?;
1597    }
1598
1599    package_url.add_qualifier("type", "egg").ok()?;
1600
1601    Some(package_url.to_string())
1602}
1603
1604fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1605    let metadata = super::rfc822::parse_rfc822_content(content);
1606    build_package_data_from_rfc822(&metadata, datasource_id)
1607}
1608
1609/// Builds PackageData from parsed RFC822 metadata.
1610///
1611/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1612/// and `python_parse_rfc822_content` (content-based) functions.
1613fn build_package_data_from_rfc822(
1614    metadata: &super::rfc822::Rfc822Metadata,
1615    datasource_id: DatasourceId,
1616) -> PackageData {
1617    use super::rfc822::{get_header_all, get_header_first};
1618
1619    let name = get_header_first(&metadata.headers, "name");
1620    let version = get_header_first(&metadata.headers, "version");
1621    let summary = get_header_first(&metadata.headers, "summary");
1622    let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1623    let author = get_header_first(&metadata.headers, "author");
1624    let author_email = get_header_first(&metadata.headers, "author-email");
1625    let license = get_header_first(&metadata.headers, "license");
1626    let license_expression = get_header_first(&metadata.headers, "license-expression");
1627    let download_url = get_header_first(&metadata.headers, "download-url");
1628    let platform = get_header_first(&metadata.headers, "platform");
1629    let requires_python = get_header_first(&metadata.headers, "requires-python");
1630    let classifiers = get_header_all(&metadata.headers, "classifier");
1631    let license_files = get_header_all(&metadata.headers, "license-file");
1632
1633    let description_body = if metadata.body.is_empty() {
1634        get_header_first(&metadata.headers, "description").unwrap_or_default()
1635    } else {
1636        metadata.body.clone()
1637    };
1638
1639    let description = build_description(summary.as_deref(), &description_body);
1640
1641    let mut parties = Vec::new();
1642    if author.is_some() || author_email.is_some() {
1643        parties.push(Party {
1644            r#type: Some("person".to_string()),
1645            role: Some("author".to_string()),
1646            name: author,
1647            email: author_email,
1648            url: None,
1649            organization: None,
1650            organization_url: None,
1651            timezone: None,
1652        });
1653    }
1654
1655    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1656    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1657        normalize_spdx_declared_license(license_expression.as_deref());
1658
1659    let extracted_license_statement = license_expression
1660        .clone()
1661        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1662
1663    let mut extra_data = HashMap::new();
1664    if let Some(platform_value) = platform
1665        && !platform_value.eq_ignore_ascii_case("unknown")
1666        && !platform_value.is_empty()
1667    {
1668        extra_data.insert(
1669            "platform".to_string(),
1670            serde_json::Value::String(platform_value),
1671        );
1672    }
1673
1674    if let Some(requires_python_value) = requires_python
1675        && !requires_python_value.is_empty()
1676    {
1677        extra_data.insert(
1678            "requires_python".to_string(),
1679            serde_json::Value::String(requires_python_value),
1680        );
1681    }
1682
1683    if !license_files.is_empty() {
1684        extra_data.insert(
1685            "license_files".to_string(),
1686            serde_json::Value::Array(
1687                license_files
1688                    .iter()
1689                    .cloned()
1690                    .map(serde_json::Value::String)
1691                    .collect(),
1692            ),
1693        );
1694    }
1695
1696    let file_references = license_files
1697        .iter()
1698        .map(|path| FileReference {
1699            path: path.clone(),
1700            size: None,
1701            sha1: None,
1702            md5: None,
1703            sha256: None,
1704            sha512: None,
1705            extra_data: None,
1706        })
1707        .collect();
1708
1709    let project_urls = get_header_all(&metadata.headers, "project-url");
1710    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1711    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1712
1713    if !project_urls.is_empty() {
1714        let parsed_urls = parse_project_urls(&project_urls);
1715
1716        for (label, url) in &parsed_urls {
1717            let label_lower = label.to_lowercase();
1718
1719            if bug_tracking_url.is_none()
1720                && matches!(
1721                    label_lower.as_str(),
1722                    "tracker"
1723                        | "bug reports"
1724                        | "bug tracker"
1725                        | "issues"
1726                        | "issue tracker"
1727                        | "github: issues"
1728                )
1729            {
1730                bug_tracking_url = Some(url.clone());
1731            } else if code_view_url.is_none()
1732                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1733            {
1734                code_view_url = Some(url.clone());
1735            } else if vcs_url.is_none()
1736                && matches!(
1737                    label_lower.as_str(),
1738                    "github" | "gitlab" | "github: repo" | "repository"
1739                )
1740            {
1741                vcs_url = Some(url.clone());
1742            } else if homepage_url.is_none()
1743                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1744            {
1745                homepage_url = Some(url.clone());
1746            } else if label_lower == "changelog" {
1747                extra_data.insert(
1748                    "changelog_url".to_string(),
1749                    serde_json::Value::String(url.clone()),
1750                );
1751            }
1752        }
1753
1754        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1755            .iter()
1756            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1757            .collect();
1758
1759        if !project_urls_json.is_empty() {
1760            extra_data.insert(
1761                "project_urls".to_string(),
1762                serde_json::Value::Object(project_urls_json),
1763            );
1764        }
1765    }
1766
1767    let extra_data = if extra_data.is_empty() {
1768        None
1769    } else {
1770        Some(extra_data)
1771    };
1772
1773    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1774        build_pypi_urls(name.as_deref(), version.as_deref());
1775
1776    PackageData {
1777        package_type: Some(PythonParser::PACKAGE_TYPE),
1778        namespace: None,
1779        name,
1780        version,
1781        qualifiers: None,
1782        subpath: None,
1783        primary_language: Some("Python".to_string()),
1784        description,
1785        release_date: None,
1786        parties,
1787        keywords,
1788        homepage_url,
1789        download_url,
1790        size: None,
1791        sha1: None,
1792        md5: None,
1793        sha256: None,
1794        sha512: None,
1795        bug_tracking_url,
1796        code_view_url,
1797        vcs_url,
1798        copyright: None,
1799        holder: None,
1800        declared_license_expression,
1801        declared_license_expression_spdx,
1802        license_detections,
1803        other_license_expression: None,
1804        other_license_expression_spdx: None,
1805        other_license_detections: Vec::new(),
1806        extracted_license_statement,
1807        notice_text: None,
1808        source_packages: Vec::new(),
1809        file_references,
1810        is_private: false,
1811        is_virtual: false,
1812        extra_data,
1813        dependencies,
1814        repository_homepage_url,
1815        repository_download_url,
1816        api_data_url,
1817        datasource_id: Some(datasource_id),
1818        purl,
1819    }
1820}
1821
1822fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
1823    project_urls
1824        .iter()
1825        .filter_map(|url_entry| {
1826            if let Some((label, url)) = url_entry.split_once(", ") {
1827                let label_trimmed = label.trim();
1828                let url_trimmed = url.trim();
1829                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
1830                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
1831                }
1832            }
1833            None
1834        })
1835        .collect()
1836}
1837
1838fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
1839    let mut parts = Vec::new();
1840    if let Some(summary_value) = summary
1841        && !summary_value.trim().is_empty()
1842    {
1843        parts.push(summary_value.trim().to_string());
1844    }
1845
1846    if !body.trim().is_empty() {
1847        parts.push(body.trim().to_string());
1848    }
1849
1850    if parts.is_empty() {
1851        None
1852    } else {
1853        Some(parts.join("\n"))
1854    }
1855}
1856
1857fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
1858    let mut keywords = Vec::new();
1859    let mut license_classifiers = Vec::new();
1860
1861    for classifier in classifiers {
1862        if classifier.starts_with("License ::") {
1863            license_classifiers.push(classifier.to_string());
1864        } else {
1865            keywords.push(classifier.to_string());
1866        }
1867    }
1868
1869    (keywords, license_classifiers)
1870}
1871
1872fn build_extracted_license_statement(
1873    license: Option<&str>,
1874    license_classifiers: &[String],
1875) -> Option<String> {
1876    let mut lines = Vec::new();
1877
1878    if let Some(value) = license
1879        && !value.trim().is_empty()
1880    {
1881        lines.push(format!("license: {}", value.trim()));
1882    }
1883
1884    if !license_classifiers.is_empty() {
1885        lines.push("classifiers:".to_string());
1886        for classifier in license_classifiers {
1887            lines.push(format!("  - '{}'", classifier));
1888        }
1889    }
1890
1891    if lines.is_empty() {
1892        None
1893    } else {
1894        Some(format!("{}\n", lines.join("\n")))
1895    }
1896}
1897
1898pub(crate) fn build_pypi_urls(
1899    name: Option<&str>,
1900    version: Option<&str>,
1901) -> (
1902    Option<String>,
1903    Option<String>,
1904    Option<String>,
1905    Option<String>,
1906) {
1907    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
1908
1909    let repository_download_url = name.and_then(|value| {
1910        version.map(|ver| {
1911            format!(
1912                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
1913                &value[..1.min(value.len())],
1914                value,
1915                value,
1916                ver
1917            )
1918        })
1919    });
1920
1921    let api_data_url = name.map(|value| {
1922        if let Some(ver) = version {
1923            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
1924        } else {
1925            format!("https://pypi.org/pypi/{}/json", value)
1926        }
1927    });
1928
1929    let purl = name.and_then(|value| {
1930        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
1931        if let Some(ver) = version {
1932            package_url.with_version(ver).ok()?;
1933        }
1934        Some(package_url.to_string())
1935    });
1936
1937    (
1938        repository_homepage_url,
1939        repository_download_url,
1940        api_data_url,
1941        purl,
1942    )
1943}
1944
1945fn build_pypi_purl_with_extension(
1946    name: &str,
1947    version: Option<&str>,
1948    extension: &str,
1949) -> Option<String> {
1950    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1951    if let Some(ver) = version {
1952        package_url.with_version(ver).ok()?;
1953    }
1954    package_url.add_qualifier("extension", extension).ok()?;
1955    Some(package_url.to_string())
1956}
1957
1958fn extract_from_pyproject_toml(path: &Path) -> PackageData {
1959    let toml_content = match read_toml_file(path) {
1960        Ok(content) => content,
1961        Err(e) => {
1962            warn!(
1963                "Failed to read or parse pyproject.toml at {:?}: {}",
1964                path, e
1965            );
1966            return default_package_data();
1967        }
1968    };
1969
1970    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
1971
1972    // Handle both PEP 621 (project table) and poetry formats
1973    let project_table =
1974        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
1975            // Standard PEP 621 format with [project] table
1976            project.clone()
1977        } else if let Some(tool) = tool_table {
1978            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
1979                // Poetry format with [tool.poetry] table
1980                poetry.clone()
1981            } else {
1982                warn!(
1983                    "No project or tool.poetry data found in pyproject.toml at {:?}",
1984                    path
1985                );
1986                return default_package_data();
1987            }
1988        } else if toml_content.get(FIELD_NAME).is_some() {
1989            // Other format with top-level fields
1990            match toml_content.as_table() {
1991                Some(table) => table.clone(),
1992                None => {
1993                    warn!("Failed to convert TOML content to table in {:?}", path);
1994                    return default_package_data();
1995                }
1996            }
1997        } else {
1998            warn!("No project data found in pyproject.toml at {:?}", path);
1999            return default_package_data();
2000        };
2001
2002    let name = project_table
2003        .get(FIELD_NAME)
2004        .and_then(|v| v.as_str())
2005        .map(String::from);
2006
2007    let version = project_table
2008        .get(FIELD_VERSION)
2009        .and_then(|v| v.as_str())
2010        .map(String::from);
2011    let classifiers = project_table
2012        .get("classifiers")
2013        .and_then(|value| value.as_array())
2014        .map(|values| {
2015            values
2016                .iter()
2017                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2018                .collect::<Vec<_>>()
2019        })
2020        .unwrap_or_default();
2021
2022    let extracted_license_statement = extract_raw_license_string(&project_table);
2023    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2024        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2025
2026    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2027    let (homepage_url, repository_url) = extract_urls(&project_table);
2028
2029    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2030    let extra_data = extract_pyproject_extra_data(&toml_content);
2031
2032    // Create package URL
2033    let purl = name.as_ref().and_then(|n| {
2034        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2035            Ok(p) => p,
2036            Err(e) => {
2037                warn!(
2038                    "Failed to create PackageUrl for Python package '{}': {}",
2039                    n, e
2040                );
2041                return None;
2042            }
2043        };
2044
2045        if let Some(v) = &version
2046            && let Err(e) = package_url.with_version(v)
2047        {
2048            warn!(
2049                "Failed to set version '{}' for Python package '{}': {}",
2050                v, n, e
2051            );
2052            return None;
2053        }
2054
2055        Some(package_url.to_string())
2056    });
2057
2058    let api_data_url = name.as_ref().map(|n| {
2059        if let Some(v) = &version {
2060            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2061        } else {
2062            format!("https://pypi.org/pypi/{}/json", n)
2063        }
2064    });
2065
2066    let pypi_homepage_url = name
2067        .as_ref()
2068        .map(|n| format!("https://pypi.org/project/{}", n));
2069
2070    let pypi_download_url = name.as_ref().and_then(|n| {
2071        version.as_ref().map(|v| {
2072            format!(
2073                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2074                &n[..1.min(n.len())],
2075                n,
2076                n,
2077                v
2078            )
2079        })
2080    });
2081
2082    PackageData {
2083        package_type: Some(PythonParser::PACKAGE_TYPE),
2084        namespace: None,
2085        name,
2086        version,
2087        qualifiers: None,
2088        subpath: None,
2089        primary_language: None,
2090        description: None,
2091        release_date: None,
2092        parties: extract_parties(&project_table),
2093        keywords: Vec::new(),
2094        homepage_url: homepage_url.or(pypi_homepage_url),
2095        download_url: repository_url.clone().or(pypi_download_url),
2096        size: None,
2097        sha1: None,
2098        md5: None,
2099        sha256: None,
2100        sha512: None,
2101        bug_tracking_url: None,
2102        code_view_url: None,
2103        vcs_url: repository_url,
2104        copyright: None,
2105        holder: None,
2106        declared_license_expression,
2107        declared_license_expression_spdx,
2108        license_detections,
2109        other_license_expression: None,
2110        other_license_expression_spdx: None,
2111        other_license_detections: Vec::new(),
2112        extracted_license_statement,
2113        notice_text: None,
2114        source_packages: Vec::new(),
2115        file_references: Vec::new(),
2116        is_private: has_private_classifier(&classifiers),
2117        is_virtual: false,
2118        extra_data,
2119        dependencies: [dependencies, optional_dependencies].concat(),
2120        repository_homepage_url: None,
2121        repository_download_url: None,
2122        api_data_url,
2123        datasource_id: Some(DatasourceId::PypiPyprojectToml),
2124        purl,
2125    }
2126}
2127
2128fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2129    project
2130        .get(FIELD_LICENSE)
2131        .and_then(|license_value| match license_value {
2132            TomlValue::String(license_str) => Some(license_str.clone()),
2133            TomlValue::Table(license_table) => license_table
2134                .get("text")
2135                .and_then(|v| v.as_str())
2136                .map(|s| s.to_string())
2137                .or_else(|| {
2138                    license_table
2139                        .get("expression")
2140                        .and_then(|v| v.as_str())
2141                        .map(|expr| expr.to_string())
2142                }),
2143            _ => None,
2144        })
2145}
2146
2147fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2148    match project.get(FIELD_LICENSE) {
2149        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2150        Some(TomlValue::Table(license_table)) => license_table
2151            .get("expression")
2152            .and_then(|value| value.as_str()),
2153        _ => None,
2154    }
2155}
2156
2157fn extract_urls(project: &TomlMap<String, TomlValue>) -> (Option<String>, Option<String>) {
2158    let mut homepage_url = None;
2159    let mut repository_url = None;
2160
2161    // Check for URLs table
2162    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2163        homepage_url = urls
2164            .get(FIELD_HOMEPAGE)
2165            .and_then(|v| v.as_str())
2166            .map(String::from);
2167        repository_url = urls
2168            .get(FIELD_REPOSITORY)
2169            .and_then(|v| v.as_str())
2170            .map(String::from);
2171    }
2172
2173    // If not found in URLs table, check for top-level keys
2174    if homepage_url.is_none() {
2175        homepage_url = project
2176            .get(FIELD_HOMEPAGE)
2177            .and_then(|v| v.as_str())
2178            .map(String::from);
2179    }
2180
2181    if repository_url.is_none() {
2182        repository_url = project
2183            .get(FIELD_REPOSITORY)
2184            .and_then(|v| v.as_str())
2185            .map(String::from);
2186    }
2187
2188    (homepage_url, repository_url)
2189}
2190
2191fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2192    let mut parties = Vec::new();
2193
2194    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2195        for author in authors {
2196            if let Some(author_str) = author.as_str() {
2197                let (name, email) = split_name_email(author_str);
2198                parties.push(Party {
2199                    r#type: None,
2200                    role: Some("author".to_string()),
2201                    name,
2202                    email,
2203                    url: None,
2204                    organization: None,
2205                    organization_url: None,
2206                    timezone: None,
2207                });
2208            }
2209        }
2210    }
2211
2212    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2213        for maintainer in maintainers {
2214            if let Some(maintainer_str) = maintainer.as_str() {
2215                let (name, email) = split_name_email(maintainer_str);
2216                parties.push(Party {
2217                    r#type: None,
2218                    role: Some("maintainer".to_string()),
2219                    name,
2220                    email,
2221                    url: None,
2222                    organization: None,
2223                    organization_url: None,
2224                    timezone: None,
2225                });
2226            }
2227        }
2228    }
2229
2230    parties
2231}
2232
2233fn extract_dependencies(
2234    project: &TomlMap<String, TomlValue>,
2235    toml_content: &TomlValue,
2236) -> (Vec<Dependency>, Vec<Dependency>) {
2237    let mut dependencies = Vec::new();
2238    let mut optional_dependencies = Vec::new();
2239
2240    // Handle dependencies - can be array or table format
2241    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2242        match deps_value {
2243            TomlValue::Array(arr) => {
2244                dependencies = parse_dependency_array(arr, false, None);
2245            }
2246            TomlValue::Table(table) => {
2247                dependencies = parse_dependency_table(table, false, None);
2248            }
2249            _ => {}
2250        }
2251    }
2252
2253    // Handle PEP 621 optional-dependencies with scope
2254    if let Some(opt_deps_table) = project
2255        .get(FIELD_OPTIONAL_DEPENDENCIES)
2256        .and_then(|v| v.as_table())
2257    {
2258        for (extra_name, deps) in opt_deps_table {
2259            match deps {
2260                TomlValue::Array(arr) => {
2261                    optional_dependencies.extend(parse_dependency_array(
2262                        arr,
2263                        true,
2264                        Some(extra_name),
2265                    ));
2266                }
2267                TomlValue::Table(table) => {
2268                    optional_dependencies.extend(parse_dependency_table(
2269                        table,
2270                        true,
2271                        Some(extra_name),
2272                    ));
2273                }
2274                _ => {}
2275            }
2276        }
2277    }
2278
2279    // Handle Poetry dev-dependencies
2280    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2281        match dev_deps_value {
2282            TomlValue::Array(arr) => {
2283                optional_dependencies.extend(parse_dependency_array(
2284                    arr,
2285                    true,
2286                    Some(FIELD_DEV_DEPENDENCIES),
2287                ));
2288            }
2289            TomlValue::Table(table) => {
2290                optional_dependencies.extend(parse_dependency_table(
2291                    table,
2292                    true,
2293                    Some(FIELD_DEV_DEPENDENCIES),
2294                ));
2295            }
2296            _ => {}
2297        }
2298    }
2299
2300    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2301    if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
2302        for (group_name, group_data) in groups_table {
2303            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2304                match group_deps {
2305                    TomlValue::Array(arr) => {
2306                        optional_dependencies.extend(parse_dependency_array(
2307                            arr,
2308                            true,
2309                            Some(group_name),
2310                        ));
2311                    }
2312                    TomlValue::Table(table) => {
2313                        optional_dependencies.extend(parse_dependency_table(
2314                            table,
2315                            true,
2316                            Some(group_name),
2317                        ));
2318                    }
2319                    _ => {}
2320                }
2321            }
2322        }
2323    }
2324
2325    if let Some(groups_table) = toml_content
2326        .get(FIELD_DEPENDENCY_GROUPS)
2327        .and_then(|value| value.as_table())
2328    {
2329        for (group_name, deps) in groups_table {
2330            match deps {
2331                TomlValue::Array(arr) => {
2332                    optional_dependencies.extend(parse_dependency_array(
2333                        arr,
2334                        true,
2335                        Some(group_name),
2336                    ));
2337                }
2338                TomlValue::Table(table) => {
2339                    optional_dependencies.extend(parse_dependency_table(
2340                        table,
2341                        true,
2342                        Some(group_name),
2343                    ));
2344                }
2345                _ => {}
2346            }
2347        }
2348    }
2349
2350    if let Some(dev_deps_value) = toml_content
2351        .get("tool")
2352        .and_then(|value| value.as_table())
2353        .and_then(|tool| tool.get("uv"))
2354        .and_then(|value| value.as_table())
2355        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2356    {
2357        match dev_deps_value {
2358            TomlValue::Array(arr) => {
2359                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2360            }
2361            TomlValue::Table(table) => {
2362                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2363            }
2364            _ => {}
2365        }
2366    }
2367
2368    (dependencies, optional_dependencies)
2369}
2370
2371fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2372    let mut extra_data = HashMap::new();
2373
2374    if let Some(tool_uv) = toml_content
2375        .get("tool")
2376        .and_then(|value| value.as_table())
2377        .and_then(|tool| tool.get("uv"))
2378    {
2379        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2380    }
2381
2382    if extra_data.is_empty() {
2383        None
2384    } else {
2385        Some(extra_data)
2386    }
2387}
2388
2389fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2390    match value {
2391        TomlValue::String(value) => JsonValue::String(value.clone()),
2392        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2393        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2394        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2395        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2396        TomlValue::Array(values) => {
2397            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2398        }
2399        TomlValue::Table(values) => JsonValue::Object(
2400            values
2401                .iter()
2402                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2403                .collect::<JsonMap<String, JsonValue>>(),
2404        ),
2405    }
2406}
2407
2408fn parse_dependency_table(
2409    table: &TomlMap<String, TomlValue>,
2410    is_optional: bool,
2411    scope: Option<&str>,
2412) -> Vec<Dependency> {
2413    table
2414        .iter()
2415        .filter_map(|(name, version)| {
2416            let version_str = version.as_str().map(|s| s.to_string());
2417            let mut package_url =
2418                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2419
2420            if let Some(v) = &version_str {
2421                package_url.with_version(v).ok()?;
2422            }
2423
2424            Some(Dependency {
2425                purl: Some(package_url.to_string()),
2426                extracted_requirement: None,
2427                scope: scope.map(|s| s.to_string()),
2428                is_runtime: Some(!is_optional),
2429                is_optional: Some(is_optional),
2430                is_pinned: None,
2431                is_direct: Some(true),
2432                resolved_package: None,
2433                extra_data: None,
2434            })
2435        })
2436        .collect()
2437}
2438
2439fn parse_dependency_array(
2440    array: &[TomlValue],
2441    is_optional: bool,
2442    scope: Option<&str>,
2443) -> Vec<Dependency> {
2444    array
2445        .iter()
2446        .filter_map(|dep| {
2447            let dep_str = dep.as_str()?;
2448
2449            let mut parts = dep_str.split(['>', '=', '<', '~']);
2450            let name = parts.next()?.trim().to_string();
2451
2452            let version = parts.next().map(|v| v.trim().to_string());
2453
2454            let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name)
2455            {
2456                Ok(purl) => purl,
2457                Err(_) => return None,
2458            };
2459
2460            if let Some(ref v) = version {
2461                package_url.with_version(v).ok()?;
2462            }
2463
2464            Some(Dependency {
2465                purl: Some(package_url.to_string()),
2466                extracted_requirement: None,
2467                scope: scope.map(|s| s.to_string()),
2468                is_runtime: Some(!is_optional),
2469                is_optional: Some(is_optional),
2470                is_pinned: None,
2471                is_direct: Some(true),
2472                resolved_package: None,
2473                extra_data: None,
2474            })
2475        })
2476        .collect()
2477}
2478
2479#[derive(Debug, Clone)]
2480enum Value {
2481    String(String),
2482    Number(f64),
2483    Bool(bool),
2484    None,
2485    List(Vec<Value>),
2486    Tuple(Vec<Value>),
2487    Dict(HashMap<String, Value>),
2488}
2489
2490struct LiteralEvaluator {
2491    constants: HashMap<String, Value>,
2492    max_depth: usize,
2493    max_nodes: usize,
2494    nodes_visited: usize,
2495}
2496
2497impl LiteralEvaluator {
2498    fn new(constants: HashMap<String, Value>) -> Self {
2499        Self {
2500            constants,
2501            max_depth: MAX_SETUP_PY_AST_DEPTH,
2502            max_nodes: MAX_SETUP_PY_AST_NODES,
2503            nodes_visited: 0,
2504        }
2505    }
2506
2507    fn insert_constant(&mut self, name: String, value: Value) {
2508        self.constants.insert(name, value);
2509    }
2510
2511    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2512        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2513            return None;
2514        }
2515        self.nodes_visited += 1;
2516
2517        match expr {
2518            ast::Expr::Constant(ast::ExprConstant { value, .. }) => self.evaluate_constant(value),
2519            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2520            ast::Expr::List(ast::ExprList { elts, .. }) => {
2521                let mut values = Vec::new();
2522                for elt in elts {
2523                    values.push(self.evaluate_expr(elt, depth + 1)?);
2524                }
2525                Some(Value::List(values))
2526            }
2527            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
2528                let mut values = Vec::new();
2529                for elt in elts {
2530                    values.push(self.evaluate_expr(elt, depth + 1)?);
2531                }
2532                Some(Value::Tuple(values))
2533            }
2534            ast::Expr::Dict(ast::ExprDict { keys, values, .. }) => {
2535                let mut dict = HashMap::new();
2536                for (key_expr, value_expr) in keys.iter().zip(values.iter()) {
2537                    let key_expr = key_expr.as_ref()?;
2538                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
2539                    let key = value_to_string(&key_value)?;
2540                    let value = self.evaluate_expr(value_expr, depth + 1)?;
2541                    dict.insert(key, value);
2542                }
2543                Some(Value::Dict(dict))
2544            }
2545            ast::Expr::Call(ast::ExprCall {
2546                func,
2547                args,
2548                keywords,
2549                ..
2550            }) => {
2551                if keywords.is_empty()
2552                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
2553                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
2554                {
2555                    return self.evaluate_ordered_dict(args, depth + 1);
2556                }
2557
2558                if !args.is_empty() {
2559                    return None;
2560                }
2561
2562                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
2563                    && id == "dict"
2564                {
2565                    let mut dict = HashMap::new();
2566                    for keyword in keywords {
2567                        let key = keyword.arg.as_ref().map(|name| name.as_str())?;
2568                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2569                        dict.insert(key.to_string(), value);
2570                    }
2571                    return Some(Value::Dict(dict));
2572                }
2573
2574                None
2575            }
2576            _ => None,
2577        }
2578    }
2579
2580    fn evaluate_constant(&self, constant: &ast::Constant) -> Option<Value> {
2581        match constant {
2582            ast::Constant::Str(value) => Some(Value::String(value.clone())),
2583            ast::Constant::Bool(value) => Some(Value::Bool(*value)),
2584            ast::Constant::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2585            ast::Constant::Float(value) => Some(Value::Number(*value)),
2586            ast::Constant::None => Some(Value::None),
2587            _ => None,
2588        }
2589    }
2590
2591    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2592        if args.len() != 1 {
2593            return None;
2594        }
2595
2596        let items = match self.evaluate_expr(&args[0], depth)? {
2597            Value::List(items) | Value::Tuple(items) => items,
2598            _ => return None,
2599        };
2600
2601        let mut dict = HashMap::new();
2602        for item in items {
2603            let Value::Tuple(values) = item else {
2604                return None;
2605            };
2606            if values.len() != 2 {
2607                return None;
2608            }
2609            let key = value_to_string(&values[0])?;
2610            dict.insert(key, values[1].clone());
2611        }
2612
2613        Some(Value::Dict(dict))
2614    }
2615}
2616
2617#[derive(Default)]
2618struct SetupAliases {
2619    setup_names: HashSet<String>,
2620    module_aliases: HashMap<String, String>,
2621}
2622
2623fn extract_from_setup_py(path: &Path) -> PackageData {
2624    let content = match read_file_to_string(path) {
2625        Ok(content) => content,
2626        Err(e) => {
2627            warn!("Failed to read setup.py at {:?}: {}", path, e);
2628            return default_package_data();
2629        }
2630    };
2631
2632    if content.len() > MAX_SETUP_PY_BYTES {
2633        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
2634        return extract_from_setup_py_regex(&content);
2635    }
2636
2637    let mut package_data = match extract_from_setup_py_ast(&content) {
2638        Ok(Some(data)) => data,
2639        Ok(None) => extract_from_setup_py_regex(&content),
2640        Err(e) => {
2641            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
2642            extract_from_setup_py_regex(&content)
2643        }
2644    };
2645
2646    if package_data.name.is_none() {
2647        package_data.name = extract_setup_value(&content, "name");
2648    }
2649
2650    if package_data.version.is_none() {
2651        package_data.version = extract_setup_value(&content, "version");
2652    }
2653
2654    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
2655
2656    if package_data.purl.is_none() {
2657        package_data.purl = build_setup_py_purl(
2658            package_data.name.as_deref(),
2659            package_data.version.as_deref(),
2660        );
2661    }
2662
2663    package_data
2664}
2665
2666fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
2667    if package_data.version.is_some()
2668        && package_data.extracted_license_statement.is_some()
2669        && package_data
2670            .parties
2671            .iter()
2672            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
2673    {
2674        return;
2675    }
2676
2677    let Some(root) = path.parent() else {
2678        return;
2679    };
2680
2681    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
2682
2683    if package_data.version.is_none() {
2684        package_data.version = dunder_metadata.version;
2685    }
2686
2687    if package_data.extracted_license_statement.is_none() {
2688        package_data.extracted_license_statement = dunder_metadata.license;
2689    }
2690
2691    let has_author = package_data
2692        .parties
2693        .iter()
2694        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
2695
2696    if !has_author && let Some(author) = dunder_metadata.author {
2697        package_data.parties.push(Party {
2698            r#type: Some("person".to_string()),
2699            role: Some("author".to_string()),
2700            name: Some(author),
2701            email: None,
2702            url: None,
2703            organization: None,
2704            organization_url: None,
2705            timezone: None,
2706        });
2707    }
2708}
2709
2710#[derive(Default)]
2711struct DunderMetadata {
2712    version: Option<String>,
2713    author: Option<String>,
2714    license: Option<String>,
2715}
2716
2717fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
2718    let statements = match ast::Suite::parse(content, "<setup.py>") {
2719        Ok(statements) => statements,
2720        Err(_) => return DunderMetadata::default(),
2721    };
2722
2723    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2724    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2725    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2726    let mut metadata = DunderMetadata::default();
2727
2728    for module in imported_dunder_modules(&statements) {
2729        let Some(path) = resolve_imported_module_path(root, &module) else {
2730            continue;
2731        };
2732        let Ok(module_content) = read_file_to_string(&path) else {
2733            continue;
2734        };
2735
2736        if metadata.version.is_none() {
2737            metadata.version = version_re
2738                .as_ref()
2739                .and_then(|regex| regex.captures(&module_content))
2740                .and_then(|captures| captures.get(1))
2741                .map(|match_| match_.as_str().to_string());
2742        }
2743
2744        if metadata.author.is_none() {
2745            metadata.author = author_re
2746                .as_ref()
2747                .and_then(|regex| regex.captures(&module_content))
2748                .and_then(|captures| captures.get(1))
2749                .map(|match_| match_.as_str().to_string());
2750        }
2751
2752        if metadata.license.is_none() {
2753            metadata.license = license_re
2754                .as_ref()
2755                .and_then(|regex| regex.captures(&module_content))
2756                .and_then(|captures| captures.get(1))
2757                .map(|match_| match_.as_str().to_string());
2758        }
2759
2760        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
2761            return metadata;
2762        }
2763    }
2764
2765    metadata
2766}
2767
2768fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
2769    let mut modules = Vec::new();
2770
2771    for statement in statements {
2772        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
2773            continue;
2774        };
2775        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
2776            continue;
2777        };
2778        let imports_dunder = names.iter().any(|alias| {
2779            matches!(
2780                alias.name.as_str(),
2781                "__version__" | "__author__" | "__license__"
2782            )
2783        });
2784        if imports_dunder {
2785            modules.push(module.to_string());
2786        }
2787    }
2788
2789    modules
2790}
2791
2792fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
2793    let relative = PathBuf::from_iter(module.split('.'));
2794    let candidates = [
2795        root.join(relative.with_extension("py")),
2796        root.join(&relative).join("__init__.py"),
2797        root.join("src").join(relative.with_extension("py")),
2798        root.join("src").join(relative).join("__init__.py"),
2799    ];
2800
2801    candidates.into_iter().find(|candidate| candidate.exists())
2802}
2803
2804/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
2805///
2806/// # Security Model
2807///
2808/// This function parses setup.py as a Python AST and evaluates only literal values
2809/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
2810/// arbitrary code execution during scanning.
2811///
2812/// # DoS Prevention
2813///
2814/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
2815/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
2816/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
2817///
2818/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
2819fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
2820    let statements = ast::Suite::parse(content, "<setup.py>").map_err(|e| format!("{}", e))?;
2821    let aliases = collect_setup_aliases(&statements);
2822    let mut evaluator = LiteralEvaluator::new(HashMap::new());
2823    build_setup_py_constants(&statements, &mut evaluator);
2824
2825    let setup_call = find_setup_call(&statements, &aliases);
2826    let Some(call_expr) = setup_call else {
2827        return Ok(None);
2828    };
2829
2830    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
2831    Ok(Some(build_setup_py_package_data(&setup_values)))
2832}
2833
2834fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
2835    for stmt in statements {
2836        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
2837            if targets.len() != 1 {
2838                continue;
2839            }
2840
2841            let Some(name) = extract_assign_name(&targets[0]) else {
2842                continue;
2843            };
2844
2845            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
2846                evaluator.insert_constant(name, value);
2847            }
2848        }
2849    }
2850}
2851
2852fn extract_assign_name(target: &ast::Expr) -> Option<String> {
2853    match target {
2854        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
2855        _ => None,
2856    }
2857}
2858
2859fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
2860    let mut aliases = SetupAliases::default();
2861    aliases.setup_names.insert("setup".to_string());
2862
2863    for stmt in statements {
2864        match stmt {
2865            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
2866                for alias in names {
2867                    let module_name = alias.name.as_str();
2868                    if !is_setup_module(module_name) {
2869                        continue;
2870                    }
2871                    let alias_name = alias
2872                        .asname
2873                        .as_ref()
2874                        .map(|name| name.as_str())
2875                        .unwrap_or(module_name);
2876                    aliases
2877                        .module_aliases
2878                        .insert(alias_name.to_string(), module_name.to_string());
2879                }
2880            }
2881            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
2882                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
2883                    continue;
2884                };
2885                if !is_setup_module(module_name) {
2886                    continue;
2887                }
2888                for alias in names {
2889                    if alias.name.as_str() != "setup" {
2890                        continue;
2891                    }
2892                    let alias_name = alias
2893                        .asname
2894                        .as_ref()
2895                        .map(|name| name.as_str())
2896                        .unwrap_or("setup");
2897                    aliases.setup_names.insert(alias_name.to_string());
2898                }
2899            }
2900            _ => {}
2901        }
2902    }
2903
2904    aliases
2905}
2906
2907fn is_setup_module(module_name: &str) -> bool {
2908    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
2909}
2910
2911fn find_setup_call<'a>(
2912    statements: &'a [ast::Stmt],
2913    aliases: &'a SetupAliases,
2914) -> Option<&'a ast::Expr> {
2915    let mut finder = SetupCallFinder {
2916        aliases,
2917        nodes_visited: 0,
2918    };
2919    finder.find_in_statements(statements)
2920}
2921
2922struct SetupCallFinder<'a> {
2923    aliases: &'a SetupAliases,
2924    nodes_visited: usize,
2925}
2926
2927impl<'a> SetupCallFinder<'a> {
2928    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
2929        for stmt in statements {
2930            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
2931                return None;
2932            }
2933            self.nodes_visited += 1;
2934
2935            let found = match stmt {
2936                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
2937                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
2938                ast::Stmt::If(ast::StmtIf { body, orelse, .. }) => self
2939                    .find_in_statements(body)
2940                    .or_else(|| self.find_in_statements(orelse)),
2941                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
2942                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
2943                    .find_in_statements(body)
2944                    .or_else(|| self.find_in_statements(orelse)),
2945                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
2946                ast::Stmt::Try(ast::StmtTry {
2947                    body,
2948                    orelse,
2949                    finalbody,
2950                    handlers,
2951                    ..
2952                })
2953                | ast::Stmt::TryStar(ast::StmtTryStar {
2954                    body,
2955                    orelse,
2956                    finalbody,
2957                    handlers,
2958                    ..
2959                }) => self
2960                    .find_in_statements(body)
2961                    .or_else(|| self.find_in_statements(orelse))
2962                    .or_else(|| self.find_in_statements(finalbody))
2963                    .or_else(|| {
2964                        for handler in handlers {
2965                            let ast::ExceptHandler::ExceptHandler(
2966                                ast::ExceptHandlerExceptHandler { body, .. },
2967                            ) = handler;
2968                            if let Some(found) = self.find_in_statements(body) {
2969                                return Some(found);
2970                            }
2971                        }
2972                        None
2973                    }),
2974                _ => None,
2975            };
2976
2977            if found.is_some() {
2978                return found;
2979            }
2980        }
2981
2982        None
2983    }
2984
2985    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
2986        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
2987            return None;
2988        }
2989        self.nodes_visited += 1;
2990
2991        match expr {
2992            ast::Expr::Call(ast::ExprCall { func, .. })
2993                if is_setup_call(func.as_ref(), self.aliases) =>
2994            {
2995                Some(expr)
2996            }
2997            _ => None,
2998        }
2999    }
3000}
3001
3002fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3003    let Some(dotted) = dotted_name(func, 0) else {
3004        return false;
3005    };
3006
3007    if aliases.setup_names.contains(&dotted) {
3008        return true;
3009    }
3010
3011    let Some(module) = dotted.strip_suffix(".setup") else {
3012        return false;
3013    };
3014
3015    let resolved = resolve_module_alias(module, aliases);
3016    is_setup_module(&resolved)
3017}
3018
3019fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3020    if depth >= MAX_SETUP_PY_AST_DEPTH {
3021        return None;
3022    }
3023
3024    match expr {
3025        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3026        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3027            let base = dotted_name(value.as_ref(), depth + 1)?;
3028            Some(format!("{}.{}", base, attr.as_str()))
3029        }
3030        _ => None,
3031    }
3032}
3033
3034fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3035    if let Some(mapped) = aliases.module_aliases.get(module) {
3036        return mapped.clone();
3037    }
3038
3039    let Some((base, rest)) = module.split_once('.') else {
3040        return module.to_string();
3041    };
3042
3043    if let Some(mapped) = aliases.module_aliases.get(base) {
3044        return format!("{}.{}", mapped, rest);
3045    }
3046
3047    module.to_string()
3048}
3049
3050fn extract_setup_keywords(
3051    call_expr: &ast::Expr,
3052    evaluator: &mut LiteralEvaluator,
3053) -> HashMap<String, Value> {
3054    let mut values = HashMap::new();
3055    let ast::Expr::Call(ast::ExprCall { keywords, .. }) = call_expr else {
3056        return values;
3057    };
3058
3059    for keyword in keywords {
3060        if let Some(arg) = keyword.arg.as_ref().map(|name| name.as_str()) {
3061            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3062                values.insert(arg.to_string(), value);
3063            }
3064        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3065            for (key, value) in dict {
3066                values.insert(key, value);
3067            }
3068        }
3069    }
3070
3071    values
3072}
3073
3074fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3075    let name = get_value_string(values, "name");
3076    let version = get_value_string(values, "version");
3077    let description =
3078        get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3079    let homepage_url =
3080        get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3081    let author = get_value_string(values, "author");
3082    let author_email = get_value_string(values, "author_email");
3083    let maintainer = get_value_string(values, "maintainer");
3084    let maintainer_email = get_value_string(values, "maintainer_email");
3085    let license = get_value_string(values, "license");
3086    let classifiers = values
3087        .get("classifiers")
3088        .and_then(value_to_string_list)
3089        .unwrap_or_default();
3090
3091    let mut parties = Vec::new();
3092    if author.is_some() || author_email.is_some() {
3093        parties.push(Party {
3094            r#type: Some("person".to_string()),
3095            role: Some("author".to_string()),
3096            name: author,
3097            email: author_email,
3098            url: None,
3099            organization: None,
3100            organization_url: None,
3101            timezone: None,
3102        });
3103    }
3104
3105    if maintainer.is_some() || maintainer_email.is_some() {
3106        parties.push(Party {
3107            r#type: Some("person".to_string()),
3108            role: Some("maintainer".to_string()),
3109            name: maintainer,
3110            email: maintainer_email,
3111            url: None,
3112            organization: None,
3113            organization_url: None,
3114            timezone: None,
3115        });
3116    }
3117
3118    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3119        normalize_spdx_declared_license(license.as_deref());
3120    let extracted_license_statement = license.clone();
3121
3122    let dependencies = build_setup_py_dependencies(values);
3123    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3124    let mut homepage_from_project_urls = None;
3125    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3126    let mut extra_data = HashMap::new();
3127
3128    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3129        apply_project_url_mappings(
3130            &parsed_project_urls,
3131            &mut homepage_from_project_urls,
3132            &mut bug_tracking_url,
3133            &mut code_view_url,
3134            &mut vcs_url,
3135            &mut extra_data,
3136        );
3137    }
3138
3139    let extra_data = if extra_data.is_empty() {
3140        None
3141    } else {
3142        Some(extra_data)
3143    };
3144
3145    PackageData {
3146        package_type: Some(PythonParser::PACKAGE_TYPE),
3147        namespace: None,
3148        name,
3149        version,
3150        qualifiers: None,
3151        subpath: None,
3152        primary_language: Some("Python".to_string()),
3153        description,
3154        release_date: None,
3155        parties,
3156        keywords: Vec::new(),
3157        homepage_url: homepage_url.or(homepage_from_project_urls),
3158        download_url: None,
3159        size: None,
3160        sha1: None,
3161        md5: None,
3162        sha256: None,
3163        sha512: None,
3164        bug_tracking_url,
3165        code_view_url,
3166        vcs_url,
3167        copyright: None,
3168        holder: None,
3169        declared_license_expression,
3170        declared_license_expression_spdx,
3171        license_detections,
3172        other_license_expression: None,
3173        other_license_expression_spdx: None,
3174        other_license_detections: Vec::new(),
3175        extracted_license_statement,
3176        notice_text: None,
3177        source_packages: Vec::new(),
3178        file_references: Vec::new(),
3179        is_private: has_private_classifier(&classifiers),
3180        is_virtual: false,
3181        extra_data,
3182        dependencies,
3183        repository_homepage_url: None,
3184        repository_download_url: None,
3185        api_data_url: None,
3186        datasource_id: Some(DatasourceId::PypiSetupPy),
3187        purl,
3188    }
3189}
3190
3191fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3192    let mut dependencies = Vec::new();
3193
3194    if let Some(reqs) = values
3195        .get("install_requires")
3196        .and_then(value_to_string_list)
3197    {
3198        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3199    }
3200
3201    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3202        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3203    }
3204
3205    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3206        let mut extra_items: Vec<_> = extras.iter().collect();
3207        extra_items.sort_by_key(|(name, _)| *name);
3208        for (extra_name, extra_value) in extra_items {
3209            if let Some(reqs) = value_to_string_list(extra_value) {
3210                dependencies.extend(build_setup_py_dependency_list(
3211                    reqs.as_slice(),
3212                    extra_name,
3213                    true,
3214                ));
3215            }
3216        }
3217    }
3218
3219    dependencies
3220}
3221
3222fn build_setup_py_dependency_list(
3223    reqs: &[String],
3224    scope: &str,
3225    is_optional: bool,
3226) -> Vec<Dependency> {
3227    reqs.iter()
3228        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3229        .collect()
3230}
3231
3232fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3233    values.get(key).and_then(value_to_string)
3234}
3235
3236fn value_to_string(value: &Value) -> Option<String> {
3237    match value {
3238        Value::String(value) => Some(value.clone()),
3239        Value::Number(value) => Some(value.to_string()),
3240        Value::Bool(value) => Some(value.to_string()),
3241        _ => None,
3242    }
3243}
3244
3245fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3246    match value {
3247        Value::String(value) => Some(vec![value.clone()]),
3248        Value::List(values) | Value::Tuple(values) => {
3249            let mut items = Vec::new();
3250            for item in values {
3251                items.push(value_to_string(item)?);
3252            }
3253            Some(items)
3254        }
3255        _ => None,
3256    }
3257}
3258
3259fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3260    let Value::Dict(dict) = value else {
3261        return None;
3262    };
3263
3264    let mut pairs: Vec<(String, String)> = dict
3265        .iter()
3266        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3267        .collect::<Option<Vec<_>>>()?;
3268    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3269    Some(pairs)
3270}
3271
3272fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3273    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3274    requires_dist
3275        .iter()
3276        .filter_map(|entry| build_rfc822_dependency(entry))
3277        .collect()
3278}
3279
3280fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3281    build_python_dependency(entry, "install", false, None)
3282}
3283
3284fn build_python_dependency(
3285    entry: &str,
3286    default_scope: &str,
3287    default_optional: bool,
3288    marker_override: Option<&str>,
3289) -> Option<Dependency> {
3290    let (requirement_part, marker_part) = entry
3291        .split_once(';')
3292        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3293        .unwrap_or((entry.trim(), None));
3294
3295    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3296    let requirement = normalize_rfc822_requirement(requirement_part);
3297    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3298        marker_part.or(marker_override),
3299        default_scope,
3300        default_optional,
3301    );
3302    let mut purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
3303
3304    let is_pinned = requirement
3305        .as_deref()
3306        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3307    if is_pinned
3308        && let Some(version) = requirement
3309            .as_deref()
3310            .map(|req| req.trim_start_matches('='))
3311    {
3312        purl.with_version(version).ok()?;
3313    }
3314
3315    let mut extra_data = HashMap::new();
3316    extra_data.extend(marker_data);
3317    if let Some(marker) = marker {
3318        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3319    }
3320
3321    Some(Dependency {
3322        purl: Some(purl.to_string()),
3323        extracted_requirement: requirement,
3324        scope: Some(scope),
3325        is_runtime: Some(true),
3326        is_optional: Some(is_optional),
3327        is_pinned: Some(is_pinned),
3328        is_direct: Some(true),
3329        resolved_package: None,
3330        extra_data: if extra_data.is_empty() {
3331            None
3332        } else {
3333            Some(extra_data)
3334        },
3335    })
3336}
3337
3338fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3339    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3340    let trimmed = requirement_part.trim();
3341    let mut remainder = trimmed[name.len()..].trim();
3342
3343    if let Some(stripped) = remainder.strip_prefix('[')
3344        && let Some(end_idx) = stripped.find(']')
3345    {
3346        remainder = stripped[end_idx + 1..].trim();
3347    }
3348
3349    let remainder = remainder
3350        .strip_prefix('(')
3351        .and_then(|value| value.strip_suffix(')'))
3352        .unwrap_or(remainder)
3353        .trim();
3354
3355    if remainder.is_empty() {
3356        return None;
3357    }
3358
3359    let mut specifiers: Vec<String> = remainder
3360        .split(',')
3361        .map(|specifier| specifier.trim().replace(' ', ""))
3362        .filter(|specifier| !specifier.is_empty())
3363        .collect();
3364    specifiers.sort();
3365    Some(specifiers.join(","))
3366}
3367
3368fn parse_rfc822_marker(
3369    marker_part: Option<&str>,
3370    default_scope: &str,
3371    default_optional: bool,
3372) -> (
3373    String,
3374    bool,
3375    Option<String>,
3376    HashMap<String, serde_json::Value>,
3377) {
3378    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3379        return (
3380            default_scope.to_string(),
3381            default_optional,
3382            None,
3383            HashMap::new(),
3384        );
3385    };
3386
3387    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3388        .expect("extra marker regex should compile");
3389    let mut extra_data = HashMap::new();
3390
3391    if let Some(python_version) = extract_marker_field(marker, "python_version") {
3392        extra_data.insert(
3393            "python_version".to_string(),
3394            serde_json::Value::String(python_version),
3395        );
3396    }
3397    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
3398        extra_data.insert(
3399            "sys_platform".to_string(),
3400            serde_json::Value::String(sys_platform),
3401        );
3402    }
3403
3404    if let Some(captures) = extra_re.captures(marker)
3405        && let Some(scope) = captures.get(1)
3406    {
3407        return (
3408            scope.as_str().to_string(),
3409            true,
3410            Some(marker.trim().to_string()),
3411            extra_data,
3412        );
3413    }
3414
3415    (
3416        default_scope.to_string(),
3417        default_optional,
3418        Some(marker.trim().to_string()),
3419        extra_data,
3420    )
3421}
3422
3423fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
3424    let re = Regex::new(&format!(
3425        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
3426        field
3427    ))
3428    .ok()?;
3429    let captures = re.captures(marker)?;
3430    let operator = captures.get(1)?.as_str();
3431    let value = captures.get(2)?.as_str();
3432    Some(format!("{} {}", operator, value))
3433}
3434
3435fn parse_requires_txt(content: &str) -> Vec<Dependency> {
3436    let mut dependencies = Vec::new();
3437    let mut current_scope = "install".to_string();
3438    let mut current_optional = false;
3439    let mut current_marker: Option<String> = None;
3440
3441    for line in content.lines() {
3442        let trimmed = line.trim();
3443        if trimmed.is_empty() || trimmed.starts_with('#') {
3444            continue;
3445        }
3446
3447        if trimmed.starts_with('[') && trimmed.ends_with(']') {
3448            let inner = &trimmed[1..trimmed.len() - 1];
3449            if let Some(rest) = inner.strip_prefix(':') {
3450                current_scope = "install".to_string();
3451                current_optional = false;
3452                current_marker = Some(rest.trim().to_string());
3453            } else if let Some((scope, marker)) = inner.split_once(':') {
3454                current_scope = scope.trim().to_string();
3455                current_optional = true;
3456                current_marker = Some(marker.trim().to_string());
3457            } else {
3458                current_scope = inner.trim().to_string();
3459                current_optional = true;
3460                current_marker = None;
3461            }
3462            continue;
3463        }
3464
3465        if let Some(dependency) = build_python_dependency(
3466            trimmed,
3467            &current_scope,
3468            current_optional,
3469            current_marker.as_deref(),
3470        ) {
3471            dependencies.push(dependency);
3472        }
3473    }
3474
3475    dependencies
3476}
3477
3478fn has_private_classifier(classifiers: &[String]) -> bool {
3479    classifiers
3480        .iter()
3481        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
3482}
3483
3484fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
3485    let name = name?;
3486    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
3487    if let Some(version) = version {
3488        package_url.with_version(version).ok()?;
3489    }
3490    Some(package_url.to_string())
3491}
3492
3493fn extract_from_setup_py_regex(content: &str) -> PackageData {
3494    let name = extract_setup_value(content, "name");
3495    let version = extract_setup_value(content, "version");
3496    let license_expression = extract_setup_value(content, "license");
3497
3498    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3499        normalize_spdx_declared_license(license_expression.as_deref());
3500    let extracted_license_statement = license_expression.clone();
3501
3502    let dependencies = extract_setup_py_dependencies(content);
3503    let homepage_url = extract_setup_value(content, "url");
3504    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3505
3506    PackageData {
3507        package_type: Some(PythonParser::PACKAGE_TYPE),
3508        namespace: None,
3509        name,
3510        version,
3511        qualifiers: None,
3512        subpath: None,
3513        primary_language: Some("Python".to_string()),
3514        description: None,
3515        release_date: None,
3516        parties: Vec::new(),
3517        keywords: Vec::new(),
3518        homepage_url,
3519        download_url: None,
3520        size: None,
3521        sha1: None,
3522        md5: None,
3523        sha256: None,
3524        sha512: None,
3525        bug_tracking_url: None,
3526        code_view_url: None,
3527        vcs_url: None,
3528        copyright: None,
3529        holder: None,
3530        declared_license_expression,
3531        declared_license_expression_spdx,
3532        license_detections,
3533        other_license_expression: None,
3534        other_license_expression_spdx: None,
3535        other_license_detections: Vec::new(),
3536        extracted_license_statement,
3537        notice_text: None,
3538        source_packages: Vec::new(),
3539        file_references: Vec::new(),
3540        is_private: false,
3541        is_virtual: false,
3542        extra_data: None,
3543        dependencies,
3544        repository_homepage_url: None,
3545        repository_download_url: None,
3546        api_data_url: None,
3547        datasource_id: Some(DatasourceId::PypiSetupPy),
3548        purl,
3549    }
3550}
3551
3552fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
3553    crate::models::ResolvedPackage {
3554        package_type: pkg.package_type.unwrap_or(PackageType::Pypi),
3555        namespace: pkg.namespace.clone().unwrap_or_default(),
3556        name: pkg.name.clone().unwrap_or_default(),
3557        version: pkg.version.clone().unwrap_or_default(),
3558        primary_language: pkg.primary_language.clone(),
3559        download_url: pkg.download_url.clone(),
3560        sha1: pkg.sha1.clone(),
3561        sha256: pkg.sha256.clone(),
3562        sha512: pkg.sha512.clone(),
3563        md5: pkg.md5.clone(),
3564        is_virtual: pkg.is_virtual,
3565        extra_data: None,
3566        dependencies: pkg.dependencies.clone(),
3567        repository_homepage_url: pkg.repository_homepage_url.clone(),
3568        repository_download_url: pkg.repository_download_url.clone(),
3569        api_data_url: pkg.api_data_url.clone(),
3570        datasource_id: pkg.datasource_id,
3571        purl: pkg.purl.clone(),
3572    }
3573}
3574
3575fn extract_from_pypi_json(path: &Path) -> PackageData {
3576    let default = PackageData {
3577        package_type: Some(PythonParser::PACKAGE_TYPE),
3578        datasource_id: Some(DatasourceId::PypiJson),
3579        ..Default::default()
3580    };
3581
3582    let content = match read_file_to_string(path) {
3583        Ok(content) => content,
3584        Err(error) => {
3585            warn!("Failed to read pypi.json at {:?}: {}", path, error);
3586            return default;
3587        }
3588    };
3589
3590    let root: serde_json::Value = match serde_json::from_str(&content) {
3591        Ok(value) => value,
3592        Err(error) => {
3593            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
3594            return default;
3595        }
3596    };
3597
3598    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
3599        warn!("No info object found in pypi.json at {:?}", path);
3600        return default;
3601    };
3602
3603    let name = info
3604        .get("name")
3605        .and_then(|value| value.as_str())
3606        .map(ToOwned::to_owned);
3607    let version = info
3608        .get("version")
3609        .and_then(|value| value.as_str())
3610        .map(ToOwned::to_owned);
3611    let summary = info
3612        .get("summary")
3613        .and_then(|value| value.as_str())
3614        .map(ToOwned::to_owned);
3615    let description = info
3616        .get("description")
3617        .and_then(|value| value.as_str())
3618        .filter(|value| !value.trim().is_empty())
3619        .map(ToOwned::to_owned)
3620        .or(summary);
3621    let mut homepage_url = info
3622        .get("home_page")
3623        .and_then(|value| value.as_str())
3624        .map(ToOwned::to_owned);
3625    let author = info
3626        .get("author")
3627        .and_then(|value| value.as_str())
3628        .filter(|value| !value.trim().is_empty())
3629        .map(ToOwned::to_owned);
3630    let author_email = info
3631        .get("author_email")
3632        .and_then(|value| value.as_str())
3633        .filter(|value| !value.trim().is_empty())
3634        .map(ToOwned::to_owned);
3635    let license = info
3636        .get("license")
3637        .and_then(|value| value.as_str())
3638        .filter(|value| !value.trim().is_empty())
3639        .map(ToOwned::to_owned);
3640    let keywords = parse_setup_cfg_keywords(
3641        info.get("keywords")
3642            .and_then(|value| value.as_str())
3643            .map(ToOwned::to_owned),
3644    );
3645    let classifiers = info
3646        .get("classifiers")
3647        .and_then(|value| value.as_array())
3648        .map(|values| {
3649            values
3650                .iter()
3651                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
3652                .collect::<Vec<_>>()
3653        })
3654        .unwrap_or_default();
3655
3656    let mut parties = Vec::new();
3657    if author.is_some() || author_email.is_some() {
3658        parties.push(Party {
3659            r#type: Some("person".to_string()),
3660            role: Some("author".to_string()),
3661            name: author,
3662            email: author_email,
3663            url: None,
3664            organization: None,
3665            organization_url: None,
3666            timezone: None,
3667        });
3668    }
3669
3670    let mut bug_tracking_url = None;
3671    let mut code_view_url = None;
3672    let mut vcs_url = None;
3673    let mut extra_data = HashMap::new();
3674
3675    let parsed_project_urls = info
3676        .get("project_urls")
3677        .and_then(|value| value.as_object())
3678        .map(|map| {
3679            let mut pairs: Vec<(String, String)> = map
3680                .iter()
3681                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
3682                .collect();
3683            pairs.sort_by(|left, right| left.0.cmp(&right.0));
3684            pairs
3685        })
3686        .unwrap_or_default();
3687
3688    apply_project_url_mappings(
3689        &parsed_project_urls,
3690        &mut homepage_url,
3691        &mut bug_tracking_url,
3692        &mut code_view_url,
3693        &mut vcs_url,
3694        &mut extra_data,
3695    );
3696
3697    let (download_url, size, sha256) = root
3698        .get("urls")
3699        .and_then(|value| value.as_array())
3700        .map(|urls| select_pypi_json_artifact(urls))
3701        .unwrap_or((None, None, None));
3702
3703    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
3704        build_pypi_urls(name.as_deref(), version.as_deref());
3705
3706    PackageData {
3707        package_type: Some(PythonParser::PACKAGE_TYPE),
3708        namespace: None,
3709        name,
3710        version,
3711        qualifiers: None,
3712        subpath: None,
3713        primary_language: None,
3714        description,
3715        release_date: None,
3716        parties,
3717        keywords,
3718        homepage_url: homepage_url.or(repository_homepage_url.clone()),
3719        download_url,
3720        size,
3721        sha1: None,
3722        md5: None,
3723        sha256,
3724        sha512: None,
3725        bug_tracking_url,
3726        code_view_url,
3727        vcs_url,
3728        copyright: None,
3729        holder: None,
3730        declared_license_expression: None,
3731        declared_license_expression_spdx: None,
3732        license_detections: Vec::new(),
3733        other_license_expression: None,
3734        other_license_expression_spdx: None,
3735        other_license_detections: Vec::new(),
3736        extracted_license_statement: license,
3737        notice_text: None,
3738        source_packages: Vec::new(),
3739        file_references: Vec::new(),
3740        is_private: has_private_classifier(&classifiers),
3741        is_virtual: false,
3742        extra_data: if extra_data.is_empty() {
3743            None
3744        } else {
3745            Some(extra_data)
3746        },
3747        dependencies: Vec::new(),
3748        repository_homepage_url,
3749        repository_download_url,
3750        api_data_url,
3751        datasource_id: Some(DatasourceId::PypiJson),
3752        purl,
3753    }
3754}
3755
3756fn select_pypi_json_artifact(
3757    urls: &[serde_json::Value],
3758) -> (Option<String>, Option<u64>, Option<String>) {
3759    let selected = urls
3760        .iter()
3761        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
3762        .or_else(|| urls.first());
3763
3764    let Some(entry) = selected else {
3765        return (None, None, None);
3766    };
3767
3768    let download_url = entry
3769        .get("url")
3770        .and_then(|value| value.as_str())
3771        .map(ToOwned::to_owned);
3772    let size = entry.get("size").and_then(|value| value.as_u64());
3773    let sha256 = entry
3774        .get("digests")
3775        .and_then(|value| value.as_object())
3776        .and_then(|digests| digests.get("sha256"))
3777        .and_then(|value| value.as_str())
3778        .map(ToOwned::to_owned);
3779
3780    (download_url, size, sha256)
3781}
3782
3783fn extract_from_pip_inspect(path: &Path) -> PackageData {
3784    let content = match read_file_to_string(path) {
3785        Ok(content) => content,
3786        Err(e) => {
3787            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
3788            return default_package_data();
3789        }
3790    };
3791
3792    let root: serde_json::Value = match serde_json::from_str(&content) {
3793        Ok(value) => value,
3794        Err(e) => {
3795            warn!(
3796                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
3797                path, e
3798            );
3799            return default_package_data();
3800        }
3801    };
3802
3803    let installed = match root.get("installed").and_then(|v| v.as_array()) {
3804        Some(arr) => arr,
3805        None => {
3806            warn!(
3807                "No 'installed' array found in pip-inspect.deplock at {:?}",
3808                path
3809            );
3810            return default_package_data();
3811        }
3812    };
3813
3814    let pip_version = root
3815        .get("pip_version")
3816        .and_then(|v| v.as_str())
3817        .map(String::from);
3818    let inspect_version = root
3819        .get("version")
3820        .and_then(|v| v.as_str())
3821        .map(String::from);
3822
3823    let mut main_package: Option<PackageData> = None;
3824    let mut dependencies: Vec<Dependency> = Vec::new();
3825
3826    for package_entry in installed {
3827        let metadata = match package_entry.get("metadata") {
3828            Some(m) => m,
3829            None => continue,
3830        };
3831
3832        let is_requested = package_entry
3833            .get("requested")
3834            .and_then(|v| v.as_bool())
3835            .unwrap_or(false);
3836        let has_direct_url = package_entry.get("direct_url").is_some();
3837
3838        let name = metadata
3839            .get("name")
3840            .and_then(|v| v.as_str())
3841            .map(String::from);
3842        let version = metadata
3843            .get("version")
3844            .and_then(|v| v.as_str())
3845            .map(String::from);
3846        let summary = metadata
3847            .get("summary")
3848            .and_then(|v| v.as_str())
3849            .map(String::from);
3850        let home_page = metadata
3851            .get("home_page")
3852            .and_then(|v| v.as_str())
3853            .map(String::from);
3854        let author = metadata
3855            .get("author")
3856            .and_then(|v| v.as_str())
3857            .map(String::from);
3858        let author_email = metadata
3859            .get("author_email")
3860            .and_then(|v| v.as_str())
3861            .map(String::from);
3862        let license = metadata
3863            .get("license")
3864            .and_then(|v| v.as_str())
3865            .map(String::from);
3866        let description = metadata
3867            .get("description")
3868            .and_then(|v| v.as_str())
3869            .map(String::from);
3870        let keywords = metadata
3871            .get("keywords")
3872            .and_then(|v| v.as_array())
3873            .map(|arr| {
3874                arr.iter()
3875                    .filter_map(|k| k.as_str().map(String::from))
3876                    .collect::<Vec<_>>()
3877            })
3878            .unwrap_or_default();
3879
3880        let mut parties = Vec::new();
3881        if author.is_some() || author_email.is_some() {
3882            parties.push(Party {
3883                r#type: Some("person".to_string()),
3884                role: Some("author".to_string()),
3885                name: author,
3886                email: author_email,
3887                url: None,
3888                organization: None,
3889                organization_url: None,
3890                timezone: None,
3891            });
3892        }
3893
3894        // Extract license statement only - detection happens in separate engine
3895        let license_detections = Vec::new();
3896        let declared_license_expression = None;
3897        let declared_license_expression_spdx = None;
3898        let extracted_license_statement = license.clone();
3899
3900        let purl = name.as_ref().and_then(|n| {
3901            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
3902            if let Some(v) = &version {
3903                package_url.with_version(v).ok()?;
3904            }
3905            Some(package_url.to_string())
3906        });
3907
3908        if is_requested && has_direct_url {
3909            let mut extra_data = HashMap::new();
3910            if let Some(pv) = &pip_version {
3911                extra_data.insert(
3912                    "pip_version".to_string(),
3913                    serde_json::Value::String(pv.clone()),
3914                );
3915            }
3916            if let Some(iv) = &inspect_version {
3917                extra_data.insert(
3918                    "inspect_version".to_string(),
3919                    serde_json::Value::String(iv.clone()),
3920                );
3921            }
3922
3923            main_package = Some(PackageData {
3924                package_type: Some(PythonParser::PACKAGE_TYPE),
3925                namespace: None,
3926                name,
3927                version,
3928                qualifiers: None,
3929                subpath: None,
3930                primary_language: Some("Python".to_string()),
3931                description: description.or(summary),
3932                release_date: None,
3933                parties,
3934                keywords,
3935                homepage_url: home_page,
3936                download_url: None,
3937                size: None,
3938                sha1: None,
3939                md5: None,
3940                sha256: None,
3941                sha512: None,
3942                bug_tracking_url: None,
3943                code_view_url: None,
3944                vcs_url: None,
3945                copyright: None,
3946                holder: None,
3947                declared_license_expression,
3948                declared_license_expression_spdx,
3949                license_detections,
3950                other_license_expression: None,
3951                other_license_expression_spdx: None,
3952                other_license_detections: Vec::new(),
3953                extracted_license_statement,
3954                notice_text: None,
3955                source_packages: Vec::new(),
3956                file_references: Vec::new(),
3957                is_private: false,
3958                is_virtual: true,
3959                extra_data: if extra_data.is_empty() {
3960                    None
3961                } else {
3962                    Some(extra_data)
3963                },
3964                dependencies: Vec::new(),
3965                repository_homepage_url: None,
3966                repository_download_url: None,
3967                api_data_url: None,
3968                datasource_id: Some(DatasourceId::PypiInspectDeplock),
3969                purl,
3970            });
3971        } else {
3972            let resolved_package = PackageData {
3973                package_type: Some(PythonParser::PACKAGE_TYPE),
3974                namespace: None,
3975                name: name.clone(),
3976                version: version.clone(),
3977                qualifiers: None,
3978                subpath: None,
3979                primary_language: Some("Python".to_string()),
3980                description: description.or(summary),
3981                release_date: None,
3982                parties,
3983                keywords,
3984                homepage_url: home_page,
3985                download_url: None,
3986                size: None,
3987                sha1: None,
3988                md5: None,
3989                sha256: None,
3990                sha512: None,
3991                bug_tracking_url: None,
3992                code_view_url: None,
3993                vcs_url: None,
3994                copyright: None,
3995                holder: None,
3996                declared_license_expression,
3997                declared_license_expression_spdx,
3998                license_detections,
3999                other_license_expression: None,
4000                other_license_expression_spdx: None,
4001                other_license_detections: Vec::new(),
4002                extracted_license_statement,
4003                notice_text: None,
4004                source_packages: Vec::new(),
4005                file_references: Vec::new(),
4006                is_private: false,
4007                is_virtual: true,
4008                extra_data: None,
4009                dependencies: Vec::new(),
4010                repository_homepage_url: None,
4011                repository_download_url: None,
4012                api_data_url: None,
4013                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4014                purl: purl.clone(),
4015            };
4016
4017            let resolved = package_data_to_resolved(&resolved_package);
4018            dependencies.push(Dependency {
4019                purl,
4020                extracted_requirement: None,
4021                scope: None,
4022                is_runtime: Some(true),
4023                is_optional: Some(false),
4024                is_pinned: Some(true),
4025                is_direct: Some(is_requested),
4026                resolved_package: Some(Box::new(resolved)),
4027                extra_data: None,
4028            });
4029        }
4030    }
4031
4032    if let Some(mut main_pkg) = main_package {
4033        main_pkg.dependencies = dependencies;
4034        main_pkg
4035    } else {
4036        default_package_data()
4037    }
4038}
4039
4040type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4041
4042fn extract_from_setup_cfg(path: &Path) -> PackageData {
4043    let content = match read_file_to_string(path) {
4044        Ok(content) => content,
4045        Err(e) => {
4046            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4047            return default_package_data();
4048        }
4049    };
4050
4051    let sections = parse_setup_cfg(&content);
4052    let name = get_ini_value(&sections, "metadata", "name");
4053    let version = get_ini_value(&sections, "metadata", "version");
4054    let description = get_ini_value(&sections, "metadata", "description");
4055    let author = get_ini_value(&sections, "metadata", "author");
4056    let author_email = get_ini_value(&sections, "metadata", "author_email");
4057    let maintainer = get_ini_value(&sections, "metadata", "maintainer");
4058    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4059    let license = get_ini_value(&sections, "metadata", "license");
4060    let mut homepage_url = get_ini_value(&sections, "metadata", "url");
4061    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4062    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4063    let python_requires = get_ini_value(&sections, "options", "python_requires");
4064    let parsed_project_urls =
4065        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4066    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4067    let mut extra_data = HashMap::new();
4068
4069    let mut parties = Vec::new();
4070    if author.is_some() || author_email.is_some() {
4071        parties.push(Party {
4072            r#type: Some("person".to_string()),
4073            role: Some("author".to_string()),
4074            name: author,
4075            email: author_email,
4076            url: None,
4077            organization: None,
4078            organization_url: None,
4079            timezone: None,
4080        });
4081    }
4082
4083    if maintainer.is_some() || maintainer_email.is_some() {
4084        parties.push(Party {
4085            r#type: Some("person".to_string()),
4086            role: Some("maintainer".to_string()),
4087            name: maintainer,
4088            email: maintainer_email,
4089            url: None,
4090            organization: None,
4091            organization_url: None,
4092            timezone: None,
4093        });
4094    }
4095
4096    let declared_license_expression = None;
4097    let declared_license_expression_spdx = None;
4098    let license_detections = Vec::new();
4099    let extracted_license_statement = license.clone();
4100
4101    let dependencies = extract_setup_cfg_dependencies(&sections);
4102
4103    if let Some(value) = python_requires {
4104        extra_data.insert(
4105            "python_requires".to_string(),
4106            serde_json::Value::String(value),
4107        );
4108    }
4109
4110    apply_project_url_mappings(
4111        &parsed_project_urls,
4112        &mut homepage_url,
4113        &mut bug_tracking_url,
4114        &mut code_view_url,
4115        &mut vcs_url,
4116        &mut extra_data,
4117    );
4118
4119    let extra_data = if extra_data.is_empty() {
4120        None
4121    } else {
4122        Some(extra_data)
4123    };
4124
4125    let purl = name.as_ref().and_then(|n| {
4126        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4127        if let Some(v) = &version {
4128            package_url.with_version(v).ok()?;
4129        }
4130        Some(package_url.to_string())
4131    });
4132
4133    PackageData {
4134        package_type: Some(PythonParser::PACKAGE_TYPE),
4135        namespace: None,
4136        name,
4137        version,
4138        qualifiers: None,
4139        subpath: None,
4140        primary_language: Some("Python".to_string()),
4141        description,
4142        release_date: None,
4143        parties,
4144        keywords,
4145        homepage_url,
4146        download_url: None,
4147        size: None,
4148        sha1: None,
4149        md5: None,
4150        sha256: None,
4151        sha512: None,
4152        bug_tracking_url,
4153        code_view_url,
4154        vcs_url,
4155        copyright: None,
4156        holder: None,
4157        declared_license_expression,
4158        declared_license_expression_spdx,
4159        license_detections,
4160        other_license_expression: None,
4161        other_license_expression_spdx: None,
4162        other_license_detections: Vec::new(),
4163        extracted_license_statement,
4164        notice_text: None,
4165        source_packages: Vec::new(),
4166        file_references: Vec::new(),
4167        is_private: has_private_classifier(&classifiers),
4168        is_virtual: false,
4169        extra_data,
4170        dependencies,
4171        repository_homepage_url: None,
4172        repository_download_url: None,
4173        api_data_url: None,
4174        datasource_id: Some(DatasourceId::PypiSetupCfg),
4175        purl,
4176    }
4177}
4178
4179fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4180    let Some(keywords) = value else {
4181        return Vec::new();
4182    };
4183
4184    keywords
4185        .split(',')
4186        .map(str::trim)
4187        .filter(|keyword| !keyword.is_empty())
4188        .map(ToOwned::to_owned)
4189        .collect()
4190}
4191
4192fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4193    entries
4194        .iter()
4195        .filter_map(|entry| {
4196            let (label, url) = entry.split_once('=')?;
4197            let label = label.trim();
4198            let url = url.trim();
4199            if label.is_empty() || url.is_empty() {
4200                None
4201            } else {
4202                Some((label.to_string(), url.to_string()))
4203            }
4204        })
4205        .collect()
4206}
4207
4208fn apply_project_url_mappings(
4209    parsed_urls: &[(String, String)],
4210    homepage_url: &mut Option<String>,
4211    bug_tracking_url: &mut Option<String>,
4212    code_view_url: &mut Option<String>,
4213    vcs_url: &mut Option<String>,
4214    extra_data: &mut HashMap<String, serde_json::Value>,
4215) {
4216    for (label, url) in parsed_urls {
4217        let label_lower = label.to_lowercase();
4218
4219        if bug_tracking_url.is_none()
4220            && matches!(
4221                label_lower.as_str(),
4222                "tracker"
4223                    | "bug reports"
4224                    | "bug tracker"
4225                    | "issues"
4226                    | "issue tracker"
4227                    | "github: issues"
4228            )
4229        {
4230            *bug_tracking_url = Some(url.clone());
4231        } else if code_view_url.is_none()
4232            && matches!(label_lower.as_str(), "source" | "source code" | "code")
4233        {
4234            *code_view_url = Some(url.clone());
4235        } else if vcs_url.is_none()
4236            && matches!(
4237                label_lower.as_str(),
4238                "github" | "gitlab" | "github: repo" | "repository"
4239            )
4240        {
4241            *vcs_url = Some(url.clone());
4242        } else if homepage_url.is_none()
4243            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4244        {
4245            *homepage_url = Some(url.clone());
4246        } else if label_lower == "changelog" {
4247            extra_data.insert(
4248                "changelog_url".to_string(),
4249                serde_json::Value::String(url.clone()),
4250            );
4251        }
4252    }
4253
4254    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4255        .iter()
4256        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4257        .collect();
4258
4259    if !project_urls_json.is_empty() {
4260        extra_data.insert(
4261            "project_urls".to_string(),
4262            serde_json::Value::Object(project_urls_json),
4263        );
4264    }
4265}
4266
4267fn parse_setup_cfg(content: &str) -> IniSections {
4268    let mut sections: IniSections = HashMap::new();
4269    let mut current_section: Option<String> = None;
4270    let mut current_key: Option<String> = None;
4271
4272    for raw_line in content.lines() {
4273        let line = raw_line.trim_end_matches('\r');
4274        let trimmed = line.trim();
4275        if trimmed.is_empty() {
4276            continue;
4277        }
4278
4279        let stripped = line.trim_start();
4280        if stripped.starts_with('#') || stripped.starts_with(';') {
4281            continue;
4282        }
4283
4284        if stripped.starts_with('[') && stripped.ends_with(']') {
4285            let section_name = stripped
4286                .trim_start_matches('[')
4287                .trim_end_matches(']')
4288                .trim()
4289                .to_ascii_lowercase();
4290            current_section = if section_name.is_empty() {
4291                None
4292            } else {
4293                Some(section_name)
4294            };
4295            current_key = None;
4296            continue;
4297        }
4298
4299        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4300            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4301                let value = stripped.trim();
4302                if !value.is_empty() {
4303                    sections
4304                        .entry(section.clone())
4305                        .or_default()
4306                        .entry(key.clone())
4307                        .or_default()
4308                        .push(value.to_string());
4309                }
4310            }
4311            continue;
4312        }
4313
4314        if let Some((key, value)) = stripped.split_once('=')
4315            && let Some(section) = current_section.as_ref()
4316        {
4317            let key_name = key.trim().to_ascii_lowercase();
4318            let value_trimmed = value.trim();
4319            let entry = sections
4320                .entry(section.clone())
4321                .or_default()
4322                .entry(key_name.clone())
4323                .or_default();
4324            if !value_trimmed.is_empty() {
4325                entry.push(value_trimmed.to_string());
4326            }
4327            current_key = Some(key_name);
4328        }
4329    }
4330
4331    sections
4332}
4333
4334fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4335    sections
4336        .get(&section.to_ascii_lowercase())
4337        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4338        .and_then(|entries| entries.first())
4339        .map(|value| value.trim().to_string())
4340}
4341
4342fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4343    sections
4344        .get(&section.to_ascii_lowercase())
4345        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4346        .cloned()
4347        .unwrap_or_default()
4348}
4349
4350fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
4351    let mut dependencies = Vec::new();
4352
4353    for (sub_section, scope) in [
4354        ("install_requires", "install"),
4355        ("tests_require", "test"),
4356        ("setup_requires", "setup"),
4357    ] {
4358        let reqs = get_ini_values(sections, "options", sub_section);
4359        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
4360    }
4361
4362    if let Some(extras) = sections.get("options.extras_require") {
4363        let mut extra_items: Vec<_> = extras.iter().collect();
4364        extra_items.sort_by_key(|(name, _)| *name);
4365        for (extra_name, reqs) in extra_items {
4366            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
4367        }
4368    }
4369
4370    dependencies
4371}
4372
4373fn parse_setup_cfg_requirements(
4374    reqs: &[String],
4375    scope: &str,
4376    is_optional: bool,
4377) -> Vec<Dependency> {
4378    reqs.iter()
4379        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
4380        .collect()
4381}
4382
4383fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
4384    let trimmed = req.trim();
4385    if trimmed.is_empty() || trimmed.starts_with('#') {
4386        return None;
4387    }
4388
4389    let name = extract_setup_cfg_dependency_name(trimmed)?;
4390    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4391
4392    Some(Dependency {
4393        purl: Some(purl.to_string()),
4394        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
4395        scope: Some(scope.to_string()),
4396        is_runtime: Some(true),
4397        is_optional: Some(is_optional),
4398        is_pinned: Some(false),
4399        is_direct: Some(true),
4400        resolved_package: None,
4401        extra_data: None,
4402    })
4403}
4404
4405fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
4406    let trimmed = req.trim();
4407    if trimmed.is_empty() {
4408        return None;
4409    }
4410
4411    let end = trimmed
4412        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
4413        .unwrap_or(trimmed.len());
4414    let name = trimmed[..end].trim();
4415    if name.is_empty() {
4416        None
4417    } else {
4418        Some(name.to_string())
4419    }
4420}
4421
4422fn normalize_setup_cfg_requirement(req: &str) -> String {
4423    req.chars().filter(|c| !c.is_whitespace()).collect()
4424}
4425
4426fn extract_setup_value(content: &str, key: &str) -> Option<String> {
4427    let patterns = vec![
4428        format!("{}=\"", key),   // name="value"
4429        format!("{} =\"", key),  // name ="value"
4430        format!("{}= \"", key),  // name= "value"
4431        format!("{} = \"", key), // name = "value"
4432        format!("{}='", key),    // name='value'
4433        format!("{} ='", key),   // name ='value'
4434        format!("{}= '", key),   // name= 'value'
4435        format!("{} = '", key),  // name = 'value'
4436    ];
4437
4438    for pattern in patterns {
4439        if let Some(start_idx) = content.find(&pattern) {
4440            let value_start = start_idx + pattern.len();
4441            let remaining = &content[value_start..];
4442
4443            if let Some(end_idx) = remaining.find(['"', '\'']) {
4444                return Some(remaining[..end_idx].to_string());
4445            }
4446        }
4447    }
4448
4449    None
4450}
4451
4452fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
4453    let mut dependencies = Vec::new();
4454
4455    if let Some(tests_deps) = extract_tests_require(content) {
4456        dependencies.extend(tests_deps);
4457    }
4458
4459    if let Some(extras_deps) = extract_extras_require(content) {
4460        dependencies.extend(extras_deps);
4461    }
4462
4463    dependencies
4464}
4465
4466fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
4467    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
4468    let re = Regex::new(pattern).ok()?;
4469    let captures = re.captures(content)?;
4470    let deps_str = captures.get(1)?.as_str();
4471
4472    let deps = parse_setup_py_dep_list(deps_str, "test", true);
4473    if deps.is_empty() { None } else { Some(deps) }
4474}
4475
4476fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
4477    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
4478    let re = Regex::new(pattern).ok()?;
4479    let captures = re.captures(content)?;
4480    let dict_content = captures.get(1)?.as_str();
4481
4482    let mut all_deps = Vec::new();
4483
4484    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
4485    let entry_re = Regex::new(entry_pattern).ok()?;
4486
4487    for entry_cap in entry_re.captures_iter(dict_content) {
4488        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
4489            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
4490            all_deps.extend(deps);
4491        }
4492    }
4493
4494    if all_deps.is_empty() {
4495        None
4496    } else {
4497        Some(all_deps)
4498    }
4499}
4500
4501fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
4502    let dep_pattern = r#"['"]([^'"]+)['"]"#;
4503    let re = match Regex::new(dep_pattern) {
4504        Ok(r) => r,
4505        Err(_) => return Vec::new(),
4506    };
4507
4508    re.captures_iter(deps_str)
4509        .filter_map(|cap| {
4510            let dep_str = cap.get(1)?.as_str().trim();
4511            if dep_str.is_empty() {
4512                return None;
4513            }
4514
4515            let name = extract_setup_cfg_dependency_name(dep_str)?;
4516            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4517
4518            Some(Dependency {
4519                purl: Some(purl.to_string()),
4520                extracted_requirement: Some(dep_str.to_string()),
4521                scope: Some(scope.to_string()),
4522                is_runtime: Some(true),
4523                is_optional: Some(is_optional),
4524                is_pinned: Some(false),
4525                is_direct: Some(true),
4526                resolved_package: None,
4527                extra_data: None,
4528            })
4529        })
4530        .collect()
4531}
4532
4533/// Reads and parses a TOML file
4534pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
4535    let content = read_file_to_string(path).map_err(|e| e.to_string())?;
4536    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
4537}
4538
4539/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
4540///
4541/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
4542/// Essential for SBOM compliance and package integrity verification.
4543///
4544/// # Returns
4545///
4546/// - `(Some(size), Some(hash))` on success
4547/// - `(None, None)` if file cannot be opened
4548/// - `(Some(size), None)` if hash calculation fails during read
4549fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<String>) {
4550    let mut file = match File::open(path) {
4551        Ok(f) => f,
4552        Err(_) => return (None, None),
4553    };
4554
4555    let metadata = match file.metadata() {
4556        Ok(m) => m,
4557        Err(_) => return (None, None),
4558    };
4559    let size = metadata.len();
4560
4561    let mut hasher = Sha256::new();
4562    let mut buffer = vec![0; 8192];
4563
4564    loop {
4565        match file.read(&mut buffer) {
4566            Ok(0) => break,
4567            Ok(n) => hasher.update(&buffer[..n]),
4568            Err(_) => return (Some(size), None),
4569        }
4570    }
4571
4572    let hash = format!("{:x}", hasher.finalize());
4573    (Some(size), Some(hash))
4574}
4575
4576fn default_package_data() -> PackageData {
4577    PackageData::default()
4578}
4579
4580crate::register_parser!(
4581    "Python package manifests (pyproject.toml, setup.py, setup.cfg, pypi.json, PKG-INFO, METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
4582    &[
4583        "**/pyproject.toml",
4584        "**/setup.py",
4585        "**/setup.cfg",
4586        "**/pypi.json",
4587        "**/PKG-INFO",
4588        "**/METADATA",
4589        "**/origin.json",
4590        "**/*.tar.gz",
4591        "**/*.tgz",
4592        "**/*.tar.bz2",
4593        "**/*.tar.xz",
4594        "**/*.zip",
4595        "**/*.whl",
4596        "**/*.egg"
4597    ],
4598    "pypi",
4599    "Python",
4600    Some("https://packaging.python.org/"),
4601);