Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
35use crate::parser_warn as warn;
36use crate::parsers::utils::{read_file_to_string, split_name_email};
37use base64::Engine;
38use base64::engine::general_purpose::URL_SAFE_NO_PAD;
39use bzip2::read::BzDecoder;
40use csv::ReaderBuilder;
41use flate2::read::GzDecoder;
42use liblzma::read::XzDecoder;
43use packageurl::PackageUrl;
44use regex::Regex;
45use ruff_python_ast as ast;
46use ruff_python_parser::parse_module;
47use serde_json::{Map as JsonMap, Value as JsonValue};
48use sha2::{Digest, Sha256};
49use std::collections::{HashMap, HashSet};
50use std::fs::File;
51use std::io::Read;
52use std::path::{Component, Path, PathBuf};
53use tar::Archive;
54use toml::Value as TomlValue;
55use toml::map::Map as TomlMap;
56use zip::ZipArchive;
57
58use super::PackageParser;
59use super::license_normalization::{
60    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
61    normalize_spdx_expression,
62};
63
64// Field constants for pyproject.toml
65const FIELD_PROJECT: &str = "project";
66const FIELD_NAME: &str = "name";
67const FIELD_VERSION: &str = "version";
68const FIELD_LICENSE: &str = "license";
69const FIELD_AUTHORS: &str = "authors";
70const FIELD_MAINTAINERS: &str = "maintainers";
71const FIELD_URLS: &str = "urls";
72const FIELD_HOMEPAGE: &str = "homepage";
73const FIELD_REPOSITORY: &str = "repository";
74const FIELD_DEPENDENCIES: &str = "dependencies";
75const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
76const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
77const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
78const MAX_SETUP_PY_BYTES: usize = 1_048_576;
79const MAX_SETUP_PY_AST_NODES: usize = 10_000;
80const MAX_SETUP_PY_AST_DEPTH: usize = 50;
81const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
82const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
83const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
84
85/// Python package parser supporting 11 manifest formats.
86///
87/// Extracts metadata from Python package files including pyproject.toml, setup.py,
88/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
89///
90/// # Security
91///
92/// setup.py files are parsed using AST analysis rather than code execution to prevent
93/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
94pub struct PythonParser;
95
96#[derive(Clone, Copy, Debug)]
97enum PythonSdistArchiveFormat {
98    TarGz,
99    Tgz,
100    TarBz2,
101    TarXz,
102    Zip,
103}
104
105#[derive(Clone, Debug)]
106struct ValidatedZipEntry {
107    index: usize,
108    name: String,
109}
110
111impl PackageParser for PythonParser {
112    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
113
114    fn extract_packages(path: &Path) -> Vec<PackageData> {
115        vec![
116            if path.file_name().unwrap_or_default() == "pyproject.toml" {
117                extract_from_pyproject_toml(path)
118            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
119                extract_from_setup_cfg(path)
120            } else if is_setup_py_like_path(path) {
121                return extract_setup_py_packages(path);
122            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
123                extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
124            } else if is_installed_wheel_metadata_path(path) {
125                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
126            } else if is_pip_cache_origin_json(path) {
127                extract_from_pip_origin_json(path)
128            } else if path.file_name().unwrap_or_default() == "pypi.json" {
129                extract_from_pypi_json(path)
130            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
131                extract_from_pip_inspect(path)
132            } else if is_python_sdist_archive_path(path) {
133                extract_from_sdist_archive(path)
134            } else if path
135                .extension()
136                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
137            {
138                extract_from_wheel_archive(path)
139            } else if path
140                .extension()
141                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
142            {
143                extract_from_egg_archive(path)
144            } else {
145                default_package_data(path)
146            },
147        ]
148    }
149
150    fn is_match(path: &Path) -> bool {
151        if let Some(filename) = path.file_name()
152            && (filename == "pyproject.toml"
153                || filename == "setup.cfg"
154                || is_setup_py_like_path(path)
155                || filename == "PKG-INFO"
156                || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
157                || filename == "pypi.json"
158                || filename == "pip-inspect.deplock"
159                || is_pip_cache_origin_json(path))
160        {
161            return true;
162        }
163
164        if let Some(extension) = path.extension() {
165            let ext = extension.to_string_lossy().to_lowercase();
166            if ext == "whl" || ext == "egg" || is_python_sdist_archive_path(path) {
167                return true;
168            }
169        }
170
171        false
172    }
173}
174
175fn is_setup_py_like_path(path: &Path) -> bool {
176    path.file_name()
177        .and_then(|name| name.to_str())
178        .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
179}
180
181fn is_installed_wheel_metadata_path(path: &Path) -> bool {
182    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
183        && path
184            .parent()
185            .and_then(|parent| parent.file_name())
186            .and_then(|name| name.to_str())
187            .is_some_and(|name| name.ends_with(".dist-info"))
188}
189
190#[derive(Debug, Clone)]
191struct InstalledWheelMetadata {
192    wheel_tags: Vec<String>,
193    wheel_version: Option<String>,
194    wheel_generator: Option<String>,
195    root_is_purelib: Option<bool>,
196    compressed_tag: Option<String>,
197}
198
199fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
200    let Some(parent) = path.parent() else {
201        return;
202    };
203
204    if !parent
205        .file_name()
206        .and_then(|name| name.to_str())
207        .is_some_and(|name| name.ends_with(".dist-info"))
208    {
209        return;
210    }
211
212    let wheel_path = parent.join("WHEEL");
213    if !wheel_path.exists() {
214        return;
215    }
216
217    let Ok(content) = read_file_to_string(&wheel_path) else {
218        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
219        return;
220    };
221
222    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
223        return;
224    };
225
226    apply_installed_wheel_metadata(package_data, &wheel_metadata);
227}
228
229fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
230    use super::rfc822::{get_header_all, get_header_first};
231
232    let metadata = super::rfc822::parse_rfc822_content(content);
233    let wheel_tags = get_header_all(&metadata.headers, "tag");
234    if wheel_tags.is_empty() {
235        return None;
236    }
237
238    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
239    let wheel_generator = get_header_first(&metadata.headers, "generator");
240    let root_is_purelib =
241        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
242            match value.to_ascii_lowercase().as_str() {
243                "true" => Some(true),
244                "false" => Some(false),
245                _ => None,
246            }
247        });
248
249    let compressed_tag = compress_wheel_tags(&wheel_tags);
250
251    Some(InstalledWheelMetadata {
252        wheel_tags,
253        wheel_version,
254        wheel_generator,
255        root_is_purelib,
256        compressed_tag,
257    })
258}
259
260fn compress_wheel_tags(tags: &[String]) -> Option<String> {
261    if tags.is_empty() {
262        return None;
263    }
264
265    if tags.len() == 1 {
266        return Some(tags[0].clone());
267    }
268
269    let mut python_tags = Vec::new();
270    let mut abi_tag: Option<&str> = None;
271    let mut platform_tag: Option<&str> = None;
272
273    for tag in tags {
274        let mut parts = tag.splitn(3, '-');
275        let python = parts.next()?;
276        let abi = parts.next()?;
277        let platform = parts.next()?;
278
279        if abi_tag.is_some_and(|existing| existing != abi)
280            || platform_tag.is_some_and(|existing| existing != platform)
281        {
282            return None;
283        }
284
285        abi_tag = Some(abi);
286        platform_tag = Some(platform);
287        python_tags.push(python.to_string());
288    }
289
290    Some(format!(
291        "{}-{}-{}",
292        python_tags.join("."),
293        abi_tag?,
294        platform_tag?
295    ))
296}
297
298fn apply_installed_wheel_metadata(
299    package_data: &mut PackageData,
300    wheel_metadata: &InstalledWheelMetadata,
301) {
302    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
303    extra_data.insert(
304        "wheel_tags".to_string(),
305        JsonValue::Array(
306            wheel_metadata
307                .wheel_tags
308                .iter()
309                .cloned()
310                .map(JsonValue::String)
311                .collect(),
312        ),
313    );
314
315    if let Some(wheel_version) = &wheel_metadata.wheel_version {
316        extra_data.insert(
317            "wheel_version".to_string(),
318            JsonValue::String(wheel_version.clone()),
319        );
320    }
321
322    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
323        extra_data.insert(
324            "wheel_generator".to_string(),
325            JsonValue::String(wheel_generator.clone()),
326        );
327    }
328
329    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
330        extra_data.insert(
331            "root_is_purelib".to_string(),
332            JsonValue::Bool(root_is_purelib),
333        );
334    }
335
336    if let (Some(name), Some(version), Some(extension)) = (
337        package_data.name.as_deref(),
338        package_data.version.as_deref(),
339        wheel_metadata.compressed_tag.as_deref(),
340    ) {
341        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
342    }
343}
344
345fn is_pip_cache_origin_json(path: &Path) -> bool {
346    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
347        && path.ancestors().skip(1).any(|ancestor| {
348            ancestor
349                .file_name()
350                .and_then(|name| name.to_str())
351                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
352        })
353}
354
355fn extract_from_pip_origin_json(path: &Path) -> PackageData {
356    let content = match read_file_to_string(path) {
357        Ok(content) => content,
358        Err(e) => {
359            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
360            return default_package_data(path);
361        }
362    };
363
364    let root: JsonValue = match serde_json::from_str(&content) {
365        Ok(root) => root,
366        Err(e) => {
367            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
368            return default_package_data(path);
369        }
370    };
371
372    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
373        warn!("No url found in pip cache origin.json at {:?}", path);
374        return default_package_data(path);
375    };
376
377    let sibling_wheel = find_sibling_cached_wheel(path);
378    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
379        sibling_wheel
380            .as_ref()
381            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
382    });
383
384    let Some((name, version)) = name_version else {
385        warn!(
386            "Failed to infer package name/version from pip cache origin.json at {:?}",
387            path
388        );
389        return default_package_data(path);
390    };
391
392    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
393        build_pypi_urls(Some(&name), Some(&version));
394    let purl = sibling_wheel
395        .as_ref()
396        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
397        .or(plain_purl);
398
399    PackageData {
400        package_type: Some(PythonParser::PACKAGE_TYPE),
401        primary_language: Some("Python".to_string()),
402        name: Some(name),
403        version: Some(version),
404        datasource_id: Some(DatasourceId::PypiPipOriginJson),
405        download_url: Some(download_url.to_string()),
406        sha256: extract_sha256_from_origin_json(&root),
407        repository_homepage_url,
408        repository_download_url,
409        api_data_url,
410        purl,
411        ..Default::default()
412    }
413}
414
415fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
416    let parent = path.parent()?;
417    let entries = parent.read_dir().ok()?;
418
419    for entry in entries.flatten() {
420        let sibling_path = entry.path();
421        if sibling_path
422            .extension()
423            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
424            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
425        {
426            return Some(wheel_info);
427        }
428    }
429
430    None
431}
432
433fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
434    let file_name = url.rsplit('/').next()?;
435
436    if file_name.ends_with(".whl") {
437        return parse_wheel_filename(Path::new(file_name))
438            .map(|wheel_info| (wheel_info.name, wheel_info.version));
439    }
440
441    let stem = strip_python_archive_extension(file_name)?;
442    let (name, version) = stem.rsplit_once('-')?;
443    if name.is_empty() || version.is_empty() {
444        return None;
445    }
446
447    Some((name.replace('_', "-"), version.to_string()))
448}
449
450fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
451    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
452        .iter()
453        .find_map(|suffix| file_name.strip_suffix(suffix))
454}
455
456fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
457    root.pointer("/archive_info/hashes/sha256")
458        .and_then(|value| value.as_str())
459        .map(ToOwned::to_owned)
460        .or_else(|| {
461            root.pointer("/archive_info/hash")
462                .and_then(|value| value.as_str())
463                .and_then(normalize_origin_hash)
464        })
465}
466
467fn normalize_origin_hash(hash: &str) -> Option<String> {
468    if let Some(value) = hash.strip_prefix("sha256=") {
469        return Some(value.to_string());
470    }
471    if let Some(value) = hash.strip_prefix("sha256:") {
472        return Some(value.to_string());
473    }
474    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
475        return Some(hash.to_string());
476    }
477    None
478}
479
480fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
481    let content = match read_file_to_string(path) {
482        Ok(content) => content,
483        Err(e) => {
484            warn!("Failed to read metadata at {:?}: {}", path, e);
485            return default_package_data(path);
486        }
487    };
488
489    let metadata = super::rfc822::parse_rfc822_content(&content);
490    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
491    merge_sibling_metadata_dependencies(path, &mut package_data);
492    merge_sibling_metadata_file_references(path, &mut package_data);
493    if datasource_id == DatasourceId::PypiWheelMetadata {
494        merge_sibling_wheel_metadata(path, &mut package_data);
495    }
496    package_data
497}
498
499fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
500    let mut extra_dependencies = Vec::new();
501
502    if let Some(parent) = path.parent() {
503        let direct_requires = parent.join("requires.txt");
504        if direct_requires.exists()
505            && let Ok(content) = read_file_to_string(&direct_requires)
506        {
507            extra_dependencies.extend(parse_requires_txt(&content));
508        }
509
510        let sibling_egg_info_requires = parent
511            .read_dir()
512            .ok()
513            .into_iter()
514            .flatten()
515            .flatten()
516            .find_map(|entry| {
517                let child_path = entry.path();
518                if child_path.is_dir()
519                    && child_path
520                        .file_name()
521                        .and_then(|name| name.to_str())
522                        .is_some_and(|name| name.ends_with(".egg-info"))
523                {
524                    let requires = child_path.join("requires.txt");
525                    requires.exists().then_some(requires)
526                } else {
527                    None
528                }
529            });
530
531        if let Some(requires_path) = sibling_egg_info_requires
532            && let Ok(content) = read_file_to_string(&requires_path)
533        {
534            extra_dependencies.extend(parse_requires_txt(&content));
535        }
536    }
537
538    for dependency in extra_dependencies {
539        if !package_data.dependencies.iter().any(|existing| {
540            existing.purl == dependency.purl
541                && existing.scope == dependency.scope
542                && existing.extracted_requirement == dependency.extracted_requirement
543                && existing.extra_data == dependency.extra_data
544        }) {
545            package_data.dependencies.push(dependency);
546        }
547    }
548}
549
550fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
551    let mut extra_refs = Vec::new();
552
553    if let Some(parent) = path.parent() {
554        let record_path = parent.join("RECORD");
555        if record_path.exists()
556            && let Ok(content) = read_file_to_string(&record_path)
557        {
558            extra_refs.extend(parse_record_csv(&content));
559        }
560
561        let installed_files_path = parent.join("installed-files.txt");
562        if installed_files_path.exists()
563            && let Ok(content) = read_file_to_string(&installed_files_path)
564        {
565            extra_refs.extend(parse_installed_files_txt(&content));
566        }
567
568        let sources_path = parent.join("SOURCES.txt");
569        if sources_path.exists()
570            && let Ok(content) = read_file_to_string(&sources_path)
571        {
572            extra_refs.extend(parse_sources_txt(&content));
573        }
574    }
575
576    for file_ref in extra_refs {
577        if !package_data
578            .file_references
579            .iter()
580            .any(|existing| existing.path == file_ref.path)
581        {
582            package_data.file_references.push(file_ref);
583        }
584    }
585}
586
587fn collect_validated_zip_entries<R: Read + std::io::Seek>(
588    archive: &mut ZipArchive<R>,
589    path: &Path,
590    archive_type: &str,
591) -> Result<Vec<ValidatedZipEntry>, String> {
592    let mut total_extracted = 0u64;
593    let mut entries = Vec::new();
594
595    for i in 0..archive.len() {
596        if let Ok(file) = archive.by_index_raw(i) {
597            let compressed_size = file.compressed_size();
598            let uncompressed_size = file.size();
599            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
600                warn!(
601                    "Skipping unsafe path in {} {:?}: {}",
602                    archive_type,
603                    path,
604                    file.name()
605                );
606                continue;
607            };
608
609            if compressed_size > 0 {
610                let ratio = uncompressed_size as f64 / compressed_size as f64;
611                if ratio > MAX_COMPRESSION_RATIO {
612                    warn!(
613                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
614                        archive_type, path, ratio
615                    );
616                    continue;
617                }
618            }
619
620            if uncompressed_size > MAX_FILE_SIZE {
621                warn!(
622                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
623                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
624                );
625                continue;
626            }
627
628            total_extracted += uncompressed_size;
629            if total_extracted > MAX_ARCHIVE_SIZE {
630                let msg = format!(
631                    "Total extracted size exceeds limit for {} {:?}",
632                    archive_type, path
633                );
634                warn!("{}", msg);
635                return Err(msg);
636            }
637
638            entries.push(ValidatedZipEntry {
639                index: i,
640                name: entry_name,
641            });
642        }
643    }
644
645    Ok(entries)
646}
647
648fn is_python_sdist_archive_path(path: &Path) -> bool {
649    detect_python_sdist_archive_format(path).is_some()
650}
651
652fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
653    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
654
655    if !is_likely_python_sdist_filename(&file_name) {
656        return None;
657    }
658
659    if file_name.ends_with(".tar.gz") {
660        Some(PythonSdistArchiveFormat::TarGz)
661    } else if file_name.ends_with(".tgz") {
662        tgz_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Tgz)
663    } else if file_name.ends_with(".tar.bz2") {
664        Some(PythonSdistArchiveFormat::TarBz2)
665    } else if file_name.ends_with(".tar.xz") {
666        Some(PythonSdistArchiveFormat::TarXz)
667    } else if file_name.ends_with(".zip") {
668        zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
669    } else {
670        None
671    }
672}
673
674fn tgz_sdist_contains_pkg_info(path: &Path) -> bool {
675    if !path.is_file() {
676        return true;
677    }
678
679    let compressed_size = match std::fs::metadata(path) {
680        Ok(metadata) => metadata.len(),
681        Err(_) => return false,
682    };
683    let file = match File::open(path) {
684        Ok(file) => file,
685        Err(_) => return false,
686    };
687    let decoder = GzDecoder::new(file);
688    let Some(entries) = collect_tar_sdist_entries(path, decoder, "tgz", compressed_size) else {
689        return false;
690    };
691
692    select_sdist_pkginfo_entry(path, &entries).is_some()
693}
694
695fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
696    if !path.is_file() {
697        return true;
698    }
699
700    let file = match File::open(path) {
701        Ok(file) => file,
702        Err(_) => return false,
703    };
704    let mut archive = match ZipArchive::new(file) {
705        Ok(archive) => archive,
706        Err(_) => return false,
707    };
708
709    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
710        Ok(entries) => entries,
711        Err(_) => return false,
712    };
713    let metadata_entries: Vec<_> = validated_entries
714        .iter()
715        .filter(|entry| entry.name.ends_with("/PKG-INFO"))
716        .filter_map(|entry| {
717            read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
718                .ok()
719                .map(|content| (entry.name.clone(), content))
720        })
721        .collect();
722
723    has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
724}
725
726fn is_likely_python_sdist_filename(file_name: &str) -> bool {
727    let Some(stem) = strip_python_archive_extension(file_name) else {
728        return false;
729    };
730
731    let Some((name, version)) = stem.rsplit_once('-') else {
732        return false;
733    };
734
735    !name.is_empty()
736        && !version.is_empty()
737        && version.chars().any(|ch| ch.is_ascii_digit())
738        && name
739            .chars()
740            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
741}
742
743fn extract_from_sdist_archive(path: &Path) -> PackageData {
744    let metadata = match std::fs::metadata(path) {
745        Ok(m) => m,
746        Err(e) => {
747            warn!(
748                "Failed to read metadata for sdist archive {:?}: {}",
749                path, e
750            );
751            return default_package_data(path);
752        }
753    };
754
755    if metadata.len() > MAX_ARCHIVE_SIZE {
756        warn!(
757            "sdist archive too large: {} bytes (limit: {} bytes)",
758            metadata.len(),
759            MAX_ARCHIVE_SIZE
760        );
761        return default_package_data(path);
762    }
763
764    let Some(format) = detect_python_sdist_archive_format(path) else {
765        return default_package_data(path);
766    };
767
768    let mut package_data = match format {
769        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
770            let file = match File::open(path) {
771                Ok(file) => file,
772                Err(e) => {
773                    warn!("Failed to open sdist archive {:?}: {}", path, e);
774                    return default_package_data(path);
775                }
776            };
777            let decoder = GzDecoder::new(file);
778            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
779        }
780        PythonSdistArchiveFormat::TarBz2 => {
781            let file = match File::open(path) {
782                Ok(file) => file,
783                Err(e) => {
784                    warn!("Failed to open sdist archive {:?}: {}", path, e);
785                    return default_package_data(path);
786                }
787            };
788            let decoder = BzDecoder::new(file);
789            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
790        }
791        PythonSdistArchiveFormat::TarXz => {
792            let file = match File::open(path) {
793                Ok(file) => file,
794                Err(e) => {
795                    warn!("Failed to open sdist archive {:?}: {}", path, e);
796                    return default_package_data(path);
797                }
798            };
799            let decoder = XzDecoder::new(file);
800            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
801        }
802        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
803    };
804
805    if package_data.package_type.is_some() {
806        let (size, sha256) = calculate_file_checksums(path);
807        package_data.size = size;
808        package_data.sha256 = sha256;
809    }
810
811    package_data
812}
813
814fn extract_from_tar_sdist_archive<R: Read>(
815    path: &Path,
816    reader: R,
817    archive_type: &str,
818    compressed_size: u64,
819) -> PackageData {
820    let Some(entries) = collect_tar_sdist_entries(path, reader, archive_type, compressed_size)
821    else {
822        return default_package_data(path);
823    };
824
825    build_sdist_package_data(path, entries)
826}
827
828fn collect_tar_sdist_entries<R: Read>(
829    path: &Path,
830    reader: R,
831    archive_type: &str,
832    compressed_size: u64,
833) -> Option<Vec<(String, String)>> {
834    let mut archive = Archive::new(reader);
835    let archive_entries = match archive.entries() {
836        Ok(entries) => entries,
837        Err(e) => {
838            warn!(
839                "Failed to read {} sdist archive {:?}: {}",
840                archive_type, path, e
841            );
842            return None;
843        }
844    };
845
846    let mut total_extracted = 0u64;
847    let mut entries = Vec::new();
848
849    for entry_result in archive_entries {
850        let mut entry = match entry_result {
851            Ok(entry) => entry,
852            Err(e) => {
853                warn!(
854                    "Failed to read {} sdist entry from {:?}: {}",
855                    archive_type, path, e
856                );
857                continue;
858            }
859        };
860
861        let entry_size = entry.size();
862        if entry_size > MAX_FILE_SIZE {
863            warn!(
864                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
865                archive_type, path, entry_size, MAX_FILE_SIZE
866            );
867            continue;
868        }
869
870        total_extracted += entry_size;
871        if total_extracted > MAX_ARCHIVE_SIZE {
872            warn!(
873                "Total extracted size exceeds limit for {} sdist {:?}",
874                archive_type, path
875            );
876            return None;
877        }
878
879        if compressed_size > 0 {
880            let ratio = total_extracted as f64 / compressed_size as f64;
881            if ratio > MAX_COMPRESSION_RATIO {
882                warn!(
883                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
884                    archive_type, path, ratio
885                );
886                return None;
887            }
888        }
889
890        let entry_path = match entry.path() {
891            Ok(path) => path.to_string_lossy().replace('\\', "/"),
892            Err(e) => {
893                warn!(
894                    "Failed to get {} sdist entry path from {:?}: {}",
895                    archive_type, path, e
896                );
897                continue;
898            }
899        };
900
901        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
902            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
903            continue;
904        };
905
906        if !is_relevant_sdist_text_entry(&entry_path) {
907            continue;
908        }
909
910        if let Ok(content) = read_limited_utf8(
911            &mut entry,
912            MAX_FILE_SIZE,
913            &format!("{} entry {}", archive_type, entry_path),
914        ) {
915            entries.push((entry_path, content));
916        }
917    }
918
919    Some(entries)
920}
921
922fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
923    let file = match File::open(path) {
924        Ok(file) => file,
925        Err(e) => {
926            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
927            return default_package_data(path);
928        }
929    };
930
931    let mut archive = match ZipArchive::new(file) {
932        Ok(archive) => archive,
933        Err(e) => {
934            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
935            return default_package_data(path);
936        }
937    };
938
939    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
940        Ok(entries) => entries,
941        Err(_) => return default_package_data(path),
942    };
943
944    let mut entries = Vec::new();
945    for entry in validated_entries.iter() {
946        if !is_relevant_sdist_text_entry(&entry.name) {
947            continue;
948        }
949
950        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
951            entries.push((entry.name.clone(), content));
952        }
953    }
954
955    build_sdist_package_data(path, entries)
956}
957
958fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
959    entry_path.ends_with("/PKG-INFO")
960        || entry_path.ends_with("/requires.txt")
961        || entry_path.ends_with("/SOURCES.txt")
962}
963
964fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
965    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
966        warn!("No PKG-INFO file found in sdist archive {:?}", path);
967        return default_package_data(path);
968    };
969
970    let mut package_data =
971        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
972    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
973    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
974    apply_sdist_name_version_fallback(path, &mut package_data);
975    package_data.datasource_id = Some(DatasourceId::PypiSdist);
976    package_data
977}
978
979fn select_sdist_pkginfo_entry(
980    archive_path: &Path,
981    entries: &[(String, String)],
982) -> Option<(String, String)> {
983    let expected_name = sdist_archive_expected_name(archive_path);
984
985    entries
986        .iter()
987        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
988        .min_by_key(|(entry_path, content)| {
989            let components: Vec<_> = entry_path
990                .split('/')
991                .filter(|part| !part.is_empty())
992                .collect();
993            let candidate_name = sdist_pkginfo_candidate_name(content);
994            let name_rank = if candidate_name == expected_name {
995                0
996            } else {
997                1
998            };
999            let kind_rank = sdist_pkginfo_kind_rank(entry_path);
1000
1001            (name_rank, kind_rank, components.len(), entry_path.clone())
1002        })
1003        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
1004}
1005
1006fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
1007    let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
1008        return false;
1009    };
1010
1011    entries.iter().any(|(entry_path, content)| {
1012        sdist_pkginfo_kind_rank(entry_path) < 3
1013            && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
1014    })
1015}
1016
1017fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
1018    archive_path
1019        .file_name()
1020        .and_then(|name| name.to_str())
1021        .and_then(strip_python_archive_extension)
1022        .and_then(|stem| {
1023            stem.rsplit_once('-')
1024                .map(|(name, _)| normalize_python_package_name(name))
1025        })
1026}
1027
1028fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
1029    let metadata = super::rfc822::parse_rfc822_content(content);
1030    super::rfc822::get_header_first(&metadata.headers, "name")
1031        .map(|name| normalize_python_package_name(&name))
1032}
1033
1034fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1035    let components: Vec<_> = entry_path
1036        .split('/')
1037        .filter(|part| !part.is_empty())
1038        .collect();
1039
1040    if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1041    {
1042        0
1043    } else if components.len() == 2 && components[1] == "PKG-INFO" {
1044        1
1045    } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1046        2
1047    } else {
1048        3
1049    }
1050}
1051
1052fn merge_sdist_archive_dependencies(
1053    entries: &[(String, String)],
1054    metadata_path: &str,
1055    package_data: &mut PackageData,
1056) {
1057    let metadata_dir = metadata_path
1058        .rsplit_once('/')
1059        .map(|(dir, _)| dir)
1060        .unwrap_or("");
1061    let archive_root = metadata_path.split('/').next().unwrap_or("");
1062    let matched_egg_info_dir =
1063        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1064    let mut extra_dependencies = Vec::new();
1065
1066    for (entry_path, content) in entries {
1067        let is_direct_requires =
1068            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1069        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1070            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1071        });
1072
1073        if is_direct_requires || is_egg_info_requires {
1074            extra_dependencies.extend(parse_requires_txt(content));
1075        }
1076    }
1077
1078    for dependency in extra_dependencies {
1079        if !package_data.dependencies.iter().any(|existing| {
1080            existing.purl == dependency.purl
1081                && existing.scope == dependency.scope
1082                && existing.extracted_requirement == dependency.extracted_requirement
1083                && existing.extra_data == dependency.extra_data
1084        }) {
1085            package_data.dependencies.push(dependency);
1086        }
1087    }
1088}
1089
1090fn merge_sdist_archive_file_references(
1091    entries: &[(String, String)],
1092    metadata_path: &str,
1093    package_data: &mut PackageData,
1094) {
1095    let metadata_dir = metadata_path
1096        .rsplit_once('/')
1097        .map(|(dir, _)| dir)
1098        .unwrap_or("");
1099    let archive_root = metadata_path.split('/').next().unwrap_or("");
1100    let matched_egg_info_dir =
1101        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1102    let mut extra_refs = Vec::new();
1103
1104    for (entry_path, content) in entries {
1105        let is_direct_sources =
1106            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1107        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1108            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1109        });
1110
1111        if is_direct_sources || is_egg_info_sources {
1112            extra_refs.extend(parse_sources_txt(content));
1113        }
1114    }
1115
1116    for file_ref in extra_refs {
1117        if !package_data
1118            .file_references
1119            .iter()
1120            .any(|existing| existing.path == file_ref.path)
1121        {
1122            package_data.file_references.push(file_ref);
1123        }
1124    }
1125}
1126
1127fn select_matching_sdist_egg_info_dir(
1128    entries: &[(String, String)],
1129    archive_root: &str,
1130    package_name: Option<&str>,
1131) -> Option<String> {
1132    let normalized_package_name = package_name.map(normalize_python_package_name);
1133
1134    entries
1135        .iter()
1136        .filter_map(|(entry_path, _)| {
1137            let components: Vec<_> = entry_path
1138                .split('/')
1139                .filter(|part| !part.is_empty())
1140                .collect();
1141            if components.len() == 3
1142                && components[0] == archive_root
1143                && components[1].ends_with(".egg-info")
1144            {
1145                Some(components[1].to_string())
1146            } else {
1147                None
1148            }
1149        })
1150        .min_by_key(|egg_info_dir| {
1151            let normalized_dir_name =
1152                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1153            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1154                0
1155            } else {
1156                1
1157            };
1158
1159            (name_rank, egg_info_dir.clone())
1160        })
1161}
1162
1163fn normalize_python_package_name(name: &str) -> String {
1164    name.to_ascii_lowercase().replace('_', "-")
1165}
1166
1167fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1168    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1169        return;
1170    };
1171
1172    let Some(stem) = strip_python_archive_extension(file_name) else {
1173        return;
1174    };
1175
1176    let Some((name, version)) = stem.rsplit_once('-') else {
1177        return;
1178    };
1179
1180    if package_data.name.is_none() {
1181        package_data.name = Some(name.replace('_', "-"));
1182    }
1183    if package_data.version.is_none() {
1184        package_data.version = Some(version.to_string());
1185    }
1186
1187    if package_data.purl.is_none()
1188        || package_data.repository_homepage_url.is_none()
1189        || package_data.repository_download_url.is_none()
1190        || package_data.api_data_url.is_none()
1191    {
1192        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1193            build_pypi_urls(
1194                package_data.name.as_deref(),
1195                package_data.version.as_deref(),
1196            );
1197
1198        if package_data.repository_homepage_url.is_none() {
1199            package_data.repository_homepage_url = repository_homepage_url;
1200        }
1201        if package_data.repository_download_url.is_none() {
1202            package_data.repository_download_url = repository_download_url;
1203        }
1204        if package_data.api_data_url.is_none() {
1205            package_data.api_data_url = api_data_url;
1206        }
1207        if package_data.purl.is_none() {
1208            package_data.purl = purl;
1209        }
1210    }
1211}
1212
1213fn extract_from_wheel_archive(path: &Path) -> PackageData {
1214    let metadata = match std::fs::metadata(path) {
1215        Ok(m) => m,
1216        Err(e) => {
1217            warn!(
1218                "Failed to read metadata for wheel archive {:?}: {}",
1219                path, e
1220            );
1221            return default_package_data(path);
1222        }
1223    };
1224
1225    if metadata.len() > MAX_ARCHIVE_SIZE {
1226        warn!(
1227            "Wheel archive too large: {} bytes (limit: {} bytes)",
1228            metadata.len(),
1229            MAX_ARCHIVE_SIZE
1230        );
1231        return default_package_data(path);
1232    }
1233
1234    let file = match File::open(path) {
1235        Ok(f) => f,
1236        Err(e) => {
1237            warn!("Failed to open wheel archive {:?}: {}", path, e);
1238            return default_package_data(path);
1239        }
1240    };
1241
1242    let mut archive = match ZipArchive::new(file) {
1243        Ok(a) => a,
1244        Err(e) => {
1245            warn!("Failed to read wheel archive {:?}: {}", path, e);
1246            return default_package_data(path);
1247        }
1248    };
1249
1250    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1251        Ok(entries) => entries,
1252        Err(_) => return default_package_data(path),
1253    };
1254
1255    let metadata_entry =
1256        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1257            Some(entry) => entry,
1258            None => {
1259                warn!("No METADATA file found in wheel archive {:?}", path);
1260                return default_package_data(path);
1261            }
1262        };
1263
1264    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1265        Ok(c) => c,
1266        Err(e) => {
1267            warn!("Failed to read METADATA from {:?}: {}", path, e);
1268            return default_package_data(path);
1269        }
1270    };
1271
1272    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1273
1274    let (size, sha256) = calculate_file_checksums(path);
1275    package_data.size = size;
1276    package_data.sha256 = sha256;
1277
1278    if let Some(record_entry) =
1279        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1280        && let Ok(record_content) =
1281            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1282    {
1283        package_data.file_references = parse_record_csv(&record_content);
1284    }
1285
1286    if let Some(wheel_info) = parse_wheel_filename(path) {
1287        if package_data.name.is_none() {
1288            package_data.name = Some(wheel_info.name.clone());
1289        }
1290        if package_data.version.is_none() {
1291            package_data.version = Some(wheel_info.version.clone());
1292        }
1293
1294        package_data.qualifiers = Some(std::collections::HashMap::from([(
1295            "extension".to_string(),
1296            format!(
1297                "{}-{}-{}",
1298                wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1299            ),
1300        )]));
1301
1302        package_data.purl = build_wheel_purl(
1303            package_data.name.as_deref(),
1304            package_data.version.as_deref(),
1305            &wheel_info,
1306        );
1307
1308        let mut extra_data = package_data.extra_data.unwrap_or_default();
1309        extra_data.insert(
1310            "python_requires".to_string(),
1311            serde_json::Value::String(wheel_info.python_tag.clone()),
1312        );
1313        extra_data.insert(
1314            "abi_tag".to_string(),
1315            serde_json::Value::String(wheel_info.abi_tag.clone()),
1316        );
1317        extra_data.insert(
1318            "platform_tag".to_string(),
1319            serde_json::Value::String(wheel_info.platform_tag.clone()),
1320        );
1321        package_data.extra_data = Some(extra_data);
1322    }
1323
1324    package_data
1325}
1326
1327fn extract_from_egg_archive(path: &Path) -> PackageData {
1328    let metadata = match std::fs::metadata(path) {
1329        Ok(m) => m,
1330        Err(e) => {
1331            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1332            return default_package_data(path);
1333        }
1334    };
1335
1336    if metadata.len() > MAX_ARCHIVE_SIZE {
1337        warn!(
1338            "Egg archive too large: {} bytes (limit: {} bytes)",
1339            metadata.len(),
1340            MAX_ARCHIVE_SIZE
1341        );
1342        return default_package_data(path);
1343    }
1344
1345    let file = match File::open(path) {
1346        Ok(f) => f,
1347        Err(e) => {
1348            warn!("Failed to open egg archive {:?}: {}", path, e);
1349            return default_package_data(path);
1350        }
1351    };
1352
1353    let mut archive = match ZipArchive::new(file) {
1354        Ok(a) => a,
1355        Err(e) => {
1356            warn!("Failed to read egg archive {:?}: {}", path, e);
1357            return default_package_data(path);
1358        }
1359    };
1360
1361    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1362        Ok(entries) => entries,
1363        Err(_) => return default_package_data(path),
1364    };
1365
1366    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1367        &validated_entries,
1368        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1369    ) {
1370        Some(entry) => entry,
1371        None => {
1372            warn!("No PKG-INFO file found in egg archive {:?}", path);
1373            return default_package_data(path);
1374        }
1375    };
1376
1377    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1378        Ok(c) => c,
1379        Err(e) => {
1380            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1381            return default_package_data(path);
1382        }
1383    };
1384
1385    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1386
1387    let (size, sha256) = calculate_file_checksums(path);
1388    package_data.size = size;
1389    package_data.sha256 = sha256;
1390
1391    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1392        &validated_entries,
1393        &[
1394            "EGG-INFO/installed-files.txt",
1395            ".egg-info/installed-files.txt",
1396        ],
1397    ) && let Ok(installed_files_content) =
1398        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1399    {
1400        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1401    }
1402
1403    if let Some(egg_info) = parse_egg_filename(path) {
1404        if package_data.name.is_none() {
1405            package_data.name = Some(egg_info.name.clone());
1406        }
1407        if package_data.version.is_none() {
1408            package_data.version = Some(egg_info.version.clone());
1409        }
1410
1411        if let Some(python_version) = &egg_info.python_version {
1412            let mut extra_data = package_data.extra_data.unwrap_or_default();
1413            extra_data.insert(
1414                "python_version".to_string(),
1415                serde_json::Value::String(python_version.clone()),
1416            );
1417            package_data.extra_data = Some(extra_data);
1418        }
1419    }
1420
1421    package_data.purl = build_egg_purl(
1422        package_data.name.as_deref(),
1423        package_data.version.as_deref(),
1424    );
1425
1426    package_data
1427}
1428
1429fn find_validated_zip_entry_by_suffix<'a>(
1430    entries: &'a [ValidatedZipEntry],
1431    suffix: &str,
1432) -> Option<&'a ValidatedZipEntry> {
1433    entries.iter().find(|entry| entry.name.ends_with(suffix))
1434}
1435
1436fn find_validated_zip_entry_by_any_suffix<'a>(
1437    entries: &'a [ValidatedZipEntry],
1438    suffixes: &[&str],
1439) -> Option<&'a ValidatedZipEntry> {
1440    entries
1441        .iter()
1442        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1443}
1444
1445fn read_validated_zip_entry<R: Read + std::io::Seek>(
1446    archive: &mut ZipArchive<R>,
1447    entry: &ValidatedZipEntry,
1448    path: &Path,
1449    archive_type: &str,
1450) -> Result<String, String> {
1451    let mut file = archive
1452        .by_index(entry.index)
1453        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1454
1455    let compressed_size = file.compressed_size();
1456    let uncompressed_size = file.size();
1457
1458    if compressed_size > 0 {
1459        let ratio = uncompressed_size as f64 / compressed_size as f64;
1460        if ratio > MAX_COMPRESSION_RATIO {
1461            return Err(format!(
1462                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1463                archive_type, path, ratio
1464            ));
1465        }
1466    }
1467
1468    if uncompressed_size > MAX_FILE_SIZE {
1469        return Err(format!(
1470            "Rejected oversized entry in {} {:?}: {} bytes",
1471            archive_type, path, uncompressed_size
1472        ));
1473    }
1474
1475    read_limited_utf8(
1476        &mut file,
1477        MAX_FILE_SIZE,
1478        &format!("{} entry {}", archive_type, entry.name),
1479    )
1480}
1481
1482fn read_limited_utf8<R: Read>(
1483    reader: &mut R,
1484    max_bytes: u64,
1485    context: &str,
1486) -> Result<String, String> {
1487    let mut limited = reader.take(max_bytes + 1);
1488    let mut bytes = Vec::new();
1489    limited
1490        .read_to_end(&mut bytes)
1491        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1492
1493    if bytes.len() as u64 > max_bytes {
1494        return Err(format!(
1495            "{} exceeded {} byte limit while reading",
1496            context, max_bytes
1497        ));
1498    }
1499
1500    String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1501}
1502
1503fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1504    let normalized = entry_path.replace('\\', "/");
1505    if normalized.len() >= 3 {
1506        let bytes = normalized.as_bytes();
1507        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1508            return None;
1509        }
1510    }
1511    let path = Path::new(&normalized);
1512    let mut components = Vec::new();
1513
1514    for component in path.components() {
1515        match component {
1516            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1517            Component::CurDir => {}
1518            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1519        }
1520    }
1521
1522    (!components.is_empty()).then_some(components.join("/"))
1523}
1524
1525/// Parses RECORD CSV format from wheel archives (PEP 427).
1526/// Format: path,hash,size (3 columns, no header)
1527/// Hash format: sha256=urlsafe_base64_hash or empty
1528/// Size: bytes as u64 or empty
1529pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1530    let mut reader = ReaderBuilder::new()
1531        .has_headers(false)
1532        .from_reader(content.as_bytes());
1533
1534    let mut file_references = Vec::new();
1535
1536    for result in reader.records() {
1537        match result {
1538            Ok(record) => {
1539                if record.len() < 3 {
1540                    continue;
1541                }
1542
1543                let path = record.get(0).unwrap_or("").trim().to_string();
1544                if path.is_empty() {
1545                    continue;
1546                }
1547
1548                let hash_field = record.get(1).unwrap_or("").trim();
1549                let size_field = record.get(2).unwrap_or("").trim();
1550
1551                // Parse hash: format is "algorithm=value"
1552                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1553                    let parts: Vec<&str> = hash_field.split('=').collect();
1554                    if parts.len() == 2 && parts[0] == "sha256" {
1555                        // Decode base64 to hex
1556                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1557                            Ok(decoded) => {
1558                                let hex = decoded
1559                                    .iter()
1560                                    .map(|b| format!("{:02x}", b))
1561                                    .collect::<String>();
1562                                Some(hex)
1563                            }
1564                            Err(_) => None,
1565                        }
1566                    } else {
1567                        None
1568                    }
1569                } else {
1570                    None
1571                };
1572
1573                // Parse size
1574                let size = if !size_field.is_empty() && size_field != "-" {
1575                    size_field.parse::<u64>().ok()
1576                } else {
1577                    None
1578                };
1579
1580                file_references.push(FileReference {
1581                    path,
1582                    size,
1583                    sha1: None,
1584                    md5: None,
1585                    sha256,
1586                    sha512: None,
1587                    extra_data: None,
1588                });
1589            }
1590            Err(e) => {
1591                warn!("Failed to parse RECORD CSV row: {}", e);
1592                continue;
1593            }
1594        }
1595    }
1596
1597    file_references
1598}
1599
1600/// Parses installed-files.txt format from egg archives (PEP 376).
1601/// Format: one file path per line, no headers, no hash, no size
1602pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1603    content
1604        .lines()
1605        .map(|line| line.trim())
1606        .filter(|line| !line.is_empty())
1607        .map(|path| FileReference {
1608            path: path.to_string(),
1609            size: None,
1610            sha1: None,
1611            md5: None,
1612            sha256: None,
1613            sha512: None,
1614            extra_data: None,
1615        })
1616        .collect()
1617}
1618
1619pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1620    content
1621        .lines()
1622        .map(str::trim)
1623        .filter(|line| !line.is_empty())
1624        .map(|path| FileReference {
1625            path: path.to_string(),
1626            size: None,
1627            sha1: None,
1628            md5: None,
1629            sha256: None,
1630            sha512: None,
1631            extra_data: None,
1632        })
1633        .collect()
1634}
1635
1636struct WheelInfo {
1637    name: String,
1638    version: String,
1639    python_tag: String,
1640    abi_tag: String,
1641    platform_tag: String,
1642}
1643
1644fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1645    let stem = path.file_stem()?.to_string_lossy();
1646    let parts: Vec<&str> = stem.split('-').collect();
1647
1648    if parts.len() >= 5 {
1649        Some(WheelInfo {
1650            name: parts[0].replace('_', "-"),
1651            version: parts[1].to_string(),
1652            python_tag: parts[2].to_string(),
1653            abi_tag: parts[3].to_string(),
1654            platform_tag: parts[4..].join("-"),
1655        })
1656    } else {
1657        None
1658    }
1659}
1660
1661struct EggInfo {
1662    name: String,
1663    version: String,
1664    python_version: Option<String>,
1665}
1666
1667fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1668    let stem = path.file_stem()?.to_string_lossy();
1669    let parts: Vec<&str> = stem.split('-').collect();
1670
1671    if parts.len() >= 2 {
1672        Some(EggInfo {
1673            name: parts[0].replace('_', "-"),
1674            version: parts[1].to_string(),
1675            python_version: parts.get(2).map(|s| s.to_string()),
1676        })
1677    } else {
1678        None
1679    }
1680}
1681
1682fn build_wheel_purl(
1683    name: Option<&str>,
1684    version: Option<&str>,
1685    wheel_info: &WheelInfo,
1686) -> Option<String> {
1687    let name = name?;
1688    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1689
1690    if let Some(ver) = version {
1691        package_url.with_version(ver).ok()?;
1692    }
1693
1694    let extension = format!(
1695        "{}-{}-{}",
1696        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1697    );
1698    package_url.add_qualifier("extension", extension).ok()?;
1699
1700    Some(package_url.to_string())
1701}
1702
1703fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1704    let name = name?;
1705    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1706
1707    if let Some(ver) = version {
1708        package_url.with_version(ver).ok()?;
1709    }
1710
1711    package_url.add_qualifier("type", "egg").ok()?;
1712
1713    Some(package_url.to_string())
1714}
1715
1716fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1717    let metadata = super::rfc822::parse_rfc822_content(content);
1718    build_package_data_from_rfc822(&metadata, datasource_id)
1719}
1720
1721/// Builds PackageData from parsed RFC822 metadata.
1722///
1723/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1724/// and `python_parse_rfc822_content` (content-based) functions.
1725fn build_package_data_from_rfc822(
1726    metadata: &super::rfc822::Rfc822Metadata,
1727    datasource_id: DatasourceId,
1728) -> PackageData {
1729    use super::rfc822::{get_header_all, get_header_first};
1730
1731    let name = get_header_first(&metadata.headers, "name");
1732    let version = get_header_first(&metadata.headers, "version");
1733    let summary = get_header_first(&metadata.headers, "summary");
1734    let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1735    let author = get_header_first(&metadata.headers, "author");
1736    let author_email = get_header_first(&metadata.headers, "author-email");
1737    let license = get_header_first(&metadata.headers, "license");
1738    let license_expression = get_header_first(&metadata.headers, "license-expression");
1739    let download_url = get_header_first(&metadata.headers, "download-url");
1740    let platform = get_header_first(&metadata.headers, "platform");
1741    let requires_python = get_header_first(&metadata.headers, "requires-python");
1742    let classifiers = get_header_all(&metadata.headers, "classifier");
1743    let license_files = get_header_all(&metadata.headers, "license-file");
1744
1745    let description_body = if metadata.body.is_empty() {
1746        get_header_first(&metadata.headers, "description").unwrap_or_default()
1747    } else {
1748        metadata.body.clone()
1749    };
1750
1751    let description = build_description(summary.as_deref(), &description_body);
1752
1753    let mut parties = Vec::new();
1754    if author.is_some() || author_email.is_some() {
1755        parties.push(Party {
1756            r#type: Some("person".to_string()),
1757            role: Some("author".to_string()),
1758            name: author,
1759            email: author_email,
1760            url: None,
1761            organization: None,
1762            organization_url: None,
1763            timezone: None,
1764        });
1765    }
1766
1767    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1768    let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1769    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1770        license_expression
1771            .as_deref()
1772            .and_then(normalize_spdx_expression)
1773            .map(|normalized| {
1774                build_declared_license_data(
1775                    normalized,
1776                    DeclaredLicenseMatchMetadata::single_line(
1777                        license_expression.as_deref().unwrap_or_default(),
1778                    )
1779                    .with_referenced_filenames(&referenced_license_files),
1780                )
1781            })
1782            .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1783
1784    let extracted_license_statement = license_expression
1785        .clone()
1786        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1787
1788    let mut extra_data = HashMap::new();
1789    if let Some(platform_value) = platform
1790        && !platform_value.eq_ignore_ascii_case("unknown")
1791        && !platform_value.is_empty()
1792    {
1793        extra_data.insert(
1794            "platform".to_string(),
1795            serde_json::Value::String(platform_value),
1796        );
1797    }
1798
1799    if let Some(requires_python_value) = requires_python
1800        && !requires_python_value.is_empty()
1801    {
1802        extra_data.insert(
1803            "requires_python".to_string(),
1804            serde_json::Value::String(requires_python_value),
1805        );
1806    }
1807
1808    if !license_files.is_empty() {
1809        extra_data.insert(
1810            "license_files".to_string(),
1811            serde_json::Value::Array(
1812                license_files
1813                    .iter()
1814                    .cloned()
1815                    .map(serde_json::Value::String)
1816                    .collect(),
1817            ),
1818        );
1819    }
1820
1821    let file_references = license_files
1822        .iter()
1823        .map(|path| FileReference {
1824            path: path.clone(),
1825            size: None,
1826            sha1: None,
1827            md5: None,
1828            sha256: None,
1829            sha512: None,
1830            extra_data: None,
1831        })
1832        .collect();
1833
1834    let project_urls = get_header_all(&metadata.headers, "project-url");
1835    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1836    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1837
1838    if !project_urls.is_empty() {
1839        let parsed_urls = parse_project_urls(&project_urls);
1840
1841        for (label, url) in &parsed_urls {
1842            let label_lower = label.to_lowercase();
1843
1844            if bug_tracking_url.is_none()
1845                && matches!(
1846                    label_lower.as_str(),
1847                    "tracker"
1848                        | "bug reports"
1849                        | "bug tracker"
1850                        | "issues"
1851                        | "issue tracker"
1852                        | "github: issues"
1853                )
1854            {
1855                bug_tracking_url = Some(url.clone());
1856            } else if code_view_url.is_none()
1857                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1858            {
1859                code_view_url = Some(url.clone());
1860            } else if vcs_url.is_none()
1861                && matches!(
1862                    label_lower.as_str(),
1863                    "github" | "gitlab" | "github: repo" | "repository"
1864                )
1865            {
1866                vcs_url = Some(url.clone());
1867            } else if homepage_url.is_none()
1868                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1869            {
1870                homepage_url = Some(url.clone());
1871            } else if label_lower == "changelog" {
1872                extra_data.insert(
1873                    "changelog_url".to_string(),
1874                    serde_json::Value::String(url.clone()),
1875                );
1876            }
1877        }
1878
1879        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1880            .iter()
1881            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1882            .collect();
1883
1884        if !project_urls_json.is_empty() {
1885            extra_data.insert(
1886                "project_urls".to_string(),
1887                serde_json::Value::Object(project_urls_json),
1888            );
1889        }
1890    }
1891
1892    let extra_data = if extra_data.is_empty() {
1893        None
1894    } else {
1895        Some(extra_data)
1896    };
1897
1898    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1899        build_pypi_urls(name.as_deref(), version.as_deref());
1900
1901    PackageData {
1902        package_type: Some(PythonParser::PACKAGE_TYPE),
1903        namespace: None,
1904        name,
1905        version,
1906        qualifiers: None,
1907        subpath: None,
1908        primary_language: Some("Python".to_string()),
1909        description,
1910        release_date: None,
1911        parties,
1912        keywords,
1913        homepage_url,
1914        download_url,
1915        size: None,
1916        sha1: None,
1917        md5: None,
1918        sha256: None,
1919        sha512: None,
1920        bug_tracking_url,
1921        code_view_url,
1922        vcs_url,
1923        copyright: None,
1924        holder: None,
1925        declared_license_expression,
1926        declared_license_expression_spdx,
1927        license_detections,
1928        other_license_expression: None,
1929        other_license_expression_spdx: None,
1930        other_license_detections: Vec::new(),
1931        extracted_license_statement,
1932        notice_text: None,
1933        source_packages: Vec::new(),
1934        file_references,
1935        is_private: false,
1936        is_virtual: false,
1937        extra_data,
1938        dependencies,
1939        repository_homepage_url,
1940        repository_download_url,
1941        api_data_url,
1942        datasource_id: Some(datasource_id),
1943        purl,
1944    }
1945}
1946
1947fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
1948    project_urls
1949        .iter()
1950        .filter_map(|url_entry| {
1951            if let Some((label, url)) = url_entry.split_once(", ") {
1952                let label_trimmed = label.trim();
1953                let url_trimmed = url.trim();
1954                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
1955                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
1956                }
1957            }
1958            None
1959        })
1960        .collect()
1961}
1962
1963fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
1964    let mut parts = Vec::new();
1965    if let Some(summary_value) = summary
1966        && !summary_value.trim().is_empty()
1967    {
1968        parts.push(summary_value.trim().to_string());
1969    }
1970
1971    if !body.trim().is_empty() {
1972        parts.push(body.trim().to_string());
1973    }
1974
1975    if parts.is_empty() {
1976        None
1977    } else {
1978        Some(parts.join("\n"))
1979    }
1980}
1981
1982fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
1983    let mut keywords = Vec::new();
1984    let mut license_classifiers = Vec::new();
1985
1986    for classifier in classifiers {
1987        if classifier.starts_with("License ::") {
1988            license_classifiers.push(classifier.to_string());
1989        } else {
1990            keywords.push(classifier.to_string());
1991        }
1992    }
1993
1994    (keywords, license_classifiers)
1995}
1996
1997fn build_extracted_license_statement(
1998    license: Option<&str>,
1999    license_classifiers: &[String],
2000) -> Option<String> {
2001    let mut lines = Vec::new();
2002
2003    if let Some(value) = license
2004        && !value.trim().is_empty()
2005    {
2006        lines.push(format!("license: {}", value.trim()));
2007    }
2008
2009    if !license_classifiers.is_empty() {
2010        lines.push("classifiers:".to_string());
2011        for classifier in license_classifiers {
2012            lines.push(format!("  - '{}'", classifier));
2013        }
2014    }
2015
2016    if lines.is_empty() {
2017        None
2018    } else {
2019        Some(format!("{}\n", lines.join("\n")))
2020    }
2021}
2022
2023pub(crate) fn build_pypi_urls(
2024    name: Option<&str>,
2025    version: Option<&str>,
2026) -> (
2027    Option<String>,
2028    Option<String>,
2029    Option<String>,
2030    Option<String>,
2031) {
2032    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
2033
2034    let repository_download_url = name.and_then(|value| {
2035        version.map(|ver| {
2036            format!(
2037                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2038                &value[..1.min(value.len())],
2039                value,
2040                value,
2041                ver
2042            )
2043        })
2044    });
2045
2046    let api_data_url = name.map(|value| {
2047        if let Some(ver) = version {
2048            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2049        } else {
2050            format!("https://pypi.org/pypi/{}/json", value)
2051        }
2052    });
2053
2054    let purl = name.and_then(|value| {
2055        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2056        if let Some(ver) = version {
2057            package_url.with_version(ver).ok()?;
2058        }
2059        Some(package_url.to_string())
2060    });
2061
2062    (
2063        repository_homepage_url,
2064        repository_download_url,
2065        api_data_url,
2066        purl,
2067    )
2068}
2069
2070fn build_pypi_purl_with_extension(
2071    name: &str,
2072    version: Option<&str>,
2073    extension: &str,
2074) -> Option<String> {
2075    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2076    if let Some(ver) = version {
2077        package_url.with_version(ver).ok()?;
2078    }
2079    package_url.add_qualifier("extension", extension).ok()?;
2080    Some(package_url.to_string())
2081}
2082
2083fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2084    let toml_content = match read_toml_file(path) {
2085        Ok(content) => content,
2086        Err(e) => {
2087            warn!(
2088                "Failed to read or parse pyproject.toml at {:?}: {}",
2089                path, e
2090            );
2091            return default_package_data(path);
2092        }
2093    };
2094
2095    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2096    let is_poetry_pyproject = tool_table
2097        .and_then(|tool| tool.get("poetry"))
2098        .and_then(|value| value.as_table())
2099        .is_some();
2100
2101    // Handle both PEP 621 (project table) and poetry formats
2102    let project_table =
2103        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2104            // Standard PEP 621 format with [project] table
2105            project.clone()
2106        } else if let Some(tool) = tool_table {
2107            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2108                // Poetry format with [tool.poetry] table
2109                poetry.clone()
2110            } else {
2111                return default_package_data(path);
2112            }
2113        } else if toml_content.get(FIELD_NAME).is_some() {
2114            // Other format with top-level fields
2115            match toml_content.as_table() {
2116                Some(table) => table.clone(),
2117                None => {
2118                    warn!("Failed to convert TOML content to table in {:?}", path);
2119                    return default_package_data(path);
2120                }
2121            }
2122        } else {
2123            return default_package_data(path);
2124        };
2125
2126    let name = project_table
2127        .get(FIELD_NAME)
2128        .and_then(|v| v.as_str())
2129        .map(String::from);
2130
2131    let version = project_table
2132        .get(FIELD_VERSION)
2133        .and_then(|v| v.as_str())
2134        .map(String::from);
2135    let classifiers = project_table
2136        .get("classifiers")
2137        .and_then(|value| value.as_array())
2138        .map(|values| {
2139            values
2140                .iter()
2141                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2142                .collect::<Vec<_>>()
2143        })
2144        .unwrap_or_default();
2145
2146    let extracted_license_statement = extract_raw_license_string(&project_table);
2147    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2148        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2149
2150    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2151    let (homepage_url, repository_url) = extract_urls(&project_table);
2152
2153    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2154    let extra_data = extract_pyproject_extra_data(&toml_content);
2155
2156    // Create package URL
2157    let purl = name.as_ref().and_then(|n| {
2158        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2159            Ok(p) => p,
2160            Err(e) => {
2161                warn!(
2162                    "Failed to create PackageUrl for Python package '{}': {}",
2163                    n, e
2164                );
2165                return None;
2166            }
2167        };
2168
2169        if let Some(v) = &version
2170            && let Err(e) = package_url.with_version(v)
2171        {
2172            warn!(
2173                "Failed to set version '{}' for Python package '{}': {}",
2174                v, n, e
2175            );
2176            return None;
2177        }
2178
2179        Some(package_url.to_string())
2180    });
2181
2182    let api_data_url = name.as_ref().map(|n| {
2183        if let Some(v) = &version {
2184            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2185        } else {
2186            format!("https://pypi.org/pypi/{}/json", n)
2187        }
2188    });
2189
2190    let pypi_homepage_url = name
2191        .as_ref()
2192        .map(|n| format!("https://pypi.org/project/{}", n));
2193
2194    let pypi_download_url = name.as_ref().and_then(|n| {
2195        version.as_ref().map(|v| {
2196            format!(
2197                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2198                &n[..1.min(n.len())],
2199                n,
2200                n,
2201                v
2202            )
2203        })
2204    });
2205
2206    PackageData {
2207        package_type: Some(PythonParser::PACKAGE_TYPE),
2208        namespace: None,
2209        name,
2210        version,
2211        qualifiers: None,
2212        subpath: None,
2213        primary_language: None,
2214        description: None,
2215        release_date: None,
2216        parties: extract_parties(&project_table),
2217        keywords: Vec::new(),
2218        homepage_url: homepage_url.or(pypi_homepage_url),
2219        download_url: repository_url.clone().or(pypi_download_url),
2220        size: None,
2221        sha1: None,
2222        md5: None,
2223        sha256: None,
2224        sha512: None,
2225        bug_tracking_url: None,
2226        code_view_url: None,
2227        vcs_url: repository_url,
2228        copyright: None,
2229        holder: None,
2230        declared_license_expression,
2231        declared_license_expression_spdx,
2232        license_detections,
2233        other_license_expression: None,
2234        other_license_expression_spdx: None,
2235        other_license_detections: Vec::new(),
2236        extracted_license_statement,
2237        notice_text: None,
2238        source_packages: Vec::new(),
2239        file_references: Vec::new(),
2240        is_private: has_private_classifier(&classifiers),
2241        is_virtual: false,
2242        extra_data,
2243        dependencies: [dependencies, optional_dependencies].concat(),
2244        repository_homepage_url: None,
2245        repository_download_url: None,
2246        api_data_url,
2247        datasource_id: Some(if is_poetry_pyproject {
2248            DatasourceId::PypiPoetryPyprojectToml
2249        } else {
2250            DatasourceId::PypiPyprojectToml
2251        }),
2252        purl,
2253    }
2254}
2255
2256fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2257    let path_str = path.to_string_lossy().replace('\\', "/");
2258    if path_str.contains("/EGG-INFO/PKG-INFO") {
2259        DatasourceId::PypiEggPkginfo
2260    } else if path_str.ends_with(".egg-info/PKG-INFO") {
2261        DatasourceId::PypiEditableEggPkginfo
2262    } else {
2263        DatasourceId::PypiSdistPkginfo
2264    }
2265}
2266
2267fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2268    project
2269        .get(FIELD_LICENSE)
2270        .and_then(|license_value| match license_value {
2271            TomlValue::String(license_str) => Some(license_str.clone()),
2272            TomlValue::Table(license_table) => license_table
2273                .get("text")
2274                .and_then(|v| v.as_str())
2275                .map(|s| s.to_string())
2276                .or_else(|| {
2277                    license_table
2278                        .get("expression")
2279                        .and_then(|v| v.as_str())
2280                        .map(|expr| expr.to_string())
2281                }),
2282            _ => None,
2283        })
2284}
2285
2286fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2287    match project.get(FIELD_LICENSE) {
2288        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2289        Some(TomlValue::Table(license_table)) => license_table
2290            .get("expression")
2291            .and_then(|value| value.as_str()),
2292        _ => None,
2293    }
2294}
2295
2296fn extract_urls(project: &TomlMap<String, TomlValue>) -> (Option<String>, Option<String>) {
2297    let mut homepage_url = None;
2298    let mut repository_url = None;
2299
2300    // Check for URLs table
2301    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2302        homepage_url = urls
2303            .get(FIELD_HOMEPAGE)
2304            .and_then(|v| v.as_str())
2305            .map(String::from);
2306        repository_url = urls
2307            .get(FIELD_REPOSITORY)
2308            .and_then(|v| v.as_str())
2309            .map(String::from);
2310    }
2311
2312    // If not found in URLs table, check for top-level keys
2313    if homepage_url.is_none() {
2314        homepage_url = project
2315            .get(FIELD_HOMEPAGE)
2316            .and_then(|v| v.as_str())
2317            .map(String::from);
2318    }
2319
2320    if repository_url.is_none() {
2321        repository_url = project
2322            .get(FIELD_REPOSITORY)
2323            .and_then(|v| v.as_str())
2324            .map(String::from);
2325    }
2326
2327    (homepage_url, repository_url)
2328}
2329
2330fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2331    let mut parties = Vec::new();
2332
2333    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2334        for author in authors {
2335            if let Some(author_str) = author.as_str() {
2336                let (name, email) = split_name_email(author_str);
2337                parties.push(Party {
2338                    r#type: None,
2339                    role: Some("author".to_string()),
2340                    name,
2341                    email,
2342                    url: None,
2343                    organization: None,
2344                    organization_url: None,
2345                    timezone: None,
2346                });
2347            }
2348        }
2349    }
2350
2351    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2352        for maintainer in maintainers {
2353            if let Some(maintainer_str) = maintainer.as_str() {
2354                let (name, email) = split_name_email(maintainer_str);
2355                parties.push(Party {
2356                    r#type: None,
2357                    role: Some("maintainer".to_string()),
2358                    name,
2359                    email,
2360                    url: None,
2361                    organization: None,
2362                    organization_url: None,
2363                    timezone: None,
2364                });
2365            }
2366        }
2367    }
2368
2369    parties
2370}
2371
2372fn extract_dependencies(
2373    project: &TomlMap<String, TomlValue>,
2374    toml_content: &TomlValue,
2375) -> (Vec<Dependency>, Vec<Dependency>) {
2376    let mut dependencies = Vec::new();
2377    let mut optional_dependencies = Vec::new();
2378
2379    // Handle dependencies - can be array or table format
2380    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2381        match deps_value {
2382            TomlValue::Array(arr) => {
2383                dependencies = parse_dependency_array(arr, false, None);
2384            }
2385            TomlValue::Table(table) => {
2386                dependencies = parse_dependency_table(table, false, None);
2387            }
2388            _ => {}
2389        }
2390    }
2391
2392    // Handle PEP 621 optional-dependencies with scope
2393    if let Some(opt_deps_table) = project
2394        .get(FIELD_OPTIONAL_DEPENDENCIES)
2395        .and_then(|v| v.as_table())
2396    {
2397        for (extra_name, deps) in opt_deps_table {
2398            match deps {
2399                TomlValue::Array(arr) => {
2400                    optional_dependencies.extend(parse_dependency_array(
2401                        arr,
2402                        true,
2403                        Some(extra_name),
2404                    ));
2405                }
2406                TomlValue::Table(table) => {
2407                    optional_dependencies.extend(parse_dependency_table(
2408                        table,
2409                        true,
2410                        Some(extra_name),
2411                    ));
2412                }
2413                _ => {}
2414            }
2415        }
2416    }
2417
2418    // Handle Poetry dev-dependencies
2419    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2420        match dev_deps_value {
2421            TomlValue::Array(arr) => {
2422                optional_dependencies.extend(parse_dependency_array(
2423                    arr,
2424                    true,
2425                    Some(FIELD_DEV_DEPENDENCIES),
2426                ));
2427            }
2428            TomlValue::Table(table) => {
2429                optional_dependencies.extend(parse_dependency_table(
2430                    table,
2431                    true,
2432                    Some(FIELD_DEV_DEPENDENCIES),
2433                ));
2434            }
2435            _ => {}
2436        }
2437    }
2438
2439    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2440    if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
2441        for (group_name, group_data) in groups_table {
2442            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2443                match group_deps {
2444                    TomlValue::Array(arr) => {
2445                        optional_dependencies.extend(parse_dependency_array(
2446                            arr,
2447                            true,
2448                            Some(group_name),
2449                        ));
2450                    }
2451                    TomlValue::Table(table) => {
2452                        optional_dependencies.extend(parse_dependency_table(
2453                            table,
2454                            true,
2455                            Some(group_name),
2456                        ));
2457                    }
2458                    _ => {}
2459                }
2460            }
2461        }
2462    }
2463
2464    if let Some(groups_table) = toml_content
2465        .get(FIELD_DEPENDENCY_GROUPS)
2466        .and_then(|value| value.as_table())
2467    {
2468        for (group_name, deps) in groups_table {
2469            match deps {
2470                TomlValue::Array(arr) => {
2471                    optional_dependencies.extend(parse_dependency_array(
2472                        arr,
2473                        true,
2474                        Some(group_name),
2475                    ));
2476                }
2477                TomlValue::Table(table) => {
2478                    optional_dependencies.extend(parse_dependency_table(
2479                        table,
2480                        true,
2481                        Some(group_name),
2482                    ));
2483                }
2484                _ => {}
2485            }
2486        }
2487    }
2488
2489    if let Some(dev_deps_value) = toml_content
2490        .get("tool")
2491        .and_then(|value| value.as_table())
2492        .and_then(|tool| tool.get("uv"))
2493        .and_then(|value| value.as_table())
2494        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2495    {
2496        match dev_deps_value {
2497            TomlValue::Array(arr) => {
2498                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2499            }
2500            TomlValue::Table(table) => {
2501                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2502            }
2503            _ => {}
2504        }
2505    }
2506
2507    (dependencies, optional_dependencies)
2508}
2509
2510fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2511    let mut extra_data = HashMap::new();
2512
2513    if let Some(tool_uv) = toml_content
2514        .get("tool")
2515        .and_then(|value| value.as_table())
2516        .and_then(|tool| tool.get("uv"))
2517    {
2518        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2519    }
2520
2521    if extra_data.is_empty() {
2522        None
2523    } else {
2524        Some(extra_data)
2525    }
2526}
2527
2528fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2529    match value {
2530        TomlValue::String(value) => JsonValue::String(value.clone()),
2531        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2532        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2533        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2534        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2535        TomlValue::Array(values) => {
2536            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2537        }
2538        TomlValue::Table(values) => JsonValue::Object(
2539            values
2540                .iter()
2541                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2542                .collect::<JsonMap<String, JsonValue>>(),
2543        ),
2544    }
2545}
2546
2547fn parse_dependency_table(
2548    table: &TomlMap<String, TomlValue>,
2549    is_optional: bool,
2550    scope: Option<&str>,
2551) -> Vec<Dependency> {
2552    table
2553        .iter()
2554        .filter_map(|(name, version)| {
2555            let version_str = version.as_str().map(|s| s.to_string());
2556            let mut package_url =
2557                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2558
2559            if let Some(v) = &version_str {
2560                package_url.with_version(v).ok()?;
2561            }
2562
2563            Some(Dependency {
2564                purl: Some(package_url.to_string()),
2565                extracted_requirement: None,
2566                scope: scope.map(|s| s.to_string()),
2567                is_runtime: Some(!is_optional),
2568                is_optional: Some(is_optional),
2569                is_pinned: None,
2570                is_direct: Some(true),
2571                resolved_package: None,
2572                extra_data: None,
2573            })
2574        })
2575        .collect()
2576}
2577
2578fn parse_dependency_array(
2579    array: &[TomlValue],
2580    is_optional: bool,
2581    scope: Option<&str>,
2582) -> Vec<Dependency> {
2583    array
2584        .iter()
2585        .filter_map(|dep| {
2586            let dep_str = dep.as_str()?;
2587
2588            let mut parts = dep_str.split(['>', '=', '<', '~']);
2589            let name = parts.next()?.trim().to_string();
2590
2591            let version = parts.next().map(|v| v.trim().to_string());
2592
2593            let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name)
2594            {
2595                Ok(purl) => purl,
2596                Err(_) => return None,
2597            };
2598
2599            if let Some(ref v) = version {
2600                package_url.with_version(v).ok()?;
2601            }
2602
2603            Some(Dependency {
2604                purl: Some(package_url.to_string()),
2605                extracted_requirement: None,
2606                scope: scope.map(|s| s.to_string()),
2607                is_runtime: Some(!is_optional),
2608                is_optional: Some(is_optional),
2609                is_pinned: None,
2610                is_direct: Some(true),
2611                resolved_package: None,
2612                extra_data: None,
2613            })
2614        })
2615        .collect()
2616}
2617
2618#[derive(Debug, Clone)]
2619enum Value {
2620    String(String),
2621    Number(f64),
2622    Bool(bool),
2623    None,
2624    List(Vec<Value>),
2625    Tuple(Vec<Value>),
2626    Dict(HashMap<String, Value>),
2627}
2628
2629struct LiteralEvaluator {
2630    constants: HashMap<String, Value>,
2631    max_depth: usize,
2632    max_nodes: usize,
2633    nodes_visited: usize,
2634}
2635
2636impl LiteralEvaluator {
2637    fn new(constants: HashMap<String, Value>) -> Self {
2638        Self {
2639            constants,
2640            max_depth: MAX_SETUP_PY_AST_DEPTH,
2641            max_nodes: MAX_SETUP_PY_AST_NODES,
2642            nodes_visited: 0,
2643        }
2644    }
2645
2646    fn insert_constant(&mut self, name: String, value: Value) {
2647        self.constants.insert(name, value);
2648    }
2649
2650    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2651        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2652            return None;
2653        }
2654        self.nodes_visited += 1;
2655
2656        match expr {
2657            ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
2658                Some(Value::String(value.to_str().to_string()))
2659            }
2660            ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
2661                Some(Value::Bool(*value))
2662            }
2663            ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
2664                self.evaluate_number(value)
2665            }
2666            ast::Expr::NoneLiteral(_) => Some(Value::None),
2667            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2668            ast::Expr::List(ast::ExprList { elts, .. }) => {
2669                let mut values = Vec::new();
2670                for elt in elts {
2671                    values.push(self.evaluate_expr(elt, depth + 1)?);
2672                }
2673                Some(Value::List(values))
2674            }
2675            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
2676                let mut values = Vec::new();
2677                for elt in elts {
2678                    values.push(self.evaluate_expr(elt, depth + 1)?);
2679                }
2680                Some(Value::Tuple(values))
2681            }
2682            ast::Expr::Dict(ast::ExprDict { items, .. }) => {
2683                let mut dict = HashMap::new();
2684                for item in items {
2685                    let key_expr = item.key.as_ref()?;
2686                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
2687                    let key = value_to_string(&key_value)?;
2688                    let value = self.evaluate_expr(&item.value, depth + 1)?;
2689                    dict.insert(key, value);
2690                }
2691                Some(Value::Dict(dict))
2692            }
2693            ast::Expr::Call(ast::ExprCall {
2694                func, arguments, ..
2695            }) => {
2696                let args = arguments.args.as_ref();
2697                let keywords = arguments.keywords.as_ref();
2698                if keywords.is_empty()
2699                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
2700                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
2701                {
2702                    return self.evaluate_ordered_dict(args, depth + 1);
2703                }
2704
2705                if !args.is_empty() {
2706                    return None;
2707                }
2708
2709                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
2710                    && id == "dict"
2711                {
2712                    let mut dict = HashMap::new();
2713                    for keyword in keywords {
2714                        let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
2715                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2716                        dict.insert(key.to_string(), value);
2717                    }
2718                    return Some(Value::Dict(dict));
2719                }
2720
2721                None
2722            }
2723            _ => None,
2724        }
2725    }
2726
2727    fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
2728        match number {
2729            ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2730            ast::Number::Float(value) => Some(Value::Number(*value)),
2731            ast::Number::Complex { .. } => None,
2732        }
2733    }
2734
2735    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2736        if args.len() != 1 {
2737            return None;
2738        }
2739
2740        let items = match self.evaluate_expr(&args[0], depth)? {
2741            Value::List(items) | Value::Tuple(items) => items,
2742            _ => return None,
2743        };
2744
2745        let mut dict = HashMap::new();
2746        for item in items {
2747            let Value::Tuple(values) = item else {
2748                return None;
2749            };
2750            if values.len() != 2 {
2751                return None;
2752            }
2753            let key = value_to_string(&values[0])?;
2754            dict.insert(key, values[1].clone());
2755        }
2756
2757        Some(Value::Dict(dict))
2758    }
2759}
2760
2761#[derive(Default)]
2762struct SetupAliases {
2763    setup_names: HashSet<String>,
2764    module_aliases: HashMap<String, String>,
2765}
2766
2767fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
2768    extract_from_setup_py(path).into_iter().collect()
2769}
2770
2771fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
2772    let content = match read_file_to_string(path) {
2773        Ok(content) => content,
2774        Err(e) => {
2775            warn!("Failed to read setup.py at {:?}: {}", path, e);
2776            return Some(default_package_data(path));
2777        }
2778    };
2779
2780    if content.len() > MAX_SETUP_PY_BYTES {
2781        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
2782        let package_data = extract_from_setup_py_regex(&content);
2783        return should_emit_setup_py_package(&package_data).then_some(package_data);
2784    }
2785
2786    let mut package_data = match extract_from_setup_py_ast(&content) {
2787        Ok(Some(data)) => data,
2788        Ok(None) => return Some(default_package_data(path)),
2789        Err(e) => {
2790            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
2791            extract_from_setup_py_regex(&content)
2792        }
2793    };
2794
2795    if package_data.name.is_none() {
2796        package_data.name = extract_setup_value(&content, "name");
2797    }
2798
2799    if package_data.version.is_none() {
2800        package_data.version = extract_setup_value(&content, "version");
2801    }
2802
2803    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
2804
2805    if package_data.purl.is_none() {
2806        package_data.purl = build_setup_py_purl(
2807            package_data.name.as_deref(),
2808            package_data.version.as_deref(),
2809        );
2810    }
2811
2812    if should_emit_setup_py_package(&package_data) {
2813        Some(package_data)
2814    } else {
2815        Some(default_package_data(path))
2816    }
2817}
2818
2819fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
2820    package_data.name.is_some()
2821        || package_data.version.is_some()
2822        || package_data.purl.is_some()
2823        || !package_data.dependencies.is_empty()
2824        || package_data.extracted_license_statement.is_some()
2825        || !package_data.license_detections.is_empty()
2826        || !package_data.parties.is_empty()
2827        || package_data.description.is_some()
2828        || package_data.homepage_url.is_some()
2829        || package_data.bug_tracking_url.is_some()
2830        || package_data.code_view_url.is_some()
2831        || package_data.vcs_url.is_some()
2832}
2833
2834fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
2835    if package_data.version.is_some()
2836        && package_data.extracted_license_statement.is_some()
2837        && package_data
2838            .parties
2839            .iter()
2840            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
2841    {
2842        return;
2843    }
2844
2845    let Some(root) = path.parent() else {
2846        return;
2847    };
2848
2849    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
2850
2851    if package_data.version.is_none() {
2852        package_data.version = dunder_metadata.version;
2853    }
2854
2855    if package_data.extracted_license_statement.is_none() {
2856        package_data.extracted_license_statement = dunder_metadata.license;
2857    }
2858
2859    let has_author = package_data
2860        .parties
2861        .iter()
2862        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
2863
2864    if !has_author && let Some(author) = dunder_metadata.author {
2865        package_data.parties.push(Party {
2866            r#type: Some("person".to_string()),
2867            role: Some("author".to_string()),
2868            name: Some(author),
2869            email: None,
2870            url: None,
2871            organization: None,
2872            organization_url: None,
2873            timezone: None,
2874        });
2875    }
2876}
2877
2878#[derive(Default)]
2879struct DunderMetadata {
2880    version: Option<String>,
2881    author: Option<String>,
2882    license: Option<String>,
2883}
2884
2885fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
2886    let statements = match parse_module(content) {
2887        Ok(parsed) => parsed.into_suite(),
2888        Err(_) => return DunderMetadata::default(),
2889    };
2890
2891    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2892    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2893    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2894    let mut metadata = DunderMetadata::default();
2895
2896    for module in imported_dunder_modules(&statements) {
2897        let Some(path) = resolve_imported_module_path(root, &module) else {
2898            continue;
2899        };
2900        let Ok(module_content) = read_file_to_string(&path) else {
2901            continue;
2902        };
2903
2904        if metadata.version.is_none() {
2905            metadata.version = version_re
2906                .as_ref()
2907                .and_then(|regex| regex.captures(&module_content))
2908                .and_then(|captures| captures.get(1))
2909                .map(|match_| match_.as_str().to_string());
2910        }
2911
2912        if metadata.author.is_none() {
2913            metadata.author = author_re
2914                .as_ref()
2915                .and_then(|regex| regex.captures(&module_content))
2916                .and_then(|captures| captures.get(1))
2917                .map(|match_| match_.as_str().to_string());
2918        }
2919
2920        if metadata.license.is_none() {
2921            metadata.license = license_re
2922                .as_ref()
2923                .and_then(|regex| regex.captures(&module_content))
2924                .and_then(|captures| captures.get(1))
2925                .map(|match_| match_.as_str().to_string());
2926        }
2927
2928        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
2929            return metadata;
2930        }
2931    }
2932
2933    metadata
2934}
2935
2936fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
2937    let mut modules = Vec::new();
2938
2939    for statement in statements {
2940        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
2941            continue;
2942        };
2943        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
2944            continue;
2945        };
2946        let imports_dunder = names.iter().any(|alias| {
2947            matches!(
2948                alias.name.as_str(),
2949                "__version__" | "__author__" | "__license__"
2950            )
2951        });
2952        if imports_dunder {
2953            modules.push(module.to_string());
2954        }
2955    }
2956
2957    modules
2958}
2959
2960fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
2961    let relative = PathBuf::from_iter(module.split('.'));
2962    let candidates = [
2963        root.join(relative.with_extension("py")),
2964        root.join(&relative).join("__init__.py"),
2965        root.join("src").join(relative.with_extension("py")),
2966        root.join("src").join(relative).join("__init__.py"),
2967    ];
2968
2969    candidates.into_iter().find(|candidate| candidate.exists())
2970}
2971
2972/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
2973///
2974/// # Security Model
2975///
2976/// This function parses setup.py as a Python AST and evaluates only literal values
2977/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
2978/// arbitrary code execution during scanning.
2979///
2980/// # DoS Prevention
2981///
2982/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
2983/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
2984/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
2985///
2986/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
2987fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
2988    let statements = parse_module(content)
2989        .map(|parsed| parsed.into_suite())
2990        .map_err(|e| e.to_string())?;
2991    let aliases = collect_setup_aliases(&statements);
2992    let mut evaluator = LiteralEvaluator::new(HashMap::new());
2993    build_setup_py_constants(&statements, &mut evaluator);
2994
2995    let setup_call = find_setup_call(&statements, &aliases);
2996    let Some(call_expr) = setup_call else {
2997        return Ok(None);
2998    };
2999
3000    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
3001    Ok(Some(build_setup_py_package_data(&setup_values)))
3002}
3003
3004fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
3005    for stmt in statements {
3006        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
3007            if targets.len() != 1 {
3008                continue;
3009            }
3010
3011            let Some(name) = extract_assign_name(&targets[0]) else {
3012                continue;
3013            };
3014
3015            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
3016                evaluator.insert_constant(name, value);
3017            }
3018        }
3019    }
3020}
3021
3022fn extract_assign_name(target: &ast::Expr) -> Option<String> {
3023    match target {
3024        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3025        _ => None,
3026    }
3027}
3028
3029fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
3030    let mut aliases = SetupAliases::default();
3031    aliases.setup_names.insert("setup".to_string());
3032
3033    for stmt in statements {
3034        match stmt {
3035            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3036                for alias in names {
3037                    let module_name = alias.name.as_str();
3038                    if !is_setup_module(module_name) {
3039                        continue;
3040                    }
3041                    let alias_name = alias
3042                        .asname
3043                        .as_ref()
3044                        .map(|name| name.as_str())
3045                        .unwrap_or(module_name);
3046                    aliases
3047                        .module_aliases
3048                        .insert(alias_name.to_string(), module_name.to_string());
3049                }
3050            }
3051            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3052                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3053                    continue;
3054                };
3055                if !is_setup_module(module_name) {
3056                    continue;
3057                }
3058                for alias in names {
3059                    if alias.name.as_str() != "setup" {
3060                        continue;
3061                    }
3062                    let alias_name = alias
3063                        .asname
3064                        .as_ref()
3065                        .map(|name| name.as_str())
3066                        .unwrap_or("setup");
3067                    aliases.setup_names.insert(alias_name.to_string());
3068                }
3069            }
3070            _ => {}
3071        }
3072    }
3073
3074    aliases
3075}
3076
3077fn is_setup_module(module_name: &str) -> bool {
3078    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3079}
3080
3081fn find_setup_call<'a>(
3082    statements: &'a [ast::Stmt],
3083    aliases: &'a SetupAliases,
3084) -> Option<&'a ast::Expr> {
3085    let mut finder = SetupCallFinder {
3086        aliases,
3087        called_function_names: collect_top_level_called_function_names(statements),
3088        nodes_visited: 0,
3089    };
3090    finder.find_in_statements(statements)
3091}
3092
3093fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3094    let mut called = HashSet::new();
3095    collect_called_function_names_in_statements(statements, &mut called);
3096    called
3097}
3098
3099fn collect_called_function_names_in_statements(
3100    statements: &[ast::Stmt],
3101    called: &mut HashSet<String>,
3102) {
3103    for stmt in statements {
3104        match stmt {
3105            ast::Stmt::Expr(ast::StmtExpr { value, .. })
3106            | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3107                collect_called_function_names_in_expr(value.as_ref(), called);
3108            }
3109            ast::Stmt::If(ast::StmtIf {
3110                body,
3111                elif_else_clauses,
3112                ..
3113            }) => {
3114                collect_called_function_names_in_statements(body, called);
3115                for clause in elif_else_clauses {
3116                    collect_called_function_names_in_statements(&clause.body, called);
3117                }
3118            }
3119            ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3120            | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3121                collect_called_function_names_in_statements(body, called);
3122                collect_called_function_names_in_statements(orelse, called);
3123            }
3124            ast::Stmt::With(ast::StmtWith { body, .. }) => {
3125                collect_called_function_names_in_statements(body, called);
3126            }
3127            ast::Stmt::Try(ast::StmtTry {
3128                body,
3129                orelse,
3130                finalbody,
3131                handlers,
3132                ..
3133            }) => {
3134                collect_called_function_names_in_statements(body, called);
3135                collect_called_function_names_in_statements(orelse, called);
3136                collect_called_function_names_in_statements(finalbody, called);
3137                for handler in handlers {
3138                    let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3139                        body,
3140                        ..
3141                    }) = handler;
3142                    collect_called_function_names_in_statements(body, called);
3143                }
3144            }
3145            _ => {}
3146        }
3147    }
3148}
3149
3150fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3151    if let ast::Expr::Call(ast::ExprCall {
3152        func, arguments, ..
3153    }) = expr
3154    {
3155        if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3156            called.insert(id.as_str().to_string());
3157        }
3158
3159        for arg in arguments.args.iter() {
3160            collect_called_function_names_in_expr(arg, called);
3161        }
3162        for keyword in arguments.keywords.iter() {
3163            collect_called_function_names_in_expr(&keyword.value, called);
3164        }
3165    }
3166}
3167
3168struct SetupCallFinder<'a> {
3169    aliases: &'a SetupAliases,
3170    called_function_names: HashSet<String>,
3171    nodes_visited: usize,
3172}
3173
3174impl<'a> SetupCallFinder<'a> {
3175    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3176        for stmt in statements {
3177            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3178                return None;
3179            }
3180            self.nodes_visited += 1;
3181
3182            let found = match stmt {
3183                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3184                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3185                ast::Stmt::If(ast::StmtIf {
3186                    body,
3187                    elif_else_clauses,
3188                    ..
3189                }) => self.find_in_statements(body).or_else(|| {
3190                    for clause in elif_else_clauses {
3191                        if let Some(found) = self.find_in_statements(&clause.body) {
3192                            return Some(found);
3193                        }
3194                    }
3195                    None
3196                }),
3197                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3198                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3199                    .find_in_statements(body)
3200                    .or_else(|| self.find_in_statements(orelse)),
3201                ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3202                    .called_function_names
3203                    .contains(name.as_str())
3204                    .then(|| self.find_in_statements(body))
3205                    .flatten(),
3206                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3207                ast::Stmt::Try(ast::StmtTry {
3208                    body,
3209                    orelse,
3210                    finalbody,
3211                    handlers,
3212                    ..
3213                }) => self
3214                    .find_in_statements(body)
3215                    .or_else(|| self.find_in_statements(orelse))
3216                    .or_else(|| self.find_in_statements(finalbody))
3217                    .or_else(|| {
3218                        for handler in handlers {
3219                            let ast::ExceptHandler::ExceptHandler(
3220                                ast::ExceptHandlerExceptHandler { body, .. },
3221                            ) = handler;
3222                            if let Some(found) = self.find_in_statements(body) {
3223                                return Some(found);
3224                            }
3225                        }
3226                        None
3227                    }),
3228                _ => None,
3229            };
3230
3231            if found.is_some() {
3232                return found;
3233            }
3234        }
3235
3236        None
3237    }
3238
3239    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3240        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3241            return None;
3242        }
3243        self.nodes_visited += 1;
3244
3245        match expr {
3246            ast::Expr::Call(ast::ExprCall { func, .. })
3247                if is_setup_call(func.as_ref(), self.aliases) =>
3248            {
3249                Some(expr)
3250            }
3251            _ => None,
3252        }
3253    }
3254}
3255
3256fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3257    let Some(dotted) = dotted_name(func, 0) else {
3258        return false;
3259    };
3260
3261    if aliases.setup_names.contains(&dotted) {
3262        return true;
3263    }
3264
3265    let Some(module) = dotted.strip_suffix(".setup") else {
3266        return false;
3267    };
3268
3269    let resolved = resolve_module_alias(module, aliases);
3270    is_setup_module(&resolved)
3271}
3272
3273fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3274    if depth >= MAX_SETUP_PY_AST_DEPTH {
3275        return None;
3276    }
3277
3278    match expr {
3279        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3280        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3281            let base = dotted_name(value.as_ref(), depth + 1)?;
3282            Some(format!("{}.{}", base, attr.as_str()))
3283        }
3284        _ => None,
3285    }
3286}
3287
3288fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3289    if let Some(mapped) = aliases.module_aliases.get(module) {
3290        return mapped.clone();
3291    }
3292
3293    let Some((base, rest)) = module.split_once('.') else {
3294        return module.to_string();
3295    };
3296
3297    if let Some(mapped) = aliases.module_aliases.get(base) {
3298        return format!("{}.{}", mapped, rest);
3299    }
3300
3301    module.to_string()
3302}
3303
3304fn extract_setup_keywords(
3305    call_expr: &ast::Expr,
3306    evaluator: &mut LiteralEvaluator,
3307) -> HashMap<String, Value> {
3308    let mut values = HashMap::new();
3309    let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3310        return values;
3311    };
3312
3313    for keyword in arguments.keywords.iter() {
3314        if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3315            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3316                values.insert(arg.to_string(), value);
3317            }
3318        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3319            for (key, value) in dict {
3320                values.insert(key, value);
3321            }
3322        }
3323    }
3324
3325    values
3326}
3327
3328fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3329    let name = get_value_string(values, "name");
3330    let version = get_value_string(values, "version");
3331    let description =
3332        get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3333    let homepage_url =
3334        get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3335    let author = get_value_string(values, "author");
3336    let author_email = get_value_string(values, "author_email");
3337    let maintainer = get_value_string(values, "maintainer");
3338    let maintainer_email = get_value_string(values, "maintainer_email");
3339    let license = get_value_string(values, "license");
3340    let classifiers = values
3341        .get("classifiers")
3342        .and_then(value_to_string_list)
3343        .unwrap_or_default();
3344
3345    let mut parties = Vec::new();
3346    if author.is_some() || author_email.is_some() {
3347        parties.push(Party {
3348            r#type: Some("person".to_string()),
3349            role: Some("author".to_string()),
3350            name: author,
3351            email: author_email,
3352            url: None,
3353            organization: None,
3354            organization_url: None,
3355            timezone: None,
3356        });
3357    }
3358
3359    if maintainer.is_some() || maintainer_email.is_some() {
3360        parties.push(Party {
3361            r#type: Some("person".to_string()),
3362            role: Some("maintainer".to_string()),
3363            name: maintainer,
3364            email: maintainer_email,
3365            url: None,
3366            organization: None,
3367            organization_url: None,
3368            timezone: None,
3369        });
3370    }
3371
3372    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3373        normalize_spdx_declared_license(license.as_deref());
3374    let extracted_license_statement = license.clone();
3375
3376    let dependencies = build_setup_py_dependencies(values);
3377    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3378    let mut homepage_from_project_urls = None;
3379    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3380    let mut extra_data = HashMap::new();
3381
3382    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3383        apply_project_url_mappings(
3384            &parsed_project_urls,
3385            &mut homepage_from_project_urls,
3386            &mut bug_tracking_url,
3387            &mut code_view_url,
3388            &mut vcs_url,
3389            &mut extra_data,
3390        );
3391    }
3392
3393    let extra_data = if extra_data.is_empty() {
3394        None
3395    } else {
3396        Some(extra_data)
3397    };
3398
3399    PackageData {
3400        package_type: Some(PythonParser::PACKAGE_TYPE),
3401        namespace: None,
3402        name,
3403        version,
3404        qualifiers: None,
3405        subpath: None,
3406        primary_language: Some("Python".to_string()),
3407        description,
3408        release_date: None,
3409        parties,
3410        keywords: Vec::new(),
3411        homepage_url: homepage_url.or(homepage_from_project_urls),
3412        download_url: None,
3413        size: None,
3414        sha1: None,
3415        md5: None,
3416        sha256: None,
3417        sha512: None,
3418        bug_tracking_url,
3419        code_view_url,
3420        vcs_url,
3421        copyright: None,
3422        holder: None,
3423        declared_license_expression,
3424        declared_license_expression_spdx,
3425        license_detections,
3426        other_license_expression: None,
3427        other_license_expression_spdx: None,
3428        other_license_detections: Vec::new(),
3429        extracted_license_statement,
3430        notice_text: None,
3431        source_packages: Vec::new(),
3432        file_references: Vec::new(),
3433        is_private: has_private_classifier(&classifiers),
3434        is_virtual: false,
3435        extra_data,
3436        dependencies,
3437        repository_homepage_url: None,
3438        repository_download_url: None,
3439        api_data_url: None,
3440        datasource_id: Some(DatasourceId::PypiSetupPy),
3441        purl,
3442    }
3443}
3444
3445fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3446    let mut dependencies = Vec::new();
3447
3448    if let Some(reqs) = values
3449        .get("install_requires")
3450        .and_then(value_to_string_list)
3451    {
3452        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3453    }
3454
3455    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3456        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3457    }
3458
3459    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3460        let mut extra_items: Vec<_> = extras.iter().collect();
3461        extra_items.sort_by_key(|(name, _)| *name);
3462        for (extra_name, extra_value) in extra_items {
3463            if let Some(reqs) = value_to_string_list(extra_value) {
3464                dependencies.extend(build_setup_py_dependency_list(
3465                    reqs.as_slice(),
3466                    extra_name,
3467                    true,
3468                ));
3469            }
3470        }
3471    }
3472
3473    dependencies
3474}
3475
3476fn build_setup_py_dependency_list(
3477    reqs: &[String],
3478    scope: &str,
3479    is_optional: bool,
3480) -> Vec<Dependency> {
3481    reqs.iter()
3482        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3483        .collect()
3484}
3485
3486fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3487    values.get(key).and_then(value_to_string)
3488}
3489
3490fn value_to_string(value: &Value) -> Option<String> {
3491    match value {
3492        Value::String(value) => Some(value.clone()),
3493        Value::Number(value) => Some(value.to_string()),
3494        Value::Bool(value) => Some(value.to_string()),
3495        _ => None,
3496    }
3497}
3498
3499fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3500    match value {
3501        Value::String(value) => Some(vec![value.clone()]),
3502        Value::List(values) | Value::Tuple(values) => {
3503            let mut items = Vec::new();
3504            for item in values {
3505                items.push(value_to_string(item)?);
3506            }
3507            Some(items)
3508        }
3509        _ => None,
3510    }
3511}
3512
3513fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3514    let Value::Dict(dict) = value else {
3515        return None;
3516    };
3517
3518    let mut pairs: Vec<(String, String)> = dict
3519        .iter()
3520        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3521        .collect::<Option<Vec<_>>>()?;
3522    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3523    Some(pairs)
3524}
3525
3526fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3527    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3528    extract_requires_dist_dependencies(&requires_dist)
3529}
3530
3531pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3532    requires_dist
3533        .iter()
3534        .filter_map(|entry| build_rfc822_dependency(entry))
3535        .collect()
3536}
3537
3538fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3539    build_python_dependency(entry, "install", false, None)
3540}
3541
3542fn build_python_dependency(
3543    entry: &str,
3544    default_scope: &str,
3545    default_optional: bool,
3546    marker_override: Option<&str>,
3547) -> Option<Dependency> {
3548    let (requirement_part, marker_part) = entry
3549        .split_once(';')
3550        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3551        .unwrap_or((entry.trim(), None));
3552
3553    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3554    let requirement = normalize_rfc822_requirement(requirement_part);
3555    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3556        marker_part.or(marker_override),
3557        default_scope,
3558        default_optional,
3559    );
3560    let mut purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
3561
3562    let is_pinned = requirement
3563        .as_deref()
3564        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3565    if is_pinned
3566        && let Some(version) = requirement
3567            .as_deref()
3568            .map(|req| req.trim_start_matches('='))
3569    {
3570        purl.with_version(version).ok()?;
3571    }
3572
3573    let mut extra_data = HashMap::new();
3574    extra_data.extend(marker_data);
3575    if let Some(marker) = marker {
3576        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3577    }
3578
3579    Some(Dependency {
3580        purl: Some(purl.to_string()),
3581        extracted_requirement: requirement,
3582        scope: Some(scope),
3583        is_runtime: Some(true),
3584        is_optional: Some(is_optional),
3585        is_pinned: Some(is_pinned),
3586        is_direct: Some(true),
3587        resolved_package: None,
3588        extra_data: if extra_data.is_empty() {
3589            None
3590        } else {
3591            Some(extra_data)
3592        },
3593    })
3594}
3595
3596fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3597    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3598    let trimmed = requirement_part.trim();
3599    let mut remainder = trimmed[name.len()..].trim();
3600
3601    if let Some(stripped) = remainder.strip_prefix('[')
3602        && let Some(end_idx) = stripped.find(']')
3603    {
3604        remainder = stripped[end_idx + 1..].trim();
3605    }
3606
3607    let remainder = remainder
3608        .strip_prefix('(')
3609        .and_then(|value| value.strip_suffix(')'))
3610        .unwrap_or(remainder)
3611        .trim();
3612
3613    if remainder.is_empty() {
3614        return None;
3615    }
3616
3617    let mut specifiers: Vec<String> = remainder
3618        .split(',')
3619        .map(|specifier| specifier.trim().replace(' ', ""))
3620        .filter(|specifier| !specifier.is_empty())
3621        .collect();
3622    specifiers.sort();
3623    Some(specifiers.join(","))
3624}
3625
3626fn parse_rfc822_marker(
3627    marker_part: Option<&str>,
3628    default_scope: &str,
3629    default_optional: bool,
3630) -> (
3631    String,
3632    bool,
3633    Option<String>,
3634    HashMap<String, serde_json::Value>,
3635) {
3636    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3637        return (
3638            default_scope.to_string(),
3639            default_optional,
3640            None,
3641            HashMap::new(),
3642        );
3643    };
3644
3645    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3646        .expect("extra marker regex should compile");
3647    let mut extra_data = HashMap::new();
3648
3649    if let Some(python_version) = extract_marker_field(marker, "python_version") {
3650        extra_data.insert(
3651            "python_version".to_string(),
3652            serde_json::Value::String(python_version),
3653        );
3654    }
3655    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
3656        extra_data.insert(
3657            "sys_platform".to_string(),
3658            serde_json::Value::String(sys_platform),
3659        );
3660    }
3661
3662    if let Some(captures) = extra_re.captures(marker)
3663        && let Some(scope) = captures.get(1)
3664    {
3665        return (
3666            scope.as_str().to_string(),
3667            true,
3668            Some(marker.trim().to_string()),
3669            extra_data,
3670        );
3671    }
3672
3673    (
3674        default_scope.to_string(),
3675        default_optional,
3676        Some(marker.trim().to_string()),
3677        extra_data,
3678    )
3679}
3680
3681fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
3682    let re = Regex::new(&format!(
3683        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
3684        field
3685    ))
3686    .ok()?;
3687    let captures = re.captures(marker)?;
3688    let operator = captures.get(1)?.as_str();
3689    let value = captures.get(2)?.as_str();
3690    Some(format!("{} {}", operator, value))
3691}
3692
3693fn parse_requires_txt(content: &str) -> Vec<Dependency> {
3694    let mut dependencies = Vec::new();
3695    let mut current_scope = "install".to_string();
3696    let mut current_optional = false;
3697    let mut current_marker: Option<String> = None;
3698
3699    for line in content.lines() {
3700        let trimmed = line.trim();
3701        if trimmed.is_empty() || trimmed.starts_with('#') {
3702            continue;
3703        }
3704
3705        if trimmed.starts_with('[') && trimmed.ends_with(']') {
3706            let inner = &trimmed[1..trimmed.len() - 1];
3707            if let Some(rest) = inner.strip_prefix(':') {
3708                current_scope = "install".to_string();
3709                current_optional = false;
3710                current_marker = Some(rest.trim().to_string());
3711            } else if let Some((scope, marker)) = inner.split_once(':') {
3712                current_scope = scope.trim().to_string();
3713                current_optional = true;
3714                current_marker = Some(marker.trim().to_string());
3715            } else {
3716                current_scope = inner.trim().to_string();
3717                current_optional = true;
3718                current_marker = None;
3719            }
3720            continue;
3721        }
3722
3723        if let Some(dependency) = build_python_dependency(
3724            trimmed,
3725            &current_scope,
3726            current_optional,
3727            current_marker.as_deref(),
3728        ) {
3729            dependencies.push(dependency);
3730        }
3731    }
3732
3733    dependencies
3734}
3735
3736fn has_private_classifier(classifiers: &[String]) -> bool {
3737    classifiers
3738        .iter()
3739        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
3740}
3741
3742fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
3743    let name = name?;
3744    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
3745    if let Some(version) = version {
3746        package_url.with_version(version).ok()?;
3747    }
3748    Some(package_url.to_string())
3749}
3750
3751fn extract_from_setup_py_regex(content: &str) -> PackageData {
3752    let name = extract_setup_value(content, "name");
3753    let version = extract_setup_value(content, "version");
3754    let license_expression = extract_setup_value(content, "license");
3755
3756    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3757        normalize_spdx_declared_license(license_expression.as_deref());
3758    let extracted_license_statement = license_expression.clone();
3759
3760    let dependencies = extract_setup_py_dependencies(content);
3761    let homepage_url = extract_setup_value(content, "url");
3762    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3763
3764    PackageData {
3765        package_type: Some(PythonParser::PACKAGE_TYPE),
3766        namespace: None,
3767        name,
3768        version,
3769        qualifiers: None,
3770        subpath: None,
3771        primary_language: Some("Python".to_string()),
3772        description: None,
3773        release_date: None,
3774        parties: Vec::new(),
3775        keywords: Vec::new(),
3776        homepage_url,
3777        download_url: None,
3778        size: None,
3779        sha1: None,
3780        md5: None,
3781        sha256: None,
3782        sha512: None,
3783        bug_tracking_url: None,
3784        code_view_url: None,
3785        vcs_url: None,
3786        copyright: None,
3787        holder: None,
3788        declared_license_expression,
3789        declared_license_expression_spdx,
3790        license_detections,
3791        other_license_expression: None,
3792        other_license_expression_spdx: None,
3793        other_license_detections: Vec::new(),
3794        extracted_license_statement,
3795        notice_text: None,
3796        source_packages: Vec::new(),
3797        file_references: Vec::new(),
3798        is_private: false,
3799        is_virtual: false,
3800        extra_data: None,
3801        dependencies,
3802        repository_homepage_url: None,
3803        repository_download_url: None,
3804        api_data_url: None,
3805        datasource_id: Some(DatasourceId::PypiSetupPy),
3806        purl,
3807    }
3808}
3809
3810fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
3811    crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
3812}
3813
3814fn extract_from_pypi_json(path: &Path) -> PackageData {
3815    let default = PackageData {
3816        package_type: Some(PythonParser::PACKAGE_TYPE),
3817        datasource_id: Some(DatasourceId::PypiJson),
3818        ..Default::default()
3819    };
3820
3821    let content = match read_file_to_string(path) {
3822        Ok(content) => content,
3823        Err(error) => {
3824            warn!("Failed to read pypi.json at {:?}: {}", path, error);
3825            return default;
3826        }
3827    };
3828
3829    let root: serde_json::Value = match serde_json::from_str(&content) {
3830        Ok(value) => value,
3831        Err(error) => {
3832            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
3833            return default;
3834        }
3835    };
3836
3837    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
3838        warn!("No info object found in pypi.json at {:?}", path);
3839        return default;
3840    };
3841
3842    let name = info
3843        .get("name")
3844        .and_then(|value| value.as_str())
3845        .map(ToOwned::to_owned);
3846    let version = info
3847        .get("version")
3848        .and_then(|value| value.as_str())
3849        .map(ToOwned::to_owned);
3850    let summary = info
3851        .get("summary")
3852        .and_then(|value| value.as_str())
3853        .map(ToOwned::to_owned);
3854    let description = info
3855        .get("description")
3856        .and_then(|value| value.as_str())
3857        .filter(|value| !value.trim().is_empty())
3858        .map(ToOwned::to_owned)
3859        .or(summary);
3860    let mut homepage_url = info
3861        .get("home_page")
3862        .and_then(|value| value.as_str())
3863        .map(ToOwned::to_owned);
3864    let author = info
3865        .get("author")
3866        .and_then(|value| value.as_str())
3867        .filter(|value| !value.trim().is_empty())
3868        .map(ToOwned::to_owned);
3869    let author_email = info
3870        .get("author_email")
3871        .and_then(|value| value.as_str())
3872        .filter(|value| !value.trim().is_empty())
3873        .map(ToOwned::to_owned);
3874    let license = info
3875        .get("license")
3876        .and_then(|value| value.as_str())
3877        .filter(|value| !value.trim().is_empty())
3878        .map(ToOwned::to_owned);
3879    let keywords = parse_setup_cfg_keywords(
3880        info.get("keywords")
3881            .and_then(|value| value.as_str())
3882            .map(ToOwned::to_owned),
3883    );
3884    let classifiers = info
3885        .get("classifiers")
3886        .and_then(|value| value.as_array())
3887        .map(|values| {
3888            values
3889                .iter()
3890                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
3891                .collect::<Vec<_>>()
3892        })
3893        .unwrap_or_default();
3894
3895    let mut parties = Vec::new();
3896    if author.is_some() || author_email.is_some() {
3897        parties.push(Party {
3898            r#type: Some("person".to_string()),
3899            role: Some("author".to_string()),
3900            name: author,
3901            email: author_email,
3902            url: None,
3903            organization: None,
3904            organization_url: None,
3905            timezone: None,
3906        });
3907    }
3908
3909    let mut bug_tracking_url = None;
3910    let mut code_view_url = None;
3911    let mut vcs_url = None;
3912    let mut extra_data = HashMap::new();
3913
3914    let parsed_project_urls = info
3915        .get("project_urls")
3916        .and_then(|value| value.as_object())
3917        .map(|map| {
3918            let mut pairs: Vec<(String, String)> = map
3919                .iter()
3920                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
3921                .collect();
3922            pairs.sort_by(|left, right| left.0.cmp(&right.0));
3923            pairs
3924        })
3925        .unwrap_or_default();
3926
3927    apply_project_url_mappings(
3928        &parsed_project_urls,
3929        &mut homepage_url,
3930        &mut bug_tracking_url,
3931        &mut code_view_url,
3932        &mut vcs_url,
3933        &mut extra_data,
3934    );
3935
3936    let (download_url, size, sha256) = root
3937        .get("urls")
3938        .and_then(|value| value.as_array())
3939        .map(|urls| select_pypi_json_artifact(urls))
3940        .unwrap_or((None, None, None));
3941
3942    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3943        normalize_spdx_declared_license(license.as_deref());
3944    let dependencies = info
3945        .get("requires_dist")
3946        .and_then(|value| value.as_array())
3947        .map(|entries| {
3948            entries
3949                .iter()
3950                .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
3951                .collect::<Vec<_>>()
3952        })
3953        .map(|entries| extract_requires_dist_dependencies(&entries))
3954        .unwrap_or_default();
3955
3956    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
3957        build_pypi_urls(name.as_deref(), version.as_deref());
3958
3959    PackageData {
3960        package_type: Some(PythonParser::PACKAGE_TYPE),
3961        namespace: None,
3962        name,
3963        version,
3964        qualifiers: None,
3965        subpath: None,
3966        primary_language: None,
3967        description,
3968        release_date: None,
3969        parties,
3970        keywords,
3971        homepage_url: homepage_url.or(repository_homepage_url.clone()),
3972        download_url,
3973        size,
3974        sha1: None,
3975        md5: None,
3976        sha256,
3977        sha512: None,
3978        bug_tracking_url,
3979        code_view_url,
3980        vcs_url,
3981        copyright: None,
3982        holder: None,
3983        declared_license_expression,
3984        declared_license_expression_spdx,
3985        license_detections,
3986        other_license_expression: None,
3987        other_license_expression_spdx: None,
3988        other_license_detections: Vec::new(),
3989        extracted_license_statement: license,
3990        notice_text: None,
3991        source_packages: Vec::new(),
3992        file_references: Vec::new(),
3993        is_private: has_private_classifier(&classifiers),
3994        is_virtual: false,
3995        extra_data: if extra_data.is_empty() {
3996            None
3997        } else {
3998            Some(extra_data)
3999        },
4000        dependencies,
4001        repository_homepage_url,
4002        repository_download_url,
4003        api_data_url,
4004        datasource_id: Some(DatasourceId::PypiJson),
4005        purl,
4006    }
4007}
4008
4009fn select_pypi_json_artifact(
4010    urls: &[serde_json::Value],
4011) -> (Option<String>, Option<u64>, Option<String>) {
4012    let selected = urls
4013        .iter()
4014        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
4015        .or_else(|| urls.first());
4016
4017    let Some(entry) = selected else {
4018        return (None, None, None);
4019    };
4020
4021    let download_url = entry
4022        .get("url")
4023        .and_then(|value| value.as_str())
4024        .map(ToOwned::to_owned);
4025    let size = entry.get("size").and_then(|value| value.as_u64());
4026    let sha256 = entry
4027        .get("digests")
4028        .and_then(|value| value.as_object())
4029        .and_then(|digests| digests.get("sha256"))
4030        .and_then(|value| value.as_str())
4031        .map(ToOwned::to_owned);
4032
4033    (download_url, size, sha256)
4034}
4035
4036fn extract_from_pip_inspect(path: &Path) -> PackageData {
4037    let content = match read_file_to_string(path) {
4038        Ok(content) => content,
4039        Err(e) => {
4040            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4041            return default_package_data(path);
4042        }
4043    };
4044
4045    let root: serde_json::Value = match serde_json::from_str(&content) {
4046        Ok(value) => value,
4047        Err(e) => {
4048            warn!(
4049                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4050                path, e
4051            );
4052            return default_package_data(path);
4053        }
4054    };
4055
4056    let installed = match root.get("installed").and_then(|v| v.as_array()) {
4057        Some(arr) => arr,
4058        None => {
4059            warn!(
4060                "No 'installed' array found in pip-inspect.deplock at {:?}",
4061                path
4062            );
4063            return default_package_data(path);
4064        }
4065    };
4066
4067    let pip_version = root
4068        .get("pip_version")
4069        .and_then(|v| v.as_str())
4070        .map(String::from);
4071    let inspect_version = root
4072        .get("version")
4073        .and_then(|v| v.as_str())
4074        .map(String::from);
4075
4076    let mut main_package: Option<PackageData> = None;
4077    let mut dependencies: Vec<Dependency> = Vec::new();
4078
4079    for package_entry in installed {
4080        let metadata = match package_entry.get("metadata") {
4081            Some(m) => m,
4082            None => continue,
4083        };
4084
4085        let is_requested = package_entry
4086            .get("requested")
4087            .and_then(|v| v.as_bool())
4088            .unwrap_or(false);
4089        let has_direct_url = package_entry.get("direct_url").is_some();
4090
4091        let name = metadata
4092            .get("name")
4093            .and_then(|v| v.as_str())
4094            .map(String::from);
4095        let version = metadata
4096            .get("version")
4097            .and_then(|v| v.as_str())
4098            .map(String::from);
4099        let summary = metadata
4100            .get("summary")
4101            .and_then(|v| v.as_str())
4102            .map(String::from);
4103        let home_page = metadata
4104            .get("home_page")
4105            .and_then(|v| v.as_str())
4106            .map(String::from);
4107        let author = metadata
4108            .get("author")
4109            .and_then(|v| v.as_str())
4110            .map(String::from);
4111        let author_email = metadata
4112            .get("author_email")
4113            .and_then(|v| v.as_str())
4114            .map(String::from);
4115        let license = metadata
4116            .get("license")
4117            .and_then(|v| v.as_str())
4118            .map(String::from);
4119        let description = metadata
4120            .get("description")
4121            .and_then(|v| v.as_str())
4122            .map(String::from);
4123        let keywords = metadata
4124            .get("keywords")
4125            .and_then(|v| v.as_array())
4126            .map(|arr| {
4127                arr.iter()
4128                    .filter_map(|k| k.as_str().map(String::from))
4129                    .collect::<Vec<_>>()
4130            })
4131            .unwrap_or_default();
4132
4133        let mut parties = Vec::new();
4134        if author.is_some() || author_email.is_some() {
4135            parties.push(Party {
4136                r#type: Some("person".to_string()),
4137                role: Some("author".to_string()),
4138                name: author,
4139                email: author_email,
4140                url: None,
4141                organization: None,
4142                organization_url: None,
4143                timezone: None,
4144            });
4145        }
4146
4147        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4148            normalize_spdx_declared_license(license.as_deref());
4149        let extracted_license_statement = license.clone();
4150        let requires_dist = metadata
4151            .get("requires_dist")
4152            .and_then(|v| v.as_array())
4153            .map(|entries| {
4154                entries
4155                    .iter()
4156                    .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4157                    .collect::<Vec<_>>()
4158            })
4159            .unwrap_or_default();
4160        let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4161
4162        let purl = name.as_ref().and_then(|n| {
4163            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4164            if let Some(v) = &version {
4165                package_url.with_version(v).ok()?;
4166            }
4167            Some(package_url.to_string())
4168        });
4169
4170        if is_requested && has_direct_url {
4171            let mut extra_data = HashMap::new();
4172            if let Some(pv) = &pip_version {
4173                extra_data.insert(
4174                    "pip_version".to_string(),
4175                    serde_json::Value::String(pv.clone()),
4176                );
4177            }
4178            if let Some(iv) = &inspect_version {
4179                extra_data.insert(
4180                    "inspect_version".to_string(),
4181                    serde_json::Value::String(iv.clone()),
4182                );
4183            }
4184
4185            main_package = Some(PackageData {
4186                package_type: Some(PythonParser::PACKAGE_TYPE),
4187                namespace: None,
4188                name,
4189                version,
4190                qualifiers: None,
4191                subpath: None,
4192                primary_language: Some("Python".to_string()),
4193                description: description.or(summary),
4194                release_date: None,
4195                parties,
4196                keywords,
4197                homepage_url: home_page,
4198                download_url: None,
4199                size: None,
4200                sha1: None,
4201                md5: None,
4202                sha256: None,
4203                sha512: None,
4204                bug_tracking_url: None,
4205                code_view_url: None,
4206                vcs_url: None,
4207                copyright: None,
4208                holder: None,
4209                declared_license_expression,
4210                declared_license_expression_spdx,
4211                license_detections,
4212                other_license_expression: None,
4213                other_license_expression_spdx: None,
4214                other_license_detections: Vec::new(),
4215                extracted_license_statement,
4216                notice_text: None,
4217                source_packages: Vec::new(),
4218                file_references: Vec::new(),
4219                is_private: false,
4220                is_virtual: true,
4221                extra_data: if extra_data.is_empty() {
4222                    None
4223                } else {
4224                    Some(extra_data)
4225                },
4226                dependencies: parsed_dependencies,
4227                repository_homepage_url: None,
4228                repository_download_url: None,
4229                api_data_url: None,
4230                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4231                purl,
4232            });
4233        } else {
4234            let resolved_package = PackageData {
4235                package_type: Some(PythonParser::PACKAGE_TYPE),
4236                namespace: None,
4237                name: name.clone(),
4238                version: version.clone(),
4239                qualifiers: None,
4240                subpath: None,
4241                primary_language: Some("Python".to_string()),
4242                description: description.or(summary),
4243                release_date: None,
4244                parties,
4245                keywords,
4246                homepage_url: home_page,
4247                download_url: None,
4248                size: None,
4249                sha1: None,
4250                md5: None,
4251                sha256: None,
4252                sha512: None,
4253                bug_tracking_url: None,
4254                code_view_url: None,
4255                vcs_url: None,
4256                copyright: None,
4257                holder: None,
4258                declared_license_expression,
4259                declared_license_expression_spdx,
4260                license_detections,
4261                other_license_expression: None,
4262                other_license_expression_spdx: None,
4263                other_license_detections: Vec::new(),
4264                extracted_license_statement,
4265                notice_text: None,
4266                source_packages: Vec::new(),
4267                file_references: Vec::new(),
4268                is_private: false,
4269                is_virtual: true,
4270                extra_data: None,
4271                dependencies: parsed_dependencies,
4272                repository_homepage_url: None,
4273                repository_download_url: None,
4274                api_data_url: None,
4275                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4276                purl: purl.clone(),
4277            };
4278
4279            let resolved = package_data_to_resolved(&resolved_package);
4280            dependencies.push(Dependency {
4281                purl,
4282                extracted_requirement: None,
4283                scope: None,
4284                is_runtime: Some(true),
4285                is_optional: Some(false),
4286                is_pinned: Some(true),
4287                is_direct: Some(is_requested),
4288                resolved_package: Some(Box::new(resolved)),
4289                extra_data: None,
4290            });
4291        }
4292    }
4293
4294    if let Some(mut main_pkg) = main_package {
4295        let direct_requirement_purls: HashSet<String> = main_pkg
4296            .dependencies
4297            .iter()
4298            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4299            .collect();
4300
4301        let resolved_requirement_purls: HashSet<String> = dependencies
4302            .iter()
4303            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4304            .collect();
4305
4306        let unresolved_dependencies = main_pkg
4307            .dependencies
4308            .iter()
4309            .filter(|dep| {
4310                dep.purl.as_ref().is_some_and(|purl| {
4311                    !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4312                })
4313            })
4314            .cloned()
4315            .collect::<Vec<_>>();
4316
4317        for dependency in &mut dependencies {
4318            if dependency
4319                .purl
4320                .as_ref()
4321                .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4322            {
4323                dependency.is_direct = Some(true);
4324            }
4325        }
4326
4327        main_pkg.dependencies = dependencies;
4328        main_pkg.dependencies.extend(unresolved_dependencies);
4329        main_pkg
4330    } else {
4331        default_package_data(path)
4332    }
4333}
4334
4335fn base_dependency_purl(purl: &str) -> String {
4336    purl.split_once('@')
4337        .map(|(base, _)| base.to_string())
4338        .unwrap_or_else(|| purl.to_string())
4339}
4340
4341type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4342
4343fn extract_from_setup_cfg(path: &Path) -> PackageData {
4344    let content = match read_file_to_string(path) {
4345        Ok(content) => content,
4346        Err(e) => {
4347            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4348            return default_package_data(path);
4349        }
4350    };
4351
4352    let sections = parse_setup_cfg(&content);
4353    let name = get_ini_value(&sections, "metadata", "name");
4354    let version = get_ini_value(&sections, "metadata", "version");
4355    let description = get_ini_value(&sections, "metadata", "description");
4356    let author = get_ini_value(&sections, "metadata", "author");
4357    let author_email = get_ini_value(&sections, "metadata", "author_email");
4358    let maintainer = get_ini_value(&sections, "metadata", "maintainer");
4359    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4360    let license = get_ini_value(&sections, "metadata", "license");
4361    let mut homepage_url = get_ini_value(&sections, "metadata", "url");
4362    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4363    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4364    let python_requires = get_ini_value(&sections, "options", "python_requires");
4365    let parsed_project_urls =
4366        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4367    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4368    let mut extra_data = HashMap::new();
4369
4370    let mut parties = Vec::new();
4371    if author.is_some() || author_email.is_some() {
4372        parties.push(Party {
4373            r#type: Some("person".to_string()),
4374            role: Some("author".to_string()),
4375            name: author,
4376            email: author_email,
4377            url: None,
4378            organization: None,
4379            organization_url: None,
4380            timezone: None,
4381        });
4382    }
4383
4384    if maintainer.is_some() || maintainer_email.is_some() {
4385        parties.push(Party {
4386            r#type: Some("person".to_string()),
4387            role: Some("maintainer".to_string()),
4388            name: maintainer,
4389            email: maintainer_email,
4390            url: None,
4391            organization: None,
4392            organization_url: None,
4393            timezone: None,
4394        });
4395    }
4396
4397    let declared_license_expression = None;
4398    let declared_license_expression_spdx = None;
4399    let license_detections = Vec::new();
4400    let extracted_license_statement = license.clone();
4401
4402    let dependencies = extract_setup_cfg_dependencies(&sections);
4403
4404    if let Some(value) = python_requires {
4405        extra_data.insert(
4406            "python_requires".to_string(),
4407            serde_json::Value::String(value),
4408        );
4409    }
4410
4411    apply_project_url_mappings(
4412        &parsed_project_urls,
4413        &mut homepage_url,
4414        &mut bug_tracking_url,
4415        &mut code_view_url,
4416        &mut vcs_url,
4417        &mut extra_data,
4418    );
4419
4420    let extra_data = if extra_data.is_empty() {
4421        None
4422    } else {
4423        Some(extra_data)
4424    };
4425
4426    let purl = name.as_ref().and_then(|n| {
4427        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4428        if let Some(v) = &version {
4429            package_url.with_version(v).ok()?;
4430        }
4431        Some(package_url.to_string())
4432    });
4433
4434    PackageData {
4435        package_type: Some(PythonParser::PACKAGE_TYPE),
4436        namespace: None,
4437        name,
4438        version,
4439        qualifiers: None,
4440        subpath: None,
4441        primary_language: Some("Python".to_string()),
4442        description,
4443        release_date: None,
4444        parties,
4445        keywords,
4446        homepage_url,
4447        download_url: None,
4448        size: None,
4449        sha1: None,
4450        md5: None,
4451        sha256: None,
4452        sha512: None,
4453        bug_tracking_url,
4454        code_view_url,
4455        vcs_url,
4456        copyright: None,
4457        holder: None,
4458        declared_license_expression,
4459        declared_license_expression_spdx,
4460        license_detections,
4461        other_license_expression: None,
4462        other_license_expression_spdx: None,
4463        other_license_detections: Vec::new(),
4464        extracted_license_statement,
4465        notice_text: None,
4466        source_packages: Vec::new(),
4467        file_references: Vec::new(),
4468        is_private: has_private_classifier(&classifiers),
4469        is_virtual: false,
4470        extra_data,
4471        dependencies,
4472        repository_homepage_url: None,
4473        repository_download_url: None,
4474        api_data_url: None,
4475        datasource_id: Some(DatasourceId::PypiSetupCfg),
4476        purl,
4477    }
4478}
4479
4480fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4481    let Some(keywords) = value else {
4482        return Vec::new();
4483    };
4484
4485    keywords
4486        .split(',')
4487        .map(str::trim)
4488        .filter(|keyword| !keyword.is_empty())
4489        .map(ToOwned::to_owned)
4490        .collect()
4491}
4492
4493fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4494    entries
4495        .iter()
4496        .filter_map(|entry| {
4497            let (label, url) = entry.split_once('=')?;
4498            let label = label.trim();
4499            let url = url.trim();
4500            if label.is_empty() || url.is_empty() {
4501                None
4502            } else {
4503                Some((label.to_string(), url.to_string()))
4504            }
4505        })
4506        .collect()
4507}
4508
4509fn apply_project_url_mappings(
4510    parsed_urls: &[(String, String)],
4511    homepage_url: &mut Option<String>,
4512    bug_tracking_url: &mut Option<String>,
4513    code_view_url: &mut Option<String>,
4514    vcs_url: &mut Option<String>,
4515    extra_data: &mut HashMap<String, serde_json::Value>,
4516) {
4517    for (label, url) in parsed_urls {
4518        let label_lower = label.to_lowercase();
4519
4520        if bug_tracking_url.is_none()
4521            && matches!(
4522                label_lower.as_str(),
4523                "tracker"
4524                    | "bug reports"
4525                    | "bug tracker"
4526                    | "issues"
4527                    | "issue tracker"
4528                    | "github: issues"
4529            )
4530        {
4531            *bug_tracking_url = Some(url.clone());
4532        } else if code_view_url.is_none()
4533            && matches!(label_lower.as_str(), "source" | "source code" | "code")
4534        {
4535            *code_view_url = Some(url.clone());
4536        } else if vcs_url.is_none()
4537            && matches!(
4538                label_lower.as_str(),
4539                "github" | "gitlab" | "github: repo" | "repository"
4540            )
4541        {
4542            *vcs_url = Some(url.clone());
4543        } else if homepage_url.is_none()
4544            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4545        {
4546            *homepage_url = Some(url.clone());
4547        } else if label_lower == "changelog" {
4548            extra_data.insert(
4549                "changelog_url".to_string(),
4550                serde_json::Value::String(url.clone()),
4551            );
4552        }
4553    }
4554
4555    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4556        .iter()
4557        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4558        .collect();
4559
4560    if !project_urls_json.is_empty() {
4561        extra_data.insert(
4562            "project_urls".to_string(),
4563            serde_json::Value::Object(project_urls_json),
4564        );
4565    }
4566}
4567
4568fn parse_setup_cfg(content: &str) -> IniSections {
4569    let mut sections: IniSections = HashMap::new();
4570    let mut current_section: Option<String> = None;
4571    let mut current_key: Option<String> = None;
4572
4573    for raw_line in content.lines() {
4574        let line = raw_line.trim_end_matches('\r');
4575        let trimmed = line.trim();
4576        if trimmed.is_empty() {
4577            continue;
4578        }
4579
4580        let stripped = line.trim_start();
4581        if stripped.starts_with('#') || stripped.starts_with(';') {
4582            continue;
4583        }
4584
4585        if stripped.starts_with('[') && stripped.ends_with(']') {
4586            let section_name = stripped
4587                .trim_start_matches('[')
4588                .trim_end_matches(']')
4589                .trim()
4590                .to_ascii_lowercase();
4591            current_section = if section_name.is_empty() {
4592                None
4593            } else {
4594                Some(section_name)
4595            };
4596            current_key = None;
4597            continue;
4598        }
4599
4600        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4601            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4602                let value = stripped.trim();
4603                if !value.is_empty() {
4604                    sections
4605                        .entry(section.clone())
4606                        .or_default()
4607                        .entry(key.clone())
4608                        .or_default()
4609                        .push(value.to_string());
4610                }
4611            }
4612            continue;
4613        }
4614
4615        if let Some((key, value)) = stripped.split_once('=')
4616            && let Some(section) = current_section.as_ref()
4617        {
4618            let key_name = key.trim().to_ascii_lowercase();
4619            let value_trimmed = value.trim();
4620            let entry = sections
4621                .entry(section.clone())
4622                .or_default()
4623                .entry(key_name.clone())
4624                .or_default();
4625            if !value_trimmed.is_empty() {
4626                entry.push(value_trimmed.to_string());
4627            }
4628            current_key = Some(key_name);
4629        }
4630    }
4631
4632    sections
4633}
4634
4635fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4636    sections
4637        .get(&section.to_ascii_lowercase())
4638        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4639        .and_then(|entries| entries.first())
4640        .map(|value| value.trim().to_string())
4641}
4642
4643fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4644    sections
4645        .get(&section.to_ascii_lowercase())
4646        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4647        .cloned()
4648        .unwrap_or_default()
4649}
4650
4651fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
4652    let mut dependencies = Vec::new();
4653
4654    for (sub_section, scope) in [
4655        ("install_requires", "install"),
4656        ("tests_require", "test"),
4657        ("setup_requires", "setup"),
4658    ] {
4659        let reqs = get_ini_values(sections, "options", sub_section);
4660        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
4661    }
4662
4663    if let Some(extras) = sections.get("options.extras_require") {
4664        let mut extra_items: Vec<_> = extras.iter().collect();
4665        extra_items.sort_by_key(|(name, _)| *name);
4666        for (extra_name, reqs) in extra_items {
4667            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
4668        }
4669    }
4670
4671    dependencies
4672}
4673
4674fn parse_setup_cfg_requirements(
4675    reqs: &[String],
4676    scope: &str,
4677    is_optional: bool,
4678) -> Vec<Dependency> {
4679    reqs.iter()
4680        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
4681        .collect()
4682}
4683
4684fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
4685    let trimmed = req.trim();
4686    if trimmed.is_empty() || trimmed.starts_with('#') {
4687        return None;
4688    }
4689
4690    let name = extract_setup_cfg_dependency_name(trimmed)?;
4691    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4692
4693    Some(Dependency {
4694        purl: Some(purl.to_string()),
4695        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
4696        scope: Some(scope.to_string()),
4697        is_runtime: Some(true),
4698        is_optional: Some(is_optional),
4699        is_pinned: Some(false),
4700        is_direct: Some(true),
4701        resolved_package: None,
4702        extra_data: None,
4703    })
4704}
4705
4706fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
4707    let trimmed = req.trim();
4708    if trimmed.is_empty() {
4709        return None;
4710    }
4711
4712    let end = trimmed
4713        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
4714        .unwrap_or(trimmed.len());
4715    let name = trimmed[..end].trim();
4716    if name.is_empty() {
4717        None
4718    } else {
4719        Some(name.to_string())
4720    }
4721}
4722
4723fn normalize_setup_cfg_requirement(req: &str) -> String {
4724    req.chars().filter(|c| !c.is_whitespace()).collect()
4725}
4726
4727fn extract_setup_value(content: &str, key: &str) -> Option<String> {
4728    let patterns = vec![
4729        format!("{}=\"", key),   // name="value"
4730        format!("{} =\"", key),  // name ="value"
4731        format!("{}= \"", key),  // name= "value"
4732        format!("{} = \"", key), // name = "value"
4733        format!("{}='", key),    // name='value'
4734        format!("{} ='", key),   // name ='value'
4735        format!("{}= '", key),   // name= 'value'
4736        format!("{} = '", key),  // name = 'value'
4737    ];
4738
4739    for pattern in patterns {
4740        if let Some(start_idx) = content.find(&pattern) {
4741            let value_start = start_idx + pattern.len();
4742            let remaining = &content[value_start..];
4743
4744            if let Some(end_idx) = remaining.find(['"', '\'']) {
4745                return Some(remaining[..end_idx].to_string());
4746            }
4747        }
4748    }
4749
4750    None
4751}
4752
4753fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
4754    let mut dependencies = Vec::new();
4755
4756    if let Some(tests_deps) = extract_tests_require(content) {
4757        dependencies.extend(tests_deps);
4758    }
4759
4760    if let Some(extras_deps) = extract_extras_require(content) {
4761        dependencies.extend(extras_deps);
4762    }
4763
4764    dependencies
4765}
4766
4767fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
4768    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
4769    let re = Regex::new(pattern).ok()?;
4770    let captures = re.captures(content)?;
4771    let deps_str = captures.get(1)?.as_str();
4772
4773    let deps = parse_setup_py_dep_list(deps_str, "test", true);
4774    if deps.is_empty() { None } else { Some(deps) }
4775}
4776
4777fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
4778    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
4779    let re = Regex::new(pattern).ok()?;
4780    let captures = re.captures(content)?;
4781    let dict_content = captures.get(1)?.as_str();
4782
4783    let mut all_deps = Vec::new();
4784
4785    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
4786    let entry_re = Regex::new(entry_pattern).ok()?;
4787
4788    for entry_cap in entry_re.captures_iter(dict_content) {
4789        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
4790            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
4791            all_deps.extend(deps);
4792        }
4793    }
4794
4795    if all_deps.is_empty() {
4796        None
4797    } else {
4798        Some(all_deps)
4799    }
4800}
4801
4802fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
4803    let dep_pattern = r#"['"]([^'"]+)['"]"#;
4804    let re = match Regex::new(dep_pattern) {
4805        Ok(r) => r,
4806        Err(_) => return Vec::new(),
4807    };
4808
4809    re.captures_iter(deps_str)
4810        .filter_map(|cap| {
4811            let dep_str = cap.get(1)?.as_str().trim();
4812            if dep_str.is_empty() {
4813                return None;
4814            }
4815
4816            let name = extract_setup_cfg_dependency_name(dep_str)?;
4817            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4818
4819            Some(Dependency {
4820                purl: Some(purl.to_string()),
4821                extracted_requirement: Some(dep_str.to_string()),
4822                scope: Some(scope.to_string()),
4823                is_runtime: Some(true),
4824                is_optional: Some(is_optional),
4825                is_pinned: Some(false),
4826                is_direct: Some(true),
4827                resolved_package: None,
4828                extra_data: None,
4829            })
4830        })
4831        .collect()
4832}
4833
4834/// Reads and parses a TOML file
4835pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
4836    let content = read_file_to_string(path).map_err(|e| e.to_string())?;
4837    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
4838}
4839
4840/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
4841///
4842/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
4843/// Essential for SBOM compliance and package integrity verification.
4844///
4845/// # Returns
4846///
4847/// - `(Some(size), Some(hash))` on success
4848/// - `(None, None)` if file cannot be opened
4849/// - `(Some(size), None)` if hash calculation fails during read
4850fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<String>) {
4851    let mut file = match File::open(path) {
4852        Ok(f) => f,
4853        Err(_) => return (None, None),
4854    };
4855
4856    let metadata = match file.metadata() {
4857        Ok(m) => m,
4858        Err(_) => return (None, None),
4859    };
4860    let size = metadata.len();
4861
4862    let mut hasher = Sha256::new();
4863    let mut buffer = vec![0; 8192];
4864
4865    loop {
4866        match file.read(&mut buffer) {
4867            Ok(0) => break,
4868            Ok(n) => hasher.update(&buffer[..n]),
4869            Err(_) => return (Some(size), None),
4870        }
4871    }
4872
4873    let hash = hex::encode(hasher.finalize());
4874    (Some(size), Some(hash))
4875}
4876
4877fn default_package_data(path: &Path) -> PackageData {
4878    PackageData {
4879        package_type: Some(PythonParser::PACKAGE_TYPE),
4880        primary_language: Some("Python".to_string()),
4881        datasource_id: infer_python_datasource_id(path),
4882        ..Default::default()
4883    }
4884}
4885
4886fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
4887    let file_name = path.file_name().and_then(|name| name.to_str());
4888
4889    match file_name {
4890        Some("pyproject.toml") => {
4891            if read_toml_file(path)
4892                .ok()
4893                .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
4894                .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
4895                .is_some()
4896            {
4897                Some(DatasourceId::PypiPoetryPyprojectToml)
4898            } else {
4899                Some(DatasourceId::PypiPyprojectToml)
4900            }
4901        }
4902        Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
4903            Some(DatasourceId::PypiSetupPy)
4904        }
4905        Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
4906        Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
4907        Some("METADATA") if is_installed_wheel_metadata_path(path) => {
4908            Some(DatasourceId::PypiWheelMetadata)
4909        }
4910        Some("pypi.json") => Some(DatasourceId::PypiJson),
4911        Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
4912        Some("origin.json") if is_pip_cache_origin_json(path) => {
4913            Some(DatasourceId::PypiPipOriginJson)
4914        }
4915        _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
4916            Some(DatasourceId::PypiSdist)
4917        }
4918        _ if path
4919            .extension()
4920            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
4921        {
4922            Some(DatasourceId::PypiWheel)
4923        }
4924        _ if path
4925            .extension()
4926            .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
4927        {
4928            Some(DatasourceId::PypiEgg)
4929        }
4930        _ => None,
4931    }
4932}
4933
4934crate::register_parser!(
4935    "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
4936    &[
4937        "**/pyproject.toml",
4938        "**/setup.py",
4939        "**/*_setup.py",
4940        "**/setup.cfg",
4941        "**/pypi.json",
4942        "**/PKG-INFO",
4943        "**/*.dist-info/METADATA",
4944        "**/origin.json",
4945        "**/*.tar.gz",
4946        "**/*.tgz",
4947        "**/*.tar.bz2",
4948        "**/*.tar.xz",
4949        "**/*.zip",
4950        "**/*.whl",
4951        "**/*.egg"
4952    ],
4953    "pypi",
4954    "Python",
4955    Some("https://packaging.python.org/"),
4956);