Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
35use crate::parser_warn as warn;
36use crate::parsers::utils::{read_file_to_string, split_name_email};
37use base64::Engine;
38use base64::engine::general_purpose::URL_SAFE_NO_PAD;
39use bzip2::read::BzDecoder;
40use csv::ReaderBuilder;
41use flate2::read::GzDecoder;
42use liblzma::read::XzDecoder;
43use packageurl::PackageUrl;
44use regex::Regex;
45use ruff_python_ast as ast;
46use ruff_python_parser::parse_module;
47use serde_json::{Map as JsonMap, Value as JsonValue};
48use sha2::{Digest, Sha256};
49use std::collections::{HashMap, HashSet};
50use std::fs::File;
51use std::io::Read;
52use std::path::{Component, Path, PathBuf};
53use tar::Archive;
54use toml::Value as TomlValue;
55use toml::map::Map as TomlMap;
56use zip::ZipArchive;
57
58use super::PackageParser;
59use super::license_normalization::{
60    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
61    normalize_spdx_expression,
62};
63
64// Field constants for pyproject.toml
65const FIELD_PROJECT: &str = "project";
66const FIELD_NAME: &str = "name";
67const FIELD_VERSION: &str = "version";
68const FIELD_LICENSE: &str = "license";
69const FIELD_AUTHORS: &str = "authors";
70const FIELD_MAINTAINERS: &str = "maintainers";
71const FIELD_URLS: &str = "urls";
72const FIELD_HOMEPAGE: &str = "homepage";
73const FIELD_REPOSITORY: &str = "repository";
74const FIELD_DEPENDENCIES: &str = "dependencies";
75const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
76const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
77const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
78const MAX_SETUP_PY_BYTES: usize = 1_048_576;
79const MAX_SETUP_PY_AST_NODES: usize = 10_000;
80const MAX_SETUP_PY_AST_DEPTH: usize = 50;
81const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
82const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
83const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
84
85/// Python package parser supporting 11 manifest formats.
86///
87/// Extracts metadata from Python package files including pyproject.toml, setup.py,
88/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
89///
90/// # Security
91///
92/// setup.py files are parsed using AST analysis rather than code execution to prevent
93/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
94pub struct PythonParser;
95
96#[derive(Clone, Copy, Debug)]
97enum PythonSdistArchiveFormat {
98    TarGz,
99    Tgz,
100    TarBz2,
101    TarXz,
102    Zip,
103}
104
105#[derive(Clone, Debug)]
106struct ValidatedZipEntry {
107    index: usize,
108    name: String,
109}
110
111impl PackageParser for PythonParser {
112    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
113
114    fn extract_packages(path: &Path) -> Vec<PackageData> {
115        vec![
116            if path.file_name().unwrap_or_default() == "pyproject.toml" {
117                extract_from_pyproject_toml(path)
118            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
119                extract_from_setup_cfg(path)
120            } else if is_setup_py_like_path(path) {
121                return extract_setup_py_packages(path);
122            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
123                extract_from_rfc822_metadata(path, detect_pkg_info_datasource_id(path))
124            } else if is_installed_wheel_metadata_path(path) {
125                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
126            } else if is_pip_cache_origin_json(path) {
127                extract_from_pip_origin_json(path)
128            } else if path.file_name().unwrap_or_default() == "pypi.json" {
129                extract_from_pypi_json(path)
130            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
131                extract_from_pip_inspect(path)
132            } else if is_python_sdist_archive_path(path) {
133                extract_from_sdist_archive(path)
134            } else if path
135                .extension()
136                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
137            {
138                extract_from_wheel_archive(path)
139            } else if path
140                .extension()
141                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
142            {
143                extract_from_egg_archive(path)
144            } else {
145                default_package_data(path)
146            },
147        ]
148    }
149
150    fn is_match(path: &Path) -> bool {
151        if let Some(filename) = path.file_name()
152            && (filename == "pyproject.toml"
153                || filename == "setup.cfg"
154                || is_setup_py_like_path(path)
155                || filename == "PKG-INFO"
156                || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
157                || filename == "pypi.json"
158                || filename == "pip-inspect.deplock"
159                || is_pip_cache_origin_json(path))
160        {
161            return true;
162        }
163
164        if let Some(extension) = path.extension() {
165            let ext = extension.to_string_lossy().to_lowercase();
166            if ext == "whl" || ext == "egg" || is_python_sdist_archive_path(path) {
167                return true;
168            }
169        }
170
171        false
172    }
173}
174
175fn is_setup_py_like_path(path: &Path) -> bool {
176    path.file_name()
177        .and_then(|name| name.to_str())
178        .is_some_and(|name| name == "setup.py" || name.ends_with("_setup.py"))
179}
180
181fn is_installed_wheel_metadata_path(path: &Path) -> bool {
182    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
183        && path
184            .parent()
185            .and_then(|parent| parent.file_name())
186            .and_then(|name| name.to_str())
187            .is_some_and(|name| name.ends_with(".dist-info"))
188}
189
190#[derive(Debug, Clone)]
191struct InstalledWheelMetadata {
192    wheel_tags: Vec<String>,
193    wheel_version: Option<String>,
194    wheel_generator: Option<String>,
195    root_is_purelib: Option<bool>,
196    compressed_tag: Option<String>,
197}
198
199fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
200    let Some(parent) = path.parent() else {
201        return;
202    };
203
204    if !parent
205        .file_name()
206        .and_then(|name| name.to_str())
207        .is_some_and(|name| name.ends_with(".dist-info"))
208    {
209        return;
210    }
211
212    let wheel_path = parent.join("WHEEL");
213    if !wheel_path.exists() {
214        return;
215    }
216
217    let Ok(content) = read_file_to_string(&wheel_path) else {
218        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
219        return;
220    };
221
222    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
223        return;
224    };
225
226    apply_installed_wheel_metadata(package_data, &wheel_metadata);
227}
228
229fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
230    use super::rfc822::{get_header_all, get_header_first};
231
232    let metadata = super::rfc822::parse_rfc822_content(content);
233    let wheel_tags = get_header_all(&metadata.headers, "tag");
234    if wheel_tags.is_empty() {
235        return None;
236    }
237
238    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
239    let wheel_generator = get_header_first(&metadata.headers, "generator");
240    let root_is_purelib =
241        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
242            match value.to_ascii_lowercase().as_str() {
243                "true" => Some(true),
244                "false" => Some(false),
245                _ => None,
246            }
247        });
248
249    let compressed_tag = compress_wheel_tags(&wheel_tags);
250
251    Some(InstalledWheelMetadata {
252        wheel_tags,
253        wheel_version,
254        wheel_generator,
255        root_is_purelib,
256        compressed_tag,
257    })
258}
259
260fn compress_wheel_tags(tags: &[String]) -> Option<String> {
261    if tags.is_empty() {
262        return None;
263    }
264
265    if tags.len() == 1 {
266        return Some(tags[0].clone());
267    }
268
269    let mut python_tags = Vec::new();
270    let mut abi_tag: Option<&str> = None;
271    let mut platform_tag: Option<&str> = None;
272
273    for tag in tags {
274        let mut parts = tag.splitn(3, '-');
275        let python = parts.next()?;
276        let abi = parts.next()?;
277        let platform = parts.next()?;
278
279        if abi_tag.is_some_and(|existing| existing != abi)
280            || platform_tag.is_some_and(|existing| existing != platform)
281        {
282            return None;
283        }
284
285        abi_tag = Some(abi);
286        platform_tag = Some(platform);
287        python_tags.push(python.to_string());
288    }
289
290    Some(format!(
291        "{}-{}-{}",
292        python_tags.join("."),
293        abi_tag?,
294        platform_tag?
295    ))
296}
297
298fn apply_installed_wheel_metadata(
299    package_data: &mut PackageData,
300    wheel_metadata: &InstalledWheelMetadata,
301) {
302    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
303    extra_data.insert(
304        "wheel_tags".to_string(),
305        JsonValue::Array(
306            wheel_metadata
307                .wheel_tags
308                .iter()
309                .cloned()
310                .map(JsonValue::String)
311                .collect(),
312        ),
313    );
314
315    if let Some(wheel_version) = &wheel_metadata.wheel_version {
316        extra_data.insert(
317            "wheel_version".to_string(),
318            JsonValue::String(wheel_version.clone()),
319        );
320    }
321
322    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
323        extra_data.insert(
324            "wheel_generator".to_string(),
325            JsonValue::String(wheel_generator.clone()),
326        );
327    }
328
329    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
330        extra_data.insert(
331            "root_is_purelib".to_string(),
332            JsonValue::Bool(root_is_purelib),
333        );
334    }
335
336    if let (Some(name), Some(version), Some(extension)) = (
337        package_data.name.as_deref(),
338        package_data.version.as_deref(),
339        wheel_metadata.compressed_tag.as_deref(),
340    ) {
341        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
342    }
343}
344
345fn is_pip_cache_origin_json(path: &Path) -> bool {
346    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
347        && path.ancestors().skip(1).any(|ancestor| {
348            ancestor
349                .file_name()
350                .and_then(|name| name.to_str())
351                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
352        })
353}
354
355fn extract_from_pip_origin_json(path: &Path) -> PackageData {
356    let content = match read_file_to_string(path) {
357        Ok(content) => content,
358        Err(e) => {
359            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
360            return default_package_data(path);
361        }
362    };
363
364    let root: JsonValue = match serde_json::from_str(&content) {
365        Ok(root) => root,
366        Err(e) => {
367            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
368            return default_package_data(path);
369        }
370    };
371
372    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
373        warn!("No url found in pip cache origin.json at {:?}", path);
374        return default_package_data(path);
375    };
376
377    let sibling_wheel = find_sibling_cached_wheel(path);
378    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
379        sibling_wheel
380            .as_ref()
381            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
382    });
383
384    let Some((name, version)) = name_version else {
385        warn!(
386            "Failed to infer package name/version from pip cache origin.json at {:?}",
387            path
388        );
389        return default_package_data(path);
390    };
391
392    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
393        build_pypi_urls(Some(&name), Some(&version));
394    let purl = sibling_wheel
395        .as_ref()
396        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
397        .or(plain_purl);
398
399    PackageData {
400        package_type: Some(PythonParser::PACKAGE_TYPE),
401        primary_language: Some("Python".to_string()),
402        name: Some(name),
403        version: Some(version),
404        datasource_id: Some(DatasourceId::PypiPipOriginJson),
405        download_url: Some(download_url.to_string()),
406        sha256: extract_sha256_from_origin_json(&root),
407        repository_homepage_url,
408        repository_download_url,
409        api_data_url,
410        purl,
411        ..Default::default()
412    }
413}
414
415fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
416    let parent = path.parent()?;
417    let entries = parent.read_dir().ok()?;
418
419    for entry in entries.flatten() {
420        let sibling_path = entry.path();
421        if sibling_path
422            .extension()
423            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
424            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
425        {
426            return Some(wheel_info);
427        }
428    }
429
430    None
431}
432
433fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
434    let file_name = url.rsplit('/').next()?;
435
436    if file_name.ends_with(".whl") {
437        return parse_wheel_filename(Path::new(file_name))
438            .map(|wheel_info| (wheel_info.name, wheel_info.version));
439    }
440
441    let stem = strip_python_archive_extension(file_name)?;
442    let (name, version) = stem.rsplit_once('-')?;
443    if name.is_empty() || version.is_empty() {
444        return None;
445    }
446
447    Some((name.replace('_', "-"), version.to_string()))
448}
449
450fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
451    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
452        .iter()
453        .find_map(|suffix| file_name.strip_suffix(suffix))
454}
455
456fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
457    root.pointer("/archive_info/hashes/sha256")
458        .and_then(|value| value.as_str())
459        .map(ToOwned::to_owned)
460        .or_else(|| {
461            root.pointer("/archive_info/hash")
462                .and_then(|value| value.as_str())
463                .and_then(normalize_origin_hash)
464        })
465}
466
467fn normalize_origin_hash(hash: &str) -> Option<String> {
468    if let Some(value) = hash.strip_prefix("sha256=") {
469        return Some(value.to_string());
470    }
471    if let Some(value) = hash.strip_prefix("sha256:") {
472        return Some(value.to_string());
473    }
474    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
475        return Some(hash.to_string());
476    }
477    None
478}
479
480fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
481    let content = match read_file_to_string(path) {
482        Ok(content) => content,
483        Err(e) => {
484            warn!("Failed to read metadata at {:?}: {}", path, e);
485            return default_package_data(path);
486        }
487    };
488
489    let metadata = super::rfc822::parse_rfc822_content(&content);
490    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
491    merge_sibling_metadata_dependencies(path, &mut package_data);
492    merge_sibling_metadata_file_references(path, &mut package_data);
493    if datasource_id == DatasourceId::PypiWheelMetadata {
494        merge_sibling_wheel_metadata(path, &mut package_data);
495    }
496    package_data
497}
498
499fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
500    let mut extra_dependencies = Vec::new();
501
502    if let Some(parent) = path.parent() {
503        let direct_requires = parent.join("requires.txt");
504        if direct_requires.exists()
505            && let Ok(content) = read_file_to_string(&direct_requires)
506        {
507            extra_dependencies.extend(parse_requires_txt(&content));
508        }
509
510        let sibling_egg_info_requires = parent
511            .read_dir()
512            .ok()
513            .into_iter()
514            .flatten()
515            .flatten()
516            .find_map(|entry| {
517                let child_path = entry.path();
518                if child_path.is_dir()
519                    && child_path
520                        .file_name()
521                        .and_then(|name| name.to_str())
522                        .is_some_and(|name| name.ends_with(".egg-info"))
523                {
524                    let requires = child_path.join("requires.txt");
525                    requires.exists().then_some(requires)
526                } else {
527                    None
528                }
529            });
530
531        if let Some(requires_path) = sibling_egg_info_requires
532            && let Ok(content) = read_file_to_string(&requires_path)
533        {
534            extra_dependencies.extend(parse_requires_txt(&content));
535        }
536    }
537
538    for dependency in extra_dependencies {
539        if !package_data.dependencies.iter().any(|existing| {
540            existing.purl == dependency.purl
541                && existing.scope == dependency.scope
542                && existing.extracted_requirement == dependency.extracted_requirement
543                && existing.extra_data == dependency.extra_data
544        }) {
545            package_data.dependencies.push(dependency);
546        }
547    }
548}
549
550fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
551    let mut extra_refs = Vec::new();
552
553    if let Some(parent) = path.parent() {
554        let record_path = parent.join("RECORD");
555        if record_path.exists()
556            && let Ok(content) = read_file_to_string(&record_path)
557        {
558            extra_refs.extend(parse_record_csv(&content));
559        }
560
561        let installed_files_path = parent.join("installed-files.txt");
562        if installed_files_path.exists()
563            && let Ok(content) = read_file_to_string(&installed_files_path)
564        {
565            extra_refs.extend(parse_installed_files_txt(&content));
566        }
567
568        let sources_path = parent.join("SOURCES.txt");
569        if sources_path.exists()
570            && let Ok(content) = read_file_to_string(&sources_path)
571        {
572            extra_refs.extend(parse_sources_txt(&content));
573        }
574    }
575
576    for file_ref in extra_refs {
577        if !package_data
578            .file_references
579            .iter()
580            .any(|existing| existing.path == file_ref.path)
581        {
582            package_data.file_references.push(file_ref);
583        }
584    }
585}
586
587fn collect_validated_zip_entries<R: Read + std::io::Seek>(
588    archive: &mut ZipArchive<R>,
589    path: &Path,
590    archive_type: &str,
591) -> Result<Vec<ValidatedZipEntry>, String> {
592    let mut total_extracted = 0u64;
593    let mut entries = Vec::new();
594
595    for i in 0..archive.len() {
596        if let Ok(file) = archive.by_index_raw(i) {
597            let compressed_size = file.compressed_size();
598            let uncompressed_size = file.size();
599            let Some(entry_name) = normalize_archive_entry_path(file.name()) else {
600                warn!(
601                    "Skipping unsafe path in {} {:?}: {}",
602                    archive_type,
603                    path,
604                    file.name()
605                );
606                continue;
607            };
608
609            if compressed_size > 0 {
610                let ratio = uncompressed_size as f64 / compressed_size as f64;
611                if ratio > MAX_COMPRESSION_RATIO {
612                    warn!(
613                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
614                        archive_type, path, ratio
615                    );
616                    continue;
617                }
618            }
619
620            if uncompressed_size > MAX_FILE_SIZE {
621                warn!(
622                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
623                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
624                );
625                continue;
626            }
627
628            total_extracted += uncompressed_size;
629            if total_extracted > MAX_ARCHIVE_SIZE {
630                let msg = format!(
631                    "Total extracted size exceeds limit for {} {:?}",
632                    archive_type, path
633                );
634                warn!("{}", msg);
635                return Err(msg);
636            }
637
638            entries.push(ValidatedZipEntry {
639                index: i,
640                name: entry_name,
641            });
642        }
643    }
644
645    Ok(entries)
646}
647
648fn is_python_sdist_archive_path(path: &Path) -> bool {
649    detect_python_sdist_archive_format(path).is_some()
650}
651
652fn detect_python_sdist_archive_format(path: &Path) -> Option<PythonSdistArchiveFormat> {
653    let file_name = path.file_name()?.to_str()?.to_ascii_lowercase();
654
655    if !is_likely_python_sdist_filename(&file_name) {
656        return None;
657    }
658
659    if file_name.ends_with(".tar.gz") {
660        Some(PythonSdistArchiveFormat::TarGz)
661    } else if file_name.ends_with(".tgz") {
662        Some(PythonSdistArchiveFormat::Tgz)
663    } else if file_name.ends_with(".tar.bz2") {
664        Some(PythonSdistArchiveFormat::TarBz2)
665    } else if file_name.ends_with(".tar.xz") {
666        Some(PythonSdistArchiveFormat::TarXz)
667    } else if file_name.ends_with(".zip") {
668        zip_sdist_contains_pkg_info(path).then_some(PythonSdistArchiveFormat::Zip)
669    } else {
670        None
671    }
672}
673
674fn zip_sdist_contains_pkg_info(path: &Path) -> bool {
675    if !path.is_file() {
676        return true;
677    }
678
679    let file = match File::open(path) {
680        Ok(file) => file,
681        Err(_) => return false,
682    };
683    let mut archive = match ZipArchive::new(file) {
684        Ok(archive) => archive,
685        Err(_) => return false,
686    };
687
688    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
689        Ok(entries) => entries,
690        Err(_) => return false,
691    };
692    let metadata_entries: Vec<_> = validated_entries
693        .iter()
694        .filter(|entry| entry.name.ends_with("/PKG-INFO"))
695        .filter_map(|entry| {
696            read_validated_zip_entry(&mut archive, entry, path, "sdist zip")
697                .ok()
698                .map(|content| (entry.name.clone(), content))
699        })
700        .collect();
701
702    has_matching_sdist_pkginfo_candidate(path, &metadata_entries)
703}
704
705fn is_likely_python_sdist_filename(file_name: &str) -> bool {
706    let Some(stem) = strip_python_archive_extension(file_name) else {
707        return false;
708    };
709
710    let Some((name, version)) = stem.rsplit_once('-') else {
711        return false;
712    };
713
714    !name.is_empty()
715        && !version.is_empty()
716        && version.chars().any(|ch| ch.is_ascii_digit())
717        && name
718            .chars()
719            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.'))
720}
721
722fn extract_from_sdist_archive(path: &Path) -> PackageData {
723    let metadata = match std::fs::metadata(path) {
724        Ok(m) => m,
725        Err(e) => {
726            warn!(
727                "Failed to read metadata for sdist archive {:?}: {}",
728                path, e
729            );
730            return default_package_data(path);
731        }
732    };
733
734    if metadata.len() > MAX_ARCHIVE_SIZE {
735        warn!(
736            "sdist archive too large: {} bytes (limit: {} bytes)",
737            metadata.len(),
738            MAX_ARCHIVE_SIZE
739        );
740        return default_package_data(path);
741    }
742
743    let Some(format) = detect_python_sdist_archive_format(path) else {
744        return default_package_data(path);
745    };
746
747    let mut package_data = match format {
748        PythonSdistArchiveFormat::TarGz | PythonSdistArchiveFormat::Tgz => {
749            let file = match File::open(path) {
750                Ok(file) => file,
751                Err(e) => {
752                    warn!("Failed to open sdist archive {:?}: {}", path, e);
753                    return default_package_data(path);
754                }
755            };
756            let decoder = GzDecoder::new(file);
757            extract_from_tar_sdist_archive(path, decoder, "tar.gz", metadata.len())
758        }
759        PythonSdistArchiveFormat::TarBz2 => {
760            let file = match File::open(path) {
761                Ok(file) => file,
762                Err(e) => {
763                    warn!("Failed to open sdist archive {:?}: {}", path, e);
764                    return default_package_data(path);
765                }
766            };
767            let decoder = BzDecoder::new(file);
768            extract_from_tar_sdist_archive(path, decoder, "tar.bz2", metadata.len())
769        }
770        PythonSdistArchiveFormat::TarXz => {
771            let file = match File::open(path) {
772                Ok(file) => file,
773                Err(e) => {
774                    warn!("Failed to open sdist archive {:?}: {}", path, e);
775                    return default_package_data(path);
776                }
777            };
778            let decoder = XzDecoder::new(file);
779            extract_from_tar_sdist_archive(path, decoder, "tar.xz", metadata.len())
780        }
781        PythonSdistArchiveFormat::Zip => extract_from_zip_sdist_archive(path),
782    };
783
784    if package_data.package_type.is_some() {
785        let (size, sha256) = calculate_file_checksums(path);
786        package_data.size = size;
787        package_data.sha256 = sha256;
788    }
789
790    package_data
791}
792
793fn extract_from_tar_sdist_archive<R: Read>(
794    path: &Path,
795    reader: R,
796    archive_type: &str,
797    compressed_size: u64,
798) -> PackageData {
799    let mut archive = Archive::new(reader);
800    let archive_entries = match archive.entries() {
801        Ok(entries) => entries,
802        Err(e) => {
803            warn!(
804                "Failed to read {} sdist archive {:?}: {}",
805                archive_type, path, e
806            );
807            return default_package_data(path);
808        }
809    };
810
811    let mut total_extracted = 0u64;
812    let mut entries = Vec::new();
813
814    for entry_result in archive_entries {
815        let mut entry = match entry_result {
816            Ok(entry) => entry,
817            Err(e) => {
818                warn!(
819                    "Failed to read {} sdist entry from {:?}: {}",
820                    archive_type, path, e
821                );
822                continue;
823            }
824        };
825
826        let entry_size = entry.size();
827        if entry_size > MAX_FILE_SIZE {
828            warn!(
829                "File too large in {} sdist {:?}: {} bytes (limit: {} bytes)",
830                archive_type, path, entry_size, MAX_FILE_SIZE
831            );
832            continue;
833        }
834
835        total_extracted += entry_size;
836        if total_extracted > MAX_ARCHIVE_SIZE {
837            warn!(
838                "Total extracted size exceeds limit for {} sdist {:?}",
839                archive_type, path
840            );
841            return default_package_data(path);
842        }
843
844        if compressed_size > 0 {
845            let ratio = total_extracted as f64 / compressed_size as f64;
846            if ratio > MAX_COMPRESSION_RATIO {
847                warn!(
848                    "Suspicious compression ratio in {} sdist {:?}: {:.2}:1",
849                    archive_type, path, ratio
850                );
851                return default_package_data(path);
852            }
853        }
854
855        let entry_path = match entry.path() {
856            Ok(path) => path.to_string_lossy().replace('\\', "/"),
857            Err(e) => {
858                warn!(
859                    "Failed to get {} sdist entry path from {:?}: {}",
860                    archive_type, path, e
861                );
862                continue;
863            }
864        };
865
866        let Some(entry_path) = normalize_archive_entry_path(&entry_path) else {
867            warn!("Skipping unsafe {} sdist path in {:?}", archive_type, path);
868            continue;
869        };
870
871        if !is_relevant_sdist_text_entry(&entry_path) {
872            continue;
873        }
874
875        if let Ok(content) = read_limited_utf8(
876            &mut entry,
877            MAX_FILE_SIZE,
878            &format!("{} entry {}", archive_type, entry_path),
879        ) {
880            entries.push((entry_path, content));
881        }
882    }
883
884    build_sdist_package_data(path, entries)
885}
886
887fn extract_from_zip_sdist_archive(path: &Path) -> PackageData {
888    let file = match File::open(path) {
889        Ok(file) => file,
890        Err(e) => {
891            warn!("Failed to open zip sdist archive {:?}: {}", path, e);
892            return default_package_data(path);
893        }
894    };
895
896    let mut archive = match ZipArchive::new(file) {
897        Ok(archive) => archive,
898        Err(e) => {
899            warn!("Failed to read zip sdist archive {:?}: {}", path, e);
900            return default_package_data(path);
901        }
902    };
903
904    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "sdist zip") {
905        Ok(entries) => entries,
906        Err(_) => return default_package_data(path),
907    };
908
909    let mut entries = Vec::new();
910    for entry in validated_entries.iter() {
911        if !is_relevant_sdist_text_entry(&entry.name) {
912            continue;
913        }
914
915        if let Ok(content) = read_validated_zip_entry(&mut archive, entry, path, "sdist zip") {
916            entries.push((entry.name.clone(), content));
917        }
918    }
919
920    build_sdist_package_data(path, entries)
921}
922
923fn is_relevant_sdist_text_entry(entry_path: &str) -> bool {
924    entry_path.ends_with("/PKG-INFO")
925        || entry_path.ends_with("/requires.txt")
926        || entry_path.ends_with("/SOURCES.txt")
927}
928
929fn build_sdist_package_data(path: &Path, entries: Vec<(String, String)>) -> PackageData {
930    let Some((metadata_path, metadata_content)) = select_sdist_pkginfo_entry(path, &entries) else {
931        warn!("No PKG-INFO file found in sdist archive {:?}", path);
932        return default_package_data(path);
933    };
934
935    let mut package_data =
936        python_parse_rfc822_content(&metadata_content, DatasourceId::PypiSdistPkginfo);
937    merge_sdist_archive_dependencies(&entries, &metadata_path, &mut package_data);
938    merge_sdist_archive_file_references(&entries, &metadata_path, &mut package_data);
939    apply_sdist_name_version_fallback(path, &mut package_data);
940    package_data.datasource_id = Some(DatasourceId::PypiSdist);
941    package_data
942}
943
944fn select_sdist_pkginfo_entry(
945    archive_path: &Path,
946    entries: &[(String, String)],
947) -> Option<(String, String)> {
948    let expected_name = sdist_archive_expected_name(archive_path);
949
950    entries
951        .iter()
952        .filter(|(entry_path, _)| entry_path.ends_with("/PKG-INFO"))
953        .min_by_key(|(entry_path, content)| {
954            let components: Vec<_> = entry_path
955                .split('/')
956                .filter(|part| !part.is_empty())
957                .collect();
958            let candidate_name = sdist_pkginfo_candidate_name(content);
959            let name_rank = if candidate_name == expected_name {
960                0
961            } else {
962                1
963            };
964            let kind_rank = sdist_pkginfo_kind_rank(entry_path);
965
966            (name_rank, kind_rank, components.len(), entry_path.clone())
967        })
968        .map(|(entry_path, content)| (entry_path.clone(), content.clone()))
969}
970
971fn has_matching_sdist_pkginfo_candidate(archive_path: &Path, entries: &[(String, String)]) -> bool {
972    let Some(expected_name) = sdist_archive_expected_name(archive_path) else {
973        return false;
974    };
975
976    entries.iter().any(|(entry_path, content)| {
977        sdist_pkginfo_kind_rank(entry_path) < 3
978            && sdist_pkginfo_candidate_name(content).as_deref() == Some(expected_name.as_str())
979    })
980}
981
982fn sdist_archive_expected_name(archive_path: &Path) -> Option<String> {
983    archive_path
984        .file_name()
985        .and_then(|name| name.to_str())
986        .and_then(strip_python_archive_extension)
987        .and_then(|stem| {
988            stem.rsplit_once('-')
989                .map(|(name, _)| normalize_python_package_name(name))
990        })
991}
992
993fn sdist_pkginfo_candidate_name(content: &str) -> Option<String> {
994    let metadata = super::rfc822::parse_rfc822_content(content);
995    super::rfc822::get_header_first(&metadata.headers, "name")
996        .map(|name| normalize_python_package_name(&name))
997}
998
999fn sdist_pkginfo_kind_rank(entry_path: &str) -> usize {
1000    let components: Vec<_> = entry_path
1001        .split('/')
1002        .filter(|part| !part.is_empty())
1003        .collect();
1004
1005    if components.len() == 3 && components[1].ends_with(".egg-info") && components[2] == "PKG-INFO"
1006    {
1007        0
1008    } else if components.len() == 2 && components[1] == "PKG-INFO" {
1009        1
1010    } else if entry_path.ends_with(".egg-info/PKG-INFO") {
1011        2
1012    } else {
1013        3
1014    }
1015}
1016
1017fn merge_sdist_archive_dependencies(
1018    entries: &[(String, String)],
1019    metadata_path: &str,
1020    package_data: &mut PackageData,
1021) {
1022    let metadata_dir = metadata_path
1023        .rsplit_once('/')
1024        .map(|(dir, _)| dir)
1025        .unwrap_or("");
1026    let archive_root = metadata_path.split('/').next().unwrap_or("");
1027    let matched_egg_info_dir =
1028        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1029    let mut extra_dependencies = Vec::new();
1030
1031    for (entry_path, content) in entries {
1032        let is_direct_requires =
1033            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/requires.txt");
1034        let is_egg_info_requires = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1035            entry_path == &format!("{archive_root}/{egg_info_dir}/requires.txt")
1036        });
1037
1038        if is_direct_requires || is_egg_info_requires {
1039            extra_dependencies.extend(parse_requires_txt(content));
1040        }
1041    }
1042
1043    for dependency in extra_dependencies {
1044        if !package_data.dependencies.iter().any(|existing| {
1045            existing.purl == dependency.purl
1046                && existing.scope == dependency.scope
1047                && existing.extracted_requirement == dependency.extracted_requirement
1048                && existing.extra_data == dependency.extra_data
1049        }) {
1050            package_data.dependencies.push(dependency);
1051        }
1052    }
1053}
1054
1055fn merge_sdist_archive_file_references(
1056    entries: &[(String, String)],
1057    metadata_path: &str,
1058    package_data: &mut PackageData,
1059) {
1060    let metadata_dir = metadata_path
1061        .rsplit_once('/')
1062        .map(|(dir, _)| dir)
1063        .unwrap_or("");
1064    let archive_root = metadata_path.split('/').next().unwrap_or("");
1065    let matched_egg_info_dir =
1066        select_matching_sdist_egg_info_dir(entries, archive_root, package_data.name.as_deref());
1067    let mut extra_refs = Vec::new();
1068
1069    for (entry_path, content) in entries {
1070        let is_direct_sources =
1071            !metadata_dir.is_empty() && entry_path == &format!("{metadata_dir}/SOURCES.txt");
1072        let is_egg_info_sources = matched_egg_info_dir.as_ref().is_some_and(|egg_info_dir| {
1073            entry_path == &format!("{archive_root}/{egg_info_dir}/SOURCES.txt")
1074        });
1075
1076        if is_direct_sources || is_egg_info_sources {
1077            extra_refs.extend(parse_sources_txt(content));
1078        }
1079    }
1080
1081    for file_ref in extra_refs {
1082        if !package_data
1083            .file_references
1084            .iter()
1085            .any(|existing| existing.path == file_ref.path)
1086        {
1087            package_data.file_references.push(file_ref);
1088        }
1089    }
1090}
1091
1092fn select_matching_sdist_egg_info_dir(
1093    entries: &[(String, String)],
1094    archive_root: &str,
1095    package_name: Option<&str>,
1096) -> Option<String> {
1097    let normalized_package_name = package_name.map(normalize_python_package_name);
1098
1099    entries
1100        .iter()
1101        .filter_map(|(entry_path, _)| {
1102            let components: Vec<_> = entry_path
1103                .split('/')
1104                .filter(|part| !part.is_empty())
1105                .collect();
1106            if components.len() == 3
1107                && components[0] == archive_root
1108                && components[1].ends_with(".egg-info")
1109            {
1110                Some(components[1].to_string())
1111            } else {
1112                None
1113            }
1114        })
1115        .min_by_key(|egg_info_dir| {
1116            let normalized_dir_name =
1117                normalize_python_package_name(egg_info_dir.trim_end_matches(".egg-info"));
1118            let name_rank = if Some(normalized_dir_name.clone()) == normalized_package_name {
1119                0
1120            } else {
1121                1
1122            };
1123
1124            (name_rank, egg_info_dir.clone())
1125        })
1126}
1127
1128fn normalize_python_package_name(name: &str) -> String {
1129    name.to_ascii_lowercase().replace('_', "-")
1130}
1131
1132fn apply_sdist_name_version_fallback(path: &Path, package_data: &mut PackageData) {
1133    let Some(file_name) = path.file_name().and_then(|name| name.to_str()) else {
1134        return;
1135    };
1136
1137    let Some(stem) = strip_python_archive_extension(file_name) else {
1138        return;
1139    };
1140
1141    let Some((name, version)) = stem.rsplit_once('-') else {
1142        return;
1143    };
1144
1145    if package_data.name.is_none() {
1146        package_data.name = Some(name.replace('_', "-"));
1147    }
1148    if package_data.version.is_none() {
1149        package_data.version = Some(version.to_string());
1150    }
1151
1152    if package_data.purl.is_none()
1153        || package_data.repository_homepage_url.is_none()
1154        || package_data.repository_download_url.is_none()
1155        || package_data.api_data_url.is_none()
1156    {
1157        let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1158            build_pypi_urls(
1159                package_data.name.as_deref(),
1160                package_data.version.as_deref(),
1161            );
1162
1163        if package_data.repository_homepage_url.is_none() {
1164            package_data.repository_homepage_url = repository_homepage_url;
1165        }
1166        if package_data.repository_download_url.is_none() {
1167            package_data.repository_download_url = repository_download_url;
1168        }
1169        if package_data.api_data_url.is_none() {
1170            package_data.api_data_url = api_data_url;
1171        }
1172        if package_data.purl.is_none() {
1173            package_data.purl = purl;
1174        }
1175    }
1176}
1177
1178fn extract_from_wheel_archive(path: &Path) -> PackageData {
1179    let metadata = match std::fs::metadata(path) {
1180        Ok(m) => m,
1181        Err(e) => {
1182            warn!(
1183                "Failed to read metadata for wheel archive {:?}: {}",
1184                path, e
1185            );
1186            return default_package_data(path);
1187        }
1188    };
1189
1190    if metadata.len() > MAX_ARCHIVE_SIZE {
1191        warn!(
1192            "Wheel archive too large: {} bytes (limit: {} bytes)",
1193            metadata.len(),
1194            MAX_ARCHIVE_SIZE
1195        );
1196        return default_package_data(path);
1197    }
1198
1199    let file = match File::open(path) {
1200        Ok(f) => f,
1201        Err(e) => {
1202            warn!("Failed to open wheel archive {:?}: {}", path, e);
1203            return default_package_data(path);
1204        }
1205    };
1206
1207    let mut archive = match ZipArchive::new(file) {
1208        Ok(a) => a,
1209        Err(e) => {
1210            warn!("Failed to read wheel archive {:?}: {}", path, e);
1211            return default_package_data(path);
1212        }
1213    };
1214
1215    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "wheel") {
1216        Ok(entries) => entries,
1217        Err(_) => return default_package_data(path),
1218    };
1219
1220    let metadata_entry =
1221        match find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/METADATA") {
1222            Some(entry) => entry,
1223            None => {
1224                warn!("No METADATA file found in wheel archive {:?}", path);
1225                return default_package_data(path);
1226            }
1227        };
1228
1229    let content = match read_validated_zip_entry(&mut archive, metadata_entry, path, "wheel") {
1230        Ok(c) => c,
1231        Err(e) => {
1232            warn!("Failed to read METADATA from {:?}: {}", path, e);
1233            return default_package_data(path);
1234        }
1235    };
1236
1237    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
1238
1239    let (size, sha256) = calculate_file_checksums(path);
1240    package_data.size = size;
1241    package_data.sha256 = sha256;
1242
1243    if let Some(record_entry) =
1244        find_validated_zip_entry_by_suffix(&validated_entries, ".dist-info/RECORD")
1245        && let Ok(record_content) =
1246            read_validated_zip_entry(&mut archive, record_entry, path, "wheel")
1247    {
1248        package_data.file_references = parse_record_csv(&record_content);
1249    }
1250
1251    if let Some(wheel_info) = parse_wheel_filename(path) {
1252        if package_data.name.is_none() {
1253            package_data.name = Some(wheel_info.name.clone());
1254        }
1255        if package_data.version.is_none() {
1256            package_data.version = Some(wheel_info.version.clone());
1257        }
1258
1259        package_data.qualifiers = Some(std::collections::HashMap::from([(
1260            "extension".to_string(),
1261            format!(
1262                "{}-{}-{}",
1263                wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1264            ),
1265        )]));
1266
1267        package_data.purl = build_wheel_purl(
1268            package_data.name.as_deref(),
1269            package_data.version.as_deref(),
1270            &wheel_info,
1271        );
1272
1273        let mut extra_data = package_data.extra_data.unwrap_or_default();
1274        extra_data.insert(
1275            "python_requires".to_string(),
1276            serde_json::Value::String(wheel_info.python_tag.clone()),
1277        );
1278        extra_data.insert(
1279            "abi_tag".to_string(),
1280            serde_json::Value::String(wheel_info.abi_tag.clone()),
1281        );
1282        extra_data.insert(
1283            "platform_tag".to_string(),
1284            serde_json::Value::String(wheel_info.platform_tag.clone()),
1285        );
1286        package_data.extra_data = Some(extra_data);
1287    }
1288
1289    package_data
1290}
1291
1292fn extract_from_egg_archive(path: &Path) -> PackageData {
1293    let metadata = match std::fs::metadata(path) {
1294        Ok(m) => m,
1295        Err(e) => {
1296            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
1297            return default_package_data(path);
1298        }
1299    };
1300
1301    if metadata.len() > MAX_ARCHIVE_SIZE {
1302        warn!(
1303            "Egg archive too large: {} bytes (limit: {} bytes)",
1304            metadata.len(),
1305            MAX_ARCHIVE_SIZE
1306        );
1307        return default_package_data(path);
1308    }
1309
1310    let file = match File::open(path) {
1311        Ok(f) => f,
1312        Err(e) => {
1313            warn!("Failed to open egg archive {:?}: {}", path, e);
1314            return default_package_data(path);
1315        }
1316    };
1317
1318    let mut archive = match ZipArchive::new(file) {
1319        Ok(a) => a,
1320        Err(e) => {
1321            warn!("Failed to read egg archive {:?}: {}", path, e);
1322            return default_package_data(path);
1323        }
1324    };
1325
1326    let validated_entries = match collect_validated_zip_entries(&mut archive, path, "egg") {
1327        Ok(entries) => entries,
1328        Err(_) => return default_package_data(path),
1329    };
1330
1331    let pkginfo_entry = match find_validated_zip_entry_by_any_suffix(
1332        &validated_entries,
1333        &["EGG-INFO/PKG-INFO", ".egg-info/PKG-INFO"],
1334    ) {
1335        Some(entry) => entry,
1336        None => {
1337            warn!("No PKG-INFO file found in egg archive {:?}", path);
1338            return default_package_data(path);
1339        }
1340    };
1341
1342    let content = match read_validated_zip_entry(&mut archive, pkginfo_entry, path, "egg") {
1343        Ok(c) => c,
1344        Err(e) => {
1345            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
1346            return default_package_data(path);
1347        }
1348    };
1349
1350    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
1351
1352    let (size, sha256) = calculate_file_checksums(path);
1353    package_data.size = size;
1354    package_data.sha256 = sha256;
1355
1356    if let Some(installed_files_entry) = find_validated_zip_entry_by_any_suffix(
1357        &validated_entries,
1358        &[
1359            "EGG-INFO/installed-files.txt",
1360            ".egg-info/installed-files.txt",
1361        ],
1362    ) && let Ok(installed_files_content) =
1363        read_validated_zip_entry(&mut archive, installed_files_entry, path, "egg")
1364    {
1365        package_data.file_references = parse_installed_files_txt(&installed_files_content);
1366    }
1367
1368    if let Some(egg_info) = parse_egg_filename(path) {
1369        if package_data.name.is_none() {
1370            package_data.name = Some(egg_info.name.clone());
1371        }
1372        if package_data.version.is_none() {
1373            package_data.version = Some(egg_info.version.clone());
1374        }
1375
1376        if let Some(python_version) = &egg_info.python_version {
1377            let mut extra_data = package_data.extra_data.unwrap_or_default();
1378            extra_data.insert(
1379                "python_version".to_string(),
1380                serde_json::Value::String(python_version.clone()),
1381            );
1382            package_data.extra_data = Some(extra_data);
1383        }
1384    }
1385
1386    package_data.purl = build_egg_purl(
1387        package_data.name.as_deref(),
1388        package_data.version.as_deref(),
1389    );
1390
1391    package_data
1392}
1393
1394fn find_validated_zip_entry_by_suffix<'a>(
1395    entries: &'a [ValidatedZipEntry],
1396    suffix: &str,
1397) -> Option<&'a ValidatedZipEntry> {
1398    entries.iter().find(|entry| entry.name.ends_with(suffix))
1399}
1400
1401fn find_validated_zip_entry_by_any_suffix<'a>(
1402    entries: &'a [ValidatedZipEntry],
1403    suffixes: &[&str],
1404) -> Option<&'a ValidatedZipEntry> {
1405    entries
1406        .iter()
1407        .find(|entry| suffixes.iter().any(|suffix| entry.name.ends_with(suffix)))
1408}
1409
1410fn read_validated_zip_entry<R: Read + std::io::Seek>(
1411    archive: &mut ZipArchive<R>,
1412    entry: &ValidatedZipEntry,
1413    path: &Path,
1414    archive_type: &str,
1415) -> Result<String, String> {
1416    let mut file = archive
1417        .by_index(entry.index)
1418        .map_err(|e| format!("Failed to find entry {}: {}", entry.name, e))?;
1419
1420    let compressed_size = file.compressed_size();
1421    let uncompressed_size = file.size();
1422
1423    if compressed_size > 0 {
1424        let ratio = uncompressed_size as f64 / compressed_size as f64;
1425        if ratio > MAX_COMPRESSION_RATIO {
1426            return Err(format!(
1427                "Rejected suspicious compression ratio in {} {:?}: {:.2}:1",
1428                archive_type, path, ratio
1429            ));
1430        }
1431    }
1432
1433    if uncompressed_size > MAX_FILE_SIZE {
1434        return Err(format!(
1435            "Rejected oversized entry in {} {:?}: {} bytes",
1436            archive_type, path, uncompressed_size
1437        ));
1438    }
1439
1440    read_limited_utf8(
1441        &mut file,
1442        MAX_FILE_SIZE,
1443        &format!("{} entry {}", archive_type, entry.name),
1444    )
1445}
1446
1447fn read_limited_utf8<R: Read>(
1448    reader: &mut R,
1449    max_bytes: u64,
1450    context: &str,
1451) -> Result<String, String> {
1452    let mut limited = reader.take(max_bytes + 1);
1453    let mut bytes = Vec::new();
1454    limited
1455        .read_to_end(&mut bytes)
1456        .map_err(|e| format!("Failed to read {}: {}", context, e))?;
1457
1458    if bytes.len() as u64 > max_bytes {
1459        return Err(format!(
1460            "{} exceeded {} byte limit while reading",
1461            context, max_bytes
1462        ));
1463    }
1464
1465    String::from_utf8(bytes).map_err(|e| format!("{} is not valid UTF-8: {}", context, e))
1466}
1467
1468fn normalize_archive_entry_path(entry_path: &str) -> Option<String> {
1469    let normalized = entry_path.replace('\\', "/");
1470    if normalized.len() >= 3 {
1471        let bytes = normalized.as_bytes();
1472        if bytes[1] == b':' && bytes[2] == b'/' && bytes[0].is_ascii_alphabetic() {
1473            return None;
1474        }
1475    }
1476    let path = Path::new(&normalized);
1477    let mut components = Vec::new();
1478
1479    for component in path.components() {
1480        match component {
1481            Component::Normal(segment) => components.push(segment.to_string_lossy().to_string()),
1482            Component::CurDir => {}
1483            Component::RootDir | Component::ParentDir | Component::Prefix(_) => return None,
1484        }
1485    }
1486
1487    (!components.is_empty()).then_some(components.join("/"))
1488}
1489
1490/// Parses RECORD CSV format from wheel archives (PEP 427).
1491/// Format: path,hash,size (3 columns, no header)
1492/// Hash format: sha256=urlsafe_base64_hash or empty
1493/// Size: bytes as u64 or empty
1494pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
1495    let mut reader = ReaderBuilder::new()
1496        .has_headers(false)
1497        .from_reader(content.as_bytes());
1498
1499    let mut file_references = Vec::new();
1500
1501    for result in reader.records() {
1502        match result {
1503            Ok(record) => {
1504                if record.len() < 3 {
1505                    continue;
1506                }
1507
1508                let path = record.get(0).unwrap_or("").trim().to_string();
1509                if path.is_empty() {
1510                    continue;
1511                }
1512
1513                let hash_field = record.get(1).unwrap_or("").trim();
1514                let size_field = record.get(2).unwrap_or("").trim();
1515
1516                // Parse hash: format is "algorithm=value"
1517                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
1518                    let parts: Vec<&str> = hash_field.split('=').collect();
1519                    if parts.len() == 2 && parts[0] == "sha256" {
1520                        // Decode base64 to hex
1521                        match URL_SAFE_NO_PAD.decode(parts[1]) {
1522                            Ok(decoded) => {
1523                                let hex = decoded
1524                                    .iter()
1525                                    .map(|b| format!("{:02x}", b))
1526                                    .collect::<String>();
1527                                Some(hex)
1528                            }
1529                            Err(_) => None,
1530                        }
1531                    } else {
1532                        None
1533                    }
1534                } else {
1535                    None
1536                };
1537
1538                // Parse size
1539                let size = if !size_field.is_empty() && size_field != "-" {
1540                    size_field.parse::<u64>().ok()
1541                } else {
1542                    None
1543                };
1544
1545                file_references.push(FileReference {
1546                    path,
1547                    size,
1548                    sha1: None,
1549                    md5: None,
1550                    sha256,
1551                    sha512: None,
1552                    extra_data: None,
1553                });
1554            }
1555            Err(e) => {
1556                warn!("Failed to parse RECORD CSV row: {}", e);
1557                continue;
1558            }
1559        }
1560    }
1561
1562    file_references
1563}
1564
1565/// Parses installed-files.txt format from egg archives (PEP 376).
1566/// Format: one file path per line, no headers, no hash, no size
1567pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
1568    content
1569        .lines()
1570        .map(|line| line.trim())
1571        .filter(|line| !line.is_empty())
1572        .map(|path| FileReference {
1573            path: path.to_string(),
1574            size: None,
1575            sha1: None,
1576            md5: None,
1577            sha256: None,
1578            sha512: None,
1579            extra_data: None,
1580        })
1581        .collect()
1582}
1583
1584pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
1585    content
1586        .lines()
1587        .map(str::trim)
1588        .filter(|line| !line.is_empty())
1589        .map(|path| FileReference {
1590            path: path.to_string(),
1591            size: None,
1592            sha1: None,
1593            md5: None,
1594            sha256: None,
1595            sha512: None,
1596            extra_data: None,
1597        })
1598        .collect()
1599}
1600
1601struct WheelInfo {
1602    name: String,
1603    version: String,
1604    python_tag: String,
1605    abi_tag: String,
1606    platform_tag: String,
1607}
1608
1609fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
1610    let stem = path.file_stem()?.to_string_lossy();
1611    let parts: Vec<&str> = stem.split('-').collect();
1612
1613    if parts.len() >= 5 {
1614        Some(WheelInfo {
1615            name: parts[0].replace('_', "-"),
1616            version: parts[1].to_string(),
1617            python_tag: parts[2].to_string(),
1618            abi_tag: parts[3].to_string(),
1619            platform_tag: parts[4..].join("-"),
1620        })
1621    } else {
1622        None
1623    }
1624}
1625
1626struct EggInfo {
1627    name: String,
1628    version: String,
1629    python_version: Option<String>,
1630}
1631
1632fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
1633    let stem = path.file_stem()?.to_string_lossy();
1634    let parts: Vec<&str> = stem.split('-').collect();
1635
1636    if parts.len() >= 2 {
1637        Some(EggInfo {
1638            name: parts[0].replace('_', "-"),
1639            version: parts[1].to_string(),
1640            python_version: parts.get(2).map(|s| s.to_string()),
1641        })
1642    } else {
1643        None
1644    }
1645}
1646
1647fn build_wheel_purl(
1648    name: Option<&str>,
1649    version: Option<&str>,
1650    wheel_info: &WheelInfo,
1651) -> Option<String> {
1652    let name = name?;
1653    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1654
1655    if let Some(ver) = version {
1656        package_url.with_version(ver).ok()?;
1657    }
1658
1659    let extension = format!(
1660        "{}-{}-{}",
1661        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1662    );
1663    package_url.add_qualifier("extension", extension).ok()?;
1664
1665    Some(package_url.to_string())
1666}
1667
1668fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1669    let name = name?;
1670    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1671
1672    if let Some(ver) = version {
1673        package_url.with_version(ver).ok()?;
1674    }
1675
1676    package_url.add_qualifier("type", "egg").ok()?;
1677
1678    Some(package_url.to_string())
1679}
1680
1681fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1682    let metadata = super::rfc822::parse_rfc822_content(content);
1683    build_package_data_from_rfc822(&metadata, datasource_id)
1684}
1685
1686/// Builds PackageData from parsed RFC822 metadata.
1687///
1688/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1689/// and `python_parse_rfc822_content` (content-based) functions.
1690fn build_package_data_from_rfc822(
1691    metadata: &super::rfc822::Rfc822Metadata,
1692    datasource_id: DatasourceId,
1693) -> PackageData {
1694    use super::rfc822::{get_header_all, get_header_first};
1695
1696    let name = get_header_first(&metadata.headers, "name");
1697    let version = get_header_first(&metadata.headers, "version");
1698    let summary = get_header_first(&metadata.headers, "summary");
1699    let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1700    let author = get_header_first(&metadata.headers, "author");
1701    let author_email = get_header_first(&metadata.headers, "author-email");
1702    let license = get_header_first(&metadata.headers, "license");
1703    let license_expression = get_header_first(&metadata.headers, "license-expression");
1704    let download_url = get_header_first(&metadata.headers, "download-url");
1705    let platform = get_header_first(&metadata.headers, "platform");
1706    let requires_python = get_header_first(&metadata.headers, "requires-python");
1707    let classifiers = get_header_all(&metadata.headers, "classifier");
1708    let license_files = get_header_all(&metadata.headers, "license-file");
1709
1710    let description_body = if metadata.body.is_empty() {
1711        get_header_first(&metadata.headers, "description").unwrap_or_default()
1712    } else {
1713        metadata.body.clone()
1714    };
1715
1716    let description = build_description(summary.as_deref(), &description_body);
1717
1718    let mut parties = Vec::new();
1719    if author.is_some() || author_email.is_some() {
1720        parties.push(Party {
1721            r#type: Some("person".to_string()),
1722            role: Some("author".to_string()),
1723            name: author,
1724            email: author_email,
1725            url: None,
1726            organization: None,
1727            organization_url: None,
1728            timezone: None,
1729        });
1730    }
1731
1732    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1733    let referenced_license_files: Vec<&str> = license_files.iter().map(String::as_str).collect();
1734    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1735        license_expression
1736            .as_deref()
1737            .and_then(normalize_spdx_expression)
1738            .map(|normalized| {
1739                build_declared_license_data(
1740                    normalized,
1741                    DeclaredLicenseMatchMetadata::single_line(
1742                        license_expression.as_deref().unwrap_or_default(),
1743                    )
1744                    .with_referenced_filenames(&referenced_license_files),
1745                )
1746            })
1747            .unwrap_or_else(|| normalize_spdx_declared_license(license_expression.as_deref()));
1748
1749    let extracted_license_statement = license_expression
1750        .clone()
1751        .or_else(|| build_extracted_license_statement(license.as_deref(), &license_classifiers));
1752
1753    let mut extra_data = HashMap::new();
1754    if let Some(platform_value) = platform
1755        && !platform_value.eq_ignore_ascii_case("unknown")
1756        && !platform_value.is_empty()
1757    {
1758        extra_data.insert(
1759            "platform".to_string(),
1760            serde_json::Value::String(platform_value),
1761        );
1762    }
1763
1764    if let Some(requires_python_value) = requires_python
1765        && !requires_python_value.is_empty()
1766    {
1767        extra_data.insert(
1768            "requires_python".to_string(),
1769            serde_json::Value::String(requires_python_value),
1770        );
1771    }
1772
1773    if !license_files.is_empty() {
1774        extra_data.insert(
1775            "license_files".to_string(),
1776            serde_json::Value::Array(
1777                license_files
1778                    .iter()
1779                    .cloned()
1780                    .map(serde_json::Value::String)
1781                    .collect(),
1782            ),
1783        );
1784    }
1785
1786    let file_references = license_files
1787        .iter()
1788        .map(|path| FileReference {
1789            path: path.clone(),
1790            size: None,
1791            sha1: None,
1792            md5: None,
1793            sha256: None,
1794            sha512: None,
1795            extra_data: None,
1796        })
1797        .collect();
1798
1799    let project_urls = get_header_all(&metadata.headers, "project-url");
1800    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1801    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1802
1803    if !project_urls.is_empty() {
1804        let parsed_urls = parse_project_urls(&project_urls);
1805
1806        for (label, url) in &parsed_urls {
1807            let label_lower = label.to_lowercase();
1808
1809            if bug_tracking_url.is_none()
1810                && matches!(
1811                    label_lower.as_str(),
1812                    "tracker"
1813                        | "bug reports"
1814                        | "bug tracker"
1815                        | "issues"
1816                        | "issue tracker"
1817                        | "github: issues"
1818                )
1819            {
1820                bug_tracking_url = Some(url.clone());
1821            } else if code_view_url.is_none()
1822                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1823            {
1824                code_view_url = Some(url.clone());
1825            } else if vcs_url.is_none()
1826                && matches!(
1827                    label_lower.as_str(),
1828                    "github" | "gitlab" | "github: repo" | "repository"
1829                )
1830            {
1831                vcs_url = Some(url.clone());
1832            } else if homepage_url.is_none()
1833                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1834            {
1835                homepage_url = Some(url.clone());
1836            } else if label_lower == "changelog" {
1837                extra_data.insert(
1838                    "changelog_url".to_string(),
1839                    serde_json::Value::String(url.clone()),
1840                );
1841            }
1842        }
1843
1844        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1845            .iter()
1846            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1847            .collect();
1848
1849        if !project_urls_json.is_empty() {
1850            extra_data.insert(
1851                "project_urls".to_string(),
1852                serde_json::Value::Object(project_urls_json),
1853            );
1854        }
1855    }
1856
1857    let extra_data = if extra_data.is_empty() {
1858        None
1859    } else {
1860        Some(extra_data)
1861    };
1862
1863    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1864        build_pypi_urls(name.as_deref(), version.as_deref());
1865
1866    PackageData {
1867        package_type: Some(PythonParser::PACKAGE_TYPE),
1868        namespace: None,
1869        name,
1870        version,
1871        qualifiers: None,
1872        subpath: None,
1873        primary_language: Some("Python".to_string()),
1874        description,
1875        release_date: None,
1876        parties,
1877        keywords,
1878        homepage_url,
1879        download_url,
1880        size: None,
1881        sha1: None,
1882        md5: None,
1883        sha256: None,
1884        sha512: None,
1885        bug_tracking_url,
1886        code_view_url,
1887        vcs_url,
1888        copyright: None,
1889        holder: None,
1890        declared_license_expression,
1891        declared_license_expression_spdx,
1892        license_detections,
1893        other_license_expression: None,
1894        other_license_expression_spdx: None,
1895        other_license_detections: Vec::new(),
1896        extracted_license_statement,
1897        notice_text: None,
1898        source_packages: Vec::new(),
1899        file_references,
1900        is_private: false,
1901        is_virtual: false,
1902        extra_data,
1903        dependencies,
1904        repository_homepage_url,
1905        repository_download_url,
1906        api_data_url,
1907        datasource_id: Some(datasource_id),
1908        purl,
1909    }
1910}
1911
1912fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
1913    project_urls
1914        .iter()
1915        .filter_map(|url_entry| {
1916            if let Some((label, url)) = url_entry.split_once(", ") {
1917                let label_trimmed = label.trim();
1918                let url_trimmed = url.trim();
1919                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
1920                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
1921                }
1922            }
1923            None
1924        })
1925        .collect()
1926}
1927
1928fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
1929    let mut parts = Vec::new();
1930    if let Some(summary_value) = summary
1931        && !summary_value.trim().is_empty()
1932    {
1933        parts.push(summary_value.trim().to_string());
1934    }
1935
1936    if !body.trim().is_empty() {
1937        parts.push(body.trim().to_string());
1938    }
1939
1940    if parts.is_empty() {
1941        None
1942    } else {
1943        Some(parts.join("\n"))
1944    }
1945}
1946
1947fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
1948    let mut keywords = Vec::new();
1949    let mut license_classifiers = Vec::new();
1950
1951    for classifier in classifiers {
1952        if classifier.starts_with("License ::") {
1953            license_classifiers.push(classifier.to_string());
1954        } else {
1955            keywords.push(classifier.to_string());
1956        }
1957    }
1958
1959    (keywords, license_classifiers)
1960}
1961
1962fn build_extracted_license_statement(
1963    license: Option<&str>,
1964    license_classifiers: &[String],
1965) -> Option<String> {
1966    let mut lines = Vec::new();
1967
1968    if let Some(value) = license
1969        && !value.trim().is_empty()
1970    {
1971        lines.push(format!("license: {}", value.trim()));
1972    }
1973
1974    if !license_classifiers.is_empty() {
1975        lines.push("classifiers:".to_string());
1976        for classifier in license_classifiers {
1977            lines.push(format!("  - '{}'", classifier));
1978        }
1979    }
1980
1981    if lines.is_empty() {
1982        None
1983    } else {
1984        Some(format!("{}\n", lines.join("\n")))
1985    }
1986}
1987
1988pub(crate) fn build_pypi_urls(
1989    name: Option<&str>,
1990    version: Option<&str>,
1991) -> (
1992    Option<String>,
1993    Option<String>,
1994    Option<String>,
1995    Option<String>,
1996) {
1997    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
1998
1999    let repository_download_url = name.and_then(|value| {
2000        version.map(|ver| {
2001            format!(
2002                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2003                &value[..1.min(value.len())],
2004                value,
2005                value,
2006                ver
2007            )
2008        })
2009    });
2010
2011    let api_data_url = name.map(|value| {
2012        if let Some(ver) = version {
2013            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
2014        } else {
2015            format!("https://pypi.org/pypi/{}/json", value)
2016        }
2017    });
2018
2019    let purl = name.and_then(|value| {
2020        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
2021        if let Some(ver) = version {
2022            package_url.with_version(ver).ok()?;
2023        }
2024        Some(package_url.to_string())
2025    });
2026
2027    (
2028        repository_homepage_url,
2029        repository_download_url,
2030        api_data_url,
2031        purl,
2032    )
2033}
2034
2035fn build_pypi_purl_with_extension(
2036    name: &str,
2037    version: Option<&str>,
2038    extension: &str,
2039) -> Option<String> {
2040    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2041    if let Some(ver) = version {
2042        package_url.with_version(ver).ok()?;
2043    }
2044    package_url.add_qualifier("extension", extension).ok()?;
2045    Some(package_url.to_string())
2046}
2047
2048fn extract_from_pyproject_toml(path: &Path) -> PackageData {
2049    let toml_content = match read_toml_file(path) {
2050        Ok(content) => content,
2051        Err(e) => {
2052            warn!(
2053                "Failed to read or parse pyproject.toml at {:?}: {}",
2054                path, e
2055            );
2056            return default_package_data(path);
2057        }
2058    };
2059
2060    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
2061    let is_poetry_pyproject = tool_table
2062        .and_then(|tool| tool.get("poetry"))
2063        .and_then(|value| value.as_table())
2064        .is_some();
2065
2066    // Handle both PEP 621 (project table) and poetry formats
2067    let project_table =
2068        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
2069            // Standard PEP 621 format with [project] table
2070            project.clone()
2071        } else if let Some(tool) = tool_table {
2072            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
2073                // Poetry format with [tool.poetry] table
2074                poetry.clone()
2075            } else {
2076                return default_package_data(path);
2077            }
2078        } else if toml_content.get(FIELD_NAME).is_some() {
2079            // Other format with top-level fields
2080            match toml_content.as_table() {
2081                Some(table) => table.clone(),
2082                None => {
2083                    warn!("Failed to convert TOML content to table in {:?}", path);
2084                    return default_package_data(path);
2085                }
2086            }
2087        } else {
2088            return default_package_data(path);
2089        };
2090
2091    let name = project_table
2092        .get(FIELD_NAME)
2093        .and_then(|v| v.as_str())
2094        .map(String::from);
2095
2096    let version = project_table
2097        .get(FIELD_VERSION)
2098        .and_then(|v| v.as_str())
2099        .map(String::from);
2100    let classifiers = project_table
2101        .get("classifiers")
2102        .and_then(|value| value.as_array())
2103        .map(|values| {
2104            values
2105                .iter()
2106                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
2107                .collect::<Vec<_>>()
2108        })
2109        .unwrap_or_default();
2110
2111    let extracted_license_statement = extract_raw_license_string(&project_table);
2112    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
2113        normalize_spdx_declared_license(extract_license_expression_candidate(&project_table));
2114
2115    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
2116    let (homepage_url, repository_url) = extract_urls(&project_table);
2117
2118    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
2119    let extra_data = extract_pyproject_extra_data(&toml_content);
2120
2121    // Create package URL
2122    let purl = name.as_ref().and_then(|n| {
2123        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
2124            Ok(p) => p,
2125            Err(e) => {
2126                warn!(
2127                    "Failed to create PackageUrl for Python package '{}': {}",
2128                    n, e
2129                );
2130                return None;
2131            }
2132        };
2133
2134        if let Some(v) = &version
2135            && let Err(e) = package_url.with_version(v)
2136        {
2137            warn!(
2138                "Failed to set version '{}' for Python package '{}': {}",
2139                v, n, e
2140            );
2141            return None;
2142        }
2143
2144        Some(package_url.to_string())
2145    });
2146
2147    let api_data_url = name.as_ref().map(|n| {
2148        if let Some(v) = &version {
2149            format!("https://pypi.org/pypi/{}/{}/json", n, v)
2150        } else {
2151            format!("https://pypi.org/pypi/{}/json", n)
2152        }
2153    });
2154
2155    let pypi_homepage_url = name
2156        .as_ref()
2157        .map(|n| format!("https://pypi.org/project/{}", n));
2158
2159    let pypi_download_url = name.as_ref().and_then(|n| {
2160        version.as_ref().map(|v| {
2161            format!(
2162                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
2163                &n[..1.min(n.len())],
2164                n,
2165                n,
2166                v
2167            )
2168        })
2169    });
2170
2171    PackageData {
2172        package_type: Some(PythonParser::PACKAGE_TYPE),
2173        namespace: None,
2174        name,
2175        version,
2176        qualifiers: None,
2177        subpath: None,
2178        primary_language: None,
2179        description: None,
2180        release_date: None,
2181        parties: extract_parties(&project_table),
2182        keywords: Vec::new(),
2183        homepage_url: homepage_url.or(pypi_homepage_url),
2184        download_url: repository_url.clone().or(pypi_download_url),
2185        size: None,
2186        sha1: None,
2187        md5: None,
2188        sha256: None,
2189        sha512: None,
2190        bug_tracking_url: None,
2191        code_view_url: None,
2192        vcs_url: repository_url,
2193        copyright: None,
2194        holder: None,
2195        declared_license_expression,
2196        declared_license_expression_spdx,
2197        license_detections,
2198        other_license_expression: None,
2199        other_license_expression_spdx: None,
2200        other_license_detections: Vec::new(),
2201        extracted_license_statement,
2202        notice_text: None,
2203        source_packages: Vec::new(),
2204        file_references: Vec::new(),
2205        is_private: has_private_classifier(&classifiers),
2206        is_virtual: false,
2207        extra_data,
2208        dependencies: [dependencies, optional_dependencies].concat(),
2209        repository_homepage_url: None,
2210        repository_download_url: None,
2211        api_data_url,
2212        datasource_id: Some(if is_poetry_pyproject {
2213            DatasourceId::PypiPoetryPyprojectToml
2214        } else {
2215            DatasourceId::PypiPyprojectToml
2216        }),
2217        purl,
2218    }
2219}
2220
2221fn detect_pkg_info_datasource_id(path: &Path) -> DatasourceId {
2222    let path_str = path.to_string_lossy().replace('\\', "/");
2223    if path_str.contains("/EGG-INFO/PKG-INFO") {
2224        DatasourceId::PypiEggPkginfo
2225    } else if path_str.ends_with(".egg-info/PKG-INFO") {
2226        DatasourceId::PypiEditableEggPkginfo
2227    } else {
2228        DatasourceId::PypiSdistPkginfo
2229    }
2230}
2231
2232fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
2233    project
2234        .get(FIELD_LICENSE)
2235        .and_then(|license_value| match license_value {
2236            TomlValue::String(license_str) => Some(license_str.clone()),
2237            TomlValue::Table(license_table) => license_table
2238                .get("text")
2239                .and_then(|v| v.as_str())
2240                .map(|s| s.to_string())
2241                .or_else(|| {
2242                    license_table
2243                        .get("expression")
2244                        .and_then(|v| v.as_str())
2245                        .map(|expr| expr.to_string())
2246                }),
2247            _ => None,
2248        })
2249}
2250
2251fn extract_license_expression_candidate(project: &TomlMap<String, TomlValue>) -> Option<&str> {
2252    match project.get(FIELD_LICENSE) {
2253        Some(TomlValue::String(license_str)) => Some(license_str.as_str()),
2254        Some(TomlValue::Table(license_table)) => license_table
2255            .get("expression")
2256            .and_then(|value| value.as_str()),
2257        _ => None,
2258    }
2259}
2260
2261fn extract_urls(project: &TomlMap<String, TomlValue>) -> (Option<String>, Option<String>) {
2262    let mut homepage_url = None;
2263    let mut repository_url = None;
2264
2265    // Check for URLs table
2266    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
2267        homepage_url = urls
2268            .get(FIELD_HOMEPAGE)
2269            .and_then(|v| v.as_str())
2270            .map(String::from);
2271        repository_url = urls
2272            .get(FIELD_REPOSITORY)
2273            .and_then(|v| v.as_str())
2274            .map(String::from);
2275    }
2276
2277    // If not found in URLs table, check for top-level keys
2278    if homepage_url.is_none() {
2279        homepage_url = project
2280            .get(FIELD_HOMEPAGE)
2281            .and_then(|v| v.as_str())
2282            .map(String::from);
2283    }
2284
2285    if repository_url.is_none() {
2286        repository_url = project
2287            .get(FIELD_REPOSITORY)
2288            .and_then(|v| v.as_str())
2289            .map(String::from);
2290    }
2291
2292    (homepage_url, repository_url)
2293}
2294
2295fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
2296    let mut parties = Vec::new();
2297
2298    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
2299        for author in authors {
2300            if let Some(author_str) = author.as_str() {
2301                let (name, email) = split_name_email(author_str);
2302                parties.push(Party {
2303                    r#type: None,
2304                    role: Some("author".to_string()),
2305                    name,
2306                    email,
2307                    url: None,
2308                    organization: None,
2309                    organization_url: None,
2310                    timezone: None,
2311                });
2312            }
2313        }
2314    }
2315
2316    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
2317        for maintainer in maintainers {
2318            if let Some(maintainer_str) = maintainer.as_str() {
2319                let (name, email) = split_name_email(maintainer_str);
2320                parties.push(Party {
2321                    r#type: None,
2322                    role: Some("maintainer".to_string()),
2323                    name,
2324                    email,
2325                    url: None,
2326                    organization: None,
2327                    organization_url: None,
2328                    timezone: None,
2329                });
2330            }
2331        }
2332    }
2333
2334    parties
2335}
2336
2337fn extract_dependencies(
2338    project: &TomlMap<String, TomlValue>,
2339    toml_content: &TomlValue,
2340) -> (Vec<Dependency>, Vec<Dependency>) {
2341    let mut dependencies = Vec::new();
2342    let mut optional_dependencies = Vec::new();
2343
2344    // Handle dependencies - can be array or table format
2345    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
2346        match deps_value {
2347            TomlValue::Array(arr) => {
2348                dependencies = parse_dependency_array(arr, false, None);
2349            }
2350            TomlValue::Table(table) => {
2351                dependencies = parse_dependency_table(table, false, None);
2352            }
2353            _ => {}
2354        }
2355    }
2356
2357    // Handle PEP 621 optional-dependencies with scope
2358    if let Some(opt_deps_table) = project
2359        .get(FIELD_OPTIONAL_DEPENDENCIES)
2360        .and_then(|v| v.as_table())
2361    {
2362        for (extra_name, deps) in opt_deps_table {
2363            match deps {
2364                TomlValue::Array(arr) => {
2365                    optional_dependencies.extend(parse_dependency_array(
2366                        arr,
2367                        true,
2368                        Some(extra_name),
2369                    ));
2370                }
2371                TomlValue::Table(table) => {
2372                    optional_dependencies.extend(parse_dependency_table(
2373                        table,
2374                        true,
2375                        Some(extra_name),
2376                    ));
2377                }
2378                _ => {}
2379            }
2380        }
2381    }
2382
2383    // Handle Poetry dev-dependencies
2384    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
2385        match dev_deps_value {
2386            TomlValue::Array(arr) => {
2387                optional_dependencies.extend(parse_dependency_array(
2388                    arr,
2389                    true,
2390                    Some(FIELD_DEV_DEPENDENCIES),
2391                ));
2392            }
2393            TomlValue::Table(table) => {
2394                optional_dependencies.extend(parse_dependency_table(
2395                    table,
2396                    true,
2397                    Some(FIELD_DEV_DEPENDENCIES),
2398                ));
2399            }
2400            _ => {}
2401        }
2402    }
2403
2404    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
2405    if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
2406        for (group_name, group_data) in groups_table {
2407            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
2408                match group_deps {
2409                    TomlValue::Array(arr) => {
2410                        optional_dependencies.extend(parse_dependency_array(
2411                            arr,
2412                            true,
2413                            Some(group_name),
2414                        ));
2415                    }
2416                    TomlValue::Table(table) => {
2417                        optional_dependencies.extend(parse_dependency_table(
2418                            table,
2419                            true,
2420                            Some(group_name),
2421                        ));
2422                    }
2423                    _ => {}
2424                }
2425            }
2426        }
2427    }
2428
2429    if let Some(groups_table) = toml_content
2430        .get(FIELD_DEPENDENCY_GROUPS)
2431        .and_then(|value| value.as_table())
2432    {
2433        for (group_name, deps) in groups_table {
2434            match deps {
2435                TomlValue::Array(arr) => {
2436                    optional_dependencies.extend(parse_dependency_array(
2437                        arr,
2438                        true,
2439                        Some(group_name),
2440                    ));
2441                }
2442                TomlValue::Table(table) => {
2443                    optional_dependencies.extend(parse_dependency_table(
2444                        table,
2445                        true,
2446                        Some(group_name),
2447                    ));
2448                }
2449                _ => {}
2450            }
2451        }
2452    }
2453
2454    if let Some(dev_deps_value) = toml_content
2455        .get("tool")
2456        .and_then(|value| value.as_table())
2457        .and_then(|tool| tool.get("uv"))
2458        .and_then(|value| value.as_table())
2459        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
2460    {
2461        match dev_deps_value {
2462            TomlValue::Array(arr) => {
2463                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
2464            }
2465            TomlValue::Table(table) => {
2466                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
2467            }
2468            _ => {}
2469        }
2470    }
2471
2472    (dependencies, optional_dependencies)
2473}
2474
2475fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
2476    let mut extra_data = HashMap::new();
2477
2478    if let Some(tool_uv) = toml_content
2479        .get("tool")
2480        .and_then(|value| value.as_table())
2481        .and_then(|tool| tool.get("uv"))
2482    {
2483        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
2484    }
2485
2486    if extra_data.is_empty() {
2487        None
2488    } else {
2489        Some(extra_data)
2490    }
2491}
2492
2493fn toml_value_to_json(value: &TomlValue) -> JsonValue {
2494    match value {
2495        TomlValue::String(value) => JsonValue::String(value.clone()),
2496        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
2497        TomlValue::Float(value) => JsonValue::String(value.to_string()),
2498        TomlValue::Boolean(value) => JsonValue::Bool(*value),
2499        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
2500        TomlValue::Array(values) => {
2501            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
2502        }
2503        TomlValue::Table(values) => JsonValue::Object(
2504            values
2505                .iter()
2506                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
2507                .collect::<JsonMap<String, JsonValue>>(),
2508        ),
2509    }
2510}
2511
2512fn parse_dependency_table(
2513    table: &TomlMap<String, TomlValue>,
2514    is_optional: bool,
2515    scope: Option<&str>,
2516) -> Vec<Dependency> {
2517    table
2518        .iter()
2519        .filter_map(|(name, version)| {
2520            let version_str = version.as_str().map(|s| s.to_string());
2521            let mut package_url =
2522                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2523
2524            if let Some(v) = &version_str {
2525                package_url.with_version(v).ok()?;
2526            }
2527
2528            Some(Dependency {
2529                purl: Some(package_url.to_string()),
2530                extracted_requirement: None,
2531                scope: scope.map(|s| s.to_string()),
2532                is_runtime: Some(!is_optional),
2533                is_optional: Some(is_optional),
2534                is_pinned: None,
2535                is_direct: Some(true),
2536                resolved_package: None,
2537                extra_data: None,
2538            })
2539        })
2540        .collect()
2541}
2542
2543fn parse_dependency_array(
2544    array: &[TomlValue],
2545    is_optional: bool,
2546    scope: Option<&str>,
2547) -> Vec<Dependency> {
2548    array
2549        .iter()
2550        .filter_map(|dep| {
2551            let dep_str = dep.as_str()?;
2552
2553            let mut parts = dep_str.split(['>', '=', '<', '~']);
2554            let name = parts.next()?.trim().to_string();
2555
2556            let version = parts.next().map(|v| v.trim().to_string());
2557
2558            let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name)
2559            {
2560                Ok(purl) => purl,
2561                Err(_) => return None,
2562            };
2563
2564            if let Some(ref v) = version {
2565                package_url.with_version(v).ok()?;
2566            }
2567
2568            Some(Dependency {
2569                purl: Some(package_url.to_string()),
2570                extracted_requirement: None,
2571                scope: scope.map(|s| s.to_string()),
2572                is_runtime: Some(!is_optional),
2573                is_optional: Some(is_optional),
2574                is_pinned: None,
2575                is_direct: Some(true),
2576                resolved_package: None,
2577                extra_data: None,
2578            })
2579        })
2580        .collect()
2581}
2582
2583#[derive(Debug, Clone)]
2584enum Value {
2585    String(String),
2586    Number(f64),
2587    Bool(bool),
2588    None,
2589    List(Vec<Value>),
2590    Tuple(Vec<Value>),
2591    Dict(HashMap<String, Value>),
2592}
2593
2594struct LiteralEvaluator {
2595    constants: HashMap<String, Value>,
2596    max_depth: usize,
2597    max_nodes: usize,
2598    nodes_visited: usize,
2599}
2600
2601impl LiteralEvaluator {
2602    fn new(constants: HashMap<String, Value>) -> Self {
2603        Self {
2604            constants,
2605            max_depth: MAX_SETUP_PY_AST_DEPTH,
2606            max_nodes: MAX_SETUP_PY_AST_NODES,
2607            nodes_visited: 0,
2608        }
2609    }
2610
2611    fn insert_constant(&mut self, name: String, value: Value) {
2612        self.constants.insert(name, value);
2613    }
2614
2615    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
2616        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
2617            return None;
2618        }
2619        self.nodes_visited += 1;
2620
2621        match expr {
2622            ast::Expr::StringLiteral(ast::ExprStringLiteral { value, .. }) => {
2623                Some(Value::String(value.to_str().to_string()))
2624            }
2625            ast::Expr::BooleanLiteral(ast::ExprBooleanLiteral { value, .. }) => {
2626                Some(Value::Bool(*value))
2627            }
2628            ast::Expr::NumberLiteral(ast::ExprNumberLiteral { value, .. }) => {
2629                self.evaluate_number(value)
2630            }
2631            ast::Expr::NoneLiteral(_) => Some(Value::None),
2632            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
2633            ast::Expr::List(ast::ExprList { elts, .. }) => {
2634                let mut values = Vec::new();
2635                for elt in elts {
2636                    values.push(self.evaluate_expr(elt, depth + 1)?);
2637                }
2638                Some(Value::List(values))
2639            }
2640            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
2641                let mut values = Vec::new();
2642                for elt in elts {
2643                    values.push(self.evaluate_expr(elt, depth + 1)?);
2644                }
2645                Some(Value::Tuple(values))
2646            }
2647            ast::Expr::Dict(ast::ExprDict { items, .. }) => {
2648                let mut dict = HashMap::new();
2649                for item in items {
2650                    let key_expr = item.key.as_ref()?;
2651                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
2652                    let key = value_to_string(&key_value)?;
2653                    let value = self.evaluate_expr(&item.value, depth + 1)?;
2654                    dict.insert(key, value);
2655                }
2656                Some(Value::Dict(dict))
2657            }
2658            ast::Expr::Call(ast::ExprCall {
2659                func, arguments, ..
2660            }) => {
2661                let args = arguments.args.as_ref();
2662                let keywords = arguments.keywords.as_ref();
2663                if keywords.is_empty()
2664                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
2665                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
2666                {
2667                    return self.evaluate_ordered_dict(args, depth + 1);
2668                }
2669
2670                if !args.is_empty() {
2671                    return None;
2672                }
2673
2674                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
2675                    && id == "dict"
2676                {
2677                    let mut dict = HashMap::new();
2678                    for keyword in keywords {
2679                        let key = keyword.arg.as_ref().map(ast::Identifier::as_str)?;
2680                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2681                        dict.insert(key.to_string(), value);
2682                    }
2683                    return Some(Value::Dict(dict));
2684                }
2685
2686                None
2687            }
2688            _ => None,
2689        }
2690    }
2691
2692    fn evaluate_number(&self, number: &ast::Number) -> Option<Value> {
2693        match number {
2694            ast::Number::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2695            ast::Number::Float(value) => Some(Value::Number(*value)),
2696            ast::Number::Complex { .. } => None,
2697        }
2698    }
2699
2700    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2701        if args.len() != 1 {
2702            return None;
2703        }
2704
2705        let items = match self.evaluate_expr(&args[0], depth)? {
2706            Value::List(items) | Value::Tuple(items) => items,
2707            _ => return None,
2708        };
2709
2710        let mut dict = HashMap::new();
2711        for item in items {
2712            let Value::Tuple(values) = item else {
2713                return None;
2714            };
2715            if values.len() != 2 {
2716                return None;
2717            }
2718            let key = value_to_string(&values[0])?;
2719            dict.insert(key, values[1].clone());
2720        }
2721
2722        Some(Value::Dict(dict))
2723    }
2724}
2725
2726#[derive(Default)]
2727struct SetupAliases {
2728    setup_names: HashSet<String>,
2729    module_aliases: HashMap<String, String>,
2730}
2731
2732fn extract_setup_py_packages(path: &Path) -> Vec<PackageData> {
2733    extract_from_setup_py(path).into_iter().collect()
2734}
2735
2736fn extract_from_setup_py(path: &Path) -> Option<PackageData> {
2737    let content = match read_file_to_string(path) {
2738        Ok(content) => content,
2739        Err(e) => {
2740            warn!("Failed to read setup.py at {:?}: {}", path, e);
2741            return Some(default_package_data(path));
2742        }
2743    };
2744
2745    if content.len() > MAX_SETUP_PY_BYTES {
2746        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
2747        let package_data = extract_from_setup_py_regex(&content);
2748        return should_emit_setup_py_package(&package_data).then_some(package_data);
2749    }
2750
2751    let mut package_data = match extract_from_setup_py_ast(&content) {
2752        Ok(Some(data)) => data,
2753        Ok(None) => return Some(default_package_data(path)),
2754        Err(e) => {
2755            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
2756            extract_from_setup_py_regex(&content)
2757        }
2758    };
2759
2760    if package_data.name.is_none() {
2761        package_data.name = extract_setup_value(&content, "name");
2762    }
2763
2764    if package_data.version.is_none() {
2765        package_data.version = extract_setup_value(&content, "version");
2766    }
2767
2768    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
2769
2770    if package_data.purl.is_none() {
2771        package_data.purl = build_setup_py_purl(
2772            package_data.name.as_deref(),
2773            package_data.version.as_deref(),
2774        );
2775    }
2776
2777    if should_emit_setup_py_package(&package_data) {
2778        Some(package_data)
2779    } else {
2780        Some(default_package_data(path))
2781    }
2782}
2783
2784fn should_emit_setup_py_package(package_data: &PackageData) -> bool {
2785    package_data.name.is_some()
2786        || package_data.version.is_some()
2787        || package_data.purl.is_some()
2788        || !package_data.dependencies.is_empty()
2789        || package_data.extracted_license_statement.is_some()
2790        || !package_data.license_detections.is_empty()
2791        || !package_data.parties.is_empty()
2792        || package_data.description.is_some()
2793        || package_data.homepage_url.is_some()
2794        || package_data.bug_tracking_url.is_some()
2795        || package_data.code_view_url.is_some()
2796        || package_data.vcs_url.is_some()
2797}
2798
2799fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
2800    if package_data.version.is_some()
2801        && package_data.extracted_license_statement.is_some()
2802        && package_data
2803            .parties
2804            .iter()
2805            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
2806    {
2807        return;
2808    }
2809
2810    let Some(root) = path.parent() else {
2811        return;
2812    };
2813
2814    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
2815
2816    if package_data.version.is_none() {
2817        package_data.version = dunder_metadata.version;
2818    }
2819
2820    if package_data.extracted_license_statement.is_none() {
2821        package_data.extracted_license_statement = dunder_metadata.license;
2822    }
2823
2824    let has_author = package_data
2825        .parties
2826        .iter()
2827        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
2828
2829    if !has_author && let Some(author) = dunder_metadata.author {
2830        package_data.parties.push(Party {
2831            r#type: Some("person".to_string()),
2832            role: Some("author".to_string()),
2833            name: Some(author),
2834            email: None,
2835            url: None,
2836            organization: None,
2837            organization_url: None,
2838            timezone: None,
2839        });
2840    }
2841}
2842
2843#[derive(Default)]
2844struct DunderMetadata {
2845    version: Option<String>,
2846    author: Option<String>,
2847    license: Option<String>,
2848}
2849
2850fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
2851    let statements = match parse_module(content) {
2852        Ok(parsed) => parsed.into_suite(),
2853        Err(_) => return DunderMetadata::default(),
2854    };
2855
2856    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2857    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2858    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2859    let mut metadata = DunderMetadata::default();
2860
2861    for module in imported_dunder_modules(&statements) {
2862        let Some(path) = resolve_imported_module_path(root, &module) else {
2863            continue;
2864        };
2865        let Ok(module_content) = read_file_to_string(&path) else {
2866            continue;
2867        };
2868
2869        if metadata.version.is_none() {
2870            metadata.version = version_re
2871                .as_ref()
2872                .and_then(|regex| regex.captures(&module_content))
2873                .and_then(|captures| captures.get(1))
2874                .map(|match_| match_.as_str().to_string());
2875        }
2876
2877        if metadata.author.is_none() {
2878            metadata.author = author_re
2879                .as_ref()
2880                .and_then(|regex| regex.captures(&module_content))
2881                .and_then(|captures| captures.get(1))
2882                .map(|match_| match_.as_str().to_string());
2883        }
2884
2885        if metadata.license.is_none() {
2886            metadata.license = license_re
2887                .as_ref()
2888                .and_then(|regex| regex.captures(&module_content))
2889                .and_then(|captures| captures.get(1))
2890                .map(|match_| match_.as_str().to_string());
2891        }
2892
2893        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
2894            return metadata;
2895        }
2896    }
2897
2898    metadata
2899}
2900
2901fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
2902    let mut modules = Vec::new();
2903
2904    for statement in statements {
2905        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
2906            continue;
2907        };
2908        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
2909            continue;
2910        };
2911        let imports_dunder = names.iter().any(|alias| {
2912            matches!(
2913                alias.name.as_str(),
2914                "__version__" | "__author__" | "__license__"
2915            )
2916        });
2917        if imports_dunder {
2918            modules.push(module.to_string());
2919        }
2920    }
2921
2922    modules
2923}
2924
2925fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
2926    let relative = PathBuf::from_iter(module.split('.'));
2927    let candidates = [
2928        root.join(relative.with_extension("py")),
2929        root.join(&relative).join("__init__.py"),
2930        root.join("src").join(relative.with_extension("py")),
2931        root.join("src").join(relative).join("__init__.py"),
2932    ];
2933
2934    candidates.into_iter().find(|candidate| candidate.exists())
2935}
2936
2937/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
2938///
2939/// # Security Model
2940///
2941/// This function parses setup.py as a Python AST and evaluates only literal values
2942/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
2943/// arbitrary code execution during scanning.
2944///
2945/// # DoS Prevention
2946///
2947/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
2948/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
2949/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
2950///
2951/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
2952fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
2953    let statements = parse_module(content)
2954        .map(|parsed| parsed.into_suite())
2955        .map_err(|e| e.to_string())?;
2956    let aliases = collect_setup_aliases(&statements);
2957    let mut evaluator = LiteralEvaluator::new(HashMap::new());
2958    build_setup_py_constants(&statements, &mut evaluator);
2959
2960    let setup_call = find_setup_call(&statements, &aliases);
2961    let Some(call_expr) = setup_call else {
2962        return Ok(None);
2963    };
2964
2965    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
2966    Ok(Some(build_setup_py_package_data(&setup_values)))
2967}
2968
2969fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
2970    for stmt in statements {
2971        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
2972            if targets.len() != 1 {
2973                continue;
2974            }
2975
2976            let Some(name) = extract_assign_name(&targets[0]) else {
2977                continue;
2978            };
2979
2980            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
2981                evaluator.insert_constant(name, value);
2982            }
2983        }
2984    }
2985}
2986
2987fn extract_assign_name(target: &ast::Expr) -> Option<String> {
2988    match target {
2989        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
2990        _ => None,
2991    }
2992}
2993
2994fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
2995    let mut aliases = SetupAliases::default();
2996    aliases.setup_names.insert("setup".to_string());
2997
2998    for stmt in statements {
2999        match stmt {
3000            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
3001                for alias in names {
3002                    let module_name = alias.name.as_str();
3003                    if !is_setup_module(module_name) {
3004                        continue;
3005                    }
3006                    let alias_name = alias
3007                        .asname
3008                        .as_ref()
3009                        .map(|name| name.as_str())
3010                        .unwrap_or(module_name);
3011                    aliases
3012                        .module_aliases
3013                        .insert(alias_name.to_string(), module_name.to_string());
3014                }
3015            }
3016            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
3017                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
3018                    continue;
3019                };
3020                if !is_setup_module(module_name) {
3021                    continue;
3022                }
3023                for alias in names {
3024                    if alias.name.as_str() != "setup" {
3025                        continue;
3026                    }
3027                    let alias_name = alias
3028                        .asname
3029                        .as_ref()
3030                        .map(|name| name.as_str())
3031                        .unwrap_or("setup");
3032                    aliases.setup_names.insert(alias_name.to_string());
3033                }
3034            }
3035            _ => {}
3036        }
3037    }
3038
3039    aliases
3040}
3041
3042fn is_setup_module(module_name: &str) -> bool {
3043    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
3044}
3045
3046fn find_setup_call<'a>(
3047    statements: &'a [ast::Stmt],
3048    aliases: &'a SetupAliases,
3049) -> Option<&'a ast::Expr> {
3050    let mut finder = SetupCallFinder {
3051        aliases,
3052        called_function_names: collect_top_level_called_function_names(statements),
3053        nodes_visited: 0,
3054    };
3055    finder.find_in_statements(statements)
3056}
3057
3058fn collect_top_level_called_function_names(statements: &[ast::Stmt]) -> HashSet<String> {
3059    let mut called = HashSet::new();
3060    collect_called_function_names_in_statements(statements, &mut called);
3061    called
3062}
3063
3064fn collect_called_function_names_in_statements(
3065    statements: &[ast::Stmt],
3066    called: &mut HashSet<String>,
3067) {
3068    for stmt in statements {
3069        match stmt {
3070            ast::Stmt::Expr(ast::StmtExpr { value, .. })
3071            | ast::Stmt::Assign(ast::StmtAssign { value, .. }) => {
3072                collect_called_function_names_in_expr(value.as_ref(), called);
3073            }
3074            ast::Stmt::If(ast::StmtIf {
3075                body,
3076                elif_else_clauses,
3077                ..
3078            }) => {
3079                collect_called_function_names_in_statements(body, called);
3080                for clause in elif_else_clauses {
3081                    collect_called_function_names_in_statements(&clause.body, called);
3082                }
3083            }
3084            ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3085            | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => {
3086                collect_called_function_names_in_statements(body, called);
3087                collect_called_function_names_in_statements(orelse, called);
3088            }
3089            ast::Stmt::With(ast::StmtWith { body, .. }) => {
3090                collect_called_function_names_in_statements(body, called);
3091            }
3092            ast::Stmt::Try(ast::StmtTry {
3093                body,
3094                orelse,
3095                finalbody,
3096                handlers,
3097                ..
3098            }) => {
3099                collect_called_function_names_in_statements(body, called);
3100                collect_called_function_names_in_statements(orelse, called);
3101                collect_called_function_names_in_statements(finalbody, called);
3102                for handler in handlers {
3103                    let ast::ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler {
3104                        body,
3105                        ..
3106                    }) = handler;
3107                    collect_called_function_names_in_statements(body, called);
3108                }
3109            }
3110            _ => {}
3111        }
3112    }
3113}
3114
3115fn collect_called_function_names_in_expr(expr: &ast::Expr, called: &mut HashSet<String>) {
3116    if let ast::Expr::Call(ast::ExprCall {
3117        func, arguments, ..
3118    }) = expr
3119    {
3120        if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref() {
3121            called.insert(id.as_str().to_string());
3122        }
3123
3124        for arg in arguments.args.iter() {
3125            collect_called_function_names_in_expr(arg, called);
3126        }
3127        for keyword in arguments.keywords.iter() {
3128            collect_called_function_names_in_expr(&keyword.value, called);
3129        }
3130    }
3131}
3132
3133struct SetupCallFinder<'a> {
3134    aliases: &'a SetupAliases,
3135    called_function_names: HashSet<String>,
3136    nodes_visited: usize,
3137}
3138
3139impl<'a> SetupCallFinder<'a> {
3140    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
3141        for stmt in statements {
3142            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3143                return None;
3144            }
3145            self.nodes_visited += 1;
3146
3147            let found = match stmt {
3148                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
3149                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
3150                ast::Stmt::If(ast::StmtIf {
3151                    body,
3152                    elif_else_clauses,
3153                    ..
3154                }) => self.find_in_statements(body).or_else(|| {
3155                    for clause in elif_else_clauses {
3156                        if let Some(found) = self.find_in_statements(&clause.body) {
3157                            return Some(found);
3158                        }
3159                    }
3160                    None
3161                }),
3162                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
3163                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
3164                    .find_in_statements(body)
3165                    .or_else(|| self.find_in_statements(orelse)),
3166                ast::Stmt::FunctionDef(ast::StmtFunctionDef { name, body, .. }) => self
3167                    .called_function_names
3168                    .contains(name.as_str())
3169                    .then(|| self.find_in_statements(body))
3170                    .flatten(),
3171                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
3172                ast::Stmt::Try(ast::StmtTry {
3173                    body,
3174                    orelse,
3175                    finalbody,
3176                    handlers,
3177                    ..
3178                }) => self
3179                    .find_in_statements(body)
3180                    .or_else(|| self.find_in_statements(orelse))
3181                    .or_else(|| self.find_in_statements(finalbody))
3182                    .or_else(|| {
3183                        for handler in handlers {
3184                            let ast::ExceptHandler::ExceptHandler(
3185                                ast::ExceptHandlerExceptHandler { body, .. },
3186                            ) = handler;
3187                            if let Some(found) = self.find_in_statements(body) {
3188                                return Some(found);
3189                            }
3190                        }
3191                        None
3192                    }),
3193                _ => None,
3194            };
3195
3196            if found.is_some() {
3197                return found;
3198            }
3199        }
3200
3201        None
3202    }
3203
3204    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
3205        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
3206            return None;
3207        }
3208        self.nodes_visited += 1;
3209
3210        match expr {
3211            ast::Expr::Call(ast::ExprCall { func, .. })
3212                if is_setup_call(func.as_ref(), self.aliases) =>
3213            {
3214                Some(expr)
3215            }
3216            _ => None,
3217        }
3218    }
3219}
3220
3221fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
3222    let Some(dotted) = dotted_name(func, 0) else {
3223        return false;
3224    };
3225
3226    if aliases.setup_names.contains(&dotted) {
3227        return true;
3228    }
3229
3230    let Some(module) = dotted.strip_suffix(".setup") else {
3231        return false;
3232    };
3233
3234    let resolved = resolve_module_alias(module, aliases);
3235    is_setup_module(&resolved)
3236}
3237
3238fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
3239    if depth >= MAX_SETUP_PY_AST_DEPTH {
3240        return None;
3241    }
3242
3243    match expr {
3244        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
3245        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
3246            let base = dotted_name(value.as_ref(), depth + 1)?;
3247            Some(format!("{}.{}", base, attr.as_str()))
3248        }
3249        _ => None,
3250    }
3251}
3252
3253fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
3254    if let Some(mapped) = aliases.module_aliases.get(module) {
3255        return mapped.clone();
3256    }
3257
3258    let Some((base, rest)) = module.split_once('.') else {
3259        return module.to_string();
3260    };
3261
3262    if let Some(mapped) = aliases.module_aliases.get(base) {
3263        return format!("{}.{}", mapped, rest);
3264    }
3265
3266    module.to_string()
3267}
3268
3269fn extract_setup_keywords(
3270    call_expr: &ast::Expr,
3271    evaluator: &mut LiteralEvaluator,
3272) -> HashMap<String, Value> {
3273    let mut values = HashMap::new();
3274    let ast::Expr::Call(ast::ExprCall { arguments, .. }) = call_expr else {
3275        return values;
3276    };
3277
3278    for keyword in arguments.keywords.iter() {
3279        if let Some(arg) = keyword.arg.as_ref().map(ast::Identifier::as_str) {
3280            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
3281                values.insert(arg.to_string(), value);
3282            }
3283        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
3284            for (key, value) in dict {
3285                values.insert(key, value);
3286            }
3287        }
3288    }
3289
3290    values
3291}
3292
3293fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
3294    let name = get_value_string(values, "name");
3295    let version = get_value_string(values, "version");
3296    let description =
3297        get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
3298    let homepage_url =
3299        get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
3300    let author = get_value_string(values, "author");
3301    let author_email = get_value_string(values, "author_email");
3302    let maintainer = get_value_string(values, "maintainer");
3303    let maintainer_email = get_value_string(values, "maintainer_email");
3304    let license = get_value_string(values, "license");
3305    let classifiers = values
3306        .get("classifiers")
3307        .and_then(value_to_string_list)
3308        .unwrap_or_default();
3309
3310    let mut parties = Vec::new();
3311    if author.is_some() || author_email.is_some() {
3312        parties.push(Party {
3313            r#type: Some("person".to_string()),
3314            role: Some("author".to_string()),
3315            name: author,
3316            email: author_email,
3317            url: None,
3318            organization: None,
3319            organization_url: None,
3320            timezone: None,
3321        });
3322    }
3323
3324    if maintainer.is_some() || maintainer_email.is_some() {
3325        parties.push(Party {
3326            r#type: Some("person".to_string()),
3327            role: Some("maintainer".to_string()),
3328            name: maintainer,
3329            email: maintainer_email,
3330            url: None,
3331            organization: None,
3332            organization_url: None,
3333            timezone: None,
3334        });
3335    }
3336
3337    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3338        normalize_spdx_declared_license(license.as_deref());
3339    let extracted_license_statement = license.clone();
3340
3341    let dependencies = build_setup_py_dependencies(values);
3342    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3343    let mut homepage_from_project_urls = None;
3344    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3345    let mut extra_data = HashMap::new();
3346
3347    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
3348        apply_project_url_mappings(
3349            &parsed_project_urls,
3350            &mut homepage_from_project_urls,
3351            &mut bug_tracking_url,
3352            &mut code_view_url,
3353            &mut vcs_url,
3354            &mut extra_data,
3355        );
3356    }
3357
3358    let extra_data = if extra_data.is_empty() {
3359        None
3360    } else {
3361        Some(extra_data)
3362    };
3363
3364    PackageData {
3365        package_type: Some(PythonParser::PACKAGE_TYPE),
3366        namespace: None,
3367        name,
3368        version,
3369        qualifiers: None,
3370        subpath: None,
3371        primary_language: Some("Python".to_string()),
3372        description,
3373        release_date: None,
3374        parties,
3375        keywords: Vec::new(),
3376        homepage_url: homepage_url.or(homepage_from_project_urls),
3377        download_url: None,
3378        size: None,
3379        sha1: None,
3380        md5: None,
3381        sha256: None,
3382        sha512: None,
3383        bug_tracking_url,
3384        code_view_url,
3385        vcs_url,
3386        copyright: None,
3387        holder: None,
3388        declared_license_expression,
3389        declared_license_expression_spdx,
3390        license_detections,
3391        other_license_expression: None,
3392        other_license_expression_spdx: None,
3393        other_license_detections: Vec::new(),
3394        extracted_license_statement,
3395        notice_text: None,
3396        source_packages: Vec::new(),
3397        file_references: Vec::new(),
3398        is_private: has_private_classifier(&classifiers),
3399        is_virtual: false,
3400        extra_data,
3401        dependencies,
3402        repository_homepage_url: None,
3403        repository_download_url: None,
3404        api_data_url: None,
3405        datasource_id: Some(DatasourceId::PypiSetupPy),
3406        purl,
3407    }
3408}
3409
3410fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
3411    let mut dependencies = Vec::new();
3412
3413    if let Some(reqs) = values
3414        .get("install_requires")
3415        .and_then(value_to_string_list)
3416    {
3417        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
3418    }
3419
3420    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
3421        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
3422    }
3423
3424    if let Some(Value::Dict(extras)) = values.get("extras_require") {
3425        let mut extra_items: Vec<_> = extras.iter().collect();
3426        extra_items.sort_by_key(|(name, _)| *name);
3427        for (extra_name, extra_value) in extra_items {
3428            if let Some(reqs) = value_to_string_list(extra_value) {
3429                dependencies.extend(build_setup_py_dependency_list(
3430                    reqs.as_slice(),
3431                    extra_name,
3432                    true,
3433                ));
3434            }
3435        }
3436    }
3437
3438    dependencies
3439}
3440
3441fn build_setup_py_dependency_list(
3442    reqs: &[String],
3443    scope: &str,
3444    is_optional: bool,
3445) -> Vec<Dependency> {
3446    reqs.iter()
3447        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3448        .collect()
3449}
3450
3451fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
3452    values.get(key).and_then(value_to_string)
3453}
3454
3455fn value_to_string(value: &Value) -> Option<String> {
3456    match value {
3457        Value::String(value) => Some(value.clone()),
3458        Value::Number(value) => Some(value.to_string()),
3459        Value::Bool(value) => Some(value.to_string()),
3460        _ => None,
3461    }
3462}
3463
3464fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
3465    match value {
3466        Value::String(value) => Some(vec![value.clone()]),
3467        Value::List(values) | Value::Tuple(values) => {
3468            let mut items = Vec::new();
3469            for item in values {
3470                items.push(value_to_string(item)?);
3471            }
3472            Some(items)
3473        }
3474        _ => None,
3475    }
3476}
3477
3478fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
3479    let Value::Dict(dict) = value else {
3480        return None;
3481    };
3482
3483    let mut pairs: Vec<(String, String)> = dict
3484        .iter()
3485        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
3486        .collect::<Option<Vec<_>>>()?;
3487    pairs.sort_by(|left, right| left.0.cmp(&right.0));
3488    Some(pairs)
3489}
3490
3491fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
3492    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
3493    extract_requires_dist_dependencies(&requires_dist)
3494}
3495
3496pub(crate) fn extract_requires_dist_dependencies(requires_dist: &[String]) -> Vec<Dependency> {
3497    requires_dist
3498        .iter()
3499        .filter_map(|entry| build_rfc822_dependency(entry))
3500        .collect()
3501}
3502
3503fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
3504    build_python_dependency(entry, "install", false, None)
3505}
3506
3507fn build_python_dependency(
3508    entry: &str,
3509    default_scope: &str,
3510    default_optional: bool,
3511    marker_override: Option<&str>,
3512) -> Option<Dependency> {
3513    let (requirement_part, marker_part) = entry
3514        .split_once(';')
3515        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
3516        .unwrap_or((entry.trim(), None));
3517
3518    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3519    let requirement = normalize_rfc822_requirement(requirement_part);
3520    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
3521        marker_part.or(marker_override),
3522        default_scope,
3523        default_optional,
3524    );
3525    let mut purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
3526
3527    let is_pinned = requirement
3528        .as_deref()
3529        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
3530    if is_pinned
3531        && let Some(version) = requirement
3532            .as_deref()
3533            .map(|req| req.trim_start_matches('='))
3534    {
3535        purl.with_version(version).ok()?;
3536    }
3537
3538    let mut extra_data = HashMap::new();
3539    extra_data.extend(marker_data);
3540    if let Some(marker) = marker {
3541        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
3542    }
3543
3544    Some(Dependency {
3545        purl: Some(purl.to_string()),
3546        extracted_requirement: requirement,
3547        scope: Some(scope),
3548        is_runtime: Some(true),
3549        is_optional: Some(is_optional),
3550        is_pinned: Some(is_pinned),
3551        is_direct: Some(true),
3552        resolved_package: None,
3553        extra_data: if extra_data.is_empty() {
3554            None
3555        } else {
3556            Some(extra_data)
3557        },
3558    })
3559}
3560
3561fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
3562    let name = extract_setup_cfg_dependency_name(requirement_part)?;
3563    let trimmed = requirement_part.trim();
3564    let mut remainder = trimmed[name.len()..].trim();
3565
3566    if let Some(stripped) = remainder.strip_prefix('[')
3567        && let Some(end_idx) = stripped.find(']')
3568    {
3569        remainder = stripped[end_idx + 1..].trim();
3570    }
3571
3572    let remainder = remainder
3573        .strip_prefix('(')
3574        .and_then(|value| value.strip_suffix(')'))
3575        .unwrap_or(remainder)
3576        .trim();
3577
3578    if remainder.is_empty() {
3579        return None;
3580    }
3581
3582    let mut specifiers: Vec<String> = remainder
3583        .split(',')
3584        .map(|specifier| specifier.trim().replace(' ', ""))
3585        .filter(|specifier| !specifier.is_empty())
3586        .collect();
3587    specifiers.sort();
3588    Some(specifiers.join(","))
3589}
3590
3591fn parse_rfc822_marker(
3592    marker_part: Option<&str>,
3593    default_scope: &str,
3594    default_optional: bool,
3595) -> (
3596    String,
3597    bool,
3598    Option<String>,
3599    HashMap<String, serde_json::Value>,
3600) {
3601    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
3602        return (
3603            default_scope.to_string(),
3604            default_optional,
3605            None,
3606            HashMap::new(),
3607        );
3608    };
3609
3610    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
3611        .expect("extra marker regex should compile");
3612    let mut extra_data = HashMap::new();
3613
3614    if let Some(python_version) = extract_marker_field(marker, "python_version") {
3615        extra_data.insert(
3616            "python_version".to_string(),
3617            serde_json::Value::String(python_version),
3618        );
3619    }
3620    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
3621        extra_data.insert(
3622            "sys_platform".to_string(),
3623            serde_json::Value::String(sys_platform),
3624        );
3625    }
3626
3627    if let Some(captures) = extra_re.captures(marker)
3628        && let Some(scope) = captures.get(1)
3629    {
3630        return (
3631            scope.as_str().to_string(),
3632            true,
3633            Some(marker.trim().to_string()),
3634            extra_data,
3635        );
3636    }
3637
3638    (
3639        default_scope.to_string(),
3640        default_optional,
3641        Some(marker.trim().to_string()),
3642        extra_data,
3643    )
3644}
3645
3646fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
3647    let re = Regex::new(&format!(
3648        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
3649        field
3650    ))
3651    .ok()?;
3652    let captures = re.captures(marker)?;
3653    let operator = captures.get(1)?.as_str();
3654    let value = captures.get(2)?.as_str();
3655    Some(format!("{} {}", operator, value))
3656}
3657
3658fn parse_requires_txt(content: &str) -> Vec<Dependency> {
3659    let mut dependencies = Vec::new();
3660    let mut current_scope = "install".to_string();
3661    let mut current_optional = false;
3662    let mut current_marker: Option<String> = None;
3663
3664    for line in content.lines() {
3665        let trimmed = line.trim();
3666        if trimmed.is_empty() || trimmed.starts_with('#') {
3667            continue;
3668        }
3669
3670        if trimmed.starts_with('[') && trimmed.ends_with(']') {
3671            let inner = &trimmed[1..trimmed.len() - 1];
3672            if let Some(rest) = inner.strip_prefix(':') {
3673                current_scope = "install".to_string();
3674                current_optional = false;
3675                current_marker = Some(rest.trim().to_string());
3676            } else if let Some((scope, marker)) = inner.split_once(':') {
3677                current_scope = scope.trim().to_string();
3678                current_optional = true;
3679                current_marker = Some(marker.trim().to_string());
3680            } else {
3681                current_scope = inner.trim().to_string();
3682                current_optional = true;
3683                current_marker = None;
3684            }
3685            continue;
3686        }
3687
3688        if let Some(dependency) = build_python_dependency(
3689            trimmed,
3690            &current_scope,
3691            current_optional,
3692            current_marker.as_deref(),
3693        ) {
3694            dependencies.push(dependency);
3695        }
3696    }
3697
3698    dependencies
3699}
3700
3701fn has_private_classifier(classifiers: &[String]) -> bool {
3702    classifiers
3703        .iter()
3704        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
3705}
3706
3707fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
3708    let name = name?;
3709    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
3710    if let Some(version) = version {
3711        package_url.with_version(version).ok()?;
3712    }
3713    Some(package_url.to_string())
3714}
3715
3716fn extract_from_setup_py_regex(content: &str) -> PackageData {
3717    let name = extract_setup_value(content, "name");
3718    let version = extract_setup_value(content, "version");
3719    let license_expression = extract_setup_value(content, "license");
3720
3721    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3722        normalize_spdx_declared_license(license_expression.as_deref());
3723    let extracted_license_statement = license_expression.clone();
3724
3725    let dependencies = extract_setup_py_dependencies(content);
3726    let homepage_url = extract_setup_value(content, "url");
3727    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
3728
3729    PackageData {
3730        package_type: Some(PythonParser::PACKAGE_TYPE),
3731        namespace: None,
3732        name,
3733        version,
3734        qualifiers: None,
3735        subpath: None,
3736        primary_language: Some("Python".to_string()),
3737        description: None,
3738        release_date: None,
3739        parties: Vec::new(),
3740        keywords: Vec::new(),
3741        homepage_url,
3742        download_url: None,
3743        size: None,
3744        sha1: None,
3745        md5: None,
3746        sha256: None,
3747        sha512: None,
3748        bug_tracking_url: None,
3749        code_view_url: None,
3750        vcs_url: None,
3751        copyright: None,
3752        holder: None,
3753        declared_license_expression,
3754        declared_license_expression_spdx,
3755        license_detections,
3756        other_license_expression: None,
3757        other_license_expression_spdx: None,
3758        other_license_detections: Vec::new(),
3759        extracted_license_statement,
3760        notice_text: None,
3761        source_packages: Vec::new(),
3762        file_references: Vec::new(),
3763        is_private: false,
3764        is_virtual: false,
3765        extra_data: None,
3766        dependencies,
3767        repository_homepage_url: None,
3768        repository_download_url: None,
3769        api_data_url: None,
3770        datasource_id: Some(DatasourceId::PypiSetupPy),
3771        purl,
3772    }
3773}
3774
3775fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
3776    crate::models::ResolvedPackage::from_package_data(pkg, PackageType::Pypi)
3777}
3778
3779fn extract_from_pypi_json(path: &Path) -> PackageData {
3780    let default = PackageData {
3781        package_type: Some(PythonParser::PACKAGE_TYPE),
3782        datasource_id: Some(DatasourceId::PypiJson),
3783        ..Default::default()
3784    };
3785
3786    let content = match read_file_to_string(path) {
3787        Ok(content) => content,
3788        Err(error) => {
3789            warn!("Failed to read pypi.json at {:?}: {}", path, error);
3790            return default;
3791        }
3792    };
3793
3794    let root: serde_json::Value = match serde_json::from_str(&content) {
3795        Ok(value) => value,
3796        Err(error) => {
3797            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
3798            return default;
3799        }
3800    };
3801
3802    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
3803        warn!("No info object found in pypi.json at {:?}", path);
3804        return default;
3805    };
3806
3807    let name = info
3808        .get("name")
3809        .and_then(|value| value.as_str())
3810        .map(ToOwned::to_owned);
3811    let version = info
3812        .get("version")
3813        .and_then(|value| value.as_str())
3814        .map(ToOwned::to_owned);
3815    let summary = info
3816        .get("summary")
3817        .and_then(|value| value.as_str())
3818        .map(ToOwned::to_owned);
3819    let description = info
3820        .get("description")
3821        .and_then(|value| value.as_str())
3822        .filter(|value| !value.trim().is_empty())
3823        .map(ToOwned::to_owned)
3824        .or(summary);
3825    let mut homepage_url = info
3826        .get("home_page")
3827        .and_then(|value| value.as_str())
3828        .map(ToOwned::to_owned);
3829    let author = info
3830        .get("author")
3831        .and_then(|value| value.as_str())
3832        .filter(|value| !value.trim().is_empty())
3833        .map(ToOwned::to_owned);
3834    let author_email = info
3835        .get("author_email")
3836        .and_then(|value| value.as_str())
3837        .filter(|value| !value.trim().is_empty())
3838        .map(ToOwned::to_owned);
3839    let license = info
3840        .get("license")
3841        .and_then(|value| value.as_str())
3842        .filter(|value| !value.trim().is_empty())
3843        .map(ToOwned::to_owned);
3844    let keywords = parse_setup_cfg_keywords(
3845        info.get("keywords")
3846            .and_then(|value| value.as_str())
3847            .map(ToOwned::to_owned),
3848    );
3849    let classifiers = info
3850        .get("classifiers")
3851        .and_then(|value| value.as_array())
3852        .map(|values| {
3853            values
3854                .iter()
3855                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
3856                .collect::<Vec<_>>()
3857        })
3858        .unwrap_or_default();
3859
3860    let mut parties = Vec::new();
3861    if author.is_some() || author_email.is_some() {
3862        parties.push(Party {
3863            r#type: Some("person".to_string()),
3864            role: Some("author".to_string()),
3865            name: author,
3866            email: author_email,
3867            url: None,
3868            organization: None,
3869            organization_url: None,
3870            timezone: None,
3871        });
3872    }
3873
3874    let mut bug_tracking_url = None;
3875    let mut code_view_url = None;
3876    let mut vcs_url = None;
3877    let mut extra_data = HashMap::new();
3878
3879    let parsed_project_urls = info
3880        .get("project_urls")
3881        .and_then(|value| value.as_object())
3882        .map(|map| {
3883            let mut pairs: Vec<(String, String)> = map
3884                .iter()
3885                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
3886                .collect();
3887            pairs.sort_by(|left, right| left.0.cmp(&right.0));
3888            pairs
3889        })
3890        .unwrap_or_default();
3891
3892    apply_project_url_mappings(
3893        &parsed_project_urls,
3894        &mut homepage_url,
3895        &mut bug_tracking_url,
3896        &mut code_view_url,
3897        &mut vcs_url,
3898        &mut extra_data,
3899    );
3900
3901    let (download_url, size, sha256) = root
3902        .get("urls")
3903        .and_then(|value| value.as_array())
3904        .map(|urls| select_pypi_json_artifact(urls))
3905        .unwrap_or((None, None, None));
3906
3907    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
3908        normalize_spdx_declared_license(license.as_deref());
3909    let dependencies = info
3910        .get("requires_dist")
3911        .and_then(|value| value.as_array())
3912        .map(|entries| {
3913            entries
3914                .iter()
3915                .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
3916                .collect::<Vec<_>>()
3917        })
3918        .map(|entries| extract_requires_dist_dependencies(&entries))
3919        .unwrap_or_default();
3920
3921    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
3922        build_pypi_urls(name.as_deref(), version.as_deref());
3923
3924    PackageData {
3925        package_type: Some(PythonParser::PACKAGE_TYPE),
3926        namespace: None,
3927        name,
3928        version,
3929        qualifiers: None,
3930        subpath: None,
3931        primary_language: None,
3932        description,
3933        release_date: None,
3934        parties,
3935        keywords,
3936        homepage_url: homepage_url.or(repository_homepage_url.clone()),
3937        download_url,
3938        size,
3939        sha1: None,
3940        md5: None,
3941        sha256,
3942        sha512: None,
3943        bug_tracking_url,
3944        code_view_url,
3945        vcs_url,
3946        copyright: None,
3947        holder: None,
3948        declared_license_expression,
3949        declared_license_expression_spdx,
3950        license_detections,
3951        other_license_expression: None,
3952        other_license_expression_spdx: None,
3953        other_license_detections: Vec::new(),
3954        extracted_license_statement: license,
3955        notice_text: None,
3956        source_packages: Vec::new(),
3957        file_references: Vec::new(),
3958        is_private: has_private_classifier(&classifiers),
3959        is_virtual: false,
3960        extra_data: if extra_data.is_empty() {
3961            None
3962        } else {
3963            Some(extra_data)
3964        },
3965        dependencies,
3966        repository_homepage_url,
3967        repository_download_url,
3968        api_data_url,
3969        datasource_id: Some(DatasourceId::PypiJson),
3970        purl,
3971    }
3972}
3973
3974fn select_pypi_json_artifact(
3975    urls: &[serde_json::Value],
3976) -> (Option<String>, Option<u64>, Option<String>) {
3977    let selected = urls
3978        .iter()
3979        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
3980        .or_else(|| urls.first());
3981
3982    let Some(entry) = selected else {
3983        return (None, None, None);
3984    };
3985
3986    let download_url = entry
3987        .get("url")
3988        .and_then(|value| value.as_str())
3989        .map(ToOwned::to_owned);
3990    let size = entry.get("size").and_then(|value| value.as_u64());
3991    let sha256 = entry
3992        .get("digests")
3993        .and_then(|value| value.as_object())
3994        .and_then(|digests| digests.get("sha256"))
3995        .and_then(|value| value.as_str())
3996        .map(ToOwned::to_owned);
3997
3998    (download_url, size, sha256)
3999}
4000
4001fn extract_from_pip_inspect(path: &Path) -> PackageData {
4002    let content = match read_file_to_string(path) {
4003        Ok(content) => content,
4004        Err(e) => {
4005            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
4006            return default_package_data(path);
4007        }
4008    };
4009
4010    let root: serde_json::Value = match serde_json::from_str(&content) {
4011        Ok(value) => value,
4012        Err(e) => {
4013            warn!(
4014                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
4015                path, e
4016            );
4017            return default_package_data(path);
4018        }
4019    };
4020
4021    let installed = match root.get("installed").and_then(|v| v.as_array()) {
4022        Some(arr) => arr,
4023        None => {
4024            warn!(
4025                "No 'installed' array found in pip-inspect.deplock at {:?}",
4026                path
4027            );
4028            return default_package_data(path);
4029        }
4030    };
4031
4032    let pip_version = root
4033        .get("pip_version")
4034        .and_then(|v| v.as_str())
4035        .map(String::from);
4036    let inspect_version = root
4037        .get("version")
4038        .and_then(|v| v.as_str())
4039        .map(String::from);
4040
4041    let mut main_package: Option<PackageData> = None;
4042    let mut dependencies: Vec<Dependency> = Vec::new();
4043
4044    for package_entry in installed {
4045        let metadata = match package_entry.get("metadata") {
4046            Some(m) => m,
4047            None => continue,
4048        };
4049
4050        let is_requested = package_entry
4051            .get("requested")
4052            .and_then(|v| v.as_bool())
4053            .unwrap_or(false);
4054        let has_direct_url = package_entry.get("direct_url").is_some();
4055
4056        let name = metadata
4057            .get("name")
4058            .and_then(|v| v.as_str())
4059            .map(String::from);
4060        let version = metadata
4061            .get("version")
4062            .and_then(|v| v.as_str())
4063            .map(String::from);
4064        let summary = metadata
4065            .get("summary")
4066            .and_then(|v| v.as_str())
4067            .map(String::from);
4068        let home_page = metadata
4069            .get("home_page")
4070            .and_then(|v| v.as_str())
4071            .map(String::from);
4072        let author = metadata
4073            .get("author")
4074            .and_then(|v| v.as_str())
4075            .map(String::from);
4076        let author_email = metadata
4077            .get("author_email")
4078            .and_then(|v| v.as_str())
4079            .map(String::from);
4080        let license = metadata
4081            .get("license")
4082            .and_then(|v| v.as_str())
4083            .map(String::from);
4084        let description = metadata
4085            .get("description")
4086            .and_then(|v| v.as_str())
4087            .map(String::from);
4088        let keywords = metadata
4089            .get("keywords")
4090            .and_then(|v| v.as_array())
4091            .map(|arr| {
4092                arr.iter()
4093                    .filter_map(|k| k.as_str().map(String::from))
4094                    .collect::<Vec<_>>()
4095            })
4096            .unwrap_or_default();
4097
4098        let mut parties = Vec::new();
4099        if author.is_some() || author_email.is_some() {
4100            parties.push(Party {
4101                r#type: Some("person".to_string()),
4102                role: Some("author".to_string()),
4103                name: author,
4104                email: author_email,
4105                url: None,
4106                organization: None,
4107                organization_url: None,
4108                timezone: None,
4109            });
4110        }
4111
4112        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
4113            normalize_spdx_declared_license(license.as_deref());
4114        let extracted_license_statement = license.clone();
4115        let requires_dist = metadata
4116            .get("requires_dist")
4117            .and_then(|v| v.as_array())
4118            .map(|entries| {
4119                entries
4120                    .iter()
4121                    .filter_map(|entry| entry.as_str().map(ToOwned::to_owned))
4122                    .collect::<Vec<_>>()
4123            })
4124            .unwrap_or_default();
4125        let parsed_dependencies = extract_requires_dist_dependencies(&requires_dist);
4126
4127        let purl = name.as_ref().and_then(|n| {
4128            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4129            if let Some(v) = &version {
4130                package_url.with_version(v).ok()?;
4131            }
4132            Some(package_url.to_string())
4133        });
4134
4135        if is_requested && has_direct_url {
4136            let mut extra_data = HashMap::new();
4137            if let Some(pv) = &pip_version {
4138                extra_data.insert(
4139                    "pip_version".to_string(),
4140                    serde_json::Value::String(pv.clone()),
4141                );
4142            }
4143            if let Some(iv) = &inspect_version {
4144                extra_data.insert(
4145                    "inspect_version".to_string(),
4146                    serde_json::Value::String(iv.clone()),
4147                );
4148            }
4149
4150            main_package = Some(PackageData {
4151                package_type: Some(PythonParser::PACKAGE_TYPE),
4152                namespace: None,
4153                name,
4154                version,
4155                qualifiers: None,
4156                subpath: None,
4157                primary_language: Some("Python".to_string()),
4158                description: description.or(summary),
4159                release_date: None,
4160                parties,
4161                keywords,
4162                homepage_url: home_page,
4163                download_url: None,
4164                size: None,
4165                sha1: None,
4166                md5: None,
4167                sha256: None,
4168                sha512: None,
4169                bug_tracking_url: None,
4170                code_view_url: None,
4171                vcs_url: None,
4172                copyright: None,
4173                holder: None,
4174                declared_license_expression,
4175                declared_license_expression_spdx,
4176                license_detections,
4177                other_license_expression: None,
4178                other_license_expression_spdx: None,
4179                other_license_detections: Vec::new(),
4180                extracted_license_statement,
4181                notice_text: None,
4182                source_packages: Vec::new(),
4183                file_references: Vec::new(),
4184                is_private: false,
4185                is_virtual: true,
4186                extra_data: if extra_data.is_empty() {
4187                    None
4188                } else {
4189                    Some(extra_data)
4190                },
4191                dependencies: parsed_dependencies,
4192                repository_homepage_url: None,
4193                repository_download_url: None,
4194                api_data_url: None,
4195                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4196                purl,
4197            });
4198        } else {
4199            let resolved_package = PackageData {
4200                package_type: Some(PythonParser::PACKAGE_TYPE),
4201                namespace: None,
4202                name: name.clone(),
4203                version: version.clone(),
4204                qualifiers: None,
4205                subpath: None,
4206                primary_language: Some("Python".to_string()),
4207                description: description.or(summary),
4208                release_date: None,
4209                parties,
4210                keywords,
4211                homepage_url: home_page,
4212                download_url: None,
4213                size: None,
4214                sha1: None,
4215                md5: None,
4216                sha256: None,
4217                sha512: None,
4218                bug_tracking_url: None,
4219                code_view_url: None,
4220                vcs_url: None,
4221                copyright: None,
4222                holder: None,
4223                declared_license_expression,
4224                declared_license_expression_spdx,
4225                license_detections,
4226                other_license_expression: None,
4227                other_license_expression_spdx: None,
4228                other_license_detections: Vec::new(),
4229                extracted_license_statement,
4230                notice_text: None,
4231                source_packages: Vec::new(),
4232                file_references: Vec::new(),
4233                is_private: false,
4234                is_virtual: true,
4235                extra_data: None,
4236                dependencies: parsed_dependencies,
4237                repository_homepage_url: None,
4238                repository_download_url: None,
4239                api_data_url: None,
4240                datasource_id: Some(DatasourceId::PypiInspectDeplock),
4241                purl: purl.clone(),
4242            };
4243
4244            let resolved = package_data_to_resolved(&resolved_package);
4245            dependencies.push(Dependency {
4246                purl,
4247                extracted_requirement: None,
4248                scope: None,
4249                is_runtime: Some(true),
4250                is_optional: Some(false),
4251                is_pinned: Some(true),
4252                is_direct: Some(is_requested),
4253                resolved_package: Some(Box::new(resolved)),
4254                extra_data: None,
4255            });
4256        }
4257    }
4258
4259    if let Some(mut main_pkg) = main_package {
4260        let direct_requirement_purls: HashSet<String> = main_pkg
4261            .dependencies
4262            .iter()
4263            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4264            .collect();
4265
4266        let resolved_requirement_purls: HashSet<String> = dependencies
4267            .iter()
4268            .filter_map(|dep| dep.purl.as_deref().map(base_dependency_purl))
4269            .collect();
4270
4271        let unresolved_dependencies = main_pkg
4272            .dependencies
4273            .iter()
4274            .filter(|dep| {
4275                dep.purl.as_ref().is_some_and(|purl| {
4276                    !resolved_requirement_purls.contains(&base_dependency_purl(purl))
4277                })
4278            })
4279            .cloned()
4280            .collect::<Vec<_>>();
4281
4282        for dependency in &mut dependencies {
4283            if dependency
4284                .purl
4285                .as_ref()
4286                .is_some_and(|purl| direct_requirement_purls.contains(&base_dependency_purl(purl)))
4287            {
4288                dependency.is_direct = Some(true);
4289            }
4290        }
4291
4292        main_pkg.dependencies = dependencies;
4293        main_pkg.dependencies.extend(unresolved_dependencies);
4294        main_pkg
4295    } else {
4296        default_package_data(path)
4297    }
4298}
4299
4300fn base_dependency_purl(purl: &str) -> String {
4301    purl.split_once('@')
4302        .map(|(base, _)| base.to_string())
4303        .unwrap_or_else(|| purl.to_string())
4304}
4305
4306type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
4307
4308fn extract_from_setup_cfg(path: &Path) -> PackageData {
4309    let content = match read_file_to_string(path) {
4310        Ok(content) => content,
4311        Err(e) => {
4312            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
4313            return default_package_data(path);
4314        }
4315    };
4316
4317    let sections = parse_setup_cfg(&content);
4318    let name = get_ini_value(&sections, "metadata", "name");
4319    let version = get_ini_value(&sections, "metadata", "version");
4320    let description = get_ini_value(&sections, "metadata", "description");
4321    let author = get_ini_value(&sections, "metadata", "author");
4322    let author_email = get_ini_value(&sections, "metadata", "author_email");
4323    let maintainer = get_ini_value(&sections, "metadata", "maintainer");
4324    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
4325    let license = get_ini_value(&sections, "metadata", "license");
4326    let mut homepage_url = get_ini_value(&sections, "metadata", "url");
4327    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
4328    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
4329    let python_requires = get_ini_value(&sections, "options", "python_requires");
4330    let parsed_project_urls =
4331        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
4332    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
4333    let mut extra_data = HashMap::new();
4334
4335    let mut parties = Vec::new();
4336    if author.is_some() || author_email.is_some() {
4337        parties.push(Party {
4338            r#type: Some("person".to_string()),
4339            role: Some("author".to_string()),
4340            name: author,
4341            email: author_email,
4342            url: None,
4343            organization: None,
4344            organization_url: None,
4345            timezone: None,
4346        });
4347    }
4348
4349    if maintainer.is_some() || maintainer_email.is_some() {
4350        parties.push(Party {
4351            r#type: Some("person".to_string()),
4352            role: Some("maintainer".to_string()),
4353            name: maintainer,
4354            email: maintainer_email,
4355            url: None,
4356            organization: None,
4357            organization_url: None,
4358            timezone: None,
4359        });
4360    }
4361
4362    let declared_license_expression = None;
4363    let declared_license_expression_spdx = None;
4364    let license_detections = Vec::new();
4365    let extracted_license_statement = license.clone();
4366
4367    let dependencies = extract_setup_cfg_dependencies(&sections);
4368
4369    if let Some(value) = python_requires {
4370        extra_data.insert(
4371            "python_requires".to_string(),
4372            serde_json::Value::String(value),
4373        );
4374    }
4375
4376    apply_project_url_mappings(
4377        &parsed_project_urls,
4378        &mut homepage_url,
4379        &mut bug_tracking_url,
4380        &mut code_view_url,
4381        &mut vcs_url,
4382        &mut extra_data,
4383    );
4384
4385    let extra_data = if extra_data.is_empty() {
4386        None
4387    } else {
4388        Some(extra_data)
4389    };
4390
4391    let purl = name.as_ref().and_then(|n| {
4392        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
4393        if let Some(v) = &version {
4394            package_url.with_version(v).ok()?;
4395        }
4396        Some(package_url.to_string())
4397    });
4398
4399    PackageData {
4400        package_type: Some(PythonParser::PACKAGE_TYPE),
4401        namespace: None,
4402        name,
4403        version,
4404        qualifiers: None,
4405        subpath: None,
4406        primary_language: Some("Python".to_string()),
4407        description,
4408        release_date: None,
4409        parties,
4410        keywords,
4411        homepage_url,
4412        download_url: None,
4413        size: None,
4414        sha1: None,
4415        md5: None,
4416        sha256: None,
4417        sha512: None,
4418        bug_tracking_url,
4419        code_view_url,
4420        vcs_url,
4421        copyright: None,
4422        holder: None,
4423        declared_license_expression,
4424        declared_license_expression_spdx,
4425        license_detections,
4426        other_license_expression: None,
4427        other_license_expression_spdx: None,
4428        other_license_detections: Vec::new(),
4429        extracted_license_statement,
4430        notice_text: None,
4431        source_packages: Vec::new(),
4432        file_references: Vec::new(),
4433        is_private: has_private_classifier(&classifiers),
4434        is_virtual: false,
4435        extra_data,
4436        dependencies,
4437        repository_homepage_url: None,
4438        repository_download_url: None,
4439        api_data_url: None,
4440        datasource_id: Some(DatasourceId::PypiSetupCfg),
4441        purl,
4442    }
4443}
4444
4445fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
4446    let Some(keywords) = value else {
4447        return Vec::new();
4448    };
4449
4450    keywords
4451        .split(',')
4452        .map(str::trim)
4453        .filter(|keyword| !keyword.is_empty())
4454        .map(ToOwned::to_owned)
4455        .collect()
4456}
4457
4458fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
4459    entries
4460        .iter()
4461        .filter_map(|entry| {
4462            let (label, url) = entry.split_once('=')?;
4463            let label = label.trim();
4464            let url = url.trim();
4465            if label.is_empty() || url.is_empty() {
4466                None
4467            } else {
4468                Some((label.to_string(), url.to_string()))
4469            }
4470        })
4471        .collect()
4472}
4473
4474fn apply_project_url_mappings(
4475    parsed_urls: &[(String, String)],
4476    homepage_url: &mut Option<String>,
4477    bug_tracking_url: &mut Option<String>,
4478    code_view_url: &mut Option<String>,
4479    vcs_url: &mut Option<String>,
4480    extra_data: &mut HashMap<String, serde_json::Value>,
4481) {
4482    for (label, url) in parsed_urls {
4483        let label_lower = label.to_lowercase();
4484
4485        if bug_tracking_url.is_none()
4486            && matches!(
4487                label_lower.as_str(),
4488                "tracker"
4489                    | "bug reports"
4490                    | "bug tracker"
4491                    | "issues"
4492                    | "issue tracker"
4493                    | "github: issues"
4494            )
4495        {
4496            *bug_tracking_url = Some(url.clone());
4497        } else if code_view_url.is_none()
4498            && matches!(label_lower.as_str(), "source" | "source code" | "code")
4499        {
4500            *code_view_url = Some(url.clone());
4501        } else if vcs_url.is_none()
4502            && matches!(
4503                label_lower.as_str(),
4504                "github" | "gitlab" | "github: repo" | "repository"
4505            )
4506        {
4507            *vcs_url = Some(url.clone());
4508        } else if homepage_url.is_none()
4509            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
4510        {
4511            *homepage_url = Some(url.clone());
4512        } else if label_lower == "changelog" {
4513            extra_data.insert(
4514                "changelog_url".to_string(),
4515                serde_json::Value::String(url.clone()),
4516            );
4517        }
4518    }
4519
4520    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
4521        .iter()
4522        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
4523        .collect();
4524
4525    if !project_urls_json.is_empty() {
4526        extra_data.insert(
4527            "project_urls".to_string(),
4528            serde_json::Value::Object(project_urls_json),
4529        );
4530    }
4531}
4532
4533fn parse_setup_cfg(content: &str) -> IniSections {
4534    let mut sections: IniSections = HashMap::new();
4535    let mut current_section: Option<String> = None;
4536    let mut current_key: Option<String> = None;
4537
4538    for raw_line in content.lines() {
4539        let line = raw_line.trim_end_matches('\r');
4540        let trimmed = line.trim();
4541        if trimmed.is_empty() {
4542            continue;
4543        }
4544
4545        let stripped = line.trim_start();
4546        if stripped.starts_with('#') || stripped.starts_with(';') {
4547            continue;
4548        }
4549
4550        if stripped.starts_with('[') && stripped.ends_with(']') {
4551            let section_name = stripped
4552                .trim_start_matches('[')
4553                .trim_end_matches(']')
4554                .trim()
4555                .to_ascii_lowercase();
4556            current_section = if section_name.is_empty() {
4557                None
4558            } else {
4559                Some(section_name)
4560            };
4561            current_key = None;
4562            continue;
4563        }
4564
4565        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
4566            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
4567                let value = stripped.trim();
4568                if !value.is_empty() {
4569                    sections
4570                        .entry(section.clone())
4571                        .or_default()
4572                        .entry(key.clone())
4573                        .or_default()
4574                        .push(value.to_string());
4575                }
4576            }
4577            continue;
4578        }
4579
4580        if let Some((key, value)) = stripped.split_once('=')
4581            && let Some(section) = current_section.as_ref()
4582        {
4583            let key_name = key.trim().to_ascii_lowercase();
4584            let value_trimmed = value.trim();
4585            let entry = sections
4586                .entry(section.clone())
4587                .or_default()
4588                .entry(key_name.clone())
4589                .or_default();
4590            if !value_trimmed.is_empty() {
4591                entry.push(value_trimmed.to_string());
4592            }
4593            current_key = Some(key_name);
4594        }
4595    }
4596
4597    sections
4598}
4599
4600fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
4601    sections
4602        .get(&section.to_ascii_lowercase())
4603        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4604        .and_then(|entries| entries.first())
4605        .map(|value| value.trim().to_string())
4606}
4607
4608fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
4609    sections
4610        .get(&section.to_ascii_lowercase())
4611        .and_then(|values| values.get(&key.to_ascii_lowercase()))
4612        .cloned()
4613        .unwrap_or_default()
4614}
4615
4616fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
4617    let mut dependencies = Vec::new();
4618
4619    for (sub_section, scope) in [
4620        ("install_requires", "install"),
4621        ("tests_require", "test"),
4622        ("setup_requires", "setup"),
4623    ] {
4624        let reqs = get_ini_values(sections, "options", sub_section);
4625        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
4626    }
4627
4628    if let Some(extras) = sections.get("options.extras_require") {
4629        let mut extra_items: Vec<_> = extras.iter().collect();
4630        extra_items.sort_by_key(|(name, _)| *name);
4631        for (extra_name, reqs) in extra_items {
4632            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
4633        }
4634    }
4635
4636    dependencies
4637}
4638
4639fn parse_setup_cfg_requirements(
4640    reqs: &[String],
4641    scope: &str,
4642    is_optional: bool,
4643) -> Vec<Dependency> {
4644    reqs.iter()
4645        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
4646        .collect()
4647}
4648
4649fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
4650    let trimmed = req.trim();
4651    if trimmed.is_empty() || trimmed.starts_with('#') {
4652        return None;
4653    }
4654
4655    let name = extract_setup_cfg_dependency_name(trimmed)?;
4656    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4657
4658    Some(Dependency {
4659        purl: Some(purl.to_string()),
4660        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
4661        scope: Some(scope.to_string()),
4662        is_runtime: Some(true),
4663        is_optional: Some(is_optional),
4664        is_pinned: Some(false),
4665        is_direct: Some(true),
4666        resolved_package: None,
4667        extra_data: None,
4668    })
4669}
4670
4671fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
4672    let trimmed = req.trim();
4673    if trimmed.is_empty() {
4674        return None;
4675    }
4676
4677    let end = trimmed
4678        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
4679        .unwrap_or(trimmed.len());
4680    let name = trimmed[..end].trim();
4681    if name.is_empty() {
4682        None
4683    } else {
4684        Some(name.to_string())
4685    }
4686}
4687
4688fn normalize_setup_cfg_requirement(req: &str) -> String {
4689    req.chars().filter(|c| !c.is_whitespace()).collect()
4690}
4691
4692fn extract_setup_value(content: &str, key: &str) -> Option<String> {
4693    let patterns = vec![
4694        format!("{}=\"", key),   // name="value"
4695        format!("{} =\"", key),  // name ="value"
4696        format!("{}= \"", key),  // name= "value"
4697        format!("{} = \"", key), // name = "value"
4698        format!("{}='", key),    // name='value'
4699        format!("{} ='", key),   // name ='value'
4700        format!("{}= '", key),   // name= 'value'
4701        format!("{} = '", key),  // name = 'value'
4702    ];
4703
4704    for pattern in patterns {
4705        if let Some(start_idx) = content.find(&pattern) {
4706            let value_start = start_idx + pattern.len();
4707            let remaining = &content[value_start..];
4708
4709            if let Some(end_idx) = remaining.find(['"', '\'']) {
4710                return Some(remaining[..end_idx].to_string());
4711            }
4712        }
4713    }
4714
4715    None
4716}
4717
4718fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
4719    let mut dependencies = Vec::new();
4720
4721    if let Some(tests_deps) = extract_tests_require(content) {
4722        dependencies.extend(tests_deps);
4723    }
4724
4725    if let Some(extras_deps) = extract_extras_require(content) {
4726        dependencies.extend(extras_deps);
4727    }
4728
4729    dependencies
4730}
4731
4732fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
4733    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
4734    let re = Regex::new(pattern).ok()?;
4735    let captures = re.captures(content)?;
4736    let deps_str = captures.get(1)?.as_str();
4737
4738    let deps = parse_setup_py_dep_list(deps_str, "test", true);
4739    if deps.is_empty() { None } else { Some(deps) }
4740}
4741
4742fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
4743    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
4744    let re = Regex::new(pattern).ok()?;
4745    let captures = re.captures(content)?;
4746    let dict_content = captures.get(1)?.as_str();
4747
4748    let mut all_deps = Vec::new();
4749
4750    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
4751    let entry_re = Regex::new(entry_pattern).ok()?;
4752
4753    for entry_cap in entry_re.captures_iter(dict_content) {
4754        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
4755            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
4756            all_deps.extend(deps);
4757        }
4758    }
4759
4760    if all_deps.is_empty() {
4761        None
4762    } else {
4763        Some(all_deps)
4764    }
4765}
4766
4767fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
4768    let dep_pattern = r#"['"]([^'"]+)['"]"#;
4769    let re = match Regex::new(dep_pattern) {
4770        Ok(r) => r,
4771        Err(_) => return Vec::new(),
4772    };
4773
4774    re.captures_iter(deps_str)
4775        .filter_map(|cap| {
4776            let dep_str = cap.get(1)?.as_str().trim();
4777            if dep_str.is_empty() {
4778                return None;
4779            }
4780
4781            let name = extract_setup_cfg_dependency_name(dep_str)?;
4782            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
4783
4784            Some(Dependency {
4785                purl: Some(purl.to_string()),
4786                extracted_requirement: Some(dep_str.to_string()),
4787                scope: Some(scope.to_string()),
4788                is_runtime: Some(true),
4789                is_optional: Some(is_optional),
4790                is_pinned: Some(false),
4791                is_direct: Some(true),
4792                resolved_package: None,
4793                extra_data: None,
4794            })
4795        })
4796        .collect()
4797}
4798
4799/// Reads and parses a TOML file
4800pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
4801    let content = read_file_to_string(path).map_err(|e| e.to_string())?;
4802    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
4803}
4804
4805/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
4806///
4807/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
4808/// Essential for SBOM compliance and package integrity verification.
4809///
4810/// # Returns
4811///
4812/// - `(Some(size), Some(hash))` on success
4813/// - `(None, None)` if file cannot be opened
4814/// - `(Some(size), None)` if hash calculation fails during read
4815fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<String>) {
4816    let mut file = match File::open(path) {
4817        Ok(f) => f,
4818        Err(_) => return (None, None),
4819    };
4820
4821    let metadata = match file.metadata() {
4822        Ok(m) => m,
4823        Err(_) => return (None, None),
4824    };
4825    let size = metadata.len();
4826
4827    let mut hasher = Sha256::new();
4828    let mut buffer = vec![0; 8192];
4829
4830    loop {
4831        match file.read(&mut buffer) {
4832            Ok(0) => break,
4833            Ok(n) => hasher.update(&buffer[..n]),
4834            Err(_) => return (Some(size), None),
4835        }
4836    }
4837
4838    let hash = hex::encode(hasher.finalize());
4839    (Some(size), Some(hash))
4840}
4841
4842fn default_package_data(path: &Path) -> PackageData {
4843    PackageData {
4844        package_type: Some(PythonParser::PACKAGE_TYPE),
4845        primary_language: Some("Python".to_string()),
4846        datasource_id: infer_python_datasource_id(path),
4847        ..Default::default()
4848    }
4849}
4850
4851fn infer_python_datasource_id(path: &Path) -> Option<DatasourceId> {
4852    let file_name = path.file_name().and_then(|name| name.to_str());
4853
4854    match file_name {
4855        Some("pyproject.toml") => {
4856            if read_toml_file(path)
4857                .ok()
4858                .and_then(|content| content.get("tool").and_then(|v| v.as_table()).cloned())
4859                .and_then(|tool| tool.get("poetry").and_then(|v| v.as_table()).cloned())
4860                .is_some()
4861            {
4862                Some(DatasourceId::PypiPoetryPyprojectToml)
4863            } else {
4864                Some(DatasourceId::PypiPyprojectToml)
4865            }
4866        }
4867        Some(name) if name == "setup.py" || name.ends_with("_setup.py") => {
4868            Some(DatasourceId::PypiSetupPy)
4869        }
4870        Some("setup.cfg") => Some(DatasourceId::PypiSetupCfg),
4871        Some("PKG-INFO") => Some(detect_pkg_info_datasource_id(path)),
4872        Some("METADATA") if is_installed_wheel_metadata_path(path) => {
4873            Some(DatasourceId::PypiWheelMetadata)
4874        }
4875        Some("pypi.json") => Some(DatasourceId::PypiJson),
4876        Some("pip-inspect.deplock") => Some(DatasourceId::PypiInspectDeplock),
4877        Some("origin.json") if is_pip_cache_origin_json(path) => {
4878            Some(DatasourceId::PypiPipOriginJson)
4879        }
4880        _ if file_name.is_some_and(is_likely_python_sdist_filename) => {
4881            Some(DatasourceId::PypiSdist)
4882        }
4883        _ if path
4884            .extension()
4885            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) =>
4886        {
4887            Some(DatasourceId::PypiWheel)
4888        }
4889        _ if path
4890            .extension()
4891            .is_some_and(|ext| ext.eq_ignore_ascii_case("egg")) =>
4892        {
4893            Some(DatasourceId::PypiEgg)
4894        }
4895        _ => None,
4896    }
4897}
4898
4899crate::register_parser!(
4900    "Python package manifests (pyproject.toml, setup.py, *_setup.py, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
4901    &[
4902        "**/pyproject.toml",
4903        "**/setup.py",
4904        "**/*_setup.py",
4905        "**/setup.cfg",
4906        "**/pypi.json",
4907        "**/PKG-INFO",
4908        "**/*.dist-info/METADATA",
4909        "**/origin.json",
4910        "**/*.tar.gz",
4911        "**/*.tgz",
4912        "**/*.tar.bz2",
4913        "**/*.tar.xz",
4914        "**/*.zip",
4915        "**/*.whl",
4916        "**/*.egg"
4917    ],
4918    "pypi",
4919    "Python",
4920    Some("https://packaging.python.org/"),
4921);