Skip to main content

provenant/parsers/
python.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
35use crate::parsers::utils::{read_file_to_string, split_name_email};
36use base64::Engine;
37use base64::engine::general_purpose::URL_SAFE_NO_PAD;
38use csv::ReaderBuilder;
39use log::warn;
40use packageurl::PackageUrl;
41use regex::Regex;
42use rustpython_parser::{Parse, ast};
43use serde_json::{Map as JsonMap, Value as JsonValue};
44use sha2::{Digest, Sha256};
45use std::collections::{HashMap, HashSet};
46use std::fs::File;
47use std::io::Read;
48use std::path::{Path, PathBuf};
49use toml::Value as TomlValue;
50use toml::map::Map as TomlMap;
51use zip::ZipArchive;
52
53use super::PackageParser;
54
55// Field constants for pyproject.toml
56const FIELD_PROJECT: &str = "project";
57const FIELD_NAME: &str = "name";
58const FIELD_VERSION: &str = "version";
59const FIELD_LICENSE: &str = "license";
60const FIELD_AUTHORS: &str = "authors";
61const FIELD_MAINTAINERS: &str = "maintainers";
62const FIELD_URLS: &str = "urls";
63const FIELD_HOMEPAGE: &str = "homepage";
64const FIELD_REPOSITORY: &str = "repository";
65const FIELD_DEPENDENCIES: &str = "dependencies";
66const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
67const FIELD_DEPENDENCY_GROUPS: &str = "dependency-groups";
68const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
69const MAX_SETUP_PY_BYTES: usize = 1_048_576;
70const MAX_SETUP_PY_AST_NODES: usize = 10_000;
71const MAX_SETUP_PY_AST_DEPTH: usize = 50;
72const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB uncompressed
73const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
74const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
75
76/// Python package parser supporting 11 manifest formats.
77///
78/// Extracts metadata from Python package files including pyproject.toml, setup.py,
79/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
80///
81/// # Security
82///
83/// setup.py files are parsed using AST analysis rather than code execution to prevent
84/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
85pub struct PythonParser;
86
87impl PackageParser for PythonParser {
88    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
89
90    fn extract_packages(path: &Path) -> Vec<PackageData> {
91        vec![
92            if path.file_name().unwrap_or_default() == "pyproject.toml" {
93                extract_from_pyproject_toml(path)
94            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
95                extract_from_setup_cfg(path)
96            } else if path.file_name().unwrap_or_default() == "setup.py" {
97                extract_from_setup_py(path)
98            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
99                extract_from_rfc822_metadata(path, DatasourceId::PypiSdistPkginfo)
100            } else if path.file_name().unwrap_or_default() == "METADATA" {
101                extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
102            } else if is_pip_cache_origin_json(path) {
103                extract_from_pip_origin_json(path)
104            } else if path.file_name().unwrap_or_default() == "pypi.json" {
105                extract_from_pypi_json(path)
106            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
107                extract_from_pip_inspect(path)
108            } else if path
109                .extension()
110                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
111            {
112                extract_from_wheel_archive(path)
113            } else if path
114                .extension()
115                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
116            {
117                extract_from_egg_archive(path)
118            } else {
119                default_package_data()
120            },
121        ]
122    }
123
124    fn is_match(path: &Path) -> bool {
125        if let Some(filename) = path.file_name()
126            && (filename == "pyproject.toml"
127                || filename == "setup.cfg"
128                || filename == "setup.py"
129                || filename == "PKG-INFO"
130                || filename == "METADATA"
131                || filename == "pypi.json"
132                || filename == "pip-inspect.deplock"
133                || is_pip_cache_origin_json(path))
134        {
135            return true;
136        }
137
138        if let Some(extension) = path.extension() {
139            let ext = extension.to_string_lossy().to_lowercase();
140            if ext == "whl" || ext == "egg" {
141                return true;
142            }
143        }
144
145        false
146    }
147}
148
149#[derive(Debug, Clone)]
150struct InstalledWheelMetadata {
151    wheel_tags: Vec<String>,
152    wheel_version: Option<String>,
153    wheel_generator: Option<String>,
154    root_is_purelib: Option<bool>,
155    compressed_tag: Option<String>,
156}
157
158fn merge_sibling_wheel_metadata(path: &Path, package_data: &mut PackageData) {
159    let Some(parent) = path.parent() else {
160        return;
161    };
162
163    if !parent
164        .file_name()
165        .and_then(|name| name.to_str())
166        .is_some_and(|name| name.ends_with(".dist-info"))
167    {
168        return;
169    }
170
171    let wheel_path = parent.join("WHEEL");
172    if !wheel_path.exists() {
173        return;
174    }
175
176    let Ok(content) = read_file_to_string(&wheel_path) else {
177        warn!("Failed to read sibling WHEEL file at {:?}", wheel_path);
178        return;
179    };
180
181    let Some(wheel_metadata) = parse_installed_wheel_metadata(&content) else {
182        return;
183    };
184
185    apply_installed_wheel_metadata(package_data, &wheel_metadata);
186}
187
188fn parse_installed_wheel_metadata(content: &str) -> Option<InstalledWheelMetadata> {
189    use super::rfc822::{get_header_all, get_header_first};
190
191    let metadata = super::rfc822::parse_rfc822_content(content);
192    let wheel_tags = get_header_all(&metadata.headers, "tag");
193    if wheel_tags.is_empty() {
194        return None;
195    }
196
197    let wheel_version = get_header_first(&metadata.headers, "wheel-version");
198    let wheel_generator = get_header_first(&metadata.headers, "generator");
199    let root_is_purelib =
200        get_header_first(&metadata.headers, "root-is-purelib").and_then(|value| {
201            match value.to_ascii_lowercase().as_str() {
202                "true" => Some(true),
203                "false" => Some(false),
204                _ => None,
205            }
206        });
207
208    let compressed_tag = compress_wheel_tags(&wheel_tags);
209
210    Some(InstalledWheelMetadata {
211        wheel_tags,
212        wheel_version,
213        wheel_generator,
214        root_is_purelib,
215        compressed_tag,
216    })
217}
218
219fn compress_wheel_tags(tags: &[String]) -> Option<String> {
220    if tags.is_empty() {
221        return None;
222    }
223
224    if tags.len() == 1 {
225        return Some(tags[0].clone());
226    }
227
228    let mut python_tags = Vec::new();
229    let mut abi_tag: Option<&str> = None;
230    let mut platform_tag: Option<&str> = None;
231
232    for tag in tags {
233        let mut parts = tag.splitn(3, '-');
234        let python = parts.next()?;
235        let abi = parts.next()?;
236        let platform = parts.next()?;
237
238        if abi_tag.is_some_and(|existing| existing != abi)
239            || platform_tag.is_some_and(|existing| existing != platform)
240        {
241            return None;
242        }
243
244        abi_tag = Some(abi);
245        platform_tag = Some(platform);
246        python_tags.push(python.to_string());
247    }
248
249    Some(format!(
250        "{}-{}-{}",
251        python_tags.join("."),
252        abi_tag?,
253        platform_tag?
254    ))
255}
256
257fn apply_installed_wheel_metadata(
258    package_data: &mut PackageData,
259    wheel_metadata: &InstalledWheelMetadata,
260) {
261    let extra_data = package_data.extra_data.get_or_insert_with(HashMap::new);
262    extra_data.insert(
263        "wheel_tags".to_string(),
264        JsonValue::Array(
265            wheel_metadata
266                .wheel_tags
267                .iter()
268                .cloned()
269                .map(JsonValue::String)
270                .collect(),
271        ),
272    );
273
274    if let Some(wheel_version) = &wheel_metadata.wheel_version {
275        extra_data.insert(
276            "wheel_version".to_string(),
277            JsonValue::String(wheel_version.clone()),
278        );
279    }
280
281    if let Some(wheel_generator) = &wheel_metadata.wheel_generator {
282        extra_data.insert(
283            "wheel_generator".to_string(),
284            JsonValue::String(wheel_generator.clone()),
285        );
286    }
287
288    if let Some(root_is_purelib) = wheel_metadata.root_is_purelib {
289        extra_data.insert(
290            "root_is_purelib".to_string(),
291            JsonValue::Bool(root_is_purelib),
292        );
293    }
294
295    if let (Some(name), Some(version), Some(extension)) = (
296        package_data.name.as_deref(),
297        package_data.version.as_deref(),
298        wheel_metadata.compressed_tag.as_deref(),
299    ) {
300        package_data.purl = build_pypi_purl_with_extension(name, Some(version), extension);
301    }
302}
303
304fn is_pip_cache_origin_json(path: &Path) -> bool {
305    path.file_name().and_then(|name| name.to_str()) == Some("origin.json")
306        && path.ancestors().skip(1).any(|ancestor| {
307            ancestor
308                .file_name()
309                .and_then(|name| name.to_str())
310                .is_some_and(|name| name.eq_ignore_ascii_case("wheels"))
311        })
312}
313
314fn extract_from_pip_origin_json(path: &Path) -> PackageData {
315    let content = match read_file_to_string(path) {
316        Ok(content) => content,
317        Err(e) => {
318            warn!("Failed to read pip cache origin.json at {:?}: {}", path, e);
319            return default_package_data();
320        }
321    };
322
323    let root: JsonValue = match serde_json::from_str(&content) {
324        Ok(root) => root,
325        Err(e) => {
326            warn!("Failed to parse pip cache origin.json at {:?}: {}", path, e);
327            return default_package_data();
328        }
329    };
330
331    let Some(download_url) = root.get("url").and_then(|value| value.as_str()) else {
332        warn!("No url found in pip cache origin.json at {:?}", path);
333        return default_package_data();
334    };
335
336    let sibling_wheel = find_sibling_cached_wheel(path);
337    let name_version = parse_name_version_from_origin_url(download_url).or_else(|| {
338        sibling_wheel
339            .as_ref()
340            .map(|wheel_info| (wheel_info.name.clone(), wheel_info.version.clone()))
341    });
342
343    let Some((name, version)) = name_version else {
344        warn!(
345            "Failed to infer package name/version from pip cache origin.json at {:?}",
346            path
347        );
348        return default_package_data();
349    };
350
351    let (repository_homepage_url, repository_download_url, api_data_url, plain_purl) =
352        build_pypi_urls(Some(&name), Some(&version));
353    let purl = sibling_wheel
354        .as_ref()
355        .and_then(|wheel_info| build_wheel_purl(Some(&name), Some(&version), wheel_info))
356        .or(plain_purl);
357
358    PackageData {
359        package_type: Some(PythonParser::PACKAGE_TYPE),
360        primary_language: Some("Python".to_string()),
361        name: Some(name),
362        version: Some(version),
363        datasource_id: Some(DatasourceId::PypiPipOriginJson),
364        download_url: Some(download_url.to_string()),
365        sha256: extract_sha256_from_origin_json(&root),
366        repository_homepage_url,
367        repository_download_url,
368        api_data_url,
369        purl,
370        ..Default::default()
371    }
372}
373
374fn find_sibling_cached_wheel(path: &Path) -> Option<WheelInfo> {
375    let parent = path.parent()?;
376    let entries = parent.read_dir().ok()?;
377
378    for entry in entries.flatten() {
379        let sibling_path = entry.path();
380        if sibling_path
381            .extension()
382            .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
383            && let Some(wheel_info) = parse_wheel_filename(&sibling_path)
384        {
385            return Some(wheel_info);
386        }
387    }
388
389    None
390}
391
392fn parse_name_version_from_origin_url(url: &str) -> Option<(String, String)> {
393    let file_name = url.rsplit('/').next()?;
394
395    if file_name.ends_with(".whl") {
396        return parse_wheel_filename(Path::new(file_name))
397            .map(|wheel_info| (wheel_info.name, wheel_info.version));
398    }
399
400    let stem = strip_python_archive_extension(file_name)?;
401    let (name, version) = stem.rsplit_once('-')?;
402    if name.is_empty() || version.is_empty() {
403        return None;
404    }
405
406    Some((name.replace('_', "-"), version.to_string()))
407}
408
409fn strip_python_archive_extension(file_name: &str) -> Option<&str> {
410    [".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".zip", ".whl"]
411        .iter()
412        .find_map(|suffix| file_name.strip_suffix(suffix))
413}
414
415fn extract_sha256_from_origin_json(root: &JsonValue) -> Option<String> {
416    root.pointer("/archive_info/hashes/sha256")
417        .and_then(|value| value.as_str())
418        .map(ToOwned::to_owned)
419        .or_else(|| {
420            root.pointer("/archive_info/hash")
421                .and_then(|value| value.as_str())
422                .and_then(normalize_origin_hash)
423        })
424}
425
426fn normalize_origin_hash(hash: &str) -> Option<String> {
427    if let Some(value) = hash.strip_prefix("sha256=") {
428        return Some(value.to_string());
429    }
430    if let Some(value) = hash.strip_prefix("sha256:") {
431        return Some(value.to_string());
432    }
433    if hash.len() == 64 && hash.chars().all(|ch| ch.is_ascii_hexdigit()) {
434        return Some(hash.to_string());
435    }
436    None
437}
438
439fn extract_from_rfc822_metadata(path: &Path, datasource_id: DatasourceId) -> PackageData {
440    let content = match read_file_to_string(path) {
441        Ok(content) => content,
442        Err(e) => {
443            warn!("Failed to read metadata at {:?}: {}", path, e);
444            return default_package_data();
445        }
446    };
447
448    let metadata = super::rfc822::parse_rfc822_content(&content);
449    let mut package_data = build_package_data_from_rfc822(&metadata, datasource_id);
450    merge_sibling_metadata_dependencies(path, &mut package_data);
451    merge_sibling_metadata_file_references(path, &mut package_data);
452    if datasource_id == DatasourceId::PypiWheelMetadata {
453        merge_sibling_wheel_metadata(path, &mut package_data);
454    }
455    package_data
456}
457
458fn merge_sibling_metadata_dependencies(path: &Path, package_data: &mut PackageData) {
459    let mut extra_dependencies = Vec::new();
460
461    if let Some(parent) = path.parent() {
462        let direct_requires = parent.join("requires.txt");
463        if direct_requires.exists()
464            && let Ok(content) = read_file_to_string(&direct_requires)
465        {
466            extra_dependencies.extend(parse_requires_txt(&content));
467        }
468
469        let sibling_egg_info_requires = parent
470            .read_dir()
471            .ok()
472            .into_iter()
473            .flatten()
474            .flatten()
475            .find_map(|entry| {
476                let child_path = entry.path();
477                if child_path.is_dir()
478                    && child_path
479                        .file_name()
480                        .and_then(|name| name.to_str())
481                        .is_some_and(|name| name.ends_with(".egg-info"))
482                {
483                    let requires = child_path.join("requires.txt");
484                    requires.exists().then_some(requires)
485                } else {
486                    None
487                }
488            });
489
490        if let Some(requires_path) = sibling_egg_info_requires
491            && let Ok(content) = read_file_to_string(&requires_path)
492        {
493            extra_dependencies.extend(parse_requires_txt(&content));
494        }
495    }
496
497    for dependency in extra_dependencies {
498        if !package_data.dependencies.iter().any(|existing| {
499            existing.purl == dependency.purl
500                && existing.scope == dependency.scope
501                && existing.extracted_requirement == dependency.extracted_requirement
502                && existing.extra_data == dependency.extra_data
503        }) {
504            package_data.dependencies.push(dependency);
505        }
506    }
507}
508
509fn merge_sibling_metadata_file_references(path: &Path, package_data: &mut PackageData) {
510    let mut extra_refs = Vec::new();
511
512    if let Some(parent) = path.parent() {
513        let record_path = parent.join("RECORD");
514        if record_path.exists()
515            && let Ok(content) = read_file_to_string(&record_path)
516        {
517            extra_refs.extend(parse_record_csv(&content));
518        }
519
520        let installed_files_path = parent.join("installed-files.txt");
521        if installed_files_path.exists()
522            && let Ok(content) = read_file_to_string(&installed_files_path)
523        {
524            extra_refs.extend(parse_installed_files_txt(&content));
525        }
526
527        let sources_path = parent.join("SOURCES.txt");
528        if sources_path.exists()
529            && let Ok(content) = read_file_to_string(&sources_path)
530        {
531            extra_refs.extend(parse_sources_txt(&content));
532        }
533    }
534
535    for file_ref in extra_refs {
536        if !package_data
537            .file_references
538            .iter()
539            .any(|existing| existing.path == file_ref.path)
540        {
541            package_data.file_references.push(file_ref);
542        }
543    }
544}
545
546fn validate_zip_archive<R: Read + std::io::Seek>(
547    archive: &mut ZipArchive<R>,
548    path: &Path,
549    archive_type: &str,
550) -> Result<u64, String> {
551    let mut total_extracted = 0u64;
552
553    for i in 0..archive.len() {
554        if let Ok(file) = archive.by_index(i) {
555            let compressed_size = file.compressed_size();
556            let uncompressed_size = file.size();
557
558            if compressed_size > 0 {
559                let ratio = uncompressed_size as f64 / compressed_size as f64;
560                if ratio > MAX_COMPRESSION_RATIO {
561                    warn!(
562                        "Suspicious compression ratio in {} {:?}: {:.2}:1",
563                        archive_type, path, ratio
564                    );
565                    continue;
566                }
567            }
568
569            if uncompressed_size > MAX_FILE_SIZE {
570                warn!(
571                    "File too large in {} {:?}: {} bytes (limit: {} bytes)",
572                    archive_type, path, uncompressed_size, MAX_FILE_SIZE
573                );
574                continue;
575            }
576
577            total_extracted += uncompressed_size;
578            if total_extracted > MAX_ARCHIVE_SIZE {
579                let msg = format!(
580                    "Total extracted size exceeds limit for {} {:?}",
581                    archive_type, path
582                );
583                warn!("{}", msg);
584                return Err(msg);
585            }
586        }
587    }
588
589    Ok(total_extracted)
590}
591
592fn extract_from_wheel_archive(path: &Path) -> PackageData {
593    let metadata = match std::fs::metadata(path) {
594        Ok(m) => m,
595        Err(e) => {
596            warn!(
597                "Failed to read metadata for wheel archive {:?}: {}",
598                path, e
599            );
600            return default_package_data();
601        }
602    };
603
604    if metadata.len() > MAX_ARCHIVE_SIZE {
605        warn!(
606            "Wheel archive too large: {} bytes (limit: {} bytes)",
607            metadata.len(),
608            MAX_ARCHIVE_SIZE
609        );
610        return default_package_data();
611    }
612
613    let file = match File::open(path) {
614        Ok(f) => f,
615        Err(e) => {
616            warn!("Failed to open wheel archive {:?}: {}", path, e);
617            return default_package_data();
618        }
619    };
620
621    let mut archive = match ZipArchive::new(file) {
622        Ok(a) => a,
623        Err(e) => {
624            warn!("Failed to read wheel archive {:?}: {}", path, e);
625            return default_package_data();
626        }
627    };
628
629    if validate_zip_archive(&mut archive, path, "wheel").is_err() {
630        return default_package_data();
631    }
632
633    let metadata_path = find_wheel_metadata_path(&mut archive);
634    let metadata_path = match metadata_path {
635        Some(p) => p,
636        None => {
637            warn!("No METADATA file found in wheel archive {:?}", path);
638            return default_package_data();
639        }
640    };
641
642    let content = match read_zip_entry(&mut archive, &metadata_path) {
643        Ok(c) => c,
644        Err(e) => {
645            warn!("Failed to read METADATA from {:?}: {}", path, e);
646            return default_package_data();
647        }
648    };
649
650    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiWheel);
651
652    let (size, sha256) = calculate_file_checksums(path);
653    package_data.size = size;
654    package_data.sha256 = sha256;
655
656    if let Some(record_path) = find_wheel_record_path(&mut archive)
657        && let Ok(record_content) = read_zip_entry(&mut archive, &record_path)
658    {
659        package_data.file_references = parse_record_csv(&record_content);
660    }
661
662    if let Some(wheel_info) = parse_wheel_filename(path) {
663        if package_data.name.is_none() {
664            package_data.name = Some(wheel_info.name.clone());
665        }
666        if package_data.version.is_none() {
667            package_data.version = Some(wheel_info.version.clone());
668        }
669
670        package_data.purl = build_wheel_purl(
671            package_data.name.as_deref(),
672            package_data.version.as_deref(),
673            &wheel_info,
674        );
675
676        let mut extra_data = package_data.extra_data.unwrap_or_default();
677        extra_data.insert(
678            "python_requires".to_string(),
679            serde_json::Value::String(wheel_info.python_tag.clone()),
680        );
681        extra_data.insert(
682            "abi_tag".to_string(),
683            serde_json::Value::String(wheel_info.abi_tag.clone()),
684        );
685        extra_data.insert(
686            "platform_tag".to_string(),
687            serde_json::Value::String(wheel_info.platform_tag.clone()),
688        );
689        package_data.extra_data = Some(extra_data);
690    }
691
692    package_data
693}
694
695fn extract_from_egg_archive(path: &Path) -> PackageData {
696    let metadata = match std::fs::metadata(path) {
697        Ok(m) => m,
698        Err(e) => {
699            warn!("Failed to read metadata for egg archive {:?}: {}", path, e);
700            return default_package_data();
701        }
702    };
703
704    if metadata.len() > MAX_ARCHIVE_SIZE {
705        warn!(
706            "Egg archive too large: {} bytes (limit: {} bytes)",
707            metadata.len(),
708            MAX_ARCHIVE_SIZE
709        );
710        return default_package_data();
711    }
712
713    let file = match File::open(path) {
714        Ok(f) => f,
715        Err(e) => {
716            warn!("Failed to open egg archive {:?}: {}", path, e);
717            return default_package_data();
718        }
719    };
720
721    let mut archive = match ZipArchive::new(file) {
722        Ok(a) => a,
723        Err(e) => {
724            warn!("Failed to read egg archive {:?}: {}", path, e);
725            return default_package_data();
726        }
727    };
728
729    if validate_zip_archive(&mut archive, path, "egg").is_err() {
730        return default_package_data();
731    }
732
733    let pkginfo_path = find_egg_pkginfo_path(&mut archive);
734    let pkginfo_path = match pkginfo_path {
735        Some(p) => p,
736        None => {
737            warn!("No PKG-INFO file found in egg archive {:?}", path);
738            return default_package_data();
739        }
740    };
741
742    let content = match read_zip_entry(&mut archive, &pkginfo_path) {
743        Ok(c) => c,
744        Err(e) => {
745            warn!("Failed to read PKG-INFO from {:?}: {}", path, e);
746            return default_package_data();
747        }
748    };
749
750    let mut package_data = python_parse_rfc822_content(&content, DatasourceId::PypiEgg);
751
752    let (size, sha256) = calculate_file_checksums(path);
753    package_data.size = size;
754    package_data.sha256 = sha256;
755
756    if let Some(installed_files_path) = find_egg_installed_files_path(&mut archive)
757        && let Ok(installed_files_content) = read_zip_entry(&mut archive, &installed_files_path)
758    {
759        package_data.file_references = parse_installed_files_txt(&installed_files_content);
760    }
761
762    if let Some(egg_info) = parse_egg_filename(path) {
763        if package_data.name.is_none() {
764            package_data.name = Some(egg_info.name.clone());
765        }
766        if package_data.version.is_none() {
767            package_data.version = Some(egg_info.version.clone());
768        }
769
770        if let Some(python_version) = &egg_info.python_version {
771            let mut extra_data = package_data.extra_data.unwrap_or_default();
772            extra_data.insert(
773                "python_version".to_string(),
774                serde_json::Value::String(python_version.clone()),
775            );
776            package_data.extra_data = Some(extra_data);
777        }
778    }
779
780    package_data.purl = build_egg_purl(
781        package_data.name.as_deref(),
782        package_data.version.as_deref(),
783    );
784
785    package_data
786}
787
788fn find_wheel_metadata_path<R: Read + std::io::Seek>(
789    archive: &mut ZipArchive<R>,
790) -> Option<String> {
791    for i in 0..archive.len() {
792        if let Ok(file) = archive.by_index_raw(i) {
793            let name = file.name();
794            if name.ends_with(".dist-info/METADATA") {
795                return Some(name.to_string());
796            }
797        }
798    }
799    None
800}
801
802fn find_egg_pkginfo_path<R: Read + std::io::Seek>(archive: &mut ZipArchive<R>) -> Option<String> {
803    for i in 0..archive.len() {
804        if let Ok(file) = archive.by_index_raw(i) {
805            let name = file.name();
806            if name.ends_with("EGG-INFO/PKG-INFO") || name.ends_with(".egg-info/PKG-INFO") {
807                return Some(name.to_string());
808            }
809        }
810    }
811    None
812}
813
814fn read_zip_entry<R: Read + std::io::Seek>(
815    archive: &mut ZipArchive<R>,
816    path: &str,
817) -> Result<String, String> {
818    let mut file = archive
819        .by_name(path)
820        .map_err(|e| format!("Failed to find entry {}: {}", path, e))?;
821    let mut content = String::new();
822    file.read_to_string(&mut content)
823        .map_err(|e| format!("Failed to read {}: {}", path, e))?;
824    Ok(content)
825}
826
827fn find_wheel_record_path<R: Read + std::io::Seek>(archive: &mut ZipArchive<R>) -> Option<String> {
828    for i in 0..archive.len() {
829        if let Ok(file) = archive.by_index_raw(i) {
830            let name = file.name();
831            if name.ends_with(".dist-info/RECORD") {
832                return Some(name.to_string());
833            }
834        }
835    }
836    None
837}
838
839fn find_egg_installed_files_path<R: Read + std::io::Seek>(
840    archive: &mut ZipArchive<R>,
841) -> Option<String> {
842    for i in 0..archive.len() {
843        if let Ok(file) = archive.by_index_raw(i) {
844            let name = file.name();
845            if name.ends_with("EGG-INFO/installed-files.txt")
846                || name.ends_with(".egg-info/installed-files.txt")
847            {
848                return Some(name.to_string());
849            }
850        }
851    }
852    None
853}
854
855/// Parses RECORD CSV format from wheel archives (PEP 427).
856/// Format: path,hash,size (3 columns, no header)
857/// Hash format: sha256=urlsafe_base64_hash or empty
858/// Size: bytes as u64 or empty
859pub fn parse_record_csv(content: &str) -> Vec<FileReference> {
860    let mut reader = ReaderBuilder::new()
861        .has_headers(false)
862        .from_reader(content.as_bytes());
863
864    let mut file_references = Vec::new();
865
866    for result in reader.records() {
867        match result {
868            Ok(record) => {
869                if record.len() < 3 {
870                    continue;
871                }
872
873                let path = record.get(0).unwrap_or("").trim().to_string();
874                if path.is_empty() {
875                    continue;
876                }
877
878                let hash_field = record.get(1).unwrap_or("").trim();
879                let size_field = record.get(2).unwrap_or("").trim();
880
881                // Parse hash: format is "algorithm=value"
882                let sha256 = if !hash_field.is_empty() && hash_field.contains('=') {
883                    let parts: Vec<&str> = hash_field.split('=').collect();
884                    if parts.len() == 2 && parts[0] == "sha256" {
885                        // Decode base64 to hex
886                        match URL_SAFE_NO_PAD.decode(parts[1]) {
887                            Ok(decoded) => {
888                                let hex = decoded
889                                    .iter()
890                                    .map(|b| format!("{:02x}", b))
891                                    .collect::<String>();
892                                Some(hex)
893                            }
894                            Err(_) => None,
895                        }
896                    } else {
897                        None
898                    }
899                } else {
900                    None
901                };
902
903                // Parse size
904                let size = if !size_field.is_empty() && size_field != "-" {
905                    size_field.parse::<u64>().ok()
906                } else {
907                    None
908                };
909
910                file_references.push(FileReference {
911                    path,
912                    size,
913                    sha1: None,
914                    md5: None,
915                    sha256,
916                    sha512: None,
917                    extra_data: None,
918                });
919            }
920            Err(e) => {
921                warn!("Failed to parse RECORD CSV row: {}", e);
922                continue;
923            }
924        }
925    }
926
927    file_references
928}
929
930/// Parses installed-files.txt format from egg archives (PEP 376).
931/// Format: one file path per line, no headers, no hash, no size
932pub fn parse_installed_files_txt(content: &str) -> Vec<FileReference> {
933    content
934        .lines()
935        .map(|line| line.trim())
936        .filter(|line| !line.is_empty())
937        .map(|path| FileReference {
938            path: path.to_string(),
939            size: None,
940            sha1: None,
941            md5: None,
942            sha256: None,
943            sha512: None,
944            extra_data: None,
945        })
946        .collect()
947}
948
949pub fn parse_sources_txt(content: &str) -> Vec<FileReference> {
950    content
951        .lines()
952        .map(str::trim)
953        .filter(|line| !line.is_empty())
954        .map(|path| FileReference {
955            path: path.to_string(),
956            size: None,
957            sha1: None,
958            md5: None,
959            sha256: None,
960            sha512: None,
961            extra_data: None,
962        })
963        .collect()
964}
965
966struct WheelInfo {
967    name: String,
968    version: String,
969    python_tag: String,
970    abi_tag: String,
971    platform_tag: String,
972}
973
974fn parse_wheel_filename(path: &Path) -> Option<WheelInfo> {
975    let stem = path.file_stem()?.to_string_lossy();
976    let parts: Vec<&str> = stem.split('-').collect();
977
978    if parts.len() >= 5 {
979        Some(WheelInfo {
980            name: parts[0].replace('_', "-"),
981            version: parts[1].to_string(),
982            python_tag: parts[2].to_string(),
983            abi_tag: parts[3].to_string(),
984            platform_tag: parts[4..].join("-"),
985        })
986    } else {
987        None
988    }
989}
990
991struct EggInfo {
992    name: String,
993    version: String,
994    python_version: Option<String>,
995}
996
997fn parse_egg_filename(path: &Path) -> Option<EggInfo> {
998    let stem = path.file_stem()?.to_string_lossy();
999    let parts: Vec<&str> = stem.split('-').collect();
1000
1001    if parts.len() >= 2 {
1002        Some(EggInfo {
1003            name: parts[0].replace('_', "-"),
1004            version: parts[1].to_string(),
1005            python_version: parts.get(2).map(|s| s.to_string()),
1006        })
1007    } else {
1008        None
1009    }
1010}
1011
1012fn build_wheel_purl(
1013    name: Option<&str>,
1014    version: Option<&str>,
1015    wheel_info: &WheelInfo,
1016) -> Option<String> {
1017    let name = name?;
1018    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1019
1020    if let Some(ver) = version {
1021        package_url.with_version(ver).ok()?;
1022    }
1023
1024    let extension = format!(
1025        "{}-{}-{}",
1026        wheel_info.python_tag, wheel_info.abi_tag, wheel_info.platform_tag
1027    );
1028    package_url.add_qualifier("extension", extension).ok()?;
1029
1030    Some(package_url.to_string())
1031}
1032
1033fn build_egg_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
1034    let name = name?;
1035    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1036
1037    if let Some(ver) = version {
1038        package_url.with_version(ver).ok()?;
1039    }
1040
1041    package_url.add_qualifier("type", "egg").ok()?;
1042
1043    Some(package_url.to_string())
1044}
1045
1046fn python_parse_rfc822_content(content: &str, datasource_id: DatasourceId) -> PackageData {
1047    let metadata = super::rfc822::parse_rfc822_content(content);
1048    build_package_data_from_rfc822(&metadata, datasource_id)
1049}
1050
1051/// Builds PackageData from parsed RFC822 metadata.
1052///
1053/// This is the shared implementation for both `extract_from_rfc822_metadata` (file-based)
1054/// and `python_parse_rfc822_content` (content-based) functions.
1055fn build_package_data_from_rfc822(
1056    metadata: &super::rfc822::Rfc822Metadata,
1057    datasource_id: DatasourceId,
1058) -> PackageData {
1059    use super::rfc822::{get_header_all, get_header_first};
1060
1061    let name = get_header_first(&metadata.headers, "name");
1062    let version = get_header_first(&metadata.headers, "version");
1063    let summary = get_header_first(&metadata.headers, "summary");
1064    let mut homepage_url = get_header_first(&metadata.headers, "home-page");
1065    let author = get_header_first(&metadata.headers, "author");
1066    let author_email = get_header_first(&metadata.headers, "author-email");
1067    let license = get_header_first(&metadata.headers, "license");
1068    let download_url = get_header_first(&metadata.headers, "download-url");
1069    let platform = get_header_first(&metadata.headers, "platform");
1070    let requires_python = get_header_first(&metadata.headers, "requires-python");
1071    let classifiers = get_header_all(&metadata.headers, "classifier");
1072    let license_files = get_header_all(&metadata.headers, "license-file");
1073
1074    let description_body = if metadata.body.is_empty() {
1075        get_header_first(&metadata.headers, "description").unwrap_or_default()
1076    } else {
1077        metadata.body.clone()
1078    };
1079
1080    let description = build_description(summary.as_deref(), &description_body);
1081
1082    let mut parties = Vec::new();
1083    if author.is_some() || author_email.is_some() {
1084        parties.push(Party {
1085            r#type: Some("person".to_string()),
1086            role: Some("author".to_string()),
1087            name: author,
1088            email: author_email,
1089            url: None,
1090            organization: None,
1091            organization_url: None,
1092            timezone: None,
1093        });
1094    }
1095
1096    let (keywords, license_classifiers) = split_classifiers(&classifiers);
1097    // Extract license statement only - detection happens in separate engine
1098    let license_detections = Vec::new();
1099    let declared_license_expression = None;
1100    let declared_license_expression_spdx = None;
1101
1102    let extracted_license_statement =
1103        build_extracted_license_statement(license.as_deref(), &license_classifiers);
1104
1105    let mut extra_data = HashMap::new();
1106    if let Some(platform_value) = platform
1107        && !platform_value.eq_ignore_ascii_case("unknown")
1108        && !platform_value.is_empty()
1109    {
1110        extra_data.insert(
1111            "platform".to_string(),
1112            serde_json::Value::String(platform_value),
1113        );
1114    }
1115
1116    if let Some(requires_python_value) = requires_python
1117        && !requires_python_value.is_empty()
1118    {
1119        extra_data.insert(
1120            "requires_python".to_string(),
1121            serde_json::Value::String(requires_python_value),
1122        );
1123    }
1124
1125    if !license_files.is_empty() {
1126        extra_data.insert(
1127            "license_files".to_string(),
1128            serde_json::Value::Array(
1129                license_files
1130                    .iter()
1131                    .cloned()
1132                    .map(serde_json::Value::String)
1133                    .collect(),
1134            ),
1135        );
1136    }
1137
1138    let file_references = license_files
1139        .iter()
1140        .map(|path| FileReference {
1141            path: path.clone(),
1142            size: None,
1143            sha1: None,
1144            md5: None,
1145            sha256: None,
1146            sha512: None,
1147            extra_data: None,
1148        })
1149        .collect();
1150
1151    let project_urls = get_header_all(&metadata.headers, "project-url");
1152    let dependencies = extract_rfc822_dependencies(&metadata.headers);
1153    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
1154
1155    if !project_urls.is_empty() {
1156        let parsed_urls = parse_project_urls(&project_urls);
1157
1158        for (label, url) in &parsed_urls {
1159            let label_lower = label.to_lowercase();
1160
1161            if bug_tracking_url.is_none()
1162                && matches!(
1163                    label_lower.as_str(),
1164                    "tracker"
1165                        | "bug reports"
1166                        | "bug tracker"
1167                        | "issues"
1168                        | "issue tracker"
1169                        | "github: issues"
1170                )
1171            {
1172                bug_tracking_url = Some(url.clone());
1173            } else if code_view_url.is_none()
1174                && matches!(label_lower.as_str(), "source" | "source code" | "code")
1175            {
1176                code_view_url = Some(url.clone());
1177            } else if vcs_url.is_none()
1178                && matches!(
1179                    label_lower.as_str(),
1180                    "github" | "gitlab" | "github: repo" | "repository"
1181                )
1182            {
1183                vcs_url = Some(url.clone());
1184            } else if homepage_url.is_none()
1185                && matches!(label_lower.as_str(), "website" | "homepage" | "home")
1186            {
1187                homepage_url = Some(url.clone());
1188            } else if label_lower == "changelog" {
1189                extra_data.insert(
1190                    "changelog_url".to_string(),
1191                    serde_json::Value::String(url.clone()),
1192                );
1193            }
1194        }
1195
1196        let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
1197            .iter()
1198            .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
1199            .collect();
1200
1201        if !project_urls_json.is_empty() {
1202            extra_data.insert(
1203                "project_urls".to_string(),
1204                serde_json::Value::Object(project_urls_json),
1205            );
1206        }
1207    }
1208
1209    let extra_data = if extra_data.is_empty() {
1210        None
1211    } else {
1212        Some(extra_data)
1213    };
1214
1215    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
1216        build_pypi_urls(name.as_deref(), version.as_deref());
1217
1218    PackageData {
1219        package_type: Some(PythonParser::PACKAGE_TYPE),
1220        namespace: None,
1221        name,
1222        version,
1223        qualifiers: None,
1224        subpath: None,
1225        primary_language: Some("Python".to_string()),
1226        description,
1227        release_date: None,
1228        parties,
1229        keywords,
1230        homepage_url,
1231        download_url,
1232        size: None,
1233        sha1: None,
1234        md5: None,
1235        sha256: None,
1236        sha512: None,
1237        bug_tracking_url,
1238        code_view_url,
1239        vcs_url,
1240        copyright: None,
1241        holder: None,
1242        declared_license_expression,
1243        declared_license_expression_spdx,
1244        license_detections,
1245        other_license_expression: None,
1246        other_license_expression_spdx: None,
1247        other_license_detections: Vec::new(),
1248        extracted_license_statement,
1249        notice_text: None,
1250        source_packages: Vec::new(),
1251        file_references,
1252        is_private: false,
1253        is_virtual: false,
1254        extra_data,
1255        dependencies,
1256        repository_homepage_url,
1257        repository_download_url,
1258        api_data_url,
1259        datasource_id: Some(datasource_id),
1260        purl,
1261    }
1262}
1263
1264fn parse_project_urls(project_urls: &[String]) -> Vec<(String, String)> {
1265    project_urls
1266        .iter()
1267        .filter_map(|url_entry| {
1268            if let Some((label, url)) = url_entry.split_once(", ") {
1269                let label_trimmed = label.trim();
1270                let url_trimmed = url.trim();
1271                if !label_trimmed.is_empty() && !url_trimmed.is_empty() {
1272                    return Some((label_trimmed.to_string(), url_trimmed.to_string()));
1273                }
1274            }
1275            None
1276        })
1277        .collect()
1278}
1279
1280fn build_description(summary: Option<&str>, body: &str) -> Option<String> {
1281    let mut parts = Vec::new();
1282    if let Some(summary_value) = summary
1283        && !summary_value.trim().is_empty()
1284    {
1285        parts.push(summary_value.trim().to_string());
1286    }
1287
1288    if !body.trim().is_empty() {
1289        parts.push(body.trim().to_string());
1290    }
1291
1292    if parts.is_empty() {
1293        None
1294    } else {
1295        Some(parts.join("\n"))
1296    }
1297}
1298
1299fn split_classifiers(classifiers: &[String]) -> (Vec<String>, Vec<String>) {
1300    let mut keywords = Vec::new();
1301    let mut license_classifiers = Vec::new();
1302
1303    for classifier in classifiers {
1304        if classifier.starts_with("License ::") {
1305            license_classifiers.push(classifier.to_string());
1306        } else {
1307            keywords.push(classifier.to_string());
1308        }
1309    }
1310
1311    (keywords, license_classifiers)
1312}
1313
1314fn build_extracted_license_statement(
1315    license: Option<&str>,
1316    license_classifiers: &[String],
1317) -> Option<String> {
1318    let mut lines = Vec::new();
1319
1320    if let Some(value) = license
1321        && !value.trim().is_empty()
1322    {
1323        lines.push(format!("license: {}", value.trim()));
1324    }
1325
1326    if !license_classifiers.is_empty() {
1327        lines.push("classifiers:".to_string());
1328        for classifier in license_classifiers {
1329            lines.push(format!("  - '{}'", classifier));
1330        }
1331    }
1332
1333    if lines.is_empty() {
1334        None
1335    } else {
1336        Some(format!("{}\n", lines.join("\n")))
1337    }
1338}
1339
1340pub(crate) fn build_pypi_urls(
1341    name: Option<&str>,
1342    version: Option<&str>,
1343) -> (
1344    Option<String>,
1345    Option<String>,
1346    Option<String>,
1347    Option<String>,
1348) {
1349    let repository_homepage_url = name.map(|value| format!("https://pypi.org/project/{}", value));
1350
1351    let repository_download_url = name.and_then(|value| {
1352        version.map(|ver| {
1353            format!(
1354                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
1355                &value[..1.min(value.len())],
1356                value,
1357                value,
1358                ver
1359            )
1360        })
1361    });
1362
1363    let api_data_url = name.map(|value| {
1364        if let Some(ver) = version {
1365            format!("https://pypi.org/pypi/{}/{}/json", value, ver)
1366        } else {
1367            format!("https://pypi.org/pypi/{}/json", value)
1368        }
1369    });
1370
1371    let purl = name.and_then(|value| {
1372        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), value).ok()?;
1373        if let Some(ver) = version {
1374            package_url.with_version(ver).ok()?;
1375        }
1376        Some(package_url.to_string())
1377    });
1378
1379    (
1380        repository_homepage_url,
1381        repository_download_url,
1382        api_data_url,
1383        purl,
1384    )
1385}
1386
1387fn build_pypi_purl_with_extension(
1388    name: &str,
1389    version: Option<&str>,
1390    extension: &str,
1391) -> Option<String> {
1392    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1393    if let Some(ver) = version {
1394        package_url.with_version(ver).ok()?;
1395    }
1396    package_url.add_qualifier("extension", extension).ok()?;
1397    Some(package_url.to_string())
1398}
1399
1400fn extract_from_pyproject_toml(path: &Path) -> PackageData {
1401    let toml_content = match read_toml_file(path) {
1402        Ok(content) => content,
1403        Err(e) => {
1404            warn!(
1405                "Failed to read or parse pyproject.toml at {:?}: {}",
1406                path, e
1407            );
1408            return default_package_data();
1409        }
1410    };
1411
1412    let tool_table = toml_content.get("tool").and_then(|v| v.as_table());
1413
1414    // Handle both PEP 621 (project table) and poetry formats
1415    let project_table =
1416        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
1417            // Standard PEP 621 format with [project] table
1418            project.clone()
1419        } else if let Some(tool) = tool_table {
1420            if let Some(poetry) = tool.get("poetry").and_then(|v| v.as_table()) {
1421                // Poetry format with [tool.poetry] table
1422                poetry.clone()
1423            } else {
1424                warn!(
1425                    "No project or tool.poetry data found in pyproject.toml at {:?}",
1426                    path
1427                );
1428                return default_package_data();
1429            }
1430        } else if toml_content.get(FIELD_NAME).is_some() {
1431            // Other format with top-level fields
1432            match toml_content.as_table() {
1433                Some(table) => table.clone(),
1434                None => {
1435                    warn!("Failed to convert TOML content to table in {:?}", path);
1436                    return default_package_data();
1437                }
1438            }
1439        } else {
1440            warn!("No project data found in pyproject.toml at {:?}", path);
1441            return default_package_data();
1442        };
1443
1444    let name = project_table
1445        .get(FIELD_NAME)
1446        .and_then(|v| v.as_str())
1447        .map(String::from);
1448
1449    let version = project_table
1450        .get(FIELD_VERSION)
1451        .and_then(|v| v.as_str())
1452        .map(String::from);
1453    let classifiers = project_table
1454        .get("classifiers")
1455        .and_then(|value| value.as_array())
1456        .map(|values| {
1457            values
1458                .iter()
1459                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
1460                .collect::<Vec<_>>()
1461        })
1462        .unwrap_or_default();
1463
1464    // Extract license statement only - detection happens in separate engine
1465    let license_detections = Vec::new();
1466    let extracted_license_statement = extract_raw_license_string(&project_table);
1467    let declared_license_expression = None;
1468    let declared_license_expression_spdx = None;
1469
1470    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
1471    let (homepage_url, repository_url) = extract_urls(&project_table);
1472
1473    let (dependencies, optional_dependencies) = extract_dependencies(&project_table, &toml_content);
1474    let extra_data = extract_pyproject_extra_data(&toml_content);
1475
1476    // Create package URL
1477    let purl = name.as_ref().and_then(|n| {
1478        let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n) {
1479            Ok(p) => p,
1480            Err(e) => {
1481                warn!(
1482                    "Failed to create PackageUrl for Python package '{}': {}",
1483                    n, e
1484                );
1485                return None;
1486            }
1487        };
1488
1489        if let Some(v) = &version
1490            && let Err(e) = package_url.with_version(v)
1491        {
1492            warn!(
1493                "Failed to set version '{}' for Python package '{}': {}",
1494                v, n, e
1495            );
1496            return None;
1497        }
1498
1499        Some(package_url.to_string())
1500    });
1501
1502    let api_data_url = name.as_ref().map(|n| {
1503        if let Some(v) = &version {
1504            format!("https://pypi.org/pypi/{}/{}/json", n, v)
1505        } else {
1506            format!("https://pypi.org/pypi/{}/json", n)
1507        }
1508    });
1509
1510    let pypi_homepage_url = name
1511        .as_ref()
1512        .map(|n| format!("https://pypi.org/project/{}", n));
1513
1514    let pypi_download_url = name.as_ref().and_then(|n| {
1515        version.as_ref().map(|v| {
1516            format!(
1517                "https://pypi.org/packages/source/{}/{}/{}-{}.tar.gz",
1518                &n[..1.min(n.len())],
1519                n,
1520                n,
1521                v
1522            )
1523        })
1524    });
1525
1526    PackageData {
1527        package_type: Some(PythonParser::PACKAGE_TYPE),
1528        namespace: None,
1529        name,
1530        version,
1531        qualifiers: None,
1532        subpath: None,
1533        primary_language: None,
1534        description: None,
1535        release_date: None,
1536        parties: extract_parties(&project_table),
1537        keywords: Vec::new(),
1538        homepage_url: homepage_url.or(pypi_homepage_url),
1539        download_url: repository_url.clone().or(pypi_download_url),
1540        size: None,
1541        sha1: None,
1542        md5: None,
1543        sha256: None,
1544        sha512: None,
1545        bug_tracking_url: None,
1546        code_view_url: None,
1547        vcs_url: repository_url,
1548        copyright: None,
1549        holder: None,
1550        declared_license_expression,
1551        declared_license_expression_spdx,
1552        license_detections,
1553        other_license_expression: None,
1554        other_license_expression_spdx: None,
1555        other_license_detections: Vec::new(),
1556        extracted_license_statement,
1557        notice_text: None,
1558        source_packages: Vec::new(),
1559        file_references: Vec::new(),
1560        is_private: has_private_classifier(&classifiers),
1561        is_virtual: false,
1562        extra_data,
1563        dependencies: [dependencies, optional_dependencies].concat(),
1564        repository_homepage_url: None,
1565        repository_download_url: None,
1566        api_data_url,
1567        datasource_id: Some(DatasourceId::PypiPyprojectToml),
1568        purl,
1569    }
1570}
1571
1572fn extract_raw_license_string(project: &TomlMap<String, TomlValue>) -> Option<String> {
1573    project
1574        .get(FIELD_LICENSE)
1575        .and_then(|license_value| match license_value {
1576            TomlValue::String(license_str) => Some(license_str.clone()),
1577            TomlValue::Table(license_table) => license_table
1578                .get("text")
1579                .and_then(|v| v.as_str())
1580                .map(|s| s.to_string())
1581                .or_else(|| {
1582                    license_table
1583                        .get("expression")
1584                        .and_then(|v| v.as_str())
1585                        .map(|expr| expr.to_string())
1586                }),
1587            _ => None,
1588        })
1589}
1590
1591fn extract_urls(project: &TomlMap<String, TomlValue>) -> (Option<String>, Option<String>) {
1592    let mut homepage_url = None;
1593    let mut repository_url = None;
1594
1595    // Check for URLs table
1596    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
1597        homepage_url = urls
1598            .get(FIELD_HOMEPAGE)
1599            .and_then(|v| v.as_str())
1600            .map(String::from);
1601        repository_url = urls
1602            .get(FIELD_REPOSITORY)
1603            .and_then(|v| v.as_str())
1604            .map(String::from);
1605    }
1606
1607    // If not found in URLs table, check for top-level keys
1608    if homepage_url.is_none() {
1609        homepage_url = project
1610            .get(FIELD_HOMEPAGE)
1611            .and_then(|v| v.as_str())
1612            .map(String::from);
1613    }
1614
1615    if repository_url.is_none() {
1616        repository_url = project
1617            .get(FIELD_REPOSITORY)
1618            .and_then(|v| v.as_str())
1619            .map(String::from);
1620    }
1621
1622    (homepage_url, repository_url)
1623}
1624
1625fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
1626    let mut parties = Vec::new();
1627
1628    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
1629        for author in authors {
1630            if let Some(author_str) = author.as_str() {
1631                let (name, email) = split_name_email(author_str);
1632                parties.push(Party {
1633                    r#type: None,
1634                    role: Some("author".to_string()),
1635                    name,
1636                    email,
1637                    url: None,
1638                    organization: None,
1639                    organization_url: None,
1640                    timezone: None,
1641                });
1642            }
1643        }
1644    }
1645
1646    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
1647        for maintainer in maintainers {
1648            if let Some(maintainer_str) = maintainer.as_str() {
1649                let (name, email) = split_name_email(maintainer_str);
1650                parties.push(Party {
1651                    r#type: None,
1652                    role: Some("maintainer".to_string()),
1653                    name,
1654                    email,
1655                    url: None,
1656                    organization: None,
1657                    organization_url: None,
1658                    timezone: None,
1659                });
1660            }
1661        }
1662    }
1663
1664    parties
1665}
1666
1667fn extract_dependencies(
1668    project: &TomlMap<String, TomlValue>,
1669    toml_content: &TomlValue,
1670) -> (Vec<Dependency>, Vec<Dependency>) {
1671    let mut dependencies = Vec::new();
1672    let mut optional_dependencies = Vec::new();
1673
1674    // Handle dependencies - can be array or table format
1675    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
1676        match deps_value {
1677            TomlValue::Array(arr) => {
1678                dependencies = parse_dependency_array(arr, false, None);
1679            }
1680            TomlValue::Table(table) => {
1681                dependencies = parse_dependency_table(table, false, None);
1682            }
1683            _ => {}
1684        }
1685    }
1686
1687    // Handle PEP 621 optional-dependencies with scope
1688    if let Some(opt_deps_table) = project
1689        .get(FIELD_OPTIONAL_DEPENDENCIES)
1690        .and_then(|v| v.as_table())
1691    {
1692        for (extra_name, deps) in opt_deps_table {
1693            match deps {
1694                TomlValue::Array(arr) => {
1695                    optional_dependencies.extend(parse_dependency_array(
1696                        arr,
1697                        true,
1698                        Some(extra_name),
1699                    ));
1700                }
1701                TomlValue::Table(table) => {
1702                    optional_dependencies.extend(parse_dependency_table(
1703                        table,
1704                        true,
1705                        Some(extra_name),
1706                    ));
1707                }
1708                _ => {}
1709            }
1710        }
1711    }
1712
1713    // Handle Poetry dev-dependencies
1714    if let Some(dev_deps_value) = project.get(FIELD_DEV_DEPENDENCIES) {
1715        match dev_deps_value {
1716            TomlValue::Array(arr) => {
1717                optional_dependencies.extend(parse_dependency_array(
1718                    arr,
1719                    true,
1720                    Some(FIELD_DEV_DEPENDENCIES),
1721                ));
1722            }
1723            TomlValue::Table(table) => {
1724                optional_dependencies.extend(parse_dependency_table(
1725                    table,
1726                    true,
1727                    Some(FIELD_DEV_DEPENDENCIES),
1728                ));
1729            }
1730            _ => {}
1731        }
1732    }
1733
1734    // Handle Poetry dependency groups: [tool.poetry.group.<name>]
1735    if let Some(groups_table) = project.get("group").and_then(|v| v.as_table()) {
1736        for (group_name, group_data) in groups_table {
1737            if let Some(group_deps) = group_data.as_table().and_then(|t| t.get("dependencies")) {
1738                match group_deps {
1739                    TomlValue::Array(arr) => {
1740                        optional_dependencies.extend(parse_dependency_array(
1741                            arr,
1742                            true,
1743                            Some(group_name),
1744                        ));
1745                    }
1746                    TomlValue::Table(table) => {
1747                        optional_dependencies.extend(parse_dependency_table(
1748                            table,
1749                            true,
1750                            Some(group_name),
1751                        ));
1752                    }
1753                    _ => {}
1754                }
1755            }
1756        }
1757    }
1758
1759    if let Some(groups_table) = toml_content
1760        .get(FIELD_DEPENDENCY_GROUPS)
1761        .and_then(|value| value.as_table())
1762    {
1763        for (group_name, deps) in groups_table {
1764            match deps {
1765                TomlValue::Array(arr) => {
1766                    optional_dependencies.extend(parse_dependency_array(
1767                        arr,
1768                        true,
1769                        Some(group_name),
1770                    ));
1771                }
1772                TomlValue::Table(table) => {
1773                    optional_dependencies.extend(parse_dependency_table(
1774                        table,
1775                        true,
1776                        Some(group_name),
1777                    ));
1778                }
1779                _ => {}
1780            }
1781        }
1782    }
1783
1784    if let Some(dev_deps_value) = toml_content
1785        .get("tool")
1786        .and_then(|value| value.as_table())
1787        .and_then(|tool| tool.get("uv"))
1788        .and_then(|value| value.as_table())
1789        .and_then(|uv| uv.get(FIELD_DEV_DEPENDENCIES))
1790    {
1791        match dev_deps_value {
1792            TomlValue::Array(arr) => {
1793                optional_dependencies.extend(parse_dependency_array(arr, true, Some("dev")));
1794            }
1795            TomlValue::Table(table) => {
1796                optional_dependencies.extend(parse_dependency_table(table, true, Some("dev")));
1797            }
1798            _ => {}
1799        }
1800    }
1801
1802    (dependencies, optional_dependencies)
1803}
1804
1805fn extract_pyproject_extra_data(toml_content: &TomlValue) -> Option<HashMap<String, JsonValue>> {
1806    let mut extra_data = HashMap::new();
1807
1808    if let Some(tool_uv) = toml_content
1809        .get("tool")
1810        .and_then(|value| value.as_table())
1811        .and_then(|tool| tool.get("uv"))
1812    {
1813        extra_data.insert("tool_uv".to_string(), toml_value_to_json(tool_uv));
1814    }
1815
1816    if extra_data.is_empty() {
1817        None
1818    } else {
1819        Some(extra_data)
1820    }
1821}
1822
1823fn toml_value_to_json(value: &TomlValue) -> JsonValue {
1824    match value {
1825        TomlValue::String(value) => JsonValue::String(value.clone()),
1826        TomlValue::Integer(value) => JsonValue::String(value.to_string()),
1827        TomlValue::Float(value) => JsonValue::String(value.to_string()),
1828        TomlValue::Boolean(value) => JsonValue::Bool(*value),
1829        TomlValue::Datetime(value) => JsonValue::String(value.to_string()),
1830        TomlValue::Array(values) => {
1831            JsonValue::Array(values.iter().map(toml_value_to_json).collect())
1832        }
1833        TomlValue::Table(values) => JsonValue::Object(
1834            values
1835                .iter()
1836                .map(|(key, value)| (key.clone(), toml_value_to_json(value)))
1837                .collect::<JsonMap<String, JsonValue>>(),
1838        ),
1839    }
1840}
1841
1842fn parse_dependency_table(
1843    table: &TomlMap<String, TomlValue>,
1844    is_optional: bool,
1845    scope: Option<&str>,
1846) -> Vec<Dependency> {
1847    table
1848        .iter()
1849        .filter_map(|(name, version)| {
1850            let version_str = version.as_str().map(|s| s.to_string());
1851            let mut package_url =
1852                PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
1853
1854            if let Some(v) = &version_str {
1855                package_url.with_version(v).ok()?;
1856            }
1857
1858            Some(Dependency {
1859                purl: Some(package_url.to_string()),
1860                extracted_requirement: None,
1861                scope: scope.map(|s| s.to_string()),
1862                is_runtime: Some(!is_optional),
1863                is_optional: Some(is_optional),
1864                is_pinned: None,
1865                is_direct: Some(true),
1866                resolved_package: None,
1867                extra_data: None,
1868            })
1869        })
1870        .collect()
1871}
1872
1873fn parse_dependency_array(
1874    array: &[TomlValue],
1875    is_optional: bool,
1876    scope: Option<&str>,
1877) -> Vec<Dependency> {
1878    array
1879        .iter()
1880        .filter_map(|dep| {
1881            let dep_str = dep.as_str()?;
1882
1883            let mut parts = dep_str.split(['>', '=', '<', '~']);
1884            let name = parts.next()?.trim().to_string();
1885
1886            let version = parts.next().map(|v| v.trim().to_string());
1887
1888            let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name)
1889            {
1890                Ok(purl) => purl,
1891                Err(_) => return None,
1892            };
1893
1894            if let Some(ref v) = version {
1895                package_url.with_version(v).ok()?;
1896            }
1897
1898            Some(Dependency {
1899                purl: Some(package_url.to_string()),
1900                extracted_requirement: None,
1901                scope: scope.map(|s| s.to_string()),
1902                is_runtime: Some(!is_optional),
1903                is_optional: Some(is_optional),
1904                is_pinned: None,
1905                is_direct: Some(true),
1906                resolved_package: None,
1907                extra_data: None,
1908            })
1909        })
1910        .collect()
1911}
1912
1913#[derive(Debug, Clone)]
1914enum Value {
1915    String(String),
1916    Number(f64),
1917    Bool(bool),
1918    None,
1919    List(Vec<Value>),
1920    Tuple(Vec<Value>),
1921    Dict(HashMap<String, Value>),
1922}
1923
1924struct LiteralEvaluator {
1925    constants: HashMap<String, Value>,
1926    max_depth: usize,
1927    max_nodes: usize,
1928    nodes_visited: usize,
1929}
1930
1931impl LiteralEvaluator {
1932    fn new(constants: HashMap<String, Value>) -> Self {
1933        Self {
1934            constants,
1935            max_depth: MAX_SETUP_PY_AST_DEPTH,
1936            max_nodes: MAX_SETUP_PY_AST_NODES,
1937            nodes_visited: 0,
1938        }
1939    }
1940
1941    fn insert_constant(&mut self, name: String, value: Value) {
1942        self.constants.insert(name, value);
1943    }
1944
1945    fn evaluate_expr(&mut self, expr: &ast::Expr, depth: usize) -> Option<Value> {
1946        if depth >= self.max_depth || self.nodes_visited >= self.max_nodes {
1947            return None;
1948        }
1949        self.nodes_visited += 1;
1950
1951        match expr {
1952            ast::Expr::Constant(ast::ExprConstant { value, .. }) => self.evaluate_constant(value),
1953            ast::Expr::Name(ast::ExprName { id, .. }) => self.constants.get(id.as_str()).cloned(),
1954            ast::Expr::List(ast::ExprList { elts, .. }) => {
1955                let mut values = Vec::new();
1956                for elt in elts {
1957                    values.push(self.evaluate_expr(elt, depth + 1)?);
1958                }
1959                Some(Value::List(values))
1960            }
1961            ast::Expr::Tuple(ast::ExprTuple { elts, .. }) => {
1962                let mut values = Vec::new();
1963                for elt in elts {
1964                    values.push(self.evaluate_expr(elt, depth + 1)?);
1965                }
1966                Some(Value::Tuple(values))
1967            }
1968            ast::Expr::Dict(ast::ExprDict { keys, values, .. }) => {
1969                let mut dict = HashMap::new();
1970                for (key_expr, value_expr) in keys.iter().zip(values.iter()) {
1971                    let key_expr = key_expr.as_ref()?;
1972                    let key_value = self.evaluate_expr(key_expr, depth + 1)?;
1973                    let key = value_to_string(&key_value)?;
1974                    let value = self.evaluate_expr(value_expr, depth + 1)?;
1975                    dict.insert(key, value);
1976                }
1977                Some(Value::Dict(dict))
1978            }
1979            ast::Expr::Call(ast::ExprCall {
1980                func,
1981                args,
1982                keywords,
1983                ..
1984            }) => {
1985                if keywords.is_empty()
1986                    && let Some(name) = dotted_name(func.as_ref(), depth + 1)
1987                    && matches!(name.as_str(), "OrderedDict" | "collections.OrderedDict")
1988                {
1989                    return self.evaluate_ordered_dict(args, depth + 1);
1990                }
1991
1992                if !args.is_empty() {
1993                    return None;
1994                }
1995
1996                if let ast::Expr::Name(ast::ExprName { id, .. }) = func.as_ref()
1997                    && id == "dict"
1998                {
1999                    let mut dict = HashMap::new();
2000                    for keyword in keywords {
2001                        let key = keyword.arg.as_ref().map(|name| name.as_str())?;
2002                        let value = self.evaluate_expr(&keyword.value, depth + 1)?;
2003                        dict.insert(key.to_string(), value);
2004                    }
2005                    return Some(Value::Dict(dict));
2006                }
2007
2008                None
2009            }
2010            _ => None,
2011        }
2012    }
2013
2014    fn evaluate_constant(&self, constant: &ast::Constant) -> Option<Value> {
2015        match constant {
2016            ast::Constant::Str(value) => Some(Value::String(value.clone())),
2017            ast::Constant::Bool(value) => Some(Value::Bool(*value)),
2018            ast::Constant::Int(value) => value.to_string().parse::<f64>().ok().map(Value::Number),
2019            ast::Constant::Float(value) => Some(Value::Number(*value)),
2020            ast::Constant::None => Some(Value::None),
2021            _ => None,
2022        }
2023    }
2024
2025    fn evaluate_ordered_dict(&mut self, args: &[ast::Expr], depth: usize) -> Option<Value> {
2026        if args.len() != 1 {
2027            return None;
2028        }
2029
2030        let items = match self.evaluate_expr(&args[0], depth)? {
2031            Value::List(items) | Value::Tuple(items) => items,
2032            _ => return None,
2033        };
2034
2035        let mut dict = HashMap::new();
2036        for item in items {
2037            let Value::Tuple(values) = item else {
2038                return None;
2039            };
2040            if values.len() != 2 {
2041                return None;
2042            }
2043            let key = value_to_string(&values[0])?;
2044            dict.insert(key, values[1].clone());
2045        }
2046
2047        Some(Value::Dict(dict))
2048    }
2049}
2050
2051#[derive(Default)]
2052struct SetupAliases {
2053    setup_names: HashSet<String>,
2054    module_aliases: HashMap<String, String>,
2055}
2056
2057fn extract_from_setup_py(path: &Path) -> PackageData {
2058    let content = match read_file_to_string(path) {
2059        Ok(content) => content,
2060        Err(e) => {
2061            warn!("Failed to read setup.py at {:?}: {}", path, e);
2062            return default_package_data();
2063        }
2064    };
2065
2066    if content.len() > MAX_SETUP_PY_BYTES {
2067        warn!("setup.py too large at {:?}: {} bytes", path, content.len());
2068        return extract_from_setup_py_regex(&content);
2069    }
2070
2071    let mut package_data = match extract_from_setup_py_ast(&content) {
2072        Ok(Some(data)) => data,
2073        Ok(None) => extract_from_setup_py_regex(&content),
2074        Err(e) => {
2075            warn!("Failed to parse setup.py AST at {:?}: {}", path, e);
2076            extract_from_setup_py_regex(&content)
2077        }
2078    };
2079
2080    if package_data.name.is_none() {
2081        package_data.name = extract_setup_value(&content, "name");
2082    }
2083
2084    if package_data.version.is_none() {
2085        package_data.version = extract_setup_value(&content, "version");
2086    }
2087
2088    fill_from_sibling_dunder_metadata(path, &content, &mut package_data);
2089
2090    if package_data.purl.is_none() {
2091        package_data.purl = build_setup_py_purl(
2092            package_data.name.as_deref(),
2093            package_data.version.as_deref(),
2094        );
2095    }
2096
2097    package_data
2098}
2099
2100fn fill_from_sibling_dunder_metadata(path: &Path, content: &str, package_data: &mut PackageData) {
2101    if package_data.version.is_some()
2102        && package_data.extracted_license_statement.is_some()
2103        && package_data
2104            .parties
2105            .iter()
2106            .any(|party| party.role.as_deref() == Some("author") && party.name.is_some())
2107    {
2108        return;
2109    }
2110
2111    let Some(root) = path.parent() else {
2112        return;
2113    };
2114
2115    let dunder_metadata = collect_sibling_dunder_metadata(root, content);
2116
2117    if package_data.version.is_none() {
2118        package_data.version = dunder_metadata.version;
2119    }
2120
2121    if package_data.extracted_license_statement.is_none() {
2122        package_data.extracted_license_statement = dunder_metadata.license;
2123    }
2124
2125    let has_author = package_data
2126        .parties
2127        .iter()
2128        .any(|party| party.role.as_deref() == Some("author") && party.name.is_some());
2129
2130    if !has_author && let Some(author) = dunder_metadata.author {
2131        package_data.parties.push(Party {
2132            r#type: Some("person".to_string()),
2133            role: Some("author".to_string()),
2134            name: Some(author),
2135            email: None,
2136            url: None,
2137            organization: None,
2138            organization_url: None,
2139            timezone: None,
2140        });
2141    }
2142}
2143
2144#[derive(Default)]
2145struct DunderMetadata {
2146    version: Option<String>,
2147    author: Option<String>,
2148    license: Option<String>,
2149}
2150
2151fn collect_sibling_dunder_metadata(root: &Path, content: &str) -> DunderMetadata {
2152    let statements = match ast::Suite::parse(content, "<setup.py>") {
2153        Ok(statements) => statements,
2154        Err(_) => return DunderMetadata::default(),
2155    };
2156
2157    let version_re = Regex::new(r#"(?m)^\s*__version__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2158    let author_re = Regex::new(r#"(?m)^\s*__author__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2159    let license_re = Regex::new(r#"(?m)^\s*__license__\s*=\s*['\"]([^'\"]+)['\"]"#).ok();
2160    let mut metadata = DunderMetadata::default();
2161
2162    for module in imported_dunder_modules(&statements) {
2163        let Some(path) = resolve_imported_module_path(root, &module) else {
2164            continue;
2165        };
2166        let Ok(module_content) = read_file_to_string(&path) else {
2167            continue;
2168        };
2169
2170        if metadata.version.is_none() {
2171            metadata.version = version_re
2172                .as_ref()
2173                .and_then(|regex| regex.captures(&module_content))
2174                .and_then(|captures| captures.get(1))
2175                .map(|match_| match_.as_str().to_string());
2176        }
2177
2178        if metadata.author.is_none() {
2179            metadata.author = author_re
2180                .as_ref()
2181                .and_then(|regex| regex.captures(&module_content))
2182                .and_then(|captures| captures.get(1))
2183                .map(|match_| match_.as_str().to_string());
2184        }
2185
2186        if metadata.license.is_none() {
2187            metadata.license = license_re
2188                .as_ref()
2189                .and_then(|regex| regex.captures(&module_content))
2190                .and_then(|captures| captures.get(1))
2191                .map(|match_| match_.as_str().to_string());
2192        }
2193
2194        if metadata.version.is_some() && metadata.author.is_some() && metadata.license.is_some() {
2195            return metadata;
2196        }
2197    }
2198
2199    metadata
2200}
2201
2202fn imported_dunder_modules(statements: &[ast::Stmt]) -> Vec<String> {
2203    let mut modules = Vec::new();
2204
2205    for statement in statements {
2206        let ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) = statement else {
2207            continue;
2208        };
2209        let Some(module) = module.as_ref().map(|name| name.as_str()) else {
2210            continue;
2211        };
2212        let imports_dunder = names.iter().any(|alias| {
2213            matches!(
2214                alias.name.as_str(),
2215                "__version__" | "__author__" | "__license__"
2216            )
2217        });
2218        if imports_dunder {
2219            modules.push(module.to_string());
2220        }
2221    }
2222
2223    modules
2224}
2225
2226fn resolve_imported_module_path(root: &Path, module: &str) -> Option<PathBuf> {
2227    let relative = PathBuf::from_iter(module.split('.'));
2228    let candidates = [
2229        root.join(relative.with_extension("py")),
2230        root.join(&relative).join("__init__.py"),
2231        root.join("src").join(relative.with_extension("py")),
2232        root.join("src").join(relative).join("__init__.py"),
2233    ];
2234
2235    candidates.into_iter().find(|candidate| candidate.exists())
2236}
2237
2238/// Extracts package metadata from setup.py using AST parsing (NO CODE EXECUTION).
2239///
2240/// # Security Model
2241///
2242/// This function parses setup.py as a Python AST and evaluates only literal values
2243/// (strings, numbers, lists, dicts). It does NOT execute Python code, preventing
2244/// arbitrary code execution during scanning.
2245///
2246/// # DoS Prevention
2247///
2248/// - `MAX_SETUP_PY_BYTES`: Limits file size to 1MB
2249/// - `MAX_SETUP_PY_AST_DEPTH`: Limits recursion depth (50 levels)
2250/// - `MAX_SETUP_PY_AST_NODES`: Limits total nodes visited (10,000)
2251///
2252/// These limits prevent stack overflow and infinite loops on malformed/malicious inputs.
2253fn extract_from_setup_py_ast(content: &str) -> Result<Option<PackageData>, String> {
2254    let statements = ast::Suite::parse(content, "<setup.py>").map_err(|e| format!("{}", e))?;
2255    let aliases = collect_setup_aliases(&statements);
2256    let mut evaluator = LiteralEvaluator::new(HashMap::new());
2257    build_setup_py_constants(&statements, &mut evaluator);
2258
2259    let setup_call = find_setup_call(&statements, &aliases);
2260    let Some(call_expr) = setup_call else {
2261        return Ok(None);
2262    };
2263
2264    let setup_values = extract_setup_keywords(call_expr, &mut evaluator);
2265    Ok(Some(build_setup_py_package_data(&setup_values)))
2266}
2267
2268fn build_setup_py_constants(statements: &[ast::Stmt], evaluator: &mut LiteralEvaluator) {
2269    for stmt in statements {
2270        if let ast::Stmt::Assign(ast::StmtAssign { targets, value, .. }) = stmt {
2271            if targets.len() != 1 {
2272                continue;
2273            }
2274
2275            let Some(name) = extract_assign_name(&targets[0]) else {
2276                continue;
2277            };
2278
2279            if let Some(value) = evaluator.evaluate_expr(value.as_ref(), 0) {
2280                evaluator.insert_constant(name, value);
2281            }
2282        }
2283    }
2284}
2285
2286fn extract_assign_name(target: &ast::Expr) -> Option<String> {
2287    match target {
2288        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
2289        _ => None,
2290    }
2291}
2292
2293fn collect_setup_aliases(statements: &[ast::Stmt]) -> SetupAliases {
2294    let mut aliases = SetupAliases::default();
2295    aliases.setup_names.insert("setup".to_string());
2296
2297    for stmt in statements {
2298        match stmt {
2299            ast::Stmt::Import(ast::StmtImport { names, .. }) => {
2300                for alias in names {
2301                    let module_name = alias.name.as_str();
2302                    if !is_setup_module(module_name) {
2303                        continue;
2304                    }
2305                    let alias_name = alias
2306                        .asname
2307                        .as_ref()
2308                        .map(|name| name.as_str())
2309                        .unwrap_or(module_name);
2310                    aliases
2311                        .module_aliases
2312                        .insert(alias_name.to_string(), module_name.to_string());
2313                }
2314            }
2315            ast::Stmt::ImportFrom(ast::StmtImportFrom { module, names, .. }) => {
2316                let Some(module_name) = module.as_ref().map(|name| name.as_str()) else {
2317                    continue;
2318                };
2319                if !is_setup_module(module_name) {
2320                    continue;
2321                }
2322                for alias in names {
2323                    if alias.name.as_str() != "setup" {
2324                        continue;
2325                    }
2326                    let alias_name = alias
2327                        .asname
2328                        .as_ref()
2329                        .map(|name| name.as_str())
2330                        .unwrap_or("setup");
2331                    aliases.setup_names.insert(alias_name.to_string());
2332                }
2333            }
2334            _ => {}
2335        }
2336    }
2337
2338    aliases
2339}
2340
2341fn is_setup_module(module_name: &str) -> bool {
2342    matches!(module_name, "setuptools" | "distutils" | "distutils.core")
2343}
2344
2345fn find_setup_call<'a>(
2346    statements: &'a [ast::Stmt],
2347    aliases: &'a SetupAliases,
2348) -> Option<&'a ast::Expr> {
2349    let mut finder = SetupCallFinder {
2350        aliases,
2351        nodes_visited: 0,
2352    };
2353    finder.find_in_statements(statements)
2354}
2355
2356struct SetupCallFinder<'a> {
2357    aliases: &'a SetupAliases,
2358    nodes_visited: usize,
2359}
2360
2361impl<'a> SetupCallFinder<'a> {
2362    fn find_in_statements(&mut self, statements: &'a [ast::Stmt]) -> Option<&'a ast::Expr> {
2363        for stmt in statements {
2364            if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
2365                return None;
2366            }
2367            self.nodes_visited += 1;
2368
2369            let found = match stmt {
2370                ast::Stmt::Expr(ast::StmtExpr { value, .. }) => self.visit_expr(value.as_ref()),
2371                ast::Stmt::Assign(ast::StmtAssign { value, .. }) => self.visit_expr(value.as_ref()),
2372                ast::Stmt::If(ast::StmtIf { body, orelse, .. }) => self
2373                    .find_in_statements(body)
2374                    .or_else(|| self.find_in_statements(orelse)),
2375                ast::Stmt::For(ast::StmtFor { body, orelse, .. })
2376                | ast::Stmt::While(ast::StmtWhile { body, orelse, .. }) => self
2377                    .find_in_statements(body)
2378                    .or_else(|| self.find_in_statements(orelse)),
2379                ast::Stmt::With(ast::StmtWith { body, .. }) => self.find_in_statements(body),
2380                ast::Stmt::Try(ast::StmtTry {
2381                    body,
2382                    orelse,
2383                    finalbody,
2384                    handlers,
2385                    ..
2386                })
2387                | ast::Stmt::TryStar(ast::StmtTryStar {
2388                    body,
2389                    orelse,
2390                    finalbody,
2391                    handlers,
2392                    ..
2393                }) => self
2394                    .find_in_statements(body)
2395                    .or_else(|| self.find_in_statements(orelse))
2396                    .or_else(|| self.find_in_statements(finalbody))
2397                    .or_else(|| {
2398                        for handler in handlers {
2399                            let ast::ExceptHandler::ExceptHandler(
2400                                ast::ExceptHandlerExceptHandler { body, .. },
2401                            ) = handler;
2402                            if let Some(found) = self.find_in_statements(body) {
2403                                return Some(found);
2404                            }
2405                        }
2406                        None
2407                    }),
2408                _ => None,
2409            };
2410
2411            if found.is_some() {
2412                return found;
2413            }
2414        }
2415
2416        None
2417    }
2418
2419    fn visit_expr(&mut self, expr: &'a ast::Expr) -> Option<&'a ast::Expr> {
2420        if self.nodes_visited >= MAX_SETUP_PY_AST_NODES {
2421            return None;
2422        }
2423        self.nodes_visited += 1;
2424
2425        match expr {
2426            ast::Expr::Call(ast::ExprCall { func, .. })
2427                if is_setup_call(func.as_ref(), self.aliases) =>
2428            {
2429                Some(expr)
2430            }
2431            _ => None,
2432        }
2433    }
2434}
2435
2436fn is_setup_call(func: &ast::Expr, aliases: &SetupAliases) -> bool {
2437    let Some(dotted) = dotted_name(func, 0) else {
2438        return false;
2439    };
2440
2441    if aliases.setup_names.contains(&dotted) {
2442        return true;
2443    }
2444
2445    let Some(module) = dotted.strip_suffix(".setup") else {
2446        return false;
2447    };
2448
2449    let resolved = resolve_module_alias(module, aliases);
2450    is_setup_module(&resolved)
2451}
2452
2453fn dotted_name(expr: &ast::Expr, depth: usize) -> Option<String> {
2454    if depth >= MAX_SETUP_PY_AST_DEPTH {
2455        return None;
2456    }
2457
2458    match expr {
2459        ast::Expr::Name(ast::ExprName { id, .. }) => Some(id.as_str().to_string()),
2460        ast::Expr::Attribute(ast::ExprAttribute { value, attr, .. }) => {
2461            let base = dotted_name(value.as_ref(), depth + 1)?;
2462            Some(format!("{}.{}", base, attr.as_str()))
2463        }
2464        _ => None,
2465    }
2466}
2467
2468fn resolve_module_alias(module: &str, aliases: &SetupAliases) -> String {
2469    if let Some(mapped) = aliases.module_aliases.get(module) {
2470        return mapped.clone();
2471    }
2472
2473    let Some((base, rest)) = module.split_once('.') else {
2474        return module.to_string();
2475    };
2476
2477    if let Some(mapped) = aliases.module_aliases.get(base) {
2478        return format!("{}.{}", mapped, rest);
2479    }
2480
2481    module.to_string()
2482}
2483
2484fn extract_setup_keywords(
2485    call_expr: &ast::Expr,
2486    evaluator: &mut LiteralEvaluator,
2487) -> HashMap<String, Value> {
2488    let mut values = HashMap::new();
2489    let ast::Expr::Call(ast::ExprCall { keywords, .. }) = call_expr else {
2490        return values;
2491    };
2492
2493    for keyword in keywords {
2494        if let Some(arg) = keyword.arg.as_ref().map(|name| name.as_str()) {
2495            if let Some(value) = evaluator.evaluate_expr(&keyword.value, 0) {
2496                values.insert(arg.to_string(), value);
2497            }
2498        } else if let Some(Value::Dict(dict)) = evaluator.evaluate_expr(&keyword.value, 0) {
2499            for (key, value) in dict {
2500                values.insert(key, value);
2501            }
2502        }
2503    }
2504
2505    values
2506}
2507
2508fn build_setup_py_package_data(values: &HashMap<String, Value>) -> PackageData {
2509    let name = get_value_string(values, "name");
2510    let version = get_value_string(values, "version");
2511    let description =
2512        get_value_string(values, "description").or_else(|| get_value_string(values, "summary"));
2513    let homepage_url =
2514        get_value_string(values, "url").or_else(|| get_value_string(values, "home_page"));
2515    let author = get_value_string(values, "author");
2516    let author_email = get_value_string(values, "author_email");
2517    let maintainer = get_value_string(values, "maintainer");
2518    let maintainer_email = get_value_string(values, "maintainer_email");
2519    let license = get_value_string(values, "license");
2520    let classifiers = values
2521        .get("classifiers")
2522        .and_then(value_to_string_list)
2523        .unwrap_or_default();
2524
2525    let mut parties = Vec::new();
2526    if author.is_some() || author_email.is_some() {
2527        parties.push(Party {
2528            r#type: Some("person".to_string()),
2529            role: Some("author".to_string()),
2530            name: author,
2531            email: author_email,
2532            url: None,
2533            organization: None,
2534            organization_url: None,
2535            timezone: None,
2536        });
2537    }
2538
2539    if maintainer.is_some() || maintainer_email.is_some() {
2540        parties.push(Party {
2541            r#type: Some("person".to_string()),
2542            role: Some("maintainer".to_string()),
2543            name: maintainer,
2544            email: maintainer_email,
2545            url: None,
2546            organization: None,
2547            organization_url: None,
2548            timezone: None,
2549        });
2550    }
2551
2552    // Extract license statement only - detection happens in separate engine
2553    let declared_license_expression = None;
2554    let declared_license_expression_spdx = None;
2555    let license_detections = Vec::new();
2556    let extracted_license_statement = license.clone();
2557
2558    let dependencies = build_setup_py_dependencies(values);
2559    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
2560    let mut homepage_from_project_urls = None;
2561    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
2562    let mut extra_data = HashMap::new();
2563
2564    if let Some(parsed_project_urls) = values.get("project_urls").and_then(value_to_string_pairs) {
2565        apply_project_url_mappings(
2566            &parsed_project_urls,
2567            &mut homepage_from_project_urls,
2568            &mut bug_tracking_url,
2569            &mut code_view_url,
2570            &mut vcs_url,
2571            &mut extra_data,
2572        );
2573    }
2574
2575    let extra_data = if extra_data.is_empty() {
2576        None
2577    } else {
2578        Some(extra_data)
2579    };
2580
2581    PackageData {
2582        package_type: Some(PythonParser::PACKAGE_TYPE),
2583        namespace: None,
2584        name,
2585        version,
2586        qualifiers: None,
2587        subpath: None,
2588        primary_language: None,
2589        description,
2590        release_date: None,
2591        parties,
2592        keywords: Vec::new(),
2593        homepage_url: homepage_url.or(homepage_from_project_urls),
2594        download_url: None,
2595        size: None,
2596        sha1: None,
2597        md5: None,
2598        sha256: None,
2599        sha512: None,
2600        bug_tracking_url,
2601        code_view_url,
2602        vcs_url,
2603        copyright: None,
2604        holder: None,
2605        declared_license_expression,
2606        declared_license_expression_spdx,
2607        license_detections,
2608        other_license_expression: None,
2609        other_license_expression_spdx: None,
2610        other_license_detections: Vec::new(),
2611        extracted_license_statement,
2612        notice_text: None,
2613        source_packages: Vec::new(),
2614        file_references: Vec::new(),
2615        is_private: has_private_classifier(&classifiers),
2616        is_virtual: false,
2617        extra_data,
2618        dependencies,
2619        repository_homepage_url: None,
2620        repository_download_url: None,
2621        api_data_url: None,
2622        datasource_id: Some(DatasourceId::PypiSetupPy),
2623        purl,
2624    }
2625}
2626
2627fn build_setup_py_dependencies(values: &HashMap<String, Value>) -> Vec<Dependency> {
2628    let mut dependencies = Vec::new();
2629
2630    if let Some(reqs) = values
2631        .get("install_requires")
2632        .and_then(value_to_string_list)
2633    {
2634        dependencies.extend(build_setup_py_dependency_list(&reqs, "install", false));
2635    }
2636
2637    if let Some(reqs) = values.get("tests_require").and_then(value_to_string_list) {
2638        dependencies.extend(build_setup_py_dependency_list(&reqs, "test", true));
2639    }
2640
2641    if let Some(Value::Dict(extras)) = values.get("extras_require") {
2642        let mut extra_items: Vec<_> = extras.iter().collect();
2643        extra_items.sort_by_key(|(name, _)| *name);
2644        for (extra_name, extra_value) in extra_items {
2645            if let Some(reqs) = value_to_string_list(extra_value) {
2646                dependencies.extend(build_setup_py_dependency_list(
2647                    reqs.as_slice(),
2648                    extra_name,
2649                    true,
2650                ));
2651            }
2652        }
2653    }
2654
2655    dependencies
2656}
2657
2658fn build_setup_py_dependency_list(
2659    reqs: &[String],
2660    scope: &str,
2661    is_optional: bool,
2662) -> Vec<Dependency> {
2663    reqs.iter()
2664        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
2665        .collect()
2666}
2667
2668fn get_value_string(values: &HashMap<String, Value>, key: &str) -> Option<String> {
2669    values.get(key).and_then(value_to_string)
2670}
2671
2672fn value_to_string(value: &Value) -> Option<String> {
2673    match value {
2674        Value::String(value) => Some(value.clone()),
2675        Value::Number(value) => Some(value.to_string()),
2676        Value::Bool(value) => Some(value.to_string()),
2677        _ => None,
2678    }
2679}
2680
2681fn value_to_string_list(value: &Value) -> Option<Vec<String>> {
2682    match value {
2683        Value::String(value) => Some(vec![value.clone()]),
2684        Value::List(values) | Value::Tuple(values) => {
2685            let mut items = Vec::new();
2686            for item in values {
2687                items.push(value_to_string(item)?);
2688            }
2689            Some(items)
2690        }
2691        _ => None,
2692    }
2693}
2694
2695fn value_to_string_pairs(value: &Value) -> Option<Vec<(String, String)>> {
2696    let Value::Dict(dict) = value else {
2697        return None;
2698    };
2699
2700    let mut pairs: Vec<(String, String)> = dict
2701        .iter()
2702        .map(|(key, value)| Some((key.clone(), value_to_string(value)?)))
2703        .collect::<Option<Vec<_>>>()?;
2704    pairs.sort_by(|left, right| left.0.cmp(&right.0));
2705    Some(pairs)
2706}
2707
2708fn extract_rfc822_dependencies(headers: &HashMap<String, Vec<String>>) -> Vec<Dependency> {
2709    let requires_dist = super::rfc822::get_header_all(headers, "requires-dist");
2710    requires_dist
2711        .iter()
2712        .filter_map(|entry| build_rfc822_dependency(entry))
2713        .collect()
2714}
2715
2716fn build_rfc822_dependency(entry: &str) -> Option<Dependency> {
2717    build_python_dependency(entry, "install", false, None)
2718}
2719
2720fn build_python_dependency(
2721    entry: &str,
2722    default_scope: &str,
2723    default_optional: bool,
2724    marker_override: Option<&str>,
2725) -> Option<Dependency> {
2726    let (requirement_part, marker_part) = entry
2727        .split_once(';')
2728        .map(|(req, marker)| (req.trim(), Some(marker.trim())))
2729        .unwrap_or((entry.trim(), None));
2730
2731    let name = extract_setup_cfg_dependency_name(requirement_part)?;
2732    let requirement = normalize_rfc822_requirement(requirement_part);
2733    let (scope, is_optional, marker, marker_data) = parse_rfc822_marker(
2734        marker_part.or(marker_override),
2735        default_scope,
2736        default_optional,
2737    );
2738    let mut purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
2739
2740    let is_pinned = requirement
2741        .as_deref()
2742        .is_some_and(|req| req.starts_with("==") || req.starts_with("==="));
2743    if is_pinned
2744        && let Some(version) = requirement
2745            .as_deref()
2746            .map(|req| req.trim_start_matches('='))
2747    {
2748        purl.with_version(version).ok()?;
2749    }
2750
2751    let mut extra_data = HashMap::new();
2752    extra_data.extend(marker_data);
2753    if let Some(marker) = marker {
2754        extra_data.insert("marker".to_string(), serde_json::Value::String(marker));
2755    }
2756
2757    Some(Dependency {
2758        purl: Some(purl.to_string()),
2759        extracted_requirement: requirement,
2760        scope: Some(scope),
2761        is_runtime: Some(true),
2762        is_optional: Some(is_optional),
2763        is_pinned: Some(is_pinned),
2764        is_direct: Some(true),
2765        resolved_package: None,
2766        extra_data: if extra_data.is_empty() {
2767            None
2768        } else {
2769            Some(extra_data)
2770        },
2771    })
2772}
2773
2774fn normalize_rfc822_requirement(requirement_part: &str) -> Option<String> {
2775    let name = extract_setup_cfg_dependency_name(requirement_part)?;
2776    let trimmed = requirement_part.trim();
2777    let mut remainder = trimmed[name.len()..].trim();
2778
2779    if let Some(stripped) = remainder.strip_prefix('[')
2780        && let Some(end_idx) = stripped.find(']')
2781    {
2782        remainder = stripped[end_idx + 1..].trim();
2783    }
2784
2785    let remainder = remainder
2786        .strip_prefix('(')
2787        .and_then(|value| value.strip_suffix(')'))
2788        .unwrap_or(remainder)
2789        .trim();
2790
2791    if remainder.is_empty() {
2792        return None;
2793    }
2794
2795    let mut specifiers: Vec<String> = remainder
2796        .split(',')
2797        .map(|specifier| specifier.trim().replace(' ', ""))
2798        .filter(|specifier| !specifier.is_empty())
2799        .collect();
2800    specifiers.sort();
2801    Some(specifiers.join(","))
2802}
2803
2804fn parse_rfc822_marker(
2805    marker_part: Option<&str>,
2806    default_scope: &str,
2807    default_optional: bool,
2808) -> (
2809    String,
2810    bool,
2811    Option<String>,
2812    HashMap<String, serde_json::Value>,
2813) {
2814    let Some(marker) = marker_part.filter(|marker| !marker.trim().is_empty()) else {
2815        return (
2816            default_scope.to_string(),
2817            default_optional,
2818            None,
2819            HashMap::new(),
2820        );
2821    };
2822
2823    let extra_re = Regex::new(r#"extra\s*==\s*['\"]([^'\"]+)['\"]"#)
2824        .expect("extra marker regex should compile");
2825    let mut extra_data = HashMap::new();
2826
2827    if let Some(python_version) = extract_marker_field(marker, "python_version") {
2828        extra_data.insert(
2829            "python_version".to_string(),
2830            serde_json::Value::String(python_version),
2831        );
2832    }
2833    if let Some(sys_platform) = extract_marker_field(marker, "sys_platform") {
2834        extra_data.insert(
2835            "sys_platform".to_string(),
2836            serde_json::Value::String(sys_platform),
2837        );
2838    }
2839
2840    if let Some(captures) = extra_re.captures(marker)
2841        && let Some(scope) = captures.get(1)
2842    {
2843        return (
2844            scope.as_str().to_string(),
2845            true,
2846            Some(marker.trim().to_string()),
2847            extra_data,
2848        );
2849    }
2850
2851    (
2852        default_scope.to_string(),
2853        default_optional,
2854        Some(marker.trim().to_string()),
2855        extra_data,
2856    )
2857}
2858
2859fn extract_marker_field(marker: &str, field: &str) -> Option<String> {
2860    let re = Regex::new(&format!(
2861        r#"{}\s*(==|!=|<=|>=|<|>)\s*['\"]([^'\"]+)['\"]"#,
2862        field
2863    ))
2864    .ok()?;
2865    let captures = re.captures(marker)?;
2866    let operator = captures.get(1)?.as_str();
2867    let value = captures.get(2)?.as_str();
2868    Some(format!("{} {}", operator, value))
2869}
2870
2871fn parse_requires_txt(content: &str) -> Vec<Dependency> {
2872    let mut dependencies = Vec::new();
2873    let mut current_scope = "install".to_string();
2874    let mut current_optional = false;
2875    let mut current_marker: Option<String> = None;
2876
2877    for line in content.lines() {
2878        let trimmed = line.trim();
2879        if trimmed.is_empty() || trimmed.starts_with('#') {
2880            continue;
2881        }
2882
2883        if trimmed.starts_with('[') && trimmed.ends_with(']') {
2884            let inner = &trimmed[1..trimmed.len() - 1];
2885            if let Some(rest) = inner.strip_prefix(':') {
2886                current_scope = "install".to_string();
2887                current_optional = false;
2888                current_marker = Some(rest.trim().to_string());
2889            } else if let Some((scope, marker)) = inner.split_once(':') {
2890                current_scope = scope.trim().to_string();
2891                current_optional = true;
2892                current_marker = Some(marker.trim().to_string());
2893            } else {
2894                current_scope = inner.trim().to_string();
2895                current_optional = true;
2896                current_marker = None;
2897            }
2898            continue;
2899        }
2900
2901        if let Some(dependency) = build_python_dependency(
2902            trimmed,
2903            &current_scope,
2904            current_optional,
2905            current_marker.as_deref(),
2906        ) {
2907            dependencies.push(dependency);
2908        }
2909    }
2910
2911    dependencies
2912}
2913
2914fn has_private_classifier(classifiers: &[String]) -> bool {
2915    classifiers
2916        .iter()
2917        .any(|classifier| classifier.eq_ignore_ascii_case("Private :: Do Not Upload"))
2918}
2919
2920fn build_setup_py_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
2921    let name = name?;
2922    let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), name).ok()?;
2923    if let Some(version) = version {
2924        package_url.with_version(version).ok()?;
2925    }
2926    Some(package_url.to_string())
2927}
2928
2929fn extract_from_setup_py_regex(content: &str) -> PackageData {
2930    let name = extract_setup_value(content, "name");
2931    let version = extract_setup_value(content, "version");
2932    let license_expression = extract_setup_value(content, "license");
2933
2934    // Extract license statement only - detection happens in separate engine
2935    let declared_license_expression = None;
2936    let declared_license_expression_spdx = None;
2937    let license_detections = Vec::new();
2938    let extracted_license_statement = license_expression.clone();
2939
2940    let dependencies = extract_setup_py_dependencies(content);
2941    let homepage_url = extract_setup_value(content, "url");
2942    let purl = build_setup_py_purl(name.as_deref(), version.as_deref());
2943
2944    PackageData {
2945        package_type: Some(PythonParser::PACKAGE_TYPE),
2946        namespace: None,
2947        name,
2948        version,
2949        qualifiers: None,
2950        subpath: None,
2951        primary_language: None,
2952        description: None,
2953        release_date: None,
2954        parties: Vec::new(),
2955        keywords: Vec::new(),
2956        homepage_url,
2957        download_url: None,
2958        size: None,
2959        sha1: None,
2960        md5: None,
2961        sha256: None,
2962        sha512: None,
2963        bug_tracking_url: None,
2964        code_view_url: None,
2965        vcs_url: None,
2966        copyright: None,
2967        holder: None,
2968        declared_license_expression,
2969        declared_license_expression_spdx,
2970        license_detections,
2971        other_license_expression: None,
2972        other_license_expression_spdx: None,
2973        other_license_detections: Vec::new(),
2974        extracted_license_statement,
2975        notice_text: None,
2976        source_packages: Vec::new(),
2977        file_references: Vec::new(),
2978        is_private: false,
2979        is_virtual: false,
2980        extra_data: None,
2981        dependencies,
2982        repository_homepage_url: None,
2983        repository_download_url: None,
2984        api_data_url: None,
2985        datasource_id: Some(DatasourceId::PypiSetupPy),
2986        purl,
2987    }
2988}
2989
2990fn package_data_to_resolved(pkg: &PackageData) -> crate::models::ResolvedPackage {
2991    crate::models::ResolvedPackage {
2992        package_type: pkg.package_type.unwrap_or(PackageType::Pypi),
2993        namespace: pkg.namespace.clone().unwrap_or_default(),
2994        name: pkg.name.clone().unwrap_or_default(),
2995        version: pkg.version.clone().unwrap_or_default(),
2996        primary_language: pkg.primary_language.clone(),
2997        download_url: pkg.download_url.clone(),
2998        sha1: pkg.sha1.clone(),
2999        sha256: pkg.sha256.clone(),
3000        sha512: pkg.sha512.clone(),
3001        md5: pkg.md5.clone(),
3002        is_virtual: pkg.is_virtual,
3003        extra_data: None,
3004        dependencies: pkg.dependencies.clone(),
3005        repository_homepage_url: pkg.repository_homepage_url.clone(),
3006        repository_download_url: pkg.repository_download_url.clone(),
3007        api_data_url: pkg.api_data_url.clone(),
3008        datasource_id: pkg.datasource_id,
3009        purl: pkg.purl.clone(),
3010    }
3011}
3012
3013fn extract_from_pypi_json(path: &Path) -> PackageData {
3014    let default = PackageData {
3015        package_type: Some(PythonParser::PACKAGE_TYPE),
3016        datasource_id: Some(DatasourceId::PypiJson),
3017        ..Default::default()
3018    };
3019
3020    let content = match read_file_to_string(path) {
3021        Ok(content) => content,
3022        Err(error) => {
3023            warn!("Failed to read pypi.json at {:?}: {}", path, error);
3024            return default;
3025        }
3026    };
3027
3028    let root: serde_json::Value = match serde_json::from_str(&content) {
3029        Ok(value) => value,
3030        Err(error) => {
3031            warn!("Failed to parse pypi.json at {:?}: {}", path, error);
3032            return default;
3033        }
3034    };
3035
3036    let Some(info) = root.get("info").and_then(|value| value.as_object()) else {
3037        warn!("No info object found in pypi.json at {:?}", path);
3038        return default;
3039    };
3040
3041    let name = info
3042        .get("name")
3043        .and_then(|value| value.as_str())
3044        .map(ToOwned::to_owned);
3045    let version = info
3046        .get("version")
3047        .and_then(|value| value.as_str())
3048        .map(ToOwned::to_owned);
3049    let summary = info
3050        .get("summary")
3051        .and_then(|value| value.as_str())
3052        .map(ToOwned::to_owned);
3053    let description = info
3054        .get("description")
3055        .and_then(|value| value.as_str())
3056        .filter(|value| !value.trim().is_empty())
3057        .map(ToOwned::to_owned)
3058        .or(summary);
3059    let mut homepage_url = info
3060        .get("home_page")
3061        .and_then(|value| value.as_str())
3062        .map(ToOwned::to_owned);
3063    let author = info
3064        .get("author")
3065        .and_then(|value| value.as_str())
3066        .filter(|value| !value.trim().is_empty())
3067        .map(ToOwned::to_owned);
3068    let author_email = info
3069        .get("author_email")
3070        .and_then(|value| value.as_str())
3071        .filter(|value| !value.trim().is_empty())
3072        .map(ToOwned::to_owned);
3073    let license = info
3074        .get("license")
3075        .and_then(|value| value.as_str())
3076        .filter(|value| !value.trim().is_empty())
3077        .map(ToOwned::to_owned);
3078    let keywords = parse_setup_cfg_keywords(
3079        info.get("keywords")
3080            .and_then(|value| value.as_str())
3081            .map(ToOwned::to_owned),
3082    );
3083    let classifiers = info
3084        .get("classifiers")
3085        .and_then(|value| value.as_array())
3086        .map(|values| {
3087            values
3088                .iter()
3089                .filter_map(|value| value.as_str().map(ToOwned::to_owned))
3090                .collect::<Vec<_>>()
3091        })
3092        .unwrap_or_default();
3093
3094    let mut parties = Vec::new();
3095    if author.is_some() || author_email.is_some() {
3096        parties.push(Party {
3097            r#type: Some("person".to_string()),
3098            role: Some("author".to_string()),
3099            name: author,
3100            email: author_email,
3101            url: None,
3102            organization: None,
3103            organization_url: None,
3104            timezone: None,
3105        });
3106    }
3107
3108    let mut bug_tracking_url = None;
3109    let mut code_view_url = None;
3110    let mut vcs_url = None;
3111    let mut extra_data = HashMap::new();
3112
3113    let parsed_project_urls = info
3114        .get("project_urls")
3115        .and_then(|value| value.as_object())
3116        .map(|map| {
3117            let mut pairs: Vec<(String, String)> = map
3118                .iter()
3119                .filter_map(|(key, value)| Some((key.clone(), value.as_str()?.to_string())))
3120                .collect();
3121            pairs.sort_by(|left, right| left.0.cmp(&right.0));
3122            pairs
3123        })
3124        .unwrap_or_default();
3125
3126    apply_project_url_mappings(
3127        &parsed_project_urls,
3128        &mut homepage_url,
3129        &mut bug_tracking_url,
3130        &mut code_view_url,
3131        &mut vcs_url,
3132        &mut extra_data,
3133    );
3134
3135    let (download_url, size, sha256) = root
3136        .get("urls")
3137        .and_then(|value| value.as_array())
3138        .map(|urls| select_pypi_json_artifact(urls))
3139        .unwrap_or((None, None, None));
3140
3141    let (repository_homepage_url, repository_download_url, api_data_url, purl) =
3142        build_pypi_urls(name.as_deref(), version.as_deref());
3143
3144    PackageData {
3145        package_type: Some(PythonParser::PACKAGE_TYPE),
3146        namespace: None,
3147        name,
3148        version,
3149        qualifiers: None,
3150        subpath: None,
3151        primary_language: None,
3152        description,
3153        release_date: None,
3154        parties,
3155        keywords,
3156        homepage_url: homepage_url.or(repository_homepage_url.clone()),
3157        download_url,
3158        size,
3159        sha1: None,
3160        md5: None,
3161        sha256,
3162        sha512: None,
3163        bug_tracking_url,
3164        code_view_url,
3165        vcs_url,
3166        copyright: None,
3167        holder: None,
3168        declared_license_expression: None,
3169        declared_license_expression_spdx: None,
3170        license_detections: Vec::new(),
3171        other_license_expression: None,
3172        other_license_expression_spdx: None,
3173        other_license_detections: Vec::new(),
3174        extracted_license_statement: license,
3175        notice_text: None,
3176        source_packages: Vec::new(),
3177        file_references: Vec::new(),
3178        is_private: has_private_classifier(&classifiers),
3179        is_virtual: false,
3180        extra_data: if extra_data.is_empty() {
3181            None
3182        } else {
3183            Some(extra_data)
3184        },
3185        dependencies: Vec::new(),
3186        repository_homepage_url,
3187        repository_download_url,
3188        api_data_url,
3189        datasource_id: Some(DatasourceId::PypiJson),
3190        purl,
3191    }
3192}
3193
3194fn select_pypi_json_artifact(
3195    urls: &[serde_json::Value],
3196) -> (Option<String>, Option<u64>, Option<String>) {
3197    let selected = urls
3198        .iter()
3199        .find(|entry| entry.get("packagetype").and_then(|value| value.as_str()) == Some("sdist"))
3200        .or_else(|| urls.first());
3201
3202    let Some(entry) = selected else {
3203        return (None, None, None);
3204    };
3205
3206    let download_url = entry
3207        .get("url")
3208        .and_then(|value| value.as_str())
3209        .map(ToOwned::to_owned);
3210    let size = entry.get("size").and_then(|value| value.as_u64());
3211    let sha256 = entry
3212        .get("digests")
3213        .and_then(|value| value.as_object())
3214        .and_then(|digests| digests.get("sha256"))
3215        .and_then(|value| value.as_str())
3216        .map(ToOwned::to_owned);
3217
3218    (download_url, size, sha256)
3219}
3220
3221fn extract_from_pip_inspect(path: &Path) -> PackageData {
3222    let content = match read_file_to_string(path) {
3223        Ok(content) => content,
3224        Err(e) => {
3225            warn!("Failed to read pip-inspect.deplock at {:?}: {}", path, e);
3226            return default_package_data();
3227        }
3228    };
3229
3230    let root: serde_json::Value = match serde_json::from_str(&content) {
3231        Ok(value) => value,
3232        Err(e) => {
3233            warn!(
3234                "Failed to parse pip-inspect.deplock JSON at {:?}: {}",
3235                path, e
3236            );
3237            return default_package_data();
3238        }
3239    };
3240
3241    let installed = match root.get("installed").and_then(|v| v.as_array()) {
3242        Some(arr) => arr,
3243        None => {
3244            warn!(
3245                "No 'installed' array found in pip-inspect.deplock at {:?}",
3246                path
3247            );
3248            return default_package_data();
3249        }
3250    };
3251
3252    let pip_version = root
3253        .get("pip_version")
3254        .and_then(|v| v.as_str())
3255        .map(String::from);
3256    let inspect_version = root
3257        .get("version")
3258        .and_then(|v| v.as_str())
3259        .map(String::from);
3260
3261    let mut main_package: Option<PackageData> = None;
3262    let mut dependencies: Vec<Dependency> = Vec::new();
3263
3264    for package_entry in installed {
3265        let metadata = match package_entry.get("metadata") {
3266            Some(m) => m,
3267            None => continue,
3268        };
3269
3270        let is_requested = package_entry
3271            .get("requested")
3272            .and_then(|v| v.as_bool())
3273            .unwrap_or(false);
3274        let has_direct_url = package_entry.get("direct_url").is_some();
3275
3276        let name = metadata
3277            .get("name")
3278            .and_then(|v| v.as_str())
3279            .map(String::from);
3280        let version = metadata
3281            .get("version")
3282            .and_then(|v| v.as_str())
3283            .map(String::from);
3284        let summary = metadata
3285            .get("summary")
3286            .and_then(|v| v.as_str())
3287            .map(String::from);
3288        let home_page = metadata
3289            .get("home_page")
3290            .and_then(|v| v.as_str())
3291            .map(String::from);
3292        let author = metadata
3293            .get("author")
3294            .and_then(|v| v.as_str())
3295            .map(String::from);
3296        let author_email = metadata
3297            .get("author_email")
3298            .and_then(|v| v.as_str())
3299            .map(String::from);
3300        let license = metadata
3301            .get("license")
3302            .and_then(|v| v.as_str())
3303            .map(String::from);
3304        let description = metadata
3305            .get("description")
3306            .and_then(|v| v.as_str())
3307            .map(String::from);
3308        let keywords = metadata
3309            .get("keywords")
3310            .and_then(|v| v.as_array())
3311            .map(|arr| {
3312                arr.iter()
3313                    .filter_map(|k| k.as_str().map(String::from))
3314                    .collect::<Vec<_>>()
3315            })
3316            .unwrap_or_default();
3317
3318        let mut parties = Vec::new();
3319        if author.is_some() || author_email.is_some() {
3320            parties.push(Party {
3321                r#type: Some("person".to_string()),
3322                role: Some("author".to_string()),
3323                name: author,
3324                email: author_email,
3325                url: None,
3326                organization: None,
3327                organization_url: None,
3328                timezone: None,
3329            });
3330        }
3331
3332        // Extract license statement only - detection happens in separate engine
3333        let license_detections = Vec::new();
3334        let declared_license_expression = None;
3335        let declared_license_expression_spdx = None;
3336        let extracted_license_statement = license.clone();
3337
3338        let purl = name.as_ref().and_then(|n| {
3339            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
3340            if let Some(v) = &version {
3341                package_url.with_version(v).ok()?;
3342            }
3343            Some(package_url.to_string())
3344        });
3345
3346        if is_requested && has_direct_url {
3347            let mut extra_data = HashMap::new();
3348            if let Some(pv) = &pip_version {
3349                extra_data.insert(
3350                    "pip_version".to_string(),
3351                    serde_json::Value::String(pv.clone()),
3352                );
3353            }
3354            if let Some(iv) = &inspect_version {
3355                extra_data.insert(
3356                    "inspect_version".to_string(),
3357                    serde_json::Value::String(iv.clone()),
3358                );
3359            }
3360
3361            main_package = Some(PackageData {
3362                package_type: Some(PythonParser::PACKAGE_TYPE),
3363                namespace: None,
3364                name,
3365                version,
3366                qualifiers: None,
3367                subpath: None,
3368                primary_language: Some("Python".to_string()),
3369                description: description.or(summary),
3370                release_date: None,
3371                parties,
3372                keywords,
3373                homepage_url: home_page,
3374                download_url: None,
3375                size: None,
3376                sha1: None,
3377                md5: None,
3378                sha256: None,
3379                sha512: None,
3380                bug_tracking_url: None,
3381                code_view_url: None,
3382                vcs_url: None,
3383                copyright: None,
3384                holder: None,
3385                declared_license_expression,
3386                declared_license_expression_spdx,
3387                license_detections,
3388                other_license_expression: None,
3389                other_license_expression_spdx: None,
3390                other_license_detections: Vec::new(),
3391                extracted_license_statement,
3392                notice_text: None,
3393                source_packages: Vec::new(),
3394                file_references: Vec::new(),
3395                is_private: false,
3396                is_virtual: true,
3397                extra_data: if extra_data.is_empty() {
3398                    None
3399                } else {
3400                    Some(extra_data)
3401                },
3402                dependencies: Vec::new(),
3403                repository_homepage_url: None,
3404                repository_download_url: None,
3405                api_data_url: None,
3406                datasource_id: Some(DatasourceId::PypiInspectDeplock),
3407                purl,
3408            });
3409        } else {
3410            let resolved_package = PackageData {
3411                package_type: Some(PythonParser::PACKAGE_TYPE),
3412                namespace: None,
3413                name: name.clone(),
3414                version: version.clone(),
3415                qualifiers: None,
3416                subpath: None,
3417                primary_language: Some("Python".to_string()),
3418                description: description.or(summary),
3419                release_date: None,
3420                parties,
3421                keywords,
3422                homepage_url: home_page,
3423                download_url: None,
3424                size: None,
3425                sha1: None,
3426                md5: None,
3427                sha256: None,
3428                sha512: None,
3429                bug_tracking_url: None,
3430                code_view_url: None,
3431                vcs_url: None,
3432                copyright: None,
3433                holder: None,
3434                declared_license_expression,
3435                declared_license_expression_spdx,
3436                license_detections,
3437                other_license_expression: None,
3438                other_license_expression_spdx: None,
3439                other_license_detections: Vec::new(),
3440                extracted_license_statement,
3441                notice_text: None,
3442                source_packages: Vec::new(),
3443                file_references: Vec::new(),
3444                is_private: false,
3445                is_virtual: true,
3446                extra_data: None,
3447                dependencies: Vec::new(),
3448                repository_homepage_url: None,
3449                repository_download_url: None,
3450                api_data_url: None,
3451                datasource_id: Some(DatasourceId::PypiInspectDeplock),
3452                purl: purl.clone(),
3453            };
3454
3455            let resolved = package_data_to_resolved(&resolved_package);
3456            dependencies.push(Dependency {
3457                purl,
3458                extracted_requirement: None,
3459                scope: None,
3460                is_runtime: Some(true),
3461                is_optional: Some(false),
3462                is_pinned: Some(true),
3463                is_direct: Some(is_requested),
3464                resolved_package: Some(Box::new(resolved)),
3465                extra_data: None,
3466            });
3467        }
3468    }
3469
3470    if let Some(mut main_pkg) = main_package {
3471        main_pkg.dependencies = dependencies;
3472        main_pkg
3473    } else {
3474        default_package_data()
3475    }
3476}
3477
3478type IniSections = HashMap<String, HashMap<String, Vec<String>>>;
3479
3480fn extract_from_setup_cfg(path: &Path) -> PackageData {
3481    let content = match read_file_to_string(path) {
3482        Ok(content) => content,
3483        Err(e) => {
3484            warn!("Failed to read setup.cfg at {:?}: {}", path, e);
3485            return default_package_data();
3486        }
3487    };
3488
3489    let sections = parse_setup_cfg(&content);
3490    let name = get_ini_value(&sections, "metadata", "name");
3491    let version = get_ini_value(&sections, "metadata", "version");
3492    let description = get_ini_value(&sections, "metadata", "description");
3493    let author = get_ini_value(&sections, "metadata", "author");
3494    let author_email = get_ini_value(&sections, "metadata", "author_email");
3495    let maintainer = get_ini_value(&sections, "metadata", "maintainer");
3496    let maintainer_email = get_ini_value(&sections, "metadata", "maintainer_email");
3497    let license = get_ini_value(&sections, "metadata", "license");
3498    let mut homepage_url = get_ini_value(&sections, "metadata", "url");
3499    let classifiers = get_ini_values(&sections, "metadata", "classifiers");
3500    let keywords = parse_setup_cfg_keywords(get_ini_value(&sections, "metadata", "keywords"));
3501    let python_requires = get_ini_value(&sections, "options", "python_requires");
3502    let parsed_project_urls =
3503        parse_setup_cfg_project_urls(&get_ini_values(&sections, "metadata", "project_urls"));
3504    let (mut bug_tracking_url, mut code_view_url, mut vcs_url) = (None, None, None);
3505    let mut extra_data = HashMap::new();
3506
3507    let mut parties = Vec::new();
3508    if author.is_some() || author_email.is_some() {
3509        parties.push(Party {
3510            r#type: Some("person".to_string()),
3511            role: Some("author".to_string()),
3512            name: author,
3513            email: author_email,
3514            url: None,
3515            organization: None,
3516            organization_url: None,
3517            timezone: None,
3518        });
3519    }
3520
3521    if maintainer.is_some() || maintainer_email.is_some() {
3522        parties.push(Party {
3523            r#type: Some("person".to_string()),
3524            role: Some("maintainer".to_string()),
3525            name: maintainer,
3526            email: maintainer_email,
3527            url: None,
3528            organization: None,
3529            organization_url: None,
3530            timezone: None,
3531        });
3532    }
3533
3534    // Extract license statement only - detection happens in separate engine
3535    let declared_license_expression = None;
3536    let declared_license_expression_spdx = None;
3537    let license_detections = Vec::new();
3538    let extracted_license_statement = license.clone();
3539
3540    let dependencies = extract_setup_cfg_dependencies(&sections);
3541
3542    if let Some(value) = python_requires {
3543        extra_data.insert(
3544            "python_requires".to_string(),
3545            serde_json::Value::String(value),
3546        );
3547    }
3548
3549    apply_project_url_mappings(
3550        &parsed_project_urls,
3551        &mut homepage_url,
3552        &mut bug_tracking_url,
3553        &mut code_view_url,
3554        &mut vcs_url,
3555        &mut extra_data,
3556    );
3557
3558    let extra_data = if extra_data.is_empty() {
3559        None
3560    } else {
3561        Some(extra_data)
3562    };
3563
3564    let purl = name.as_ref().and_then(|n| {
3565        let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), n).ok()?;
3566        if let Some(v) = &version {
3567            package_url.with_version(v).ok()?;
3568        }
3569        Some(package_url.to_string())
3570    });
3571
3572    PackageData {
3573        package_type: Some(PythonParser::PACKAGE_TYPE),
3574        namespace: None,
3575        name,
3576        version,
3577        qualifiers: None,
3578        subpath: None,
3579        primary_language: Some("Python".to_string()),
3580        description,
3581        release_date: None,
3582        parties,
3583        keywords,
3584        homepage_url,
3585        download_url: None,
3586        size: None,
3587        sha1: None,
3588        md5: None,
3589        sha256: None,
3590        sha512: None,
3591        bug_tracking_url,
3592        code_view_url,
3593        vcs_url,
3594        copyright: None,
3595        holder: None,
3596        declared_license_expression,
3597        declared_license_expression_spdx,
3598        license_detections,
3599        other_license_expression: None,
3600        other_license_expression_spdx: None,
3601        other_license_detections: Vec::new(),
3602        extracted_license_statement,
3603        notice_text: None,
3604        source_packages: Vec::new(),
3605        file_references: Vec::new(),
3606        is_private: has_private_classifier(&classifiers),
3607        is_virtual: false,
3608        extra_data,
3609        dependencies,
3610        repository_homepage_url: None,
3611        repository_download_url: None,
3612        api_data_url: None,
3613        datasource_id: Some(DatasourceId::PypiSetupCfg),
3614        purl,
3615    }
3616}
3617
3618fn parse_setup_cfg_keywords(value: Option<String>) -> Vec<String> {
3619    let Some(keywords) = value else {
3620        return Vec::new();
3621    };
3622
3623    keywords
3624        .split(',')
3625        .map(str::trim)
3626        .filter(|keyword| !keyword.is_empty())
3627        .map(ToOwned::to_owned)
3628        .collect()
3629}
3630
3631fn parse_setup_cfg_project_urls(entries: &[String]) -> Vec<(String, String)> {
3632    entries
3633        .iter()
3634        .filter_map(|entry| {
3635            let (label, url) = entry.split_once('=')?;
3636            let label = label.trim();
3637            let url = url.trim();
3638            if label.is_empty() || url.is_empty() {
3639                None
3640            } else {
3641                Some((label.to_string(), url.to_string()))
3642            }
3643        })
3644        .collect()
3645}
3646
3647fn apply_project_url_mappings(
3648    parsed_urls: &[(String, String)],
3649    homepage_url: &mut Option<String>,
3650    bug_tracking_url: &mut Option<String>,
3651    code_view_url: &mut Option<String>,
3652    vcs_url: &mut Option<String>,
3653    extra_data: &mut HashMap<String, serde_json::Value>,
3654) {
3655    for (label, url) in parsed_urls {
3656        let label_lower = label.to_lowercase();
3657
3658        if bug_tracking_url.is_none()
3659            && matches!(
3660                label_lower.as_str(),
3661                "tracker"
3662                    | "bug reports"
3663                    | "bug tracker"
3664                    | "issues"
3665                    | "issue tracker"
3666                    | "github: issues"
3667            )
3668        {
3669            *bug_tracking_url = Some(url.clone());
3670        } else if code_view_url.is_none()
3671            && matches!(label_lower.as_str(), "source" | "source code" | "code")
3672        {
3673            *code_view_url = Some(url.clone());
3674        } else if vcs_url.is_none()
3675            && matches!(
3676                label_lower.as_str(),
3677                "github" | "gitlab" | "github: repo" | "repository"
3678            )
3679        {
3680            *vcs_url = Some(url.clone());
3681        } else if homepage_url.is_none()
3682            && matches!(label_lower.as_str(), "website" | "homepage" | "home")
3683        {
3684            *homepage_url = Some(url.clone());
3685        } else if label_lower == "changelog" {
3686            extra_data.insert(
3687                "changelog_url".to_string(),
3688                serde_json::Value::String(url.clone()),
3689            );
3690        }
3691    }
3692
3693    let project_urls_json: serde_json::Map<String, serde_json::Value> = parsed_urls
3694        .iter()
3695        .map(|(label, url)| (label.clone(), serde_json::Value::String(url.clone())))
3696        .collect();
3697
3698    if !project_urls_json.is_empty() {
3699        extra_data.insert(
3700            "project_urls".to_string(),
3701            serde_json::Value::Object(project_urls_json),
3702        );
3703    }
3704}
3705
3706fn parse_setup_cfg(content: &str) -> IniSections {
3707    let mut sections: IniSections = HashMap::new();
3708    let mut current_section: Option<String> = None;
3709    let mut current_key: Option<String> = None;
3710
3711    for raw_line in content.lines() {
3712        let line = raw_line.trim_end_matches('\r');
3713        let trimmed = line.trim();
3714        if trimmed.is_empty() {
3715            continue;
3716        }
3717
3718        let stripped = line.trim_start();
3719        if stripped.starts_with('#') || stripped.starts_with(';') {
3720            continue;
3721        }
3722
3723        if stripped.starts_with('[') && stripped.ends_with(']') {
3724            let section_name = stripped
3725                .trim_start_matches('[')
3726                .trim_end_matches(']')
3727                .trim()
3728                .to_ascii_lowercase();
3729            current_section = if section_name.is_empty() {
3730                None
3731            } else {
3732                Some(section_name)
3733            };
3734            current_key = None;
3735            continue;
3736        }
3737
3738        if (line.starts_with(' ') || line.starts_with('\t')) && current_key.is_some() {
3739            if let (Some(section), Some(key)) = (current_section.as_ref(), current_key.as_ref()) {
3740                let value = stripped.trim();
3741                if !value.is_empty() {
3742                    sections
3743                        .entry(section.clone())
3744                        .or_default()
3745                        .entry(key.clone())
3746                        .or_default()
3747                        .push(value.to_string());
3748                }
3749            }
3750            continue;
3751        }
3752
3753        if let Some((key, value)) = stripped.split_once('=')
3754            && let Some(section) = current_section.as_ref()
3755        {
3756            let key_name = key.trim().to_ascii_lowercase();
3757            let value_trimmed = value.trim();
3758            let entry = sections
3759                .entry(section.clone())
3760                .or_default()
3761                .entry(key_name.clone())
3762                .or_default();
3763            if !value_trimmed.is_empty() {
3764                entry.push(value_trimmed.to_string());
3765            }
3766            current_key = Some(key_name);
3767        }
3768    }
3769
3770    sections
3771}
3772
3773fn get_ini_value(sections: &IniSections, section: &str, key: &str) -> Option<String> {
3774    sections
3775        .get(&section.to_ascii_lowercase())
3776        .and_then(|values| values.get(&key.to_ascii_lowercase()))
3777        .and_then(|entries| entries.first())
3778        .map(|value| value.trim().to_string())
3779}
3780
3781fn get_ini_values(sections: &IniSections, section: &str, key: &str) -> Vec<String> {
3782    sections
3783        .get(&section.to_ascii_lowercase())
3784        .and_then(|values| values.get(&key.to_ascii_lowercase()))
3785        .cloned()
3786        .unwrap_or_default()
3787}
3788
3789fn extract_setup_cfg_dependencies(sections: &IniSections) -> Vec<Dependency> {
3790    let mut dependencies = Vec::new();
3791
3792    for (sub_section, scope) in [
3793        ("install_requires", "install"),
3794        ("tests_require", "test"),
3795        ("setup_requires", "setup"),
3796    ] {
3797        let reqs = get_ini_values(sections, "options", sub_section);
3798        dependencies.extend(parse_setup_cfg_requirements(&reqs, scope, false));
3799    }
3800
3801    if let Some(extras) = sections.get("options.extras_require") {
3802        let mut extra_items: Vec<_> = extras.iter().collect();
3803        extra_items.sort_by_key(|(name, _)| *name);
3804        for (extra_name, reqs) in extra_items {
3805            dependencies.extend(parse_setup_cfg_requirements(reqs, extra_name, true));
3806        }
3807    }
3808
3809    dependencies
3810}
3811
3812fn parse_setup_cfg_requirements(
3813    reqs: &[String],
3814    scope: &str,
3815    is_optional: bool,
3816) -> Vec<Dependency> {
3817    reqs.iter()
3818        .filter_map(|req| build_setup_cfg_dependency(req, scope, is_optional))
3819        .collect()
3820}
3821
3822fn build_setup_cfg_dependency(req: &str, scope: &str, is_optional: bool) -> Option<Dependency> {
3823    let trimmed = req.trim();
3824    if trimmed.is_empty() || trimmed.starts_with('#') {
3825        return None;
3826    }
3827
3828    let name = extract_setup_cfg_dependency_name(trimmed)?;
3829    let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
3830
3831    Some(Dependency {
3832        purl: Some(purl.to_string()),
3833        extracted_requirement: Some(normalize_setup_cfg_requirement(trimmed)),
3834        scope: Some(scope.to_string()),
3835        is_runtime: Some(true),
3836        is_optional: Some(is_optional),
3837        is_pinned: Some(false),
3838        is_direct: Some(true),
3839        resolved_package: None,
3840        extra_data: None,
3841    })
3842}
3843
3844fn extract_setup_cfg_dependency_name(req: &str) -> Option<String> {
3845    let trimmed = req.trim();
3846    if trimmed.is_empty() {
3847        return None;
3848    }
3849
3850    let end = trimmed
3851        .find(|c: char| c.is_whitespace() || matches!(c, '<' | '>' | '=' | '!' | '~' | ';' | '['))
3852        .unwrap_or(trimmed.len());
3853    let name = trimmed[..end].trim();
3854    if name.is_empty() {
3855        None
3856    } else {
3857        Some(name.to_string())
3858    }
3859}
3860
3861fn normalize_setup_cfg_requirement(req: &str) -> String {
3862    req.chars().filter(|c| !c.is_whitespace()).collect()
3863}
3864
3865fn extract_setup_value(content: &str, key: &str) -> Option<String> {
3866    let patterns = vec![
3867        format!("{}=\"", key),   // name="value"
3868        format!("{} =\"", key),  // name ="value"
3869        format!("{}= \"", key),  // name= "value"
3870        format!("{} = \"", key), // name = "value"
3871        format!("{}='", key),    // name='value'
3872        format!("{} ='", key),   // name ='value'
3873        format!("{}= '", key),   // name= 'value'
3874        format!("{} = '", key),  // name = 'value'
3875    ];
3876
3877    for pattern in patterns {
3878        if let Some(start_idx) = content.find(&pattern) {
3879            let value_start = start_idx + pattern.len();
3880            let remaining = &content[value_start..];
3881
3882            if let Some(end_idx) = remaining.find(['"', '\'']) {
3883                return Some(remaining[..end_idx].to_string());
3884            }
3885        }
3886    }
3887
3888    None
3889}
3890
3891fn extract_setup_py_dependencies(content: &str) -> Vec<Dependency> {
3892    let mut dependencies = Vec::new();
3893
3894    if let Some(tests_deps) = extract_tests_require(content) {
3895        dependencies.extend(tests_deps);
3896    }
3897
3898    if let Some(extras_deps) = extract_extras_require(content) {
3899        dependencies.extend(extras_deps);
3900    }
3901
3902    dependencies
3903}
3904
3905fn extract_tests_require(content: &str) -> Option<Vec<Dependency>> {
3906    let pattern = r"tests_require\s*=\s*\[([^\]]+)\]";
3907    let re = Regex::new(pattern).ok()?;
3908    let captures = re.captures(content)?;
3909    let deps_str = captures.get(1)?.as_str();
3910
3911    let deps = parse_setup_py_dep_list(deps_str, "test", true);
3912    if deps.is_empty() { None } else { Some(deps) }
3913}
3914
3915fn extract_extras_require(content: &str) -> Option<Vec<Dependency>> {
3916    let pattern = r"extras_require\s*=\s*\{([^}]+)\}";
3917    let re = Regex::new(pattern).ok()?;
3918    let captures = re.captures(content)?;
3919    let dict_content = captures.get(1)?.as_str();
3920
3921    let mut all_deps = Vec::new();
3922
3923    let entry_pattern = r#"['"]([^'"]+)['"]\s*:\s*\[([^\]]+)\]"#;
3924    let entry_re = Regex::new(entry_pattern).ok()?;
3925
3926    for entry_cap in entry_re.captures_iter(dict_content) {
3927        if let (Some(extra_name), Some(deps_str)) = (entry_cap.get(1), entry_cap.get(2)) {
3928            let deps = parse_setup_py_dep_list(deps_str.as_str(), extra_name.as_str(), true);
3929            all_deps.extend(deps);
3930        }
3931    }
3932
3933    if all_deps.is_empty() {
3934        None
3935    } else {
3936        Some(all_deps)
3937    }
3938}
3939
3940fn parse_setup_py_dep_list(deps_str: &str, scope: &str, is_optional: bool) -> Vec<Dependency> {
3941    let dep_pattern = r#"['"]([^'"]+)['"]"#;
3942    let re = match Regex::new(dep_pattern) {
3943        Ok(r) => r,
3944        Err(_) => return Vec::new(),
3945    };
3946
3947    re.captures_iter(deps_str)
3948        .filter_map(|cap| {
3949            let dep_str = cap.get(1)?.as_str().trim();
3950            if dep_str.is_empty() {
3951                return None;
3952            }
3953
3954            let name = extract_setup_cfg_dependency_name(dep_str)?;
3955            let purl = PackageUrl::new(PythonParser::PACKAGE_TYPE.as_str(), &name).ok()?;
3956
3957            Some(Dependency {
3958                purl: Some(purl.to_string()),
3959                extracted_requirement: Some(dep_str.to_string()),
3960                scope: Some(scope.to_string()),
3961                is_runtime: Some(true),
3962                is_optional: Some(is_optional),
3963                is_pinned: Some(false),
3964                is_direct: Some(true),
3965                resolved_package: None,
3966                extra_data: None,
3967            })
3968        })
3969        .collect()
3970}
3971
3972/// Reads and parses a TOML file
3973pub(crate) fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
3974    let content = read_file_to_string(path).map_err(|e| e.to_string())?;
3975    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
3976}
3977
3978/// Calculates file size and SHA256 checksum for integrity verification in SBOMs.
3979///
3980/// Used for .whl and .egg archives to populate `size` and `sha256` fields in PackageData.
3981/// Essential for SBOM compliance and package integrity verification.
3982///
3983/// # Returns
3984///
3985/// - `(Some(size), Some(hash))` on success
3986/// - `(None, None)` if file cannot be opened
3987/// - `(Some(size), None)` if hash calculation fails during read
3988fn calculate_file_checksums(path: &Path) -> (Option<u64>, Option<String>) {
3989    let mut file = match File::open(path) {
3990        Ok(f) => f,
3991        Err(_) => return (None, None),
3992    };
3993
3994    let metadata = match file.metadata() {
3995        Ok(m) => m,
3996        Err(_) => return (None, None),
3997    };
3998    let size = metadata.len();
3999
4000    let mut hasher = Sha256::new();
4001    let mut buffer = vec![0; 8192];
4002
4003    loop {
4004        match file.read(&mut buffer) {
4005            Ok(0) => break,
4006            Ok(n) => hasher.update(&buffer[..n]),
4007            Err(_) => return (Some(size), None),
4008        }
4009    }
4010
4011    let hash = format!("{:x}", hasher.finalize());
4012    (Some(size), Some(hash))
4013}
4014
4015fn default_package_data() -> PackageData {
4016    PackageData::default()
4017}
4018
4019crate::register_parser!(
4020    "Python package manifests (pyproject.toml, setup.py, setup.cfg, pypi.json, PKG-INFO, METADATA, pip cache origin.json, .whl, .egg)",
4021    &[
4022        "**/pyproject.toml",
4023        "**/setup.py",
4024        "**/setup.cfg",
4025        "**/pypi.json",
4026        "**/PKG-INFO",
4027        "**/METADATA",
4028        "**/origin.json",
4029        "**/*.whl",
4030        "**/*.egg"
4031    ],
4032    "pypi",
4033    "Python",
4034    Some("https://packaging.python.org/"),
4035);