Skip to main content

provenant/parsers/python/
mod.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34mod archive;
35mod pypi_json;
36mod pyproject;
37mod rfc822_meta;
38mod setup_cfg;
39mod setup_py;
40mod utils;
41
42#[cfg(test)]
43mod scan_test;
44#[cfg(test)]
45mod test;
46
47use super::PackageParser;
48use crate::models::{DatasourceId, PackageData, PackageType};
49use std::path::Path;
50
51pub(crate) use self::utils::build_pypi_urls;
52#[cfg(test)]
53pub(crate) use self::utils::extract_requires_dist_dependencies;
54pub(crate) use self::utils::read_toml_file;
55
56/// Python package parser supporting 11 manifest formats.
57///
58/// Extracts metadata from Python package files including pyproject.toml, setup.py,
59/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
60///
61/// # Security
62///
63/// setup.py files are parsed using AST analysis rather than code execution to prevent
64/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
65pub struct PythonParser;
66
67impl PackageParser for PythonParser {
68    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
69
70    fn extract_packages(path: &Path) -> Vec<PackageData> {
71        vec![
72            if path.file_name().unwrap_or_default() == "pyproject.toml" {
73                pyproject::extract_from_pyproject_toml(path)
74            } else if path.file_name().unwrap_or_default() == "setup.cfg" {
75                setup_cfg::extract_from_setup_cfg(path)
76            } else if is_setup_py_like_path(path) {
77                return setup_py::extract_setup_py_packages(path);
78            } else if path.file_name().unwrap_or_default() == "PKG-INFO" {
79                rfc822_meta::extract_from_rfc822_metadata(
80                    path,
81                    utils::detect_pkg_info_datasource_id(path),
82                )
83            } else if is_installed_wheel_metadata_path(path) {
84                rfc822_meta::extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
85            } else if archive::is_pip_cache_origin_json(path) {
86                archive::extract_from_pip_origin_json(path)
87            } else if path.file_name().unwrap_or_default() == "pypi.json" {
88                pypi_json::extract_from_pypi_json(path)
89            } else if path.file_name().unwrap_or_default() == "pip-inspect.deplock" {
90                pypi_json::extract_from_pip_inspect(path)
91            } else if archive::is_python_sdist_archive_path(path) {
92                archive::extract_from_sdist_archive(path)
93            } else if path
94                .extension()
95                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
96            {
97                archive::extract_from_wheel_archive(path)
98            } else if path
99                .extension()
100                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
101            {
102                archive::extract_from_egg_archive(path)
103            } else {
104                utils::default_package_data(path)
105            },
106        ]
107    }
108
109    fn is_match(path: &Path) -> bool {
110        if let Some(filename) = path.file_name()
111            && (filename == "pyproject.toml"
112                || filename == "setup.cfg"
113                || is_setup_py_like_path(path)
114                || filename == "PKG-INFO"
115                || (filename == "METADATA" && is_installed_wheel_metadata_path(path))
116                || filename == "pypi.json"
117                || filename == "pip-inspect.deplock"
118                || archive::is_pip_cache_origin_json(path))
119        {
120            return true;
121        }
122
123        if let Some(extension) = path.extension() {
124            let ext = extension.to_string_lossy().to_lowercase();
125            if (ext == "whl" && archive::is_valid_wheel_archive_path(path))
126                || ext == "egg"
127                || archive::is_python_sdist_archive_path(path)
128            {
129                return true;
130            }
131        }
132
133        false
134    }
135}
136
137fn is_setup_py_like_path(path: &Path) -> bool {
138    path.file_name()
139        .and_then(|name| name.to_str())
140        .is_some_and(|name| {
141            name == "setup.py" || name.ends_with("_setup.py") || name.ends_with("-setup.py")
142        })
143}
144
145pub(super) fn is_installed_wheel_metadata_path(path: &Path) -> bool {
146    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
147        && path
148            .parent()
149            .and_then(|parent| parent.file_name())
150            .and_then(|name| name.to_str())
151            .is_some_and(|name| name.ends_with(".dist-info"))
152}
153
154crate::register_parser!(
155    "Python package manifests (pyproject.toml, setup.py, suffixed setup.py variants, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
156    &[
157        "**/pyproject.toml",
158        "**/setup.py",
159        "**/*_setup.py",
160        "**/*-setup.py",
161        "**/setup.cfg",
162        "**/pypi.json",
163        "**/PKG-INFO",
164        "**/*.dist-info/METADATA",
165        "**/origin.json",
166        "**/*.tar.gz",
167        "**/*.tgz",
168        "**/*.tar.bz2",
169        "**/*.tar.xz",
170        "**/*.zip",
171        "**/*.whl",
172        "**/*.egg"
173    ],
174    "pypi",
175    "Python",
176    Some("https://packaging.python.org/"),
177);