Skip to main content

provenant/parsers/python/
mod.rs

1//! Parser for Python package manifests and metadata files.
2//!
3//! Comprehensive parser supporting multiple Python packaging formats including
4//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
5//!
6//! # Supported Formats
7//! - pyproject.toml (PEP 621)
8//! - setup.py (AST parsing, no code execution)
9//! - setup.cfg (INI format)
10//! - PKG-INFO / METADATA (RFC 822 format)
11//! - .whl archives (wheel format)
12//! - .egg archives (legacy egg format)
13//! - requirements.txt
14//!
15//! # Key Features
16//! - Archive safety checks (size limits, compression ratio validation)
17//! - AST-based setup.py parsing (no code execution)
18//! - RFC 822 metadata parsing for wheels/eggs
19//! - Dependency extraction with PEP 508 markers
20//! - Party information (authors, maintainers)
21//!
22//! # Security Features
23//! - Archive size limit: 100MB
24//! - Per-file size limit: 50MB
25//! - Compression ratio limit: 100:1
26//! - Total extracted size tracking
27//! - No code execution from setup.py or .egg files
28//!
29//! # Implementation Notes
30//! - Uses multiple parsers for different formats
31//! - Direct dependencies: all manifest dependencies are direct
32//! - Graceful fallback on parse errors with warning logs
33
34mod archive;
35mod pypi_json;
36mod pyproject;
37mod rfc822_meta;
38mod setup_cfg;
39mod setup_py;
40mod utils;
41
42#[cfg(test)]
43mod scan_test;
44#[cfg(test)]
45mod test;
46
47use super::PackageParser;
48use crate::models::{DatasourceId, PackageData, PackageType};
49use std::path::Path;
50
51pub(crate) use self::utils::build_pypi_urls;
52#[cfg(test)]
53pub(crate) use self::utils::extract_requires_dist_dependencies;
54pub(crate) use self::utils::read_toml_file;
55
56enum PythonFileKind {
57    PyprojectToml,
58    SetupCfg,
59    SetupPy,
60    PkgInfo,
61    WheelMetadata,
62    PipOriginJson,
63    PypiJson,
64    PipInspectDeplock,
65    SdistArchive,
66    WheelArchive,
67    EggArchive,
68}
69
70fn classify_python_file(path: &Path) -> Option<PythonFileKind> {
71    let filename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
72    Some(match filename {
73        "pyproject.toml" => PythonFileKind::PyprojectToml,
74        "setup.cfg" => PythonFileKind::SetupCfg,
75        _ if is_setup_py_like_path(path) => PythonFileKind::SetupPy,
76        "PKG-INFO" => PythonFileKind::PkgInfo,
77        "METADATA" if is_installed_wheel_metadata_path(path) => PythonFileKind::WheelMetadata,
78        "pypi.json" => PythonFileKind::PypiJson,
79        "pip-inspect.deplock" => PythonFileKind::PipInspectDeplock,
80        _ => {
81            if archive::is_pip_cache_origin_json(path) {
82                PythonFileKind::PipOriginJson
83            } else if archive::is_python_sdist_archive_path(path) {
84                PythonFileKind::SdistArchive
85            } else if path
86                .extension()
87                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
88                && archive::is_valid_wheel_archive_path(path)
89            {
90                PythonFileKind::WheelArchive
91            } else if path
92                .extension()
93                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
94            {
95                PythonFileKind::EggArchive
96            } else {
97                return None;
98            }
99        }
100    })
101}
102
103/// Python package parser supporting 11 manifest formats.
104///
105/// Extracts metadata from Python package files including pyproject.toml, setup.py,
106/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
107///
108/// # Security
109///
110/// setup.py files are parsed using AST analysis rather than code execution to prevent
111/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
112pub struct PythonParser;
113
114impl PackageParser for PythonParser {
115    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
116
117    fn extract_packages(path: &Path) -> Vec<PackageData> {
118        match classify_python_file(path) {
119            Some(PythonFileKind::PyprojectToml) => pyproject::extract(path),
120            Some(PythonFileKind::SetupCfg) => setup_cfg::extract(path),
121            Some(PythonFileKind::SetupPy) => setup_py::extract(path),
122            Some(PythonFileKind::PkgInfo) => rfc822_meta::extract_from_rfc822_metadata(
123                path,
124                utils::detect_pkg_info_datasource_id(path),
125            ),
126            Some(PythonFileKind::WheelMetadata) => {
127                rfc822_meta::extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
128            }
129            Some(PythonFileKind::PipOriginJson) => archive::extract_from_pip_origin_json(path),
130            Some(PythonFileKind::PypiJson) => pypi_json::extract_from_pypi_json(path),
131            Some(PythonFileKind::PipInspectDeplock) => pypi_json::extract_from_pip_inspect(path),
132            Some(PythonFileKind::SdistArchive) => archive::extract_from_sdist_archive(path),
133            Some(PythonFileKind::WheelArchive) => archive::extract_from_wheel_archive(path),
134            Some(PythonFileKind::EggArchive) => archive::extract_from_egg_archive(path),
135            None => utils::default_package_data(path),
136        }
137    }
138
139    fn is_match(path: &Path) -> bool {
140        classify_python_file(path).is_some()
141    }
142}
143
144fn is_setup_py_like_path(path: &Path) -> bool {
145    path.file_name()
146        .and_then(|name| name.to_str())
147        .is_some_and(|name| {
148            name == "setup.py" || name.ends_with("_setup.py") || name.ends_with("-setup.py")
149        })
150}
151
152pub(super) fn is_installed_wheel_metadata_path(path: &Path) -> bool {
153    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
154        && path
155            .parent()
156            .and_then(|parent| parent.file_name())
157            .and_then(|name| name.to_str())
158            .is_some_and(|name| name.ends_with(".dist-info"))
159}
160
161crate::register_parser!(
162    "Python package manifests (pyproject.toml, setup.py, suffixed setup.py variants, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
163    &[
164        "**/pyproject.toml",
165        "**/setup.py",
166        "**/*_setup.py",
167        "**/*-setup.py",
168        "**/setup.cfg",
169        "**/pypi.json",
170        "**/PKG-INFO",
171        "**/*.dist-info/METADATA",
172        "**/origin.json",
173        "**/*.tar.gz",
174        "**/*.tgz",
175        "**/*.tar.bz2",
176        "**/*.tar.xz",
177        "**/*.zip",
178        "**/*.whl",
179        "**/*.egg"
180    ],
181    "pypi",
182    "Python",
183    Some("https://packaging.python.org/"),
184);