Skip to main content

provenant/parsers/python/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for Python package manifests and metadata files.
5//!
6//! Comprehensive parser supporting multiple Python packaging formats including
7//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
8//!
9//! # Supported Formats
10//! - pyproject.toml (PEP 621)
11//! - setup.py (AST parsing, no code execution)
12//! - setup.cfg (INI format)
13//! - PKG-INFO / METADATA (RFC 822 format)
14//! - .whl archives (wheel format)
15//! - .egg archives (legacy egg format)
16//! - requirements.txt
17//!
18//! # Key Features
19//! - Archive safety checks (size limits, compression ratio validation)
20//! - AST-based setup.py parsing (no code execution)
21//! - RFC 822 metadata parsing for wheels/eggs
22//! - Dependency extraction with PEP 508 markers
23//! - Party information (authors, maintainers)
24//!
25//! # Security Features
26//! - Archive size limit: 100MB
27//! - Per-file size limit: 50MB
28//! - Compression ratio limit: 100:1
29//! - Total extracted size tracking
30//! - No code execution from setup.py or .egg files
31//!
32//! # Implementation Notes
33//! - Uses multiple parsers for different formats
34//! - Direct dependencies: all manifest dependencies are direct
35//! - Graceful fallback on parse errors with warning logs
36
37mod archive;
38mod pypi_json;
39mod pyproject;
40mod rfc822_meta;
41mod setup_cfg;
42mod setup_py;
43mod utils;
44
45#[cfg(test)]
46mod scan_test;
47#[cfg(test)]
48mod test;
49
50use super::PackageParser;
51use crate::models::{DatasourceId, PackageData, PackageType};
52use std::path::Path;
53
54pub(crate) use self::utils::build_pypi_urls;
55#[cfg(test)]
56pub(crate) use self::utils::extract_requires_dist_dependencies;
57pub(crate) use self::utils::read_toml_file;
58
59enum PythonFileKind {
60    PyprojectToml,
61    SetupCfg,
62    SetupPy,
63    PkgInfo,
64    WheelMetadata,
65    PipOriginJson,
66    PypiJson,
67    PipInspectDeplock,
68    SdistArchive,
69    WheelArchive,
70    EggArchive,
71}
72
73fn classify_python_file(path: &Path) -> Option<PythonFileKind> {
74    let filename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
75    Some(match filename {
76        "pyproject.toml" => PythonFileKind::PyprojectToml,
77        "setup.cfg" => PythonFileKind::SetupCfg,
78        _ if is_setup_py_like_path(path) => PythonFileKind::SetupPy,
79        "PKG-INFO" => PythonFileKind::PkgInfo,
80        "METADATA" if is_installed_wheel_metadata_path(path) => PythonFileKind::WheelMetadata,
81        "pypi.json" => PythonFileKind::PypiJson,
82        "pip-inspect.deplock" => PythonFileKind::PipInspectDeplock,
83        _ => {
84            if archive::is_pip_cache_origin_json(path) {
85                PythonFileKind::PipOriginJson
86            } else if archive::is_python_sdist_archive_path(path) {
87                PythonFileKind::SdistArchive
88            } else if path
89                .extension()
90                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
91                && archive::is_valid_wheel_archive_path(path)
92            {
93                PythonFileKind::WheelArchive
94            } else if path
95                .extension()
96                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
97            {
98                PythonFileKind::EggArchive
99            } else {
100                return None;
101            }
102        }
103    })
104}
105
106/// Python package parser supporting 11 manifest formats.
107///
108/// Extracts metadata from Python package files including pyproject.toml, setup.py,
109/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
110///
111/// # Security
112///
113/// setup.py files are parsed using AST analysis rather than code execution to prevent
114/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
115pub struct PythonParser;
116
117impl PackageParser for PythonParser {
118    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
119
120    fn extract_packages(path: &Path) -> Vec<PackageData> {
121        match classify_python_file(path) {
122            Some(PythonFileKind::PyprojectToml) => pyproject::extract(path),
123            Some(PythonFileKind::SetupCfg) => setup_cfg::extract(path),
124            Some(PythonFileKind::SetupPy) => setup_py::extract(path),
125            Some(PythonFileKind::PkgInfo) => rfc822_meta::extract_from_rfc822_metadata(
126                path,
127                utils::detect_pkg_info_datasource_id(path),
128            ),
129            Some(PythonFileKind::WheelMetadata) => {
130                rfc822_meta::extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
131            }
132            Some(PythonFileKind::PipOriginJson) => archive::extract_from_pip_origin_json(path),
133            Some(PythonFileKind::PypiJson) => pypi_json::extract_from_pypi_json(path),
134            Some(PythonFileKind::PipInspectDeplock) => pypi_json::extract_from_pip_inspect(path),
135            Some(PythonFileKind::SdistArchive) => archive::extract_from_sdist_archive(path),
136            Some(PythonFileKind::WheelArchive) => archive::extract_from_wheel_archive(path),
137            Some(PythonFileKind::EggArchive) => archive::extract_from_egg_archive(path),
138            None => utils::default_package_data(path),
139        }
140    }
141
142    fn is_match(path: &Path) -> bool {
143        classify_python_file(path).is_some()
144    }
145}
146
147fn is_setup_py_like_path(path: &Path) -> bool {
148    path.file_name()
149        .and_then(|name| name.to_str())
150        .is_some_and(|name| {
151            name == "setup.py" || name.ends_with("_setup.py") || name.ends_with("-setup.py")
152        })
153}
154
155pub(super) fn is_installed_wheel_metadata_path(path: &Path) -> bool {
156    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
157        && path
158            .parent()
159            .and_then(|parent| parent.file_name())
160            .and_then(|name| name.to_str())
161            .is_some_and(|name| name.ends_with(".dist-info"))
162}
163
164crate::register_parser!(
165    "Python package manifests (pyproject.toml, setup.py, suffixed setup.py variants, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
166    &[
167        "**/pyproject.toml",
168        "**/setup.py",
169        "**/*_setup.py",
170        "**/*-setup.py",
171        "**/setup.cfg",
172        "**/pypi.json",
173        "**/PKG-INFO",
174        "**/*.dist-info/METADATA",
175        "**/origin.json",
176        "**/*.tar.gz",
177        "**/*.tgz",
178        "**/*.tar.bz2",
179        "**/*.tar.xz",
180        "**/*.zip",
181        "**/*.whl",
182        "**/*.egg"
183    ],
184    "pypi",
185    "Python",
186    Some("https://packaging.python.org/"),
187);