Skip to main content

provenant/parsers/python/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for Python package manifests and metadata files.
5//!
6//! Comprehensive parser supporting multiple Python packaging formats including
7//! modern (pyproject.toml) and legacy (setup.py, setup.cfg) standards.
8//!
9//! # Supported Formats
10//! - pyproject.toml (PEP 621)
11//! - setup.py (AST parsing, no code execution)
12//! - setup.cfg (INI format)
13//! - PKG-INFO / METADATA (RFC 822 format)
14//! - .whl archives (wheel format)
15//! - .egg archives (legacy egg format)
16//! - requirements.txt
17//!
18//! # Key Features
19//! - Archive safety checks (size limits, compression ratio validation)
20//! - AST-based setup.py parsing (no code execution)
21//! - RFC 822 metadata parsing for wheels/eggs
22//! - Dependency extraction with PEP 508 markers
23//! - Party information (authors, maintainers)
24//!
25//! # Security Features
26//! - Archive size limit: 100MB
27//! - Per-file size limit: 50MB
28//! - Compression ratio limit: 100:1
29//! - Total extracted size tracking
30//! - No code execution from setup.py or .egg files
31//!
32//! # Implementation Notes
33//! - Uses multiple parsers for different formats
34//! - Direct dependencies: all manifest dependencies are direct
35//! - Graceful fallback on parse errors with warning logs
36
37mod archive;
38mod pypi_json;
39mod pyproject;
40mod rfc822_meta;
41mod setup_cfg;
42mod setup_py;
43mod utils;
44
45#[cfg(test)]
46mod scan_test;
47#[cfg(test)]
48mod test;
49
50use super::PackageParser;
51use super::metadata::ParserMetadata;
52use crate::models::{DatasourceId, PackageData, PackageType};
53use std::path::Path;
54
55pub(crate) use self::utils::build_pypi_urls;
56#[cfg(test)]
57pub(crate) use self::utils::extract_requires_dist_dependencies;
58pub(crate) use self::utils::read_toml_file;
59
60enum PythonFileKind {
61    PyprojectToml,
62    SetupCfg,
63    SetupPy,
64    PkgInfo,
65    WheelMetadata,
66    PipOriginJson,
67    PypiJson,
68    PipInspectDeplock,
69    SdistArchive,
70    WheelArchive,
71    EggArchive,
72}
73
74fn classify_python_file(path: &Path) -> Option<PythonFileKind> {
75    let filename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
76    Some(match filename {
77        _ if is_pyproject_toml_like_path(path) => PythonFileKind::PyprojectToml,
78        _ if is_setup_cfg_like_path(path) => PythonFileKind::SetupCfg,
79        _ if is_setup_py_like_path(path) => PythonFileKind::SetupPy,
80        "PKG-INFO" => PythonFileKind::PkgInfo,
81        "METADATA" if is_installed_wheel_metadata_path(path) => PythonFileKind::WheelMetadata,
82        "pypi.json" => PythonFileKind::PypiJson,
83        "pip-inspect.deplock" => PythonFileKind::PipInspectDeplock,
84        _ => {
85            if archive::is_pip_cache_origin_json(path) {
86                PythonFileKind::PipOriginJson
87            } else if archive::is_python_sdist_archive_path(path) {
88                PythonFileKind::SdistArchive
89            } else if path
90                .extension()
91                .is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
92                && archive::is_valid_wheel_archive_path(path)
93            {
94                PythonFileKind::WheelArchive
95            } else if path
96                .extension()
97                .is_some_and(|ext| ext.eq_ignore_ascii_case("egg"))
98            {
99                PythonFileKind::EggArchive
100            } else {
101                return None;
102            }
103        }
104    })
105}
106
107/// Python package parser supporting 11 manifest formats.
108///
109/// Extracts metadata from Python package files including pyproject.toml, setup.py,
110/// setup.cfg, PKG-INFO, METADATA, pip-inspect lockfiles, and .whl/.egg archives.
111///
112/// # Security
113///
114/// setup.py files are parsed using AST analysis rather than code execution to prevent
115/// arbitrary code execution during scanning. See `extract_from_setup_py_ast` for details.
116pub struct PythonParser;
117
118impl PackageParser for PythonParser {
119    const PACKAGE_TYPE: PackageType = PackageType::Pypi;
120
121    fn metadata() -> Vec<ParserMetadata> {
122        vec![ParserMetadata {
123            description: "Python package manifests (pyproject.toml, setup.py, suffixed setup.py variants, setup.cfg, pypi.json, PKG-INFO, .dist-info/METADATA, pip cache origin.json, sdist archives, .whl, .egg)",
124            file_patterns: &[
125                "**/pyproject.toml",
126                "**/setup.py",
127                "**/*_setup.py",
128                "**/*-setup.py",
129                "**/setup.cfg",
130                "**/pypi.json",
131                "**/PKG-INFO",
132                "**/*.dist-info/METADATA",
133                "**/origin.json",
134                "**/*.tar.gz",
135                "**/*.tgz",
136                "**/*.tar.bz2",
137                "**/*.tar.xz",
138                "**/*.zip",
139                "**/*.whl",
140                "**/*.egg",
141            ],
142            package_type: "pypi",
143            primary_language: "Python",
144            documentation_url: Some("https://packaging.python.org/"),
145        }]
146    }
147
148    fn extract_packages(path: &Path) -> Vec<PackageData> {
149        match classify_python_file(path) {
150            Some(PythonFileKind::PyprojectToml) => pyproject::extract(path),
151            Some(PythonFileKind::SetupCfg) => setup_cfg::extract(path),
152            Some(PythonFileKind::SetupPy) => setup_py::extract(path),
153            Some(PythonFileKind::PkgInfo) => rfc822_meta::extract_from_rfc822_metadata(
154                path,
155                utils::detect_pkg_info_datasource_id(path),
156            ),
157            Some(PythonFileKind::WheelMetadata) => {
158                rfc822_meta::extract_from_rfc822_metadata(path, DatasourceId::PypiWheelMetadata)
159            }
160            Some(PythonFileKind::PipOriginJson) => archive::extract_from_pip_origin_json(path),
161            Some(PythonFileKind::PypiJson) => pypi_json::extract_from_pypi_json(path),
162            Some(PythonFileKind::PipInspectDeplock) => pypi_json::extract_from_pip_inspect(path),
163            Some(PythonFileKind::SdistArchive) => archive::extract_from_sdist_archive(path),
164            Some(PythonFileKind::WheelArchive) => archive::extract_from_wheel_archive(path),
165            Some(PythonFileKind::EggArchive) => archive::extract_from_egg_archive(path),
166            None => utils::default_package_data(path),
167        }
168    }
169
170    fn is_match(path: &Path) -> bool {
171        classify_python_file(path).is_some()
172    }
173}
174
175fn is_pyproject_toml_like_path(path: &Path) -> bool {
176    path.file_name()
177        .and_then(|name| name.to_str())
178        .is_some_and(|name| {
179            name == "pyproject.toml"
180                || name.ends_with("-pyproject.toml")
181                || name.ends_with("_pyproject.toml")
182                || name.ends_with(".pyproject.toml")
183        })
184}
185
186fn is_setup_py_like_path(path: &Path) -> bool {
187    path.file_name()
188        .and_then(|name| name.to_str())
189        .is_some_and(|name| {
190            name == "setup.py" || name.ends_with("_setup.py") || name.ends_with("-setup.py")
191        })
192}
193
194fn is_setup_cfg_like_path(path: &Path) -> bool {
195    path.file_name()
196        .and_then(|name| name.to_str())
197        .is_some_and(|name| {
198            name == "setup.cfg"
199                || name.ends_with("_setup.cfg")
200                || name.ends_with("-setup.cfg")
201                || name.ends_with(".setup.cfg")
202        })
203}
204
205pub(super) fn is_installed_wheel_metadata_path(path: &Path) -> bool {
206    path.file_name().and_then(|name| name.to_str()) == Some("METADATA")
207        && path
208            .parent()
209            .and_then(|parent| parent.file_name())
210            .and_then(|name| name.to_str())
211            .is_some_and(|name| name.ends_with(".dist-info"))
212}