scancode_rust/parsers/
python.rs

1use crate::models::{Dependency, LicenseDetection, Match, PackageData, Party};
2use log::warn;
3use packageurl::PackageUrl;
4use std::fs::File;
5use std::io::Read;
6use std::path::Path;
7use toml::Value as TomlValue;
8use toml::map::Map as TomlMap;
9
10use super::PackageParser;
11
12// Field constants for pyproject.toml
13const FIELD_PROJECT: &str = "project";
14const FIELD_NAME: &str = "name";
15const FIELD_VERSION: &str = "version";
16const FIELD_LICENSE: &str = "license";
17const FIELD_AUTHORS: &str = "authors";
18const FIELD_MAINTAINERS: &str = "maintainers";
19const FIELD_URLS: &str = "urls";
20const FIELD_HOMEPAGE: &str = "homepage";
21const FIELD_REPOSITORY: &str = "repository";
22const FIELD_DEPENDENCIES: &str = "dependencies";
23const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
24
25pub struct PythonParser;
26
27impl PackageParser for PythonParser {
28    const PACKAGE_TYPE: &'static str = "pypi";
29
30    fn extract_package_data(path: &Path) -> PackageData {
31        if path.file_name().unwrap_or_default() == "pyproject.toml" {
32            extract_from_pyproject_toml(path)
33        } else if path.file_name().unwrap_or_default() == "setup.py" {
34            extract_from_setup_py(path)
35        } else {
36            default_package_data()
37        }
38    }
39
40    fn is_match(path: &Path) -> bool {
41        if let Some(filename) = path.file_name() {
42            filename == "pyproject.toml" || filename == "setup.py"
43        } else {
44            false
45        }
46    }
47}
48
49fn extract_from_pyproject_toml(path: &Path) -> PackageData {
50    let toml_content = match read_toml_file(path) {
51        Ok(content) => content,
52        Err(e) => {
53            warn!("Failed to read or parse pyproject.toml at {:?}: {}", path, e);
54            return default_package_data();
55        }
56    };
57
58    // Handle both PEP 621 (project table) and poetry formats
59    let project_table = if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
60        // Standard PEP 621 format with [project] table
61        project.clone()
62    } else if toml_content.get(FIELD_NAME).is_some() {
63        // Poetry or other format with top-level fields
64        match toml_content.as_table() {
65            Some(table) => table.clone(),
66            None => {
67                warn!("Failed to convert TOML content to table in {:?}", path);
68                return default_package_data();
69            }
70        }
71    } else {
72        warn!("No project data found in pyproject.toml at {:?}", path);
73        return default_package_data();
74    };
75
76    let name = project_table
77        .get(FIELD_NAME)
78        .and_then(|v| v.as_str())
79        .map(String::from);
80
81    let version = project_table
82        .get(FIELD_VERSION)
83        .and_then(|v| v.as_str())
84        .map(String::from);
85
86    let license_detections = extract_license_info(&project_table);
87    
88    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
89    let (homepage_url, repository_url) = extract_urls(&project_table);
90
91    let (dependencies, optional_dependencies) = extract_dependencies(&project_table);
92
93    // Create package URL
94    let purl = name.as_ref().map(|n| {
95        let mut package_url =
96            PackageUrl::new(PythonParser::PACKAGE_TYPE, n).expect("Failed to create PackageUrl");
97
98        if let Some(v) = &version {
99            package_url.with_version(v);
100        }
101
102        package_url.to_string()
103    });
104
105    PackageData {
106        package_type: Some(PythonParser::PACKAGE_TYPE.to_string()),
107        namespace: None, // Python doesn't typically use namespaces like npm
108        name,
109        version,
110        homepage_url,
111        download_url: repository_url,
112        copyright: None,
113        license_detections,
114        dependencies: [dependencies, optional_dependencies].concat(),
115        parties: extract_parties(&project_table),
116        purl,
117    }
118}
119
120fn extract_license_info(project: &TomlMap<String, TomlValue>) -> Vec<LicenseDetection> {
121    let mut detections = Vec::new();
122
123    // Different projects might specify license in various ways
124    if let Some(license_value) = project.get(FIELD_LICENSE) {
125        match license_value {
126            TomlValue::String(license_str) => {
127                detections.push(create_license_detection(license_str));
128            }
129            TomlValue::Table(license_table) => {
130                if let Some(text) = license_table.get("text").and_then(|v| v.as_str()) {
131                    detections.push(create_license_detection(text));
132                }
133                if let Some(expr) = license_table.get("expression").and_then(|v| v.as_str()) {
134                    detections.push(create_license_detection(expr));
135                }
136            }
137            _ => {}
138        }
139    }
140
141    detections
142}
143
144fn create_license_detection(license_str: &str) -> LicenseDetection {
145    LicenseDetection {
146        license_expression: license_str.to_string(),
147        matches: vec![Match {
148            score: 100.0,
149            start_line: 0, // We don't track exact line numbers with the toml parser
150            end_line: 0,
151            license_expression: license_str.to_string(),
152            rule_identifier: None,
153            matched_text: None,
154        }],
155    }
156}
157
158fn extract_urls(
159    project: &TomlMap<String, TomlValue>,
160) -> (Option<String>, Option<String>) {
161    let mut homepage_url = None;
162    let mut repository_url = None;
163
164    // Check for URLs table
165    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
166        homepage_url = urls
167            .get(FIELD_HOMEPAGE)
168            .and_then(|v| v.as_str())
169            .map(String::from);
170        repository_url = urls
171            .get(FIELD_REPOSITORY)
172            .and_then(|v| v.as_str())
173            .map(String::from);
174    }
175
176    // If not found in URLs table, check for top-level keys
177    if homepage_url.is_none() {
178        homepage_url = project
179            .get(FIELD_HOMEPAGE)
180            .and_then(|v| v.as_str())
181            .map(String::from);
182    }
183
184    if repository_url.is_none() {
185        repository_url = project
186            .get(FIELD_REPOSITORY)
187            .and_then(|v| v.as_str())
188            .map(String::from);
189    }
190
191    (homepage_url, repository_url)
192}
193
194fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
195    let mut parties = Vec::new();
196
197    // Extract authors
198    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
199        for author in authors {
200            if let Some(author_str) = author.as_str() {
201                extract_email_from_author_string(author_str)
202                    .map(|email| parties.push(Party { email }));
203            }
204        }
205    }
206
207    // Extract maintainers
208    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
209        for maintainer in maintainers {
210            if let Some(maintainer_str) = maintainer.as_str() {
211                extract_email_from_author_string(maintainer_str)
212                    .map(|email| parties.push(Party { email }));
213            }
214        }
215    }
216
217    parties
218}
219
220fn extract_email_from_author_string(author_str: &str) -> Option<String> {
221    // Look for email addresses in the format: "Name <email@example.com>"
222    if let Some(email_start) = author_str.find('<') {
223        if let Some(email_end) = author_str.find('>') {
224            if email_start < email_end {
225                return Some(author_str[email_start + 1..email_end].to_string());
226            }
227        }
228    }
229
230    None
231}
232
233fn extract_dependencies(
234    project: &TomlMap<String, TomlValue>,
235) -> (Vec<Dependency>, Vec<Dependency>) {
236    let mut dependencies = Vec::new();
237    let mut optional_dependencies = Vec::new();
238
239    // Regular dependencies
240    if let Some(deps) = project.get(FIELD_DEPENDENCIES).and_then(|v| v.as_array()) {
241        dependencies = parse_dependency_array(deps, false);
242    }
243
244    // Optional dependencies (often grouped by feature)
245    if let Some(opt_deps_table) = project
246        .get(FIELD_OPTIONAL_DEPENDENCIES)
247        .and_then(|v| v.as_table())
248    {
249        for (_feature, deps) in opt_deps_table {
250            if let Some(deps_array) = deps.as_array() {
251                optional_dependencies.extend(parse_dependency_array(deps_array, true));
252            }
253        }
254    }
255
256    (dependencies, optional_dependencies)
257}
258
259fn parse_dependency_array(array: &[TomlValue], is_optional: bool) -> Vec<Dependency> {
260    array
261        .iter()
262        .filter_map(|dep| {
263            let dep_str = dep.as_str()?;
264
265            // Basic parsing of PEP 508 dependency specifications
266            // For example "requests>=2.0.0", "django==3.2.1", "flask"
267            let mut parts = dep_str.split(|c| c == '>' || c == '=' || c == '<' || c == '~');
268            let name = parts.next()?.trim().to_string();
269            
270            // Extract version if present
271            let version = parts.next().map(|v| v.trim().to_string());
272            
273            let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE, &name) {
274                Ok(purl) => purl,
275                Err(_) => return None,
276            };
277
278            if let Some(ref v) = version {
279                package_url.with_version(v);
280            }
281
282            Some(Dependency {
283                purl: Some(package_url.to_string()),
284                scope: None,
285                is_optional,
286            })
287        })
288        .collect()
289}
290
291fn extract_from_setup_py(path: &Path) -> PackageData {
292    // For setup.py, we do a simple text-based extraction since parsing Python
293    // would be much more complex. This is a basic implementation that could
294    // be improved in the future.
295    let content = match read_file_to_string(path) {
296        Ok(content) => content,
297        Err(e) => {
298            warn!("Failed to read setup.py at {:?}: {}", path, e);
299            return default_package_data();
300        }
301    };
302
303    let name = extract_setup_value(&content, "name");
304    let version = extract_setup_value(&content, "version");
305    let license_expression = extract_setup_value(&content, "license");
306    
307    // Create license detection if we found a license
308    let license_detections = license_expression.as_ref().map_or(Vec::new(), |license| {
309        vec![LicenseDetection {
310            license_expression: license.clone(),
311            matches: vec![Match {
312                score: 100.0,
313                start_line: 0, // We don't track exact line numbers
314                end_line: 0,
315                license_expression: license.clone(),
316                rule_identifier: None,
317                matched_text: None,
318            }],
319        }]
320    });
321
322    // Create package URL
323    let purl = name.as_ref().map(|n| {
324        let mut package_url =
325            PackageUrl::new(PythonParser::PACKAGE_TYPE, n).expect("Failed to create PackageUrl");
326
327        if let Some(v) = &version {
328            package_url.with_version(v);
329        }
330
331        package_url.to_string()
332    });
333
334    PackageData {
335        package_type: Some(PythonParser::PACKAGE_TYPE.to_string()),
336        namespace: None,
337        name,
338        version,
339        homepage_url: extract_setup_value(&content, "url"),
340        download_url: None,
341        copyright: None,
342        license_detections,
343        dependencies: Vec::new(), // For setup.py, parsing dependencies reliably is challenging
344        parties: Vec::new(),      // Same for authors without a proper parser
345        purl,
346    }
347}
348
349fn extract_setup_value(content: &str, key: &str) -> Option<String> {
350    // This is a very basic parser that looks for patterns like:
351    // name="package_name", or name = "package_name"
352    let patterns = vec![
353        format!("{}=\"", key),  // name="value"
354        format!("{} =\"", key), // name ="value"
355        format!("{}= \"", key), // name= "value"
356        format!("{} = \"", key), // name = "value"
357        format!("{}='", key),   // name='value'
358        format!("{} ='", key),  // name ='value'
359        format!("{}= '", key),  // name= 'value'
360        format!("{} = '", key), // name = 'value'
361    ];
362
363    for pattern in patterns {
364        if let Some(start_idx) = content.find(&pattern) {
365            let value_start = start_idx + pattern.len();
366            let remaining = &content[value_start..];
367            
368            // Find closing quote
369            if let Some(end_idx) = remaining.find(|c| c == '"' || c == '\'') {
370                return Some(remaining[..end_idx].to_string());
371            }
372        }
373    }
374
375    None
376}
377
378/// Reads and parses a TOML file
379fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
380    let content = read_file_to_string(path)?;
381    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
382}
383
384fn read_file_to_string(path: &Path) -> Result<String, String> {
385    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {}", e))?;
386    let mut content = String::new();
387    file.read_to_string(&mut content)
388        .map_err(|e| format!("Error reading file: {}", e))?;
389    Ok(content)
390}
391
392fn default_package_data() -> PackageData {
393    PackageData {
394        package_type: None,
395        namespace: None,
396        name: None,
397        version: None,
398        homepage_url: None,
399        download_url: None,
400        copyright: None,
401        license_detections: Vec::new(),
402        dependencies: Vec::new(),
403        parties: Vec::new(),
404        purl: None,
405    }
406}