Skip to main content

scancode_rust/parsers/
python.rs

1use crate::models::{Dependency, LicenseDetection, Match, PackageData, Party};
2use log::warn;
3use packageurl::PackageUrl;
4use std::fs::File;
5use std::io::Read;
6use std::path::Path;
7use toml::Value as TomlValue;
8use toml::map::Map as TomlMap;
9
10use super::PackageParser;
11
12// Field constants for pyproject.toml
13const FIELD_PROJECT: &str = "project";
14const FIELD_NAME: &str = "name";
15const FIELD_VERSION: &str = "version";
16const FIELD_LICENSE: &str = "license";
17const FIELD_AUTHORS: &str = "authors";
18const FIELD_MAINTAINERS: &str = "maintainers";
19const FIELD_URLS: &str = "urls";
20const FIELD_HOMEPAGE: &str = "homepage";
21const FIELD_REPOSITORY: &str = "repository";
22const FIELD_DEPENDENCIES: &str = "dependencies";
23const FIELD_OPTIONAL_DEPENDENCIES: &str = "optional-dependencies";
24
25pub struct PythonParser;
26
27impl PackageParser for PythonParser {
28    const PACKAGE_TYPE: &'static str = "pypi";
29
30    fn extract_package_data(path: &Path) -> PackageData {
31        if path.file_name().unwrap_or_default() == "pyproject.toml" {
32            extract_from_pyproject_toml(path)
33        } else if path.file_name().unwrap_or_default() == "setup.py" {
34            extract_from_setup_py(path)
35        } else {
36            default_package_data()
37        }
38    }
39
40    fn is_match(path: &Path) -> bool {
41        if let Some(filename) = path.file_name() {
42            filename == "pyproject.toml" || filename == "setup.py"
43        } else {
44            false
45        }
46    }
47}
48
49fn extract_from_pyproject_toml(path: &Path) -> PackageData {
50    let toml_content = match read_toml_file(path) {
51        Ok(content) => content,
52        Err(e) => {
53            warn!(
54                "Failed to read or parse pyproject.toml at {:?}: {}",
55                path, e
56            );
57            return default_package_data();
58        }
59    };
60
61    // Handle both PEP 621 (project table) and poetry formats
62    let project_table =
63        if let Some(project) = toml_content.get(FIELD_PROJECT).and_then(|v| v.as_table()) {
64            // Standard PEP 621 format with [project] table
65            project.clone()
66        } else if toml_content.get(FIELD_NAME).is_some() {
67            // Poetry or other format with top-level fields
68            match toml_content.as_table() {
69                Some(table) => table.clone(),
70                None => {
71                    warn!("Failed to convert TOML content to table in {:?}", path);
72                    return default_package_data();
73                }
74            }
75        } else {
76            warn!("No project data found in pyproject.toml at {:?}", path);
77            return default_package_data();
78        };
79
80    let name = project_table
81        .get(FIELD_NAME)
82        .and_then(|v| v.as_str())
83        .map(String::from);
84
85    let version = project_table
86        .get(FIELD_VERSION)
87        .and_then(|v| v.as_str())
88        .map(String::from);
89
90    let license_detections = extract_license_info(&project_table);
91
92    // URLs can be in different formats depending on the tool (poetry, flit, etc.)
93    let (homepage_url, repository_url) = extract_urls(&project_table);
94
95    let (dependencies, optional_dependencies) = extract_dependencies(&project_table);
96
97    // Create package URL
98    let purl = name.as_ref().map(|n| {
99        let mut package_url =
100            PackageUrl::new(PythonParser::PACKAGE_TYPE, n).expect("Failed to create PackageUrl");
101
102        if let Some(v) = &version {
103            package_url.with_version(v).expect("Failed to set version");
104        }
105
106        package_url.to_string()
107    });
108
109    PackageData {
110        package_type: Some(PythonParser::PACKAGE_TYPE.to_string()),
111        namespace: None, // Python doesn't typically use namespaces like npm
112        name,
113        version,
114        homepage_url,
115        download_url: repository_url,
116        copyright: None,
117        license_detections,
118        dependencies: [dependencies, optional_dependencies].concat(),
119        parties: extract_parties(&project_table),
120        purl,
121    }
122}
123
124fn extract_license_info(project: &TomlMap<String, TomlValue>) -> Vec<LicenseDetection> {
125    let mut detections = Vec::new();
126
127    // Different projects might specify license in various ways
128    if let Some(license_value) = project.get(FIELD_LICENSE) {
129        match license_value {
130            TomlValue::String(license_str) => {
131                detections.push(create_license_detection(license_str));
132            }
133            TomlValue::Table(license_table) => {
134                if let Some(text) = license_table.get("text").and_then(|v| v.as_str()) {
135                    detections.push(create_license_detection(text));
136                }
137                if let Some(expr) = license_table.get("expression").and_then(|v| v.as_str()) {
138                    detections.push(create_license_detection(expr));
139                }
140            }
141            _ => {}
142        }
143    }
144
145    detections
146}
147
148fn create_license_detection(license_str: &str) -> LicenseDetection {
149    LicenseDetection {
150        license_expression: license_str.to_string(),
151        matches: vec![Match {
152            score: 100.0,
153            start_line: 0, // We don't track exact line numbers with the toml parser
154            end_line: 0,
155            license_expression: license_str.to_string(),
156            rule_identifier: None,
157            matched_text: None,
158        }],
159    }
160}
161
162fn extract_urls(project: &TomlMap<String, TomlValue>) -> (Option<String>, Option<String>) {
163    let mut homepage_url = None;
164    let mut repository_url = None;
165
166    // Check for URLs table
167    if let Some(urls) = project.get(FIELD_URLS).and_then(|v| v.as_table()) {
168        homepage_url = urls
169            .get(FIELD_HOMEPAGE)
170            .and_then(|v| v.as_str())
171            .map(String::from);
172        repository_url = urls
173            .get(FIELD_REPOSITORY)
174            .and_then(|v| v.as_str())
175            .map(String::from);
176    }
177
178    // If not found in URLs table, check for top-level keys
179    if homepage_url.is_none() {
180        homepage_url = project
181            .get(FIELD_HOMEPAGE)
182            .and_then(|v| v.as_str())
183            .map(String::from);
184    }
185
186    if repository_url.is_none() {
187        repository_url = project
188            .get(FIELD_REPOSITORY)
189            .and_then(|v| v.as_str())
190            .map(String::from);
191    }
192
193    (homepage_url, repository_url)
194}
195
196fn extract_parties(project: &TomlMap<String, TomlValue>) -> Vec<Party> {
197    let mut parties = Vec::new();
198
199    // Extract authors
200    if let Some(authors) = project.get(FIELD_AUTHORS).and_then(|v| v.as_array()) {
201        for author in authors {
202            if let Some(author_str) = author.as_str()
203                && let Some(email) = extract_email_from_author_string(author_str)
204            {
205                parties.push(Party { email })
206            }
207        }
208    }
209
210    // Extract maintainers
211    if let Some(maintainers) = project.get(FIELD_MAINTAINERS).and_then(|v| v.as_array()) {
212        for maintainer in maintainers {
213            if let Some(maintainer_str) = maintainer.as_str()
214                && let Some(email) = extract_email_from_author_string(maintainer_str)
215            {
216                parties.push(Party { email })
217            }
218        }
219    }
220
221    parties
222}
223
224fn extract_email_from_author_string(author_str: &str) -> Option<String> {
225    // Look for email addresses in the format: "Name <email@example.com>"
226    if let Some(email_start) = author_str.find('<')
227        && let Some(email_end) = author_str.find('>')
228        && email_start < email_end
229    {
230        return Some(author_str[email_start + 1..email_end].to_string());
231    }
232
233    None
234}
235
236fn extract_dependencies(
237    project: &TomlMap<String, TomlValue>,
238) -> (Vec<Dependency>, Vec<Dependency>) {
239    let mut dependencies = Vec::new();
240    let mut optional_dependencies = Vec::new();
241
242    // Handle dependencies - can be array or table format
243    if let Some(deps_value) = project.get(FIELD_DEPENDENCIES) {
244        match deps_value {
245            TomlValue::Array(arr) => {
246                dependencies = parse_dependency_array(arr, false);
247            }
248            TomlValue::Table(table) => {
249                dependencies = parse_dependency_table(table, false);
250            }
251            _ => {}
252        }
253    }
254
255    // Handle optional dependencies
256    if let Some(opt_deps_table) = project
257        .get(FIELD_OPTIONAL_DEPENDENCIES)
258        .and_then(|v| v.as_table())
259    {
260        for (_feature, deps) in opt_deps_table {
261            match deps {
262                TomlValue::Array(arr) => {
263                    optional_dependencies.extend(parse_dependency_array(arr, true));
264                }
265                TomlValue::Table(table) => {
266                    optional_dependencies.extend(parse_dependency_table(table, true));
267                }
268                _ => {}
269            }
270        }
271    }
272
273    (dependencies, optional_dependencies)
274}
275
276fn parse_dependency_table(
277    table: &TomlMap<String, TomlValue>,
278    is_optional: bool,
279) -> Vec<Dependency> {
280    table
281        .iter()
282        .filter_map(|(name, version)| {
283            // Create version string if present
284            let version_str = version.as_str().map(|s| s.to_string());
285            // Create package URL with name
286            let mut package_url = PackageUrl::new(PythonParser::PACKAGE_TYPE, name).ok()?;
287
288            // Add version if present
289            if let Some(v) = &version_str {
290                package_url.with_version(v).ok()?;
291            }
292
293            Some(Dependency {
294                purl: Some(package_url.to_string()),
295                scope: None,
296                is_optional,
297            })
298        })
299        .collect()
300}
301
302fn parse_dependency_array(array: &[TomlValue], is_optional: bool) -> Vec<Dependency> {
303    array
304        .iter()
305        .filter_map(|dep| {
306            let dep_str = dep.as_str()?;
307
308            // Basic parsing of PEP 508 dependency specifications
309            // For example "requests>=2.0.0", "django==3.2.1", "flask"
310            let mut parts = dep_str.split(['>', '=', '<', '~']);
311            let name = parts.next()?.trim().to_string();
312
313            // Extract version if present
314            let version = parts.next().map(|v| v.trim().to_string());
315
316            let mut package_url = match PackageUrl::new(PythonParser::PACKAGE_TYPE, &name) {
317                Ok(purl) => purl,
318                Err(_) => return None,
319            };
320
321            if let Some(ref v) = version {
322                package_url.with_version(v).ok()?;
323            }
324
325            Some(Dependency {
326                purl: Some(package_url.to_string()),
327                scope: None,
328                is_optional,
329            })
330        })
331        .collect()
332}
333
334fn extract_from_setup_py(path: &Path) -> PackageData {
335    // For setup.py, we do a simple text-based extraction since parsing Python
336    // would be much more complex. This is a basic implementation that could
337    // be improved in the future.
338    let content = match read_file_to_string(path) {
339        Ok(content) => content,
340        Err(e) => {
341            warn!("Failed to read setup.py at {:?}: {}", path, e);
342            return default_package_data();
343        }
344    };
345
346    let name = extract_setup_value(&content, "name");
347    let version = extract_setup_value(&content, "version");
348    let license_expression = extract_setup_value(&content, "license");
349
350    // Create license detection if we found a license
351    let license_detections = license_expression.as_ref().map_or(Vec::new(), |license| {
352        vec![LicenseDetection {
353            license_expression: license.clone(),
354            matches: vec![Match {
355                score: 100.0,
356                start_line: 0, // We don't track exact line numbers
357                end_line: 0,
358                license_expression: license.clone(),
359                rule_identifier: None,
360                matched_text: None,
361            }],
362        }]
363    });
364
365    // Create package URL
366    let purl = name.as_ref().map(|n| {
367        let mut package_url =
368            PackageUrl::new(PythonParser::PACKAGE_TYPE, n).expect("Failed to create PackageUrl");
369
370        if let Some(v) = &version {
371            package_url.with_version(v).expect("Failed to set version");
372        }
373
374        package_url.to_string()
375    });
376
377    PackageData {
378        package_type: Some(PythonParser::PACKAGE_TYPE.to_string()),
379        namespace: None,
380        name,
381        version,
382        homepage_url: extract_setup_value(&content, "url"),
383        download_url: None,
384        copyright: None,
385        license_detections,
386        dependencies: Vec::new(), // For setup.py, parsing dependencies reliably is challenging
387        parties: Vec::new(),      // Same for authors without a proper parser
388        purl,
389    }
390}
391
392fn extract_setup_value(content: &str, key: &str) -> Option<String> {
393    // This is a very basic parser that looks for patterns like:
394    // name="package_name", or name = "package_name"
395    let patterns = vec![
396        format!("{}=\"", key),   // name="value"
397        format!("{} =\"", key),  // name ="value"
398        format!("{}= \"", key),  // name= "value"
399        format!("{} = \"", key), // name = "value"
400        format!("{}='", key),    // name='value'
401        format!("{} ='", key),   // name ='value'
402        format!("{}= '", key),   // name= 'value'
403        format!("{} = '", key),  // name = 'value'
404    ];
405
406    for pattern in patterns {
407        if let Some(start_idx) = content.find(&pattern) {
408            let value_start = start_idx + pattern.len();
409            let remaining = &content[value_start..];
410
411            // Find closing quote
412            if let Some(end_idx) = remaining.find(['"', '\'']) {
413                return Some(remaining[..end_idx].to_string());
414            }
415        }
416    }
417
418    None
419}
420
421/// Reads and parses a TOML file
422fn read_toml_file(path: &Path) -> Result<TomlValue, String> {
423    let content = read_file_to_string(path)?;
424    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
425}
426
427fn read_file_to_string(path: &Path) -> Result<String, String> {
428    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {}", e))?;
429    let mut content = String::new();
430    file.read_to_string(&mut content)
431        .map_err(|e| format!("Error reading file: {}", e))?;
432    Ok(content)
433}
434
435fn default_package_data() -> PackageData {
436    PackageData {
437        package_type: None,
438        namespace: None,
439        name: None,
440        version: None,
441        homepage_url: None,
442        download_url: None,
443        copyright: None,
444        license_detections: Vec::new(),
445        dependencies: Vec::new(),
446        parties: Vec::new(),
447        purl: None,
448    }
449}