Skip to main content

provenant/parsers/
cargo.rs

1//! Parser for Cargo.toml manifest files.
2//!
3//! Extracts package metadata, dependencies, and license information from
4//! Rust Cargo.toml files.
5//!
6//! # Supported Formats
7//! - Cargo.toml (manifest)
8//!
9//! # Key Features
10//! - Dependency extraction with feature flags and optional dependencies
11//! - `is_pinned` analysis (exact version vs range specifiers)
12//! - Package URL (purl) generation
13//! - Workspace inheritance detection (stores `"workspace"` markers in extra_data)
14//!
15//! # Implementation Notes
16//! - Uses toml crate for parsing
17//! - Version pinning: `"1.0.0"` is pinned, `"^1.0.0"` is not
18//! - Graceful error handling with `warn!()` logs
19//! - Direct dependencies: all in manifest are direct (no lockfile)
20
21use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
22use crate::parsers::utils::split_name_email;
23use log::warn;
24use packageurl::PackageUrl;
25use std::fs::File;
26use std::io::Read;
27use std::path::Path;
28use toml::Value;
29
30use super::PackageParser;
31use super::license_normalization::normalize_spdx_declared_license;
32
33const FIELD_PACKAGE: &str = "package";
34const FIELD_NAME: &str = "name";
35const FIELD_VERSION: &str = "version";
36const FIELD_LICENSE: &str = "license";
37const FIELD_LICENSE_FILE: &str = "license-file";
38const FIELD_AUTHORS: &str = "authors";
39const FIELD_REPOSITORY: &str = "repository";
40const FIELD_HOMEPAGE: &str = "homepage";
41const FIELD_DEPENDENCIES: &str = "dependencies";
42const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
43const FIELD_BUILD_DEPENDENCIES: &str = "build-dependencies";
44const FIELD_DESCRIPTION: &str = "description";
45const FIELD_KEYWORDS: &str = "keywords";
46const FIELD_CATEGORIES: &str = "categories";
47const FIELD_RUST_VERSION: &str = "rust-version";
48const FIELD_EDITION: &str = "edition";
49const FIELD_README: &str = "readme";
50const FIELD_PUBLISH: &str = "publish";
51
52/// Rust Cargo.toml manifest parser.
53///
54/// Extracts package metadata including dependencies (regular, dev, build),
55/// license information, and crate-specific fields.
56pub struct CargoParser;
57
58impl PackageParser for CargoParser {
59    const PACKAGE_TYPE: PackageType = PackageType::Cargo;
60
61    fn extract_packages(path: &Path) -> Vec<PackageData> {
62        let toml_content = match read_cargo_toml(path) {
63            Ok(content) => content,
64            Err(e) => {
65                warn!("Failed to read or parse Cargo.toml at {:?}: {}", path, e);
66                return vec![default_package_data()];
67            }
68        };
69
70        let package = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table());
71
72        let name = package
73            .and_then(|p| p.get(FIELD_NAME))
74            .and_then(|v| v.as_str())
75            .map(String::from);
76
77        let version = package
78            .and_then(|p| p.get(FIELD_VERSION))
79            .and_then(|v| v.as_str())
80            .map(String::from);
81
82        let raw_license = package
83            .and_then(|p| p.get(FIELD_LICENSE))
84            .and_then(|v| v.as_str())
85            .map(String::from);
86        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
87            normalize_spdx_declared_license(raw_license.as_deref());
88
89        let extracted_license_statement = raw_license.clone();
90
91        let dependencies = extract_dependencies(&toml_content, FIELD_DEPENDENCIES);
92        let dev_dependencies = extract_dependencies(&toml_content, FIELD_DEV_DEPENDENCIES);
93        let build_dependencies = extract_dependencies(&toml_content, FIELD_BUILD_DEPENDENCIES);
94
95        let purl = create_package_url(&name, &version);
96
97        let homepage_url = package
98            .and_then(|p| p.get(FIELD_HOMEPAGE))
99            .and_then(|v| v.as_str())
100            .map(String::from)
101            .or_else(|| {
102                name.as_ref()
103                    .map(|n| format!("https://crates.io/crates/{}", n))
104            });
105
106        let repository_url = package
107            .and_then(|p| p.get(FIELD_REPOSITORY))
108            .and_then(|v| v.as_str())
109            .map(String::from);
110        let download_url = None;
111
112        let api_data_url = generate_cargo_api_url(&name, &version);
113
114        let repository_homepage_url = name
115            .as_ref()
116            .map(|n| format!("https://crates.io/crates/{}", n));
117
118        let repository_download_url = match (&name, &version) {
119            (Some(n), Some(v)) => Some(format!(
120                "https://crates.io/api/v1/crates/{}/{}/download",
121                n, v
122            )),
123            _ => None,
124        };
125
126        let description = package
127            .and_then(|p| p.get(FIELD_DESCRIPTION))
128            .and_then(|v| v.as_str())
129            .map(|s| s.trim().to_string());
130
131        let keywords = extract_keywords_and_categories(&toml_content);
132
133        let extra_data = extract_extra_data(&toml_content);
134        let file_references = extract_file_references(&toml_content);
135
136        vec![PackageData {
137            package_type: Some(Self::PACKAGE_TYPE),
138            namespace: None,
139            name,
140            version,
141            qualifiers: None,
142            subpath: None,
143            primary_language: Some("Rust".to_string()),
144            description,
145            release_date: None,
146            parties: extract_parties(&toml_content),
147            keywords,
148            homepage_url,
149            download_url,
150            size: None,
151            sha1: None,
152            md5: None,
153            sha256: None,
154            sha512: None,
155            bug_tracking_url: None,
156            code_view_url: None,
157            vcs_url: repository_url,
158            copyright: None,
159            holder: None,
160            declared_license_expression,
161            declared_license_expression_spdx,
162            license_detections,
163            other_license_expression: None,
164            other_license_expression_spdx: None,
165            other_license_detections: Vec::new(),
166            extracted_license_statement,
167            notice_text: None,
168            source_packages: Vec::new(),
169            file_references,
170            is_private: false,
171            is_virtual: false,
172            extra_data,
173            dependencies: [dependencies, dev_dependencies, build_dependencies].concat(),
174            repository_homepage_url,
175            repository_download_url,
176            api_data_url,
177            datasource_id: Some(DatasourceId::CargoToml),
178            purl,
179        }]
180    }
181
182    fn is_match(path: &Path) -> bool {
183        path.file_name()
184            .and_then(|name| name.to_str())
185            .is_some_and(|name| name.eq_ignore_ascii_case("cargo.toml"))
186    }
187}
188
189/// Reads and parses a TOML file
190fn read_cargo_toml(path: &Path) -> Result<Value, String> {
191    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {}", e))?;
192    let mut content = String::new();
193    file.read_to_string(&mut content)
194        .map_err(|e| format!("Error reading file: {}", e))?;
195
196    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
197}
198
199fn generate_cargo_api_url(name: &Option<String>, _version: &Option<String>) -> Option<String> {
200    const REGISTRY: &str = "https://crates.io/api/v1/crates";
201    name.as_ref().map(|name| format!("{}/{}", REGISTRY, name))
202}
203
204fn create_package_url(name: &Option<String>, version: &Option<String>) -> Option<String> {
205    name.as_ref().and_then(|name| {
206        let mut package_url = match PackageUrl::new(CargoParser::PACKAGE_TYPE.as_str(), name) {
207            Ok(p) => p,
208            Err(e) => {
209                warn!(
210                    "Failed to create PackageUrl for cargo package '{}': {}",
211                    name, e
212                );
213                return None;
214            }
215        };
216
217        if let Some(v) = version
218            && let Err(e) = package_url.with_version(v)
219        {
220            warn!(
221                "Failed to set version '{}' for cargo package '{}': {}",
222                v, name, e
223            );
224            return None;
225        }
226
227        Some(package_url.to_string())
228    })
229}
230
231/// Extracts party information from the `authors` field
232fn extract_parties(toml_content: &Value) -> Vec<Party> {
233    let mut parties = Vec::new();
234
235    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table())
236        && let Some(authors) = package.get(FIELD_AUTHORS).and_then(|v| v.as_array())
237    {
238        for author in authors {
239            if let Some(author_str) = author.as_str() {
240                let (name, email) = split_name_email(author_str);
241                parties.push(Party {
242                    r#type: None,
243                    role: Some("author".to_string()),
244                    name,
245                    email,
246                    url: None,
247                    organization: None,
248                    organization_url: None,
249                    timezone: None,
250                });
251            }
252        }
253    }
254
255    parties
256}
257
258/// Determines if a Cargo version specifier is pinned to an exact version.
259///
260/// A version is considered pinned if it specifies an exact version (full semver)
261/// without range operators. Examples:
262/// - Pinned: "1.0.0", "0.8.1"
263/// - NOT pinned: "0.8" (allows patch), "^1.0.0", "~1.0.0", ">=1.0.0", "*"
264fn is_cargo_version_pinned(version_str: &str) -> bool {
265    let trimmed = version_str.trim();
266
267    // Empty version is not pinned
268    if trimmed.is_empty() {
269        return false;
270    }
271
272    // Check for range operators that indicate unpinned versions
273    if trimmed.contains('^')
274        || trimmed.contains('~')
275        || trimmed.contains('>')
276        || trimmed.contains('<')
277        || trimmed.contains('*')
278        || trimmed.contains('=')
279    {
280        return false;
281    }
282
283    // Count dots to check if it's a full semver (major.minor.patch)
284    // Pinned versions must have at least 2 dots (e.g., "1.0.0")
285    // Partial versions like "0.8" or "1" are not pinned
286    trimmed.matches('.').count() >= 2
287}
288
289fn extract_dependencies(toml_content: &Value, scope: &str) -> Vec<Dependency> {
290    use serde_json::json;
291
292    let mut dependencies = Vec::new();
293
294    // Determine is_runtime based on scope
295    let is_runtime = !scope.ends_with("dev-dependencies") && !scope.ends_with("build-dependencies");
296
297    if let Some(deps_table) = toml_content.get(scope).and_then(|v| v.as_table()) {
298        for (name, value) in deps_table {
299            let (extracted_requirement, is_optional, extra_data_map, is_pinned) = match value {
300                Value::String(version_str) => {
301                    // Simple string version: "1.0"
302                    let pinned = is_cargo_version_pinned(version_str);
303                    (
304                        Some(version_str.to_string()),
305                        false,
306                        std::collections::HashMap::new(),
307                        pinned,
308                    )
309                }
310                Value::Table(table) => {
311                    // Complex table format: { version = "1.0", optional = true, features = [...] }
312                    let version = table
313                        .get("version")
314                        .and_then(|v| v.as_str())
315                        .map(String::from);
316
317                    let pinned = version.as_ref().is_some_and(|v| is_cargo_version_pinned(v));
318
319                    let is_optional = table
320                        .get("optional")
321                        .and_then(|v| v.as_bool())
322                        .unwrap_or(false);
323
324                    let mut extra_data = std::collections::HashMap::new();
325
326                    // Extract all table fields into extra_data
327                    for (key, val) in table {
328                        match key.as_str() {
329                            "version" => {
330                                // Store version in extra_data
331                                if let Some(v) = val.as_str() {
332                                    extra_data.insert("version".to_string(), json!(v));
333                                }
334                            }
335                            "features" => {
336                                // Extract features array
337                                if let Some(features_array) = val.as_array() {
338                                    let features: Vec<String> = features_array
339                                        .iter()
340                                        .filter_map(|f| f.as_str().map(String::from))
341                                        .collect();
342                                    extra_data.insert("features".to_string(), json!(features));
343                                }
344                            }
345                            "optional" => {
346                                // Skip optional flag, it's handled separately
347                            }
348                            _ => {
349                                // Store other fields (workspace, path, git, branch, tag, rev, etc.)
350                                if let Some(s) = val.as_str() {
351                                    extra_data.insert(key.clone(), json!(s));
352                                } else if let Some(b) = val.as_bool() {
353                                    extra_data.insert(key.clone(), json!(b));
354                                } else if let Some(i) = val.as_integer() {
355                                    extra_data.insert(key.clone(), json!(i));
356                                }
357                            }
358                        }
359                    }
360
361                    (version, is_optional, extra_data, pinned)
362                }
363                _ => {
364                    // Unknown format, skip
365                    continue;
366                }
367            };
368
369            // Only create dependency if we have a version or it's a table with other data
370            if extracted_requirement.is_some() || !extra_data_map.is_empty() {
371                let purl = match PackageUrl::new(CargoParser::PACKAGE_TYPE.as_str(), name) {
372                    Ok(p) => p.to_string(),
373                    Err(e) => {
374                        warn!(
375                            "Failed to create PackageUrl for cargo dependency '{}': {}",
376                            name, e
377                        );
378                        continue; // Skip this dependency
379                    }
380                };
381
382                dependencies.push(Dependency {
383                    purl: Some(purl),
384                    extracted_requirement,
385                    scope: Some(scope.to_string()),
386                    is_runtime: Some(is_runtime),
387                    is_optional: Some(is_optional),
388                    is_pinned: Some(is_pinned),
389                    is_direct: Some(true),
390                    resolved_package: None,
391                    extra_data: if extra_data_map.is_empty() {
392                        None
393                    } else {
394                        Some(extra_data_map)
395                    },
396                });
397            }
398        }
399    }
400
401    dependencies
402}
403
404/// Extracts keywords and categories, merging them into a single keywords array
405fn extract_keywords_and_categories(toml_content: &Value) -> Vec<String> {
406    let mut keywords = Vec::new();
407
408    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table()) {
409        // Extract keywords array
410        if let Some(kw_array) = package.get(FIELD_KEYWORDS).and_then(|v| v.as_array()) {
411            for kw in kw_array {
412                if let Some(kw_str) = kw.as_str() {
413                    keywords.push(kw_str.to_string());
414                }
415            }
416        }
417
418        // Extract categories array and merge with keywords
419        if let Some(cat_array) = package.get(FIELD_CATEGORIES).and_then(|v| v.as_array()) {
420            for cat in cat_array {
421                if let Some(cat_str) = cat.as_str() {
422                    keywords.push(cat_str.to_string());
423                }
424            }
425        }
426    }
427
428    keywords
429}
430
431fn extract_file_references(toml_content: &Value) -> Vec<FileReference> {
432    let mut file_references = Vec::new();
433
434    if let Some(package) = toml_content
435        .get(FIELD_PACKAGE)
436        .and_then(|value| value.as_table())
437    {
438        for path in [
439            package
440                .get(FIELD_LICENSE_FILE)
441                .and_then(|value| value.as_str()),
442            package.get(FIELD_README).and_then(|value| value.as_str()),
443        ]
444        .into_iter()
445        .flatten()
446        {
447            if file_references
448                .iter()
449                .any(|reference: &FileReference| reference.path == path)
450            {
451                continue;
452            }
453
454            file_references.push(FileReference {
455                path: path.to_string(),
456                size: None,
457                sha1: None,
458                md5: None,
459                sha256: None,
460                sha512: None,
461                extra_data: None,
462            });
463        }
464    }
465
466    file_references
467}
468
469/// Converts toml::Value to serde_json::Value recursively
470fn toml_to_json(value: &toml::Value) -> serde_json::Value {
471    match value {
472        toml::Value::String(s) => serde_json::json!(s),
473        toml::Value::Integer(i) => serde_json::json!(i),
474        toml::Value::Float(f) => serde_json::json!(f),
475        toml::Value::Boolean(b) => serde_json::json!(b),
476        toml::Value::Array(a) => serde_json::Value::Array(a.iter().map(toml_to_json).collect()),
477        toml::Value::Table(t) => {
478            let map: serde_json::Map<String, serde_json::Value> = t
479                .iter()
480                .map(|(k, v)| (k.clone(), toml_to_json(v)))
481                .collect();
482            serde_json::Value::Object(map)
483        }
484        toml::Value::Datetime(d) => serde_json::json!(d.to_string()),
485    }
486}
487
488/// Extracts extra_data fields (rust-version, edition, documentation, license-file, workspace)
489fn extract_extra_data(
490    toml_content: &Value,
491) -> Option<std::collections::HashMap<String, serde_json::Value>> {
492    use serde_json::json;
493    let mut extra_data = std::collections::HashMap::new();
494
495    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table()) {
496        // Extract rust-version (or detect workspace inheritance)
497        if let Some(rust_version_value) = package.get(FIELD_RUST_VERSION) {
498            if let Some(rust_version_str) = rust_version_value.as_str() {
499                extra_data.insert("rust_version".to_string(), json!(rust_version_str));
500            } else if rust_version_value
501                .as_table()
502                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
503            {
504                extra_data.insert("rust-version".to_string(), json!("workspace"));
505            }
506        }
507
508        // Extract edition (or detect workspace inheritance)
509        if let Some(edition_value) = package.get(FIELD_EDITION) {
510            if let Some(edition_str) = edition_value.as_str() {
511                extra_data.insert("rust_edition".to_string(), json!(edition_str));
512            } else if edition_value
513                .as_table()
514                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
515            {
516                extra_data.insert("edition".to_string(), json!("workspace"));
517            }
518        }
519
520        // Extract documentation URL
521        if let Some(documentation) = package.get("documentation").and_then(|v| v.as_str()) {
522            extra_data.insert("documentation_url".to_string(), json!(documentation));
523        }
524
525        // Extract license-file path
526        if let Some(license_file) = package.get(FIELD_LICENSE_FILE).and_then(|v| v.as_str()) {
527            extra_data.insert("license_file".to_string(), json!(license_file));
528        }
529
530        if let Some(readme_value) = package.get(FIELD_README) {
531            if let Some(readme_file) = readme_value.as_str() {
532                extra_data.insert("readme_file".to_string(), json!(readme_file));
533            } else if let Some(readme_enabled) = readme_value.as_bool() {
534                extra_data.insert("readme".to_string(), json!(readme_enabled));
535            } else if readme_value
536                .as_table()
537                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
538            {
539                extra_data.insert("readme".to_string(), json!("workspace"));
540            }
541        }
542
543        if let Some(publish_value) = package.get(FIELD_PUBLISH) {
544            extra_data.insert("publish".to_string(), toml_to_json(publish_value));
545        }
546
547        // Check for workspace inheritance markers for other fields
548        // version
549        if let Some(version_value) = package.get(FIELD_VERSION)
550            && version_value
551                .as_table()
552                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
553        {
554            extra_data.insert("version".to_string(), json!("workspace"));
555        }
556
557        // license
558        if let Some(license_value) = package.get(FIELD_LICENSE)
559            && license_value
560                .as_table()
561                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
562        {
563            extra_data.insert("license".to_string(), json!("workspace"));
564        }
565
566        // homepage
567        if let Some(homepage_value) = package.get(FIELD_HOMEPAGE)
568            && homepage_value
569                .as_table()
570                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
571        {
572            extra_data.insert("homepage".to_string(), json!("workspace"));
573        }
574
575        // repository
576        if let Some(repository_value) = package.get(FIELD_REPOSITORY)
577            && repository_value
578                .as_table()
579                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
580        {
581            extra_data.insert("repository".to_string(), json!("workspace"));
582        }
583
584        // categories
585        if let Some(categories_value) = package.get(FIELD_CATEGORIES)
586            && categories_value
587                .as_table()
588                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
589        {
590            extra_data.insert("categories".to_string(), json!("workspace"));
591        }
592
593        // authors
594        if let Some(authors_value) = package.get(FIELD_AUTHORS)
595            && authors_value
596                .as_table()
597                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
598        {
599            extra_data.insert("authors".to_string(), json!("workspace"));
600        }
601    }
602
603    // Extract workspace table if it exists
604    if let Some(workspace_value) = toml_content.get("workspace") {
605        extra_data.insert("workspace".to_string(), toml_to_json(workspace_value));
606    }
607
608    if extra_data.is_empty() {
609        None
610    } else {
611        Some(extra_data)
612    }
613}
614
615fn default_package_data() -> PackageData {
616    PackageData {
617        package_type: Some(CargoParser::PACKAGE_TYPE),
618        datasource_id: Some(DatasourceId::CargoToml),
619        ..Default::default()
620    }
621}
622
623crate::register_parser!(
624    "Rust Cargo.toml manifest",
625    &["**/Cargo.toml", "**/cargo.toml"],
626    "cargo",
627    "Rust",
628    Some("https://doc.rust-lang.org/cargo/reference/manifest.html"),
629);