Skip to main content

provenant/parsers/
cargo.rs

1//! Parser for Cargo.toml manifest files.
2//!
3//! Extracts package metadata, dependencies, and license information from
4//! Rust Cargo.toml files.
5//!
6//! # Supported Formats
7//! - Cargo.toml (manifest)
8//!
9//! # Key Features
10//! - Dependency extraction with feature flags and optional dependencies
11//! - `is_pinned` analysis (exact version vs range specifiers)
12//! - Package URL (purl) generation
13//! - Workspace inheritance detection (stores `"workspace"` markers in extra_data)
14//!
15//! # Implementation Notes
16//! - Uses toml crate for parsing
17//! - Version pinning: `"1.0.0"` is pinned, `"^1.0.0"` is not
18//! - Graceful error handling with `warn!()` logs
19//! - Direct dependencies: all in manifest are direct (no lockfile)
20
21use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
22use crate::parser_warn as warn;
23use crate::parsers::utils::{
24    MAX_ITERATION_COUNT, read_file_to_string, split_name_email, truncate_field,
25};
26use packageurl::PackageUrl;
27use std::path::Path;
28use toml::Value;
29
30use super::PackageParser;
31use super::license_normalization::{
32    DeclaredLicenseMatchMetadata, build_declared_license_data, empty_declared_license_data,
33    normalize_spdx_expression,
34};
35
36const FIELD_PACKAGE: &str = "package";
37const FIELD_NAME: &str = "name";
38const FIELD_VERSION: &str = "version";
39const FIELD_LICENSE: &str = "license";
40const FIELD_LICENSE_FILE: &str = "license-file";
41const FIELD_AUTHORS: &str = "authors";
42const FIELD_REPOSITORY: &str = "repository";
43const FIELD_HOMEPAGE: &str = "homepage";
44const FIELD_DEPENDENCIES: &str = "dependencies";
45const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
46const FIELD_DEV_DEPENDENCIES_LEGACY: &str = "dev_dependencies";
47const FIELD_BUILD_DEPENDENCIES: &str = "build-dependencies";
48const FIELD_BUILD_DEPENDENCIES_LEGACY: &str = "build_dependencies";
49const FIELD_DESCRIPTION: &str = "description";
50const FIELD_KEYWORDS: &str = "keywords";
51const FIELD_CATEGORIES: &str = "categories";
52const FIELD_RUST_VERSION: &str = "rust-version";
53const FIELD_EDITION: &str = "edition";
54const FIELD_README: &str = "readme";
55const FIELD_PUBLISH: &str = "publish";
56
57/// Rust Cargo.toml manifest parser.
58///
59/// Extracts package metadata including dependencies (regular, dev, build),
60/// license information, and crate-specific fields.
61pub struct CargoParser;
62
63impl PackageParser for CargoParser {
64    const PACKAGE_TYPE: PackageType = PackageType::Cargo;
65
66    fn extract_packages(path: &Path) -> Vec<PackageData> {
67        let toml_content = match read_cargo_toml(path) {
68            Ok(content) => content,
69            Err(_) => return Vec::new(),
70        };
71
72        let package = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table());
73
74        let name = package
75            .and_then(|p| p.get(FIELD_NAME))
76            .and_then(|v| v.as_str())
77            .map(|s| truncate_field(s.to_string()));
78
79        let version = package
80            .and_then(|p| p.get(FIELD_VERSION))
81            .and_then(|v| v.as_str())
82            .map(|s| truncate_field(s.to_string()));
83
84        let raw_license = package
85            .and_then(|p| p.get(FIELD_LICENSE))
86            .and_then(|v| v.as_str())
87            .map(|s| truncate_field(s.to_string()));
88        let file_references = extract_file_references(&toml_content);
89        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
90            raw_license
91                .as_deref()
92                .and_then(normalize_spdx_expression)
93                .map(|normalized| {
94                    build_declared_license_data(
95                        normalized,
96                        DeclaredLicenseMatchMetadata::single_line(
97                            raw_license.as_deref().unwrap_or_default(),
98                        ),
99                    )
100                })
101                .unwrap_or_else(empty_declared_license_data);
102
103        let extracted_license_statement = raw_license.clone();
104
105        let dependencies = extract_dependencies_for_scopes(&toml_content, &[FIELD_DEPENDENCIES]);
106        let dev_dependencies = extract_dependencies_for_scopes(
107            &toml_content,
108            &[FIELD_DEV_DEPENDENCIES, FIELD_DEV_DEPENDENCIES_LEGACY],
109        );
110        let build_dependencies = extract_dependencies_for_scopes(
111            &toml_content,
112            &[FIELD_BUILD_DEPENDENCIES, FIELD_BUILD_DEPENDENCIES_LEGACY],
113        );
114
115        let purl = create_package_url(&name, &version);
116
117        let homepage_url = package
118            .and_then(|p| p.get(FIELD_HOMEPAGE))
119            .and_then(|v| v.as_str())
120            .map(|s| truncate_field(s.to_string()))
121            .or_else(|| {
122                name.as_ref()
123                    .map(|n| format!("https://crates.io/crates/{}", n))
124            });
125
126        let repository_url = package
127            .and_then(|p| p.get(FIELD_REPOSITORY))
128            .and_then(|v| v.as_str())
129            .map(|s| truncate_field(s.to_string()));
130        let download_url = None;
131
132        let api_data_url = generate_cargo_api_url(&name, &version);
133
134        let repository_homepage_url = name
135            .as_ref()
136            .map(|n| format!("https://crates.io/crates/{}", n));
137
138        let repository_download_url = match (&name, &version) {
139            (Some(n), Some(v)) => Some(format!(
140                "https://crates.io/api/v1/crates/{}/{}/download",
141                n, v
142            )),
143            _ => None,
144        };
145
146        let description = package
147            .and_then(|p| p.get(FIELD_DESCRIPTION))
148            .and_then(|v| v.as_str())
149            .map(|s| truncate_field(s.trim().to_string()));
150
151        let keywords = extract_keywords_and_categories(&toml_content);
152
153        let extra_data = extract_extra_data(&toml_content);
154        let is_private = package
155            .and_then(|p| p.get(FIELD_PUBLISH))
156            .is_some_and(|value| matches!(value, Value::Boolean(false)));
157        vec![PackageData {
158            package_type: Some(Self::PACKAGE_TYPE),
159            namespace: None,
160            name,
161            version,
162            qualifiers: None,
163            subpath: None,
164            primary_language: Some("Rust".to_string()),
165            description,
166            release_date: None,
167            parties: extract_parties(&toml_content),
168            keywords,
169            homepage_url,
170            download_url,
171            size: None,
172            sha1: None,
173            md5: None,
174            sha256: None,
175            sha512: None,
176            bug_tracking_url: None,
177            code_view_url: None,
178            vcs_url: repository_url,
179            copyright: None,
180            holder: None,
181            declared_license_expression,
182            declared_license_expression_spdx,
183            license_detections,
184            other_license_expression: None,
185            other_license_expression_spdx: None,
186            other_license_detections: Vec::new(),
187            extracted_license_statement,
188            notice_text: None,
189            source_packages: Vec::new(),
190            file_references,
191            is_private,
192            is_virtual: false,
193            extra_data,
194            dependencies: [dependencies, dev_dependencies, build_dependencies].concat(),
195            repository_homepage_url,
196            repository_download_url,
197            api_data_url,
198            datasource_id: Some(DatasourceId::CargoToml),
199            purl,
200        }]
201    }
202
203    fn is_match(path: &Path) -> bool {
204        path.file_name()
205            .and_then(|name| name.to_str())
206            .is_some_and(|name| name.eq_ignore_ascii_case("cargo.toml"))
207    }
208}
209
210/// Reads and parses a TOML file
211fn read_cargo_toml(path: &Path) -> Result<Value, String> {
212    let content =
213        read_file_to_string(path, None).map_err(|e| format!("Failed to read file: {}", e))?;
214
215    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
216}
217
218fn generate_cargo_api_url(name: &Option<String>, _version: &Option<String>) -> Option<String> {
219    const REGISTRY: &str = "https://crates.io/api/v1/crates";
220    name.as_ref().map(|name| format!("{}/{}", REGISTRY, name))
221}
222
223fn create_package_url(name: &Option<String>, version: &Option<String>) -> Option<String> {
224    name.as_ref().and_then(|name| {
225        let mut package_url = match PackageUrl::new(CargoParser::PACKAGE_TYPE.as_str(), name) {
226            Ok(p) => p,
227            Err(e) => {
228                warn!(
229                    "Failed to create PackageUrl for cargo package '{}': {}",
230                    name, e
231                );
232                return None;
233            }
234        };
235
236        if let Some(v) = version
237            && let Err(e) = package_url.with_version(v)
238        {
239            warn!(
240                "Failed to set version '{}' for cargo package '{}': {}",
241                v, name, e
242            );
243            return None;
244        }
245
246        Some(package_url.to_string())
247    })
248}
249
250/// Extracts party information from the `authors` field
251fn extract_parties(toml_content: &Value) -> Vec<Party> {
252    let mut parties = Vec::new();
253
254    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table())
255        && let Some(authors) = package.get(FIELD_AUTHORS).and_then(|v| v.as_array())
256    {
257        for author in authors.iter().take(MAX_ITERATION_COUNT) {
258            if let Some(author_str) = author.as_str() {
259                let (name, email) = split_name_email(author_str);
260                parties.push(Party {
261                    r#type: None,
262                    role: Some("author".to_string()),
263                    name,
264                    email,
265                    url: None,
266                    organization: None,
267                    organization_url: None,
268                    timezone: None,
269                });
270            }
271        }
272        if authors.len() > MAX_ITERATION_COUNT {
273            warn!(
274                "Authors array has {} entries, capping at MAX_ITERATION_COUNT ({})",
275                authors.len(),
276                MAX_ITERATION_COUNT
277            );
278        }
279    }
280
281    parties
282}
283
284/// Determines if a Cargo version specifier is pinned to an exact version.
285///
286/// A version is considered pinned if it specifies an exact version (full semver)
287/// without range operators. Examples:
288/// - Pinned: "1.0.0", "0.8.1"
289/// - NOT pinned: "0.8" (allows patch), "^1.0.0", "~1.0.0", ">=1.0.0", "*"
290fn is_cargo_version_pinned(version_str: &str) -> bool {
291    let trimmed = version_str.trim();
292
293    // Empty version is not pinned
294    if trimmed.is_empty() {
295        return false;
296    }
297
298    // Check for range operators that indicate unpinned versions
299    if trimmed.contains('^')
300        || trimmed.contains('~')
301        || trimmed.contains('>')
302        || trimmed.contains('<')
303        || trimmed.contains('*')
304        || trimmed.contains('=')
305    {
306        return false;
307    }
308
309    // Count dots to check if it's a full semver (major.minor.patch)
310    // Pinned versions must have at least 2 dots (e.g., "1.0.0")
311    // Partial versions like "0.8" or "1" are not pinned
312    trimmed.matches('.').count() >= 2
313}
314
315fn extract_dependencies(toml_content: &Value, scope: &str) -> Vec<Dependency> {
316    use serde_json::json;
317
318    let mut dependencies = Vec::new();
319
320    // Determine is_runtime based on scope
321    let is_runtime = !scope.ends_with("dev-dependencies") && !scope.ends_with("build-dependencies");
322
323    if let Some(deps_table) = toml_content.get(scope).and_then(|v| v.as_table()) {
324        if deps_table.len() > MAX_ITERATION_COUNT {
325            warn!(
326                "Dependency table '{}' has {} entries, capping at MAX_ITERATION_COUNT ({})",
327                scope,
328                deps_table.len(),
329                MAX_ITERATION_COUNT
330            );
331        }
332        for (name, value) in deps_table.iter().take(MAX_ITERATION_COUNT) {
333            let (extracted_requirement, is_optional, extra_data_map, is_pinned) = match value {
334                Value::String(version_str) => {
335                    // Simple string version: "1.0"
336                    let pinned = is_cargo_version_pinned(version_str);
337                    (
338                        Some(version_str.to_string()),
339                        false,
340                        std::collections::HashMap::new(),
341                        pinned,
342                    )
343                }
344                Value::Table(table) => {
345                    // Complex table format: { version = "1.0", optional = true, features = [...] }
346                    let version = table
347                        .get("version")
348                        .and_then(|v| v.as_str())
349                        .map(String::from);
350
351                    let pinned = version.as_ref().is_some_and(|v| is_cargo_version_pinned(v));
352
353                    let is_optional = table
354                        .get("optional")
355                        .and_then(|v| v.as_bool())
356                        .unwrap_or(false);
357
358                    let mut extra_data = std::collections::HashMap::new();
359
360                    // Extract all table fields into extra_data
361                    for (key, val) in table {
362                        match key.as_str() {
363                            "version" => {
364                                // Store version in extra_data
365                                if let Some(v) = val.as_str() {
366                                    extra_data.insert("version".to_string(), json!(v));
367                                }
368                            }
369                            "features" => {
370                                // Extract features array
371                                if let Some(features_array) = val.as_array() {
372                                    let features: Vec<String> = features_array
373                                        .iter()
374                                        .filter_map(|f| f.as_str().map(String::from))
375                                        .collect();
376                                    extra_data.insert("features".to_string(), json!(features));
377                                }
378                            }
379                            "optional" => {
380                                // Skip optional flag, it's handled separately
381                            }
382                            _ => {
383                                // Store other fields (workspace, path, git, branch, tag, rev, etc.)
384                                if let Some(s) = val.as_str() {
385                                    extra_data.insert(key.clone(), json!(s));
386                                } else if let Some(b) = val.as_bool() {
387                                    extra_data.insert(key.clone(), json!(b));
388                                } else if let Some(i) = val.as_integer() {
389                                    extra_data.insert(key.clone(), json!(i));
390                                }
391                            }
392                        }
393                    }
394
395                    (version, is_optional, extra_data, pinned)
396                }
397                _ => {
398                    // Unknown format, skip
399                    continue;
400                }
401            };
402
403            // Only create dependency if we have a version or it's a table with other data
404            if extracted_requirement.is_some() || !extra_data_map.is_empty() {
405                let purl = match PackageUrl::new(CargoParser::PACKAGE_TYPE.as_str(), name) {
406                    Ok(p) => p.to_string(),
407                    Err(e) => {
408                        warn!(
409                            "Failed to create PackageUrl for cargo dependency '{}': {}",
410                            name, e
411                        );
412                        continue; // Skip this dependency
413                    }
414                };
415
416                dependencies.push(Dependency {
417                    purl: Some(purl),
418                    extracted_requirement,
419                    scope: Some(scope.to_string()),
420                    is_runtime: Some(is_runtime),
421                    is_optional: Some(is_optional),
422                    is_pinned: Some(is_pinned),
423                    is_direct: Some(true),
424                    resolved_package: None,
425                    extra_data: if extra_data_map.is_empty() {
426                        None
427                    } else {
428                        Some(extra_data_map)
429                    },
430                });
431            }
432        }
433    }
434
435    dependencies
436}
437
438fn extract_dependencies_for_scopes(toml_content: &Value, scopes: &[&str]) -> Vec<Dependency> {
439    scopes
440        .iter()
441        .flat_map(|scope| extract_dependencies(toml_content, scope))
442        .collect()
443}
444
445/// Extracts keywords and categories, merging them into a single keywords array
446fn extract_keywords_and_categories(toml_content: &Value) -> Vec<String> {
447    let mut keywords = Vec::new();
448
449    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table()) {
450        if let Some(kw_array) = package.get(FIELD_KEYWORDS).and_then(|v| v.as_array()) {
451            if kw_array.len() > MAX_ITERATION_COUNT {
452                warn!(
453                    "Keywords array has {} entries, capping at MAX_ITERATION_COUNT ({})",
454                    kw_array.len(),
455                    MAX_ITERATION_COUNT
456                );
457            }
458            for kw in kw_array.iter().take(MAX_ITERATION_COUNT) {
459                if let Some(kw_str) = kw.as_str() {
460                    keywords.push(truncate_field(kw_str.to_string()));
461                }
462            }
463        }
464
465        if let Some(cat_array) = package.get(FIELD_CATEGORIES).and_then(|v| v.as_array()) {
466            if cat_array.len() > MAX_ITERATION_COUNT {
467                warn!(
468                    "Categories array has {} entries, capping at MAX_ITERATION_COUNT ({})",
469                    cat_array.len(),
470                    MAX_ITERATION_COUNT
471                );
472            }
473            for cat in cat_array.iter().take(MAX_ITERATION_COUNT) {
474                if let Some(cat_str) = cat.as_str() {
475                    keywords.push(truncate_field(cat_str.to_string()));
476                }
477            }
478        }
479    }
480
481    keywords
482}
483
484fn extract_file_references(toml_content: &Value) -> Vec<FileReference> {
485    let mut file_references = Vec::new();
486
487    if let Some(package) = toml_content
488        .get(FIELD_PACKAGE)
489        .and_then(|value| value.as_table())
490    {
491        for path in [
492            package
493                .get(FIELD_LICENSE_FILE)
494                .and_then(|value| value.as_str()),
495            package.get(FIELD_README).and_then(|value| value.as_str()),
496        ]
497        .into_iter()
498        .flatten()
499        {
500            if file_references
501                .iter()
502                .any(|reference: &FileReference| reference.path == path)
503            {
504                continue;
505            }
506
507            file_references.push(FileReference {
508                path: path.to_string(),
509                size: None,
510                sha1: None,
511                md5: None,
512                sha256: None,
513                sha512: None,
514                extra_data: None,
515            });
516        }
517    }
518
519    file_references
520}
521
522const MAX_TOML_DEPTH: usize = 50;
523
524fn toml_to_json(value: &toml::Value, depth: usize) -> serde_json::Value {
525    if depth > MAX_TOML_DEPTH {
526        warn!(
527            "TOML nesting depth exceeded {}, returning Null",
528            MAX_TOML_DEPTH
529        );
530        return serde_json::Value::Null;
531    }
532    match value {
533        toml::Value::String(s) => serde_json::json!(s),
534        toml::Value::Integer(i) => serde_json::json!(i),
535        toml::Value::Float(f) => serde_json::json!(f),
536        toml::Value::Boolean(b) => serde_json::json!(b),
537        toml::Value::Array(a) => {
538            serde_json::Value::Array(a.iter().map(|v| toml_to_json(v, depth + 1)).collect())
539        }
540        toml::Value::Table(t) => {
541            let map: serde_json::Map<String, serde_json::Value> = t
542                .iter()
543                .map(|(k, v)| (k.clone(), toml_to_json(v, depth + 1)))
544                .collect();
545            serde_json::Value::Object(map)
546        }
547        toml::Value::Datetime(d) => serde_json::json!(d.to_string()),
548    }
549}
550
551/// Extracts extra_data fields (rust-version, edition, documentation, license-file, workspace)
552fn extract_extra_data(
553    toml_content: &Value,
554) -> Option<std::collections::HashMap<String, serde_json::Value>> {
555    use serde_json::json;
556    let mut extra_data = std::collections::HashMap::new();
557
558    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table()) {
559        if package.len() > MAX_ITERATION_COUNT {
560            warn!(
561                "Package table has {} entries, exceeding MAX_ITERATION_COUNT ({})",
562                package.len(),
563                MAX_ITERATION_COUNT
564            );
565        }
566        if let Some(rust_version_value) = package.get(FIELD_RUST_VERSION) {
567            if let Some(rust_version_str) = rust_version_value.as_str() {
568                extra_data.insert("rust_version".to_string(), json!(rust_version_str));
569            } else if rust_version_value
570                .as_table()
571                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
572            {
573                extra_data.insert("rust-version".to_string(), json!("workspace"));
574            }
575        }
576
577        // Extract edition (or detect workspace inheritance)
578        if let Some(edition_value) = package.get(FIELD_EDITION) {
579            if let Some(edition_str) = edition_value.as_str() {
580                extra_data.insert("rust_edition".to_string(), json!(edition_str));
581            } else if edition_value
582                .as_table()
583                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
584            {
585                extra_data.insert("edition".to_string(), json!("workspace"));
586            }
587        }
588
589        // Extract documentation URL
590        if let Some(documentation) = package.get("documentation").and_then(|v| v.as_str()) {
591            extra_data.insert("documentation_url".to_string(), json!(documentation));
592        }
593
594        // Extract license-file path
595        if let Some(license_file) = package.get(FIELD_LICENSE_FILE).and_then(|v| v.as_str()) {
596            extra_data.insert("license_file".to_string(), json!(license_file));
597        }
598
599        if let Some(readme_value) = package.get(FIELD_README) {
600            if let Some(readme_file) = readme_value.as_str() {
601                extra_data.insert("readme_file".to_string(), json!(readme_file));
602            } else if let Some(readme_enabled) = readme_value.as_bool() {
603                extra_data.insert("readme".to_string(), json!(readme_enabled));
604            } else if readme_value
605                .as_table()
606                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
607            {
608                extra_data.insert("readme".to_string(), json!("workspace"));
609            }
610        }
611
612        if let Some(publish_value) = package.get(FIELD_PUBLISH) {
613            extra_data.insert("publish".to_string(), toml_to_json(publish_value, 0));
614        }
615
616        // Check for workspace inheritance markers for other fields
617        // version
618        if let Some(version_value) = package.get(FIELD_VERSION)
619            && version_value
620                .as_table()
621                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
622        {
623            extra_data.insert("version".to_string(), json!("workspace"));
624        }
625
626        // license
627        if let Some(license_value) = package.get(FIELD_LICENSE)
628            && license_value
629                .as_table()
630                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
631        {
632            extra_data.insert("license".to_string(), json!("workspace"));
633        }
634
635        // homepage
636        if let Some(homepage_value) = package.get(FIELD_HOMEPAGE)
637            && homepage_value
638                .as_table()
639                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
640        {
641            extra_data.insert("homepage".to_string(), json!("workspace"));
642        }
643
644        // repository
645        if let Some(repository_value) = package.get(FIELD_REPOSITORY)
646            && repository_value
647                .as_table()
648                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
649        {
650            extra_data.insert("repository".to_string(), json!("workspace"));
651        }
652
653        // categories
654        if let Some(categories_value) = package.get(FIELD_CATEGORIES)
655            && categories_value
656                .as_table()
657                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
658        {
659            extra_data.insert("categories".to_string(), json!("workspace"));
660        }
661
662        // authors
663        if let Some(authors_value) = package.get(FIELD_AUTHORS)
664            && authors_value
665                .as_table()
666                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
667        {
668            extra_data.insert("authors".to_string(), json!("workspace"));
669        }
670    }
671
672    // Extract workspace table if it exists
673    if let Some(workspace_value) = toml_content.get("workspace") {
674        extra_data.insert("workspace".to_string(), toml_to_json(workspace_value, 0));
675    }
676
677    if extra_data.is_empty() {
678        None
679    } else {
680        Some(extra_data)
681    }
682}
683
684crate::register_parser!(
685    "Rust Cargo.toml manifest",
686    &["**/Cargo.toml", "**/cargo.toml"],
687    "cargo",
688    "Rust",
689    Some("https://doc.rust-lang.org/cargo/reference/manifest.html"),
690);