Skip to main content

provenant/parsers/
cargo.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for Cargo.toml manifest files.
5//!
6//! Extracts package metadata, dependencies, and license information from
7//! Rust Cargo.toml files.
8//!
9//! # Supported Formats
10//! - Cargo.toml (manifest)
11//!
12//! # Key Features
13//! - Dependency extraction with feature flags and optional dependencies
14//! - `is_pinned` analysis (exact version vs range specifiers)
15//! - Package URL (purl) generation
16//! - Workspace inheritance detection (stores `"workspace"` markers in extra_data)
17//!
18//! # Implementation Notes
19//! - Uses toml crate for parsing
20//! - Version pinning: `"1.0.0"` is pinned, `"^1.0.0"` is not
21//! - Graceful error handling with `warn!()` logs
22//! - Direct dependencies: all in manifest are direct (no lockfile)
23
24use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
25use crate::parser_warn as warn;
26use crate::parsers::utils::{
27    MAX_ITERATION_COUNT, RecursionGuard, read_file_to_string, split_name_email, truncate_field,
28};
29use packageurl::PackageUrl;
30use std::path::Path;
31use toml::Value;
32
33use super::PackageParser;
34use super::license_normalization::{
35    DeclaredLicenseMatchMetadata, build_declared_license_data, empty_declared_license_data,
36    normalize_spdx_expression,
37};
38
39const FIELD_PACKAGE: &str = "package";
40const FIELD_NAME: &str = "name";
41const FIELD_VERSION: &str = "version";
42const FIELD_LICENSE: &str = "license";
43const FIELD_LICENSE_FILE: &str = "license-file";
44const FIELD_AUTHORS: &str = "authors";
45const FIELD_REPOSITORY: &str = "repository";
46const FIELD_HOMEPAGE: &str = "homepage";
47const FIELD_DEPENDENCIES: &str = "dependencies";
48const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
49const FIELD_DEV_DEPENDENCIES_LEGACY: &str = "dev_dependencies";
50const FIELD_BUILD_DEPENDENCIES: &str = "build-dependencies";
51const FIELD_BUILD_DEPENDENCIES_LEGACY: &str = "build_dependencies";
52const FIELD_DESCRIPTION: &str = "description";
53const FIELD_KEYWORDS: &str = "keywords";
54const FIELD_CATEGORIES: &str = "categories";
55const FIELD_RUST_VERSION: &str = "rust-version";
56const FIELD_EDITION: &str = "edition";
57const FIELD_README: &str = "readme";
58const FIELD_PUBLISH: &str = "publish";
59
60/// Rust Cargo.toml manifest parser.
61///
62/// Extracts package metadata including dependencies (regular, dev, build),
63/// license information, and crate-specific fields.
64pub struct CargoParser;
65
66impl PackageParser for CargoParser {
67    const PACKAGE_TYPE: PackageType = PackageType::Cargo;
68
69    fn extract_packages(path: &Path) -> Vec<PackageData> {
70        let toml_content = match read_cargo_toml(path) {
71            Ok(content) => content,
72            Err(_) => return Vec::new(),
73        };
74
75        let package = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table());
76
77        let name = package
78            .and_then(|p| p.get(FIELD_NAME))
79            .and_then(|v| v.as_str())
80            .map(|s| truncate_field(s.to_string()));
81
82        let version = package
83            .and_then(|p| p.get(FIELD_VERSION))
84            .and_then(|v| v.as_str())
85            .map(|s| truncate_field(s.to_string()));
86
87        let raw_license = package
88            .and_then(|p| p.get(FIELD_LICENSE))
89            .and_then(|v| v.as_str())
90            .map(|s| truncate_field(s.to_string()));
91        let file_references = extract_file_references(&toml_content);
92        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
93            raw_license
94                .as_deref()
95                .and_then(normalize_spdx_expression)
96                .map(|normalized| {
97                    build_declared_license_data(
98                        normalized,
99                        DeclaredLicenseMatchMetadata::single_line(
100                            raw_license.as_deref().unwrap_or_default(),
101                        ),
102                    )
103                })
104                .unwrap_or_else(empty_declared_license_data);
105
106        let extracted_license_statement = raw_license.clone();
107
108        let dependencies = extract_dependencies_for_scopes(&toml_content, &[FIELD_DEPENDENCIES]);
109        let dev_dependencies = extract_dependencies_for_scopes(
110            &toml_content,
111            &[FIELD_DEV_DEPENDENCIES, FIELD_DEV_DEPENDENCIES_LEGACY],
112        );
113        let build_dependencies = extract_dependencies_for_scopes(
114            &toml_content,
115            &[FIELD_BUILD_DEPENDENCIES, FIELD_BUILD_DEPENDENCIES_LEGACY],
116        );
117
118        let purl = create_package_url(&name, &version);
119
120        let homepage_url = package
121            .and_then(|p| p.get(FIELD_HOMEPAGE))
122            .and_then(|v| v.as_str())
123            .map(|s| truncate_field(s.to_string()))
124            .or_else(|| {
125                name.as_ref()
126                    .map(|n| format!("https://crates.io/crates/{}", n))
127            });
128
129        let repository_url = package
130            .and_then(|p| p.get(FIELD_REPOSITORY))
131            .and_then(|v| v.as_str())
132            .map(|s| truncate_field(s.to_string()));
133        let download_url = None;
134
135        let api_data_url = generate_cargo_api_url(&name, &version);
136
137        let repository_homepage_url = name
138            .as_ref()
139            .map(|n| format!("https://crates.io/crates/{}", n));
140
141        let repository_download_url = match (&name, &version) {
142            (Some(n), Some(v)) => Some(format!(
143                "https://crates.io/api/v1/crates/{}/{}/download",
144                n, v
145            )),
146            _ => None,
147        };
148
149        let description = package
150            .and_then(|p| p.get(FIELD_DESCRIPTION))
151            .and_then(|v| v.as_str())
152            .map(|s| truncate_field(s.trim().to_string()));
153
154        let keywords = extract_keywords_and_categories(&toml_content);
155
156        let extra_data = extract_extra_data(&toml_content);
157        let is_private = package
158            .and_then(|p| p.get(FIELD_PUBLISH))
159            .is_some_and(|value| matches!(value, Value::Boolean(false)));
160        vec![PackageData {
161            package_type: Some(Self::PACKAGE_TYPE),
162            namespace: None,
163            name,
164            version,
165            qualifiers: None,
166            subpath: None,
167            primary_language: Some("Rust".to_string()),
168            description,
169            release_date: None,
170            parties: extract_parties(&toml_content),
171            keywords,
172            homepage_url,
173            download_url,
174            size: None,
175            sha1: None,
176            md5: None,
177            sha256: None,
178            sha512: None,
179            bug_tracking_url: None,
180            code_view_url: None,
181            vcs_url: repository_url,
182            copyright: None,
183            holder: None,
184            declared_license_expression,
185            declared_license_expression_spdx,
186            license_detections,
187            other_license_expression: None,
188            other_license_expression_spdx: None,
189            other_license_detections: Vec::new(),
190            extracted_license_statement,
191            notice_text: None,
192            source_packages: Vec::new(),
193            file_references,
194            is_private,
195            is_virtual: false,
196            extra_data,
197            dependencies: [dependencies, dev_dependencies, build_dependencies].concat(),
198            repository_homepage_url,
199            repository_download_url,
200            api_data_url,
201            datasource_id: Some(DatasourceId::CargoToml),
202            purl,
203        }]
204    }
205
206    fn is_match(path: &Path) -> bool {
207        path.file_name()
208            .and_then(|name| name.to_str())
209            .is_some_and(|name| name.eq_ignore_ascii_case("cargo.toml"))
210    }
211
212    fn metadata() -> Vec<super::metadata::ParserMetadata> {
213        vec![super::metadata::ParserMetadata {
214            description: "Rust Cargo.toml manifest",
215            file_patterns: &["**/Cargo.toml", "**/cargo.toml"],
216            package_type: "cargo",
217            primary_language: "Rust",
218            documentation_url: Some("https://doc.rust-lang.org/cargo/reference/manifest.html"),
219        }]
220    }
221}
222
223/// Reads and parses a TOML file
224fn read_cargo_toml(path: &Path) -> Result<Value, String> {
225    let content =
226        read_file_to_string(path, None).map_err(|e| format!("Failed to read file: {}", e))?;
227
228    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
229}
230
231fn generate_cargo_api_url(name: &Option<String>, _version: &Option<String>) -> Option<String> {
232    const REGISTRY: &str = "https://crates.io/api/v1/crates";
233    name.as_ref().map(|name| format!("{}/{}", REGISTRY, name))
234}
235
236fn create_package_url(name: &Option<String>, version: &Option<String>) -> Option<String> {
237    name.as_ref().and_then(|name| {
238        let mut package_url = match PackageUrl::new(CargoParser::PACKAGE_TYPE.as_str(), name) {
239            Ok(p) => p,
240            Err(e) => {
241                warn!(
242                    "Failed to create PackageUrl for cargo package '{}': {}",
243                    name, e
244                );
245                return None;
246            }
247        };
248
249        if let Some(v) = version
250            && let Err(e) = package_url.with_version(v)
251        {
252            warn!(
253                "Failed to set version '{}' for cargo package '{}': {}",
254                v, name, e
255            );
256            return None;
257        }
258
259        Some(package_url.to_string())
260    })
261}
262
263/// Extracts party information from the `authors` field
264fn extract_parties(toml_content: &Value) -> Vec<Party> {
265    let mut parties = Vec::new();
266
267    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table())
268        && let Some(authors) = package.get(FIELD_AUTHORS).and_then(|v| v.as_array())
269    {
270        for author in authors.iter().take(MAX_ITERATION_COUNT) {
271            if let Some(author_str) = author.as_str() {
272                let (name, email) = split_name_email(author_str);
273                parties.push(Party {
274                    r#type: None,
275                    role: Some("author".to_string()),
276                    name,
277                    email,
278                    url: None,
279                    organization: None,
280                    organization_url: None,
281                    timezone: None,
282                });
283            }
284        }
285        if authors.len() > MAX_ITERATION_COUNT {
286            warn!(
287                "Authors array has {} entries, capping at MAX_ITERATION_COUNT ({})",
288                authors.len(),
289                MAX_ITERATION_COUNT
290            );
291        }
292    }
293
294    parties
295}
296
297/// Determines if a Cargo version specifier is pinned to an exact version.
298///
299/// A version is considered pinned if it specifies an exact version (full semver)
300/// without range operators. Examples:
301/// - Pinned: "1.0.0", "0.8.1"
302/// - NOT pinned: "0.8" (allows patch), "^1.0.0", "~1.0.0", ">=1.0.0", "*"
303fn is_cargo_version_pinned(version_str: &str) -> bool {
304    let trimmed = version_str.trim();
305
306    // Empty version is not pinned
307    if trimmed.is_empty() {
308        return false;
309    }
310
311    // Check for range operators that indicate unpinned versions
312    if trimmed.contains('^')
313        || trimmed.contains('~')
314        || trimmed.contains('>')
315        || trimmed.contains('<')
316        || trimmed.contains('*')
317        || trimmed.contains('=')
318    {
319        return false;
320    }
321
322    // Count dots to check if it's a full semver (major.minor.patch)
323    // Pinned versions must have at least 2 dots (e.g., "1.0.0")
324    // Partial versions like "0.8" or "1" are not pinned
325    trimmed.matches('.').count() >= 2
326}
327
328fn extract_dependencies(toml_content: &Value, scope: &str) -> Vec<Dependency> {
329    use serde_json::json;
330
331    let mut dependencies = Vec::new();
332
333    // Determine is_runtime based on scope
334    let is_runtime = !scope.ends_with("dev-dependencies") && !scope.ends_with("build-dependencies");
335
336    if let Some(deps_table) = toml_content.get(scope).and_then(|v| v.as_table()) {
337        if deps_table.len() > MAX_ITERATION_COUNT {
338            warn!(
339                "Dependency table '{}' has {} entries, capping at MAX_ITERATION_COUNT ({})",
340                scope,
341                deps_table.len(),
342                MAX_ITERATION_COUNT
343            );
344        }
345        for (name, value) in deps_table.iter().take(MAX_ITERATION_COUNT) {
346            let (extracted_requirement, is_optional, extra_data_map, is_pinned) = match value {
347                Value::String(version_str) => {
348                    // Simple string version: "1.0"
349                    let pinned = is_cargo_version_pinned(version_str);
350                    (
351                        Some(version_str.to_string()),
352                        false,
353                        std::collections::HashMap::new(),
354                        pinned,
355                    )
356                }
357                Value::Table(table) => {
358                    // Complex table format: { version = "1.0", optional = true, features = [...] }
359                    let version = table
360                        .get("version")
361                        .and_then(|v| v.as_str())
362                        .map(String::from);
363
364                    let pinned = version.as_ref().is_some_and(|v| is_cargo_version_pinned(v));
365
366                    let is_optional = table
367                        .get("optional")
368                        .and_then(|v| v.as_bool())
369                        .unwrap_or(false);
370
371                    let mut extra_data = std::collections::HashMap::new();
372
373                    // Extract all table fields into extra_data
374                    for (key, val) in table {
375                        match key.as_str() {
376                            "version" => {
377                                // Store version in extra_data
378                                if let Some(v) = val.as_str() {
379                                    extra_data.insert("version".to_string(), json!(v));
380                                }
381                            }
382                            "features" => {
383                                // Extract features array
384                                if let Some(features_array) = val.as_array() {
385                                    let features: Vec<String> = features_array
386                                        .iter()
387                                        .filter_map(|f| f.as_str().map(String::from))
388                                        .collect();
389                                    extra_data.insert("features".to_string(), json!(features));
390                                }
391                            }
392                            "optional" => {
393                                // Skip optional flag, it's handled separately
394                            }
395                            _ => {
396                                // Store other fields (workspace, path, git, branch, tag, rev, etc.)
397                                if let Some(s) = val.as_str() {
398                                    extra_data.insert(key.clone(), json!(s));
399                                } else if let Some(b) = val.as_bool() {
400                                    extra_data.insert(key.clone(), json!(b));
401                                } else if let Some(i) = val.as_integer() {
402                                    extra_data.insert(key.clone(), json!(i));
403                                }
404                            }
405                        }
406                    }
407
408                    (version, is_optional, extra_data, pinned)
409                }
410                _ => {
411                    // Unknown format, skip
412                    continue;
413                }
414            };
415
416            // Only create dependency if we have a version or it's a table with other data
417            if extracted_requirement.is_some() || !extra_data_map.is_empty() {
418                let purl = match PackageUrl::new(CargoParser::PACKAGE_TYPE.as_str(), name) {
419                    Ok(p) => p.to_string(),
420                    Err(e) => {
421                        warn!(
422                            "Failed to create PackageUrl for cargo dependency '{}': {}",
423                            name, e
424                        );
425                        continue; // Skip this dependency
426                    }
427                };
428
429                dependencies.push(Dependency {
430                    purl: Some(purl),
431                    extracted_requirement,
432                    scope: Some(scope.to_string()),
433                    is_runtime: Some(is_runtime),
434                    is_optional: Some(is_optional),
435                    is_pinned: Some(is_pinned),
436                    is_direct: Some(true),
437                    resolved_package: None,
438                    extra_data: if extra_data_map.is_empty() {
439                        None
440                    } else {
441                        Some(extra_data_map)
442                    },
443                });
444            }
445        }
446    }
447
448    dependencies
449}
450
451fn extract_dependencies_for_scopes(toml_content: &Value, scopes: &[&str]) -> Vec<Dependency> {
452    scopes
453        .iter()
454        .flat_map(|scope| extract_dependencies(toml_content, scope))
455        .collect()
456}
457
458/// Extracts keywords and categories, merging them into a single keywords array
459fn extract_keywords_and_categories(toml_content: &Value) -> Vec<String> {
460    let mut keywords = Vec::new();
461
462    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table()) {
463        if let Some(kw_array) = package.get(FIELD_KEYWORDS).and_then(|v| v.as_array()) {
464            if kw_array.len() > MAX_ITERATION_COUNT {
465                warn!(
466                    "Keywords array has {} entries, capping at MAX_ITERATION_COUNT ({})",
467                    kw_array.len(),
468                    MAX_ITERATION_COUNT
469                );
470            }
471            for kw in kw_array.iter().take(MAX_ITERATION_COUNT) {
472                if let Some(kw_str) = kw.as_str() {
473                    keywords.push(truncate_field(kw_str.to_string()));
474                }
475            }
476        }
477
478        if let Some(cat_array) = package.get(FIELD_CATEGORIES).and_then(|v| v.as_array()) {
479            if cat_array.len() > MAX_ITERATION_COUNT {
480                warn!(
481                    "Categories array has {} entries, capping at MAX_ITERATION_COUNT ({})",
482                    cat_array.len(),
483                    MAX_ITERATION_COUNT
484                );
485            }
486            for cat in cat_array.iter().take(MAX_ITERATION_COUNT) {
487                if let Some(cat_str) = cat.as_str() {
488                    keywords.push(truncate_field(cat_str.to_string()));
489                }
490            }
491        }
492    }
493
494    keywords
495}
496
497fn extract_file_references(toml_content: &Value) -> Vec<FileReference> {
498    let mut file_references = Vec::new();
499
500    if let Some(package) = toml_content
501        .get(FIELD_PACKAGE)
502        .and_then(|value| value.as_table())
503    {
504        for path in [
505            package
506                .get(FIELD_LICENSE_FILE)
507                .and_then(|value| value.as_str()),
508            package.get(FIELD_README).and_then(|value| value.as_str()),
509        ]
510        .into_iter()
511        .flatten()
512        {
513            if file_references
514                .iter()
515                .any(|reference: &FileReference| reference.path == path)
516            {
517                continue;
518            }
519
520            file_references.push(FileReference {
521                path: path.to_string(),
522                size: None,
523                sha1: None,
524                md5: None,
525                sha256: None,
526                sha512: None,
527                extra_data: None,
528            });
529        }
530    }
531
532    file_references
533}
534
535fn toml_to_json(value: &toml::Value, guard: &mut RecursionGuard<()>) -> serde_json::Value {
536    if guard.descend() {
537        warn!("TOML nesting depth exceeded, returning Null");
538        return serde_json::Value::Null;
539    }
540    let result = match value {
541        toml::Value::String(s) => serde_json::json!(s),
542        toml::Value::Integer(i) => serde_json::json!(i),
543        toml::Value::Float(f) => serde_json::json!(f),
544        toml::Value::Boolean(b) => serde_json::json!(b),
545        toml::Value::Array(a) => {
546            serde_json::Value::Array(a.iter().map(|v| toml_to_json(v, guard)).collect())
547        }
548        toml::Value::Table(t) => {
549            let map: serde_json::Map<String, serde_json::Value> = t
550                .iter()
551                .map(|(k, v)| (k.clone(), toml_to_json(v, guard)))
552                .collect();
553            serde_json::Value::Object(map)
554        }
555        toml::Value::Datetime(d) => serde_json::json!(d.to_string()),
556    };
557    guard.ascend();
558    result
559}
560
561/// Extracts extra_data fields (rust-version, edition, documentation, license-file, workspace)
562fn extract_extra_data(
563    toml_content: &Value,
564) -> Option<std::collections::HashMap<String, serde_json::Value>> {
565    use serde_json::json;
566    let mut extra_data = std::collections::HashMap::new();
567
568    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table()) {
569        if package.len() > MAX_ITERATION_COUNT {
570            warn!(
571                "Package table has {} entries, exceeding MAX_ITERATION_COUNT ({})",
572                package.len(),
573                MAX_ITERATION_COUNT
574            );
575        }
576        if let Some(rust_version_value) = package.get(FIELD_RUST_VERSION) {
577            if let Some(rust_version_str) = rust_version_value.as_str() {
578                extra_data.insert("rust_version".to_string(), json!(rust_version_str));
579            } else if rust_version_value
580                .as_table()
581                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
582            {
583                extra_data.insert("rust-version".to_string(), json!("workspace"));
584            }
585        }
586
587        // Extract edition (or detect workspace inheritance)
588        if let Some(edition_value) = package.get(FIELD_EDITION) {
589            if let Some(edition_str) = edition_value.as_str() {
590                extra_data.insert("rust_edition".to_string(), json!(edition_str));
591            } else if edition_value
592                .as_table()
593                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
594            {
595                extra_data.insert("edition".to_string(), json!("workspace"));
596            }
597        }
598
599        // Extract documentation URL
600        if let Some(documentation) = package.get("documentation").and_then(|v| v.as_str()) {
601            extra_data.insert("documentation_url".to_string(), json!(documentation));
602        }
603
604        // Extract license-file path
605        if let Some(license_file) = package.get(FIELD_LICENSE_FILE).and_then(|v| v.as_str()) {
606            extra_data.insert("license_file".to_string(), json!(license_file));
607        }
608
609        if let Some(readme_value) = package.get(FIELD_README) {
610            if let Some(readme_file) = readme_value.as_str() {
611                extra_data.insert("readme_file".to_string(), json!(readme_file));
612            } else if let Some(readme_enabled) = readme_value.as_bool() {
613                extra_data.insert("readme".to_string(), json!(readme_enabled));
614            } else if readme_value
615                .as_table()
616                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
617            {
618                extra_data.insert("readme".to_string(), json!("workspace"));
619            }
620        }
621
622        if let Some(publish_value) = package.get(FIELD_PUBLISH) {
623            extra_data.insert(
624                "publish".to_string(),
625                toml_to_json(publish_value, &mut RecursionGuard::depth_only()),
626            );
627        }
628
629        // Check for workspace inheritance markers for other fields
630        // version
631        if let Some(version_value) = package.get(FIELD_VERSION)
632            && version_value
633                .as_table()
634                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
635        {
636            extra_data.insert("version".to_string(), json!("workspace"));
637        }
638
639        // license
640        if let Some(license_value) = package.get(FIELD_LICENSE)
641            && license_value
642                .as_table()
643                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
644        {
645            extra_data.insert("license".to_string(), json!("workspace"));
646        }
647
648        // homepage
649        if let Some(homepage_value) = package.get(FIELD_HOMEPAGE)
650            && homepage_value
651                .as_table()
652                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
653        {
654            extra_data.insert("homepage".to_string(), json!("workspace"));
655        }
656
657        // repository
658        if let Some(repository_value) = package.get(FIELD_REPOSITORY)
659            && repository_value
660                .as_table()
661                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
662        {
663            extra_data.insert("repository".to_string(), json!("workspace"));
664        }
665
666        // categories
667        if let Some(categories_value) = package.get(FIELD_CATEGORIES)
668            && categories_value
669                .as_table()
670                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
671        {
672            extra_data.insert("categories".to_string(), json!("workspace"));
673        }
674
675        // authors
676        if let Some(authors_value) = package.get(FIELD_AUTHORS)
677            && authors_value
678                .as_table()
679                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
680        {
681            extra_data.insert("authors".to_string(), json!("workspace"));
682        }
683    }
684
685    // Extract workspace table if it exists
686    if let Some(workspace_value) = toml_content.get("workspace") {
687        extra_data.insert(
688            "workspace".to_string(),
689            toml_to_json(workspace_value, &mut RecursionGuard::depth_only()),
690        );
691    }
692
693    if extra_data.is_empty() {
694        None
695    } else {
696        Some(extra_data)
697    }
698}