Skip to main content

provenant/parsers/
cargo.rs

1//! Parser for Cargo.toml manifest files.
2//!
3//! Extracts package metadata, dependencies, and license information from
4//! Rust Cargo.toml files.
5//!
6//! # Supported Formats
7//! - Cargo.toml (manifest)
8//!
9//! # Key Features
10//! - Dependency extraction with feature flags and optional dependencies
11//! - `is_pinned` analysis (exact version vs range specifiers)
12//! - Package URL (purl) generation
13//! - Workspace inheritance detection (stores `"workspace"` markers in extra_data)
14//!
15//! # Implementation Notes
16//! - Uses toml crate for parsing
17//! - Version pinning: `"1.0.0"` is pinned, `"^1.0.0"` is not
18//! - Graceful error handling with `warn!()` logs
19//! - Direct dependencies: all in manifest are direct (no lockfile)
20
21use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
22use crate::parser_warn as warn;
23use crate::parsers::utils::split_name_email;
24use packageurl::PackageUrl;
25use std::fs::File;
26use std::io::Read;
27use std::path::Path;
28use toml::Value;
29
30use super::PackageParser;
31use super::license_normalization::{
32    DeclaredLicenseMatchMetadata, build_declared_license_data, empty_declared_license_data,
33    normalize_spdx_expression,
34};
35
36const FIELD_PACKAGE: &str = "package";
37const FIELD_NAME: &str = "name";
38const FIELD_VERSION: &str = "version";
39const FIELD_LICENSE: &str = "license";
40const FIELD_LICENSE_FILE: &str = "license-file";
41const FIELD_AUTHORS: &str = "authors";
42const FIELD_REPOSITORY: &str = "repository";
43const FIELD_HOMEPAGE: &str = "homepage";
44const FIELD_DEPENDENCIES: &str = "dependencies";
45const FIELD_DEV_DEPENDENCIES: &str = "dev-dependencies";
46const FIELD_BUILD_DEPENDENCIES: &str = "build-dependencies";
47const FIELD_DESCRIPTION: &str = "description";
48const FIELD_KEYWORDS: &str = "keywords";
49const FIELD_CATEGORIES: &str = "categories";
50const FIELD_RUST_VERSION: &str = "rust-version";
51const FIELD_EDITION: &str = "edition";
52const FIELD_README: &str = "readme";
53const FIELD_PUBLISH: &str = "publish";
54
55/// Rust Cargo.toml manifest parser.
56///
57/// Extracts package metadata including dependencies (regular, dev, build),
58/// license information, and crate-specific fields.
59pub struct CargoParser;
60
61impl PackageParser for CargoParser {
62    const PACKAGE_TYPE: PackageType = PackageType::Cargo;
63
64    fn extract_packages(path: &Path) -> Vec<PackageData> {
65        let toml_content = match read_cargo_toml(path) {
66            Ok(content) => content,
67            Err(e) => {
68                warn!("Failed to read or parse Cargo.toml at {:?}: {}", path, e);
69                return vec![default_package_data()];
70            }
71        };
72
73        let package = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table());
74
75        let name = package
76            .and_then(|p| p.get(FIELD_NAME))
77            .and_then(|v| v.as_str())
78            .map(String::from);
79
80        let version = package
81            .and_then(|p| p.get(FIELD_VERSION))
82            .and_then(|v| v.as_str())
83            .map(String::from);
84
85        let raw_license = package
86            .and_then(|p| p.get(FIELD_LICENSE))
87            .and_then(|v| v.as_str())
88            .map(String::from);
89        let file_references = extract_file_references(&toml_content);
90        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
91            raw_license
92                .as_deref()
93                .and_then(normalize_spdx_expression)
94                .map(|normalized| {
95                    build_declared_license_data(
96                        normalized,
97                        DeclaredLicenseMatchMetadata::single_line(
98                            raw_license.as_deref().unwrap_or_default(),
99                        ),
100                    )
101                })
102                .unwrap_or_else(empty_declared_license_data);
103
104        let extracted_license_statement = raw_license.clone();
105
106        let dependencies = extract_dependencies(&toml_content, FIELD_DEPENDENCIES);
107        let dev_dependencies = extract_dependencies(&toml_content, FIELD_DEV_DEPENDENCIES);
108        let build_dependencies = extract_dependencies(&toml_content, FIELD_BUILD_DEPENDENCIES);
109
110        let purl = create_package_url(&name, &version);
111
112        let homepage_url = package
113            .and_then(|p| p.get(FIELD_HOMEPAGE))
114            .and_then(|v| v.as_str())
115            .map(String::from)
116            .or_else(|| {
117                name.as_ref()
118                    .map(|n| format!("https://crates.io/crates/{}", n))
119            });
120
121        let repository_url = package
122            .and_then(|p| p.get(FIELD_REPOSITORY))
123            .and_then(|v| v.as_str())
124            .map(String::from);
125        let download_url = None;
126
127        let api_data_url = generate_cargo_api_url(&name, &version);
128
129        let repository_homepage_url = name
130            .as_ref()
131            .map(|n| format!("https://crates.io/crates/{}", n));
132
133        let repository_download_url = match (&name, &version) {
134            (Some(n), Some(v)) => Some(format!(
135                "https://crates.io/api/v1/crates/{}/{}/download",
136                n, v
137            )),
138            _ => None,
139        };
140
141        let description = package
142            .and_then(|p| p.get(FIELD_DESCRIPTION))
143            .and_then(|v| v.as_str())
144            .map(|s| s.trim().to_string());
145
146        let keywords = extract_keywords_and_categories(&toml_content);
147
148        let extra_data = extract_extra_data(&toml_content);
149        let is_private = package
150            .and_then(|p| p.get(FIELD_PUBLISH))
151            .is_some_and(|value| matches!(value, Value::Boolean(false)));
152        vec![PackageData {
153            package_type: Some(Self::PACKAGE_TYPE),
154            namespace: None,
155            name,
156            version,
157            qualifiers: None,
158            subpath: None,
159            primary_language: Some("Rust".to_string()),
160            description,
161            release_date: None,
162            parties: extract_parties(&toml_content),
163            keywords,
164            homepage_url,
165            download_url,
166            size: None,
167            sha1: None,
168            md5: None,
169            sha256: None,
170            sha512: None,
171            bug_tracking_url: None,
172            code_view_url: None,
173            vcs_url: repository_url,
174            copyright: None,
175            holder: None,
176            declared_license_expression,
177            declared_license_expression_spdx,
178            license_detections,
179            other_license_expression: None,
180            other_license_expression_spdx: None,
181            other_license_detections: Vec::new(),
182            extracted_license_statement,
183            notice_text: None,
184            source_packages: Vec::new(),
185            file_references,
186            is_private,
187            is_virtual: false,
188            extra_data,
189            dependencies: [dependencies, dev_dependencies, build_dependencies].concat(),
190            repository_homepage_url,
191            repository_download_url,
192            api_data_url,
193            datasource_id: Some(DatasourceId::CargoToml),
194            purl,
195        }]
196    }
197
198    fn is_match(path: &Path) -> bool {
199        path.file_name()
200            .and_then(|name| name.to_str())
201            .is_some_and(|name| name.eq_ignore_ascii_case("cargo.toml"))
202    }
203}
204
205/// Reads and parses a TOML file
206fn read_cargo_toml(path: &Path) -> Result<Value, String> {
207    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {}", e))?;
208    let mut content = String::new();
209    file.read_to_string(&mut content)
210        .map_err(|e| format!("Error reading file: {}", e))?;
211
212    toml::from_str(&content).map_err(|e| format!("Failed to parse TOML: {}", e))
213}
214
215fn generate_cargo_api_url(name: &Option<String>, _version: &Option<String>) -> Option<String> {
216    const REGISTRY: &str = "https://crates.io/api/v1/crates";
217    name.as_ref().map(|name| format!("{}/{}", REGISTRY, name))
218}
219
220fn create_package_url(name: &Option<String>, version: &Option<String>) -> Option<String> {
221    name.as_ref().and_then(|name| {
222        let mut package_url = match PackageUrl::new(CargoParser::PACKAGE_TYPE.as_str(), name) {
223            Ok(p) => p,
224            Err(e) => {
225                warn!(
226                    "Failed to create PackageUrl for cargo package '{}': {}",
227                    name, e
228                );
229                return None;
230            }
231        };
232
233        if let Some(v) = version
234            && let Err(e) = package_url.with_version(v)
235        {
236            warn!(
237                "Failed to set version '{}' for cargo package '{}': {}",
238                v, name, e
239            );
240            return None;
241        }
242
243        Some(package_url.to_string())
244    })
245}
246
247/// Extracts party information from the `authors` field
248fn extract_parties(toml_content: &Value) -> Vec<Party> {
249    let mut parties = Vec::new();
250
251    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table())
252        && let Some(authors) = package.get(FIELD_AUTHORS).and_then(|v| v.as_array())
253    {
254        for author in authors {
255            if let Some(author_str) = author.as_str() {
256                let (name, email) = split_name_email(author_str);
257                parties.push(Party {
258                    r#type: None,
259                    role: Some("author".to_string()),
260                    name,
261                    email,
262                    url: None,
263                    organization: None,
264                    organization_url: None,
265                    timezone: None,
266                });
267            }
268        }
269    }
270
271    parties
272}
273
274/// Determines if a Cargo version specifier is pinned to an exact version.
275///
276/// A version is considered pinned if it specifies an exact version (full semver)
277/// without range operators. Examples:
278/// - Pinned: "1.0.0", "0.8.1"
279/// - NOT pinned: "0.8" (allows patch), "^1.0.0", "~1.0.0", ">=1.0.0", "*"
280fn is_cargo_version_pinned(version_str: &str) -> bool {
281    let trimmed = version_str.trim();
282
283    // Empty version is not pinned
284    if trimmed.is_empty() {
285        return false;
286    }
287
288    // Check for range operators that indicate unpinned versions
289    if trimmed.contains('^')
290        || trimmed.contains('~')
291        || trimmed.contains('>')
292        || trimmed.contains('<')
293        || trimmed.contains('*')
294        || trimmed.contains('=')
295    {
296        return false;
297    }
298
299    // Count dots to check if it's a full semver (major.minor.patch)
300    // Pinned versions must have at least 2 dots (e.g., "1.0.0")
301    // Partial versions like "0.8" or "1" are not pinned
302    trimmed.matches('.').count() >= 2
303}
304
305fn extract_dependencies(toml_content: &Value, scope: &str) -> Vec<Dependency> {
306    use serde_json::json;
307
308    let mut dependencies = Vec::new();
309
310    // Determine is_runtime based on scope
311    let is_runtime = !scope.ends_with("dev-dependencies") && !scope.ends_with("build-dependencies");
312
313    if let Some(deps_table) = toml_content.get(scope).and_then(|v| v.as_table()) {
314        for (name, value) in deps_table {
315            let (extracted_requirement, is_optional, extra_data_map, is_pinned) = match value {
316                Value::String(version_str) => {
317                    // Simple string version: "1.0"
318                    let pinned = is_cargo_version_pinned(version_str);
319                    (
320                        Some(version_str.to_string()),
321                        false,
322                        std::collections::HashMap::new(),
323                        pinned,
324                    )
325                }
326                Value::Table(table) => {
327                    // Complex table format: { version = "1.0", optional = true, features = [...] }
328                    let version = table
329                        .get("version")
330                        .and_then(|v| v.as_str())
331                        .map(String::from);
332
333                    let pinned = version.as_ref().is_some_and(|v| is_cargo_version_pinned(v));
334
335                    let is_optional = table
336                        .get("optional")
337                        .and_then(|v| v.as_bool())
338                        .unwrap_or(false);
339
340                    let mut extra_data = std::collections::HashMap::new();
341
342                    // Extract all table fields into extra_data
343                    for (key, val) in table {
344                        match key.as_str() {
345                            "version" => {
346                                // Store version in extra_data
347                                if let Some(v) = val.as_str() {
348                                    extra_data.insert("version".to_string(), json!(v));
349                                }
350                            }
351                            "features" => {
352                                // Extract features array
353                                if let Some(features_array) = val.as_array() {
354                                    let features: Vec<String> = features_array
355                                        .iter()
356                                        .filter_map(|f| f.as_str().map(String::from))
357                                        .collect();
358                                    extra_data.insert("features".to_string(), json!(features));
359                                }
360                            }
361                            "optional" => {
362                                // Skip optional flag, it's handled separately
363                            }
364                            _ => {
365                                // Store other fields (workspace, path, git, branch, tag, rev, etc.)
366                                if let Some(s) = val.as_str() {
367                                    extra_data.insert(key.clone(), json!(s));
368                                } else if let Some(b) = val.as_bool() {
369                                    extra_data.insert(key.clone(), json!(b));
370                                } else if let Some(i) = val.as_integer() {
371                                    extra_data.insert(key.clone(), json!(i));
372                                }
373                            }
374                        }
375                    }
376
377                    (version, is_optional, extra_data, pinned)
378                }
379                _ => {
380                    // Unknown format, skip
381                    continue;
382                }
383            };
384
385            // Only create dependency if we have a version or it's a table with other data
386            if extracted_requirement.is_some() || !extra_data_map.is_empty() {
387                let purl = match PackageUrl::new(CargoParser::PACKAGE_TYPE.as_str(), name) {
388                    Ok(p) => p.to_string(),
389                    Err(e) => {
390                        warn!(
391                            "Failed to create PackageUrl for cargo dependency '{}': {}",
392                            name, e
393                        );
394                        continue; // Skip this dependency
395                    }
396                };
397
398                dependencies.push(Dependency {
399                    purl: Some(purl),
400                    extracted_requirement,
401                    scope: Some(scope.to_string()),
402                    is_runtime: Some(is_runtime),
403                    is_optional: Some(is_optional),
404                    is_pinned: Some(is_pinned),
405                    is_direct: Some(true),
406                    resolved_package: None,
407                    extra_data: if extra_data_map.is_empty() {
408                        None
409                    } else {
410                        Some(extra_data_map)
411                    },
412                });
413            }
414        }
415    }
416
417    dependencies
418}
419
420/// Extracts keywords and categories, merging them into a single keywords array
421fn extract_keywords_and_categories(toml_content: &Value) -> Vec<String> {
422    let mut keywords = Vec::new();
423
424    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table()) {
425        // Extract keywords array
426        if let Some(kw_array) = package.get(FIELD_KEYWORDS).and_then(|v| v.as_array()) {
427            for kw in kw_array {
428                if let Some(kw_str) = kw.as_str() {
429                    keywords.push(kw_str.to_string());
430                }
431            }
432        }
433
434        // Extract categories array and merge with keywords
435        if let Some(cat_array) = package.get(FIELD_CATEGORIES).and_then(|v| v.as_array()) {
436            for cat in cat_array {
437                if let Some(cat_str) = cat.as_str() {
438                    keywords.push(cat_str.to_string());
439                }
440            }
441        }
442    }
443
444    keywords
445}
446
447fn extract_file_references(toml_content: &Value) -> Vec<FileReference> {
448    let mut file_references = Vec::new();
449
450    if let Some(package) = toml_content
451        .get(FIELD_PACKAGE)
452        .and_then(|value| value.as_table())
453    {
454        for path in [
455            package
456                .get(FIELD_LICENSE_FILE)
457                .and_then(|value| value.as_str()),
458            package.get(FIELD_README).and_then(|value| value.as_str()),
459        ]
460        .into_iter()
461        .flatten()
462        {
463            if file_references
464                .iter()
465                .any(|reference: &FileReference| reference.path == path)
466            {
467                continue;
468            }
469
470            file_references.push(FileReference {
471                path: path.to_string(),
472                size: None,
473                sha1: None,
474                md5: None,
475                sha256: None,
476                sha512: None,
477                extra_data: None,
478            });
479        }
480    }
481
482    file_references
483}
484
485/// Converts toml::Value to serde_json::Value recursively
486fn toml_to_json(value: &toml::Value) -> serde_json::Value {
487    match value {
488        toml::Value::String(s) => serde_json::json!(s),
489        toml::Value::Integer(i) => serde_json::json!(i),
490        toml::Value::Float(f) => serde_json::json!(f),
491        toml::Value::Boolean(b) => serde_json::json!(b),
492        toml::Value::Array(a) => serde_json::Value::Array(a.iter().map(toml_to_json).collect()),
493        toml::Value::Table(t) => {
494            let map: serde_json::Map<String, serde_json::Value> = t
495                .iter()
496                .map(|(k, v)| (k.clone(), toml_to_json(v)))
497                .collect();
498            serde_json::Value::Object(map)
499        }
500        toml::Value::Datetime(d) => serde_json::json!(d.to_string()),
501    }
502}
503
504/// Extracts extra_data fields (rust-version, edition, documentation, license-file, workspace)
505fn extract_extra_data(
506    toml_content: &Value,
507) -> Option<std::collections::HashMap<String, serde_json::Value>> {
508    use serde_json::json;
509    let mut extra_data = std::collections::HashMap::new();
510
511    if let Some(package) = toml_content.get(FIELD_PACKAGE).and_then(|v| v.as_table()) {
512        // Extract rust-version (or detect workspace inheritance)
513        if let Some(rust_version_value) = package.get(FIELD_RUST_VERSION) {
514            if let Some(rust_version_str) = rust_version_value.as_str() {
515                extra_data.insert("rust_version".to_string(), json!(rust_version_str));
516            } else if rust_version_value
517                .as_table()
518                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
519            {
520                extra_data.insert("rust-version".to_string(), json!("workspace"));
521            }
522        }
523
524        // Extract edition (or detect workspace inheritance)
525        if let Some(edition_value) = package.get(FIELD_EDITION) {
526            if let Some(edition_str) = edition_value.as_str() {
527                extra_data.insert("rust_edition".to_string(), json!(edition_str));
528            } else if edition_value
529                .as_table()
530                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
531            {
532                extra_data.insert("edition".to_string(), json!("workspace"));
533            }
534        }
535
536        // Extract documentation URL
537        if let Some(documentation) = package.get("documentation").and_then(|v| v.as_str()) {
538            extra_data.insert("documentation_url".to_string(), json!(documentation));
539        }
540
541        // Extract license-file path
542        if let Some(license_file) = package.get(FIELD_LICENSE_FILE).and_then(|v| v.as_str()) {
543            extra_data.insert("license_file".to_string(), json!(license_file));
544        }
545
546        if let Some(readme_value) = package.get(FIELD_README) {
547            if let Some(readme_file) = readme_value.as_str() {
548                extra_data.insert("readme_file".to_string(), json!(readme_file));
549            } else if let Some(readme_enabled) = readme_value.as_bool() {
550                extra_data.insert("readme".to_string(), json!(readme_enabled));
551            } else if readme_value
552                .as_table()
553                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
554            {
555                extra_data.insert("readme".to_string(), json!("workspace"));
556            }
557        }
558
559        if let Some(publish_value) = package.get(FIELD_PUBLISH) {
560            extra_data.insert("publish".to_string(), toml_to_json(publish_value));
561        }
562
563        // Check for workspace inheritance markers for other fields
564        // version
565        if let Some(version_value) = package.get(FIELD_VERSION)
566            && version_value
567                .as_table()
568                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
569        {
570            extra_data.insert("version".to_string(), json!("workspace"));
571        }
572
573        // license
574        if let Some(license_value) = package.get(FIELD_LICENSE)
575            && license_value
576                .as_table()
577                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
578        {
579            extra_data.insert("license".to_string(), json!("workspace"));
580        }
581
582        // homepage
583        if let Some(homepage_value) = package.get(FIELD_HOMEPAGE)
584            && homepage_value
585                .as_table()
586                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
587        {
588            extra_data.insert("homepage".to_string(), json!("workspace"));
589        }
590
591        // repository
592        if let Some(repository_value) = package.get(FIELD_REPOSITORY)
593            && repository_value
594                .as_table()
595                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
596        {
597            extra_data.insert("repository".to_string(), json!("workspace"));
598        }
599
600        // categories
601        if let Some(categories_value) = package.get(FIELD_CATEGORIES)
602            && categories_value
603                .as_table()
604                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
605        {
606            extra_data.insert("categories".to_string(), json!("workspace"));
607        }
608
609        // authors
610        if let Some(authors_value) = package.get(FIELD_AUTHORS)
611            && authors_value
612                .as_table()
613                .is_some_and(|t| t.get("workspace") == Some(&toml::Value::Boolean(true)))
614        {
615            extra_data.insert("authors".to_string(), json!("workspace"));
616        }
617    }
618
619    // Extract workspace table if it exists
620    if let Some(workspace_value) = toml_content.get("workspace") {
621        extra_data.insert("workspace".to_string(), toml_to_json(workspace_value));
622    }
623
624    if extra_data.is_empty() {
625        None
626    } else {
627        Some(extra_data)
628    }
629}
630
631fn default_package_data() -> PackageData {
632    PackageData {
633        package_type: Some(CargoParser::PACKAGE_TYPE),
634        datasource_id: Some(DatasourceId::CargoToml),
635        ..Default::default()
636    }
637}
638
639crate::register_parser!(
640    "Rust Cargo.toml manifest",
641    &["**/Cargo.toml", "**/cargo.toml"],
642    "cargo",
643    "Rust",
644    Some("https://doc.rust-lang.org/cargo/reference/manifest.html"),
645);