Skip to main content

provenant/parsers/
chef.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for Chef cookbook metadata files (JSON and Ruby).
5//!
6//! Extracts package metadata, dependencies, and maintainer information from
7//! Chef cookbook metadata files used by the Chef configuration management tool.
8//!
9//! # Supported Formats
10//! - metadata.json (Chef cookbook metadata in JSON format)
11//! - metadata.rb (Chef cookbook metadata in Ruby DSL format)
12//!
13//! # Key Features
14//! - Maintainer party extraction from maintainer/maintainer_email fields
15//! - Dependency extraction from both `dependencies` and `depends` fields (merged)
16//! - URL construction for Chef Supermarket (download, homepage, API)
17//! - dist-info guard to prevent false positives with Python wheel metadata.json
18//!
19//! # Implementation Notes
20//! - JSON parser uses serde_json for JSON parsing
21//! - Ruby parser uses line-based token extraction (not a full Ruby parser)
22//! - Description from `description` or fallback to `long_description`
23//! - Graceful error handling: logs warnings and returns default on parse failure
24//! - IO.read(...) expressions in Ruby files are skipped (cannot evaluate Ruby code)
25
26use std::collections::HashMap;
27use std::fs::{self, File};
28use std::io::{BufRead, BufReader};
29use std::path::Path;
30use std::sync::LazyLock;
31
32use crate::parser_warn as warn;
33use packageurl::PackageUrl;
34use regex::Regex;
35use serde_json::Value;
36
37use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
38
39use super::PackageParser;
40use super::metadata::ParserMetadata;
41use super::utils::{MAX_ITERATION_COUNT, MAX_MANIFEST_SIZE, read_file_to_string, truncate_field};
42
43const FIELD_NAME: &str = "name";
44const FIELD_VERSION: &str = "version";
45const FIELD_DESCRIPTION: &str = "description";
46const FIELD_LONG_DESCRIPTION: &str = "long_description";
47const FIELD_LICENSE: &str = "license";
48const FIELD_MAINTAINER: &str = "maintainer";
49const FIELD_MAINTAINER_EMAIL: &str = "maintainer_email";
50const FIELD_SOURCE_URL: &str = "source_url";
51const FIELD_ISSUES_URL: &str = "issues_url";
52const FIELD_DEPENDENCIES: &str = "dependencies";
53const FIELD_DEPENDS: &str = "depends";
54
55static RE_FIELD: LazyLock<Regex> =
56    LazyLock::new(|| Regex::new(r#"^\s*(\w+)\s+['"](.+?)['"]"#).expect("valid regex"));
57static RE_DEPENDS: LazyLock<Regex> = LazyLock::new(|| {
58    Regex::new(r#"^\s*depends\s+['"](.+?)['"](?:\s*,\s*['"](.+?)['"])?"#).expect("valid regex")
59});
60static RE_IO_READ: LazyLock<Regex> =
61    LazyLock::new(|| Regex::new(r"IO\.read\(").expect("valid regex"));
62
63struct ChefPackageFields {
64    datasource_id: DatasourceId,
65    name: Option<String>,
66    version: Option<String>,
67    description: Option<String>,
68    extracted_license_statement: Option<String>,
69    maintainer_name: Option<String>,
70    maintainer_email: Option<String>,
71    code_view_url: Option<String>,
72    bug_tracking_url: Option<String>,
73    deps: HashMap<String, Option<String>>,
74}
75
76/// Chef metadata.json parser for Chef cookbook manifests.
77///
78/// Extracts metadata from Chef cookbook metadata.json files, including
79/// dependencies from both `dependencies` and `depends` fields.
80pub struct ChefMetadataJsonParser;
81
82impl PackageParser for ChefMetadataJsonParser {
83    const PACKAGE_TYPE: PackageType = PackageType::Chef;
84
85    fn metadata() -> Vec<ParserMetadata> {
86        vec![ParserMetadata {
87            description: "Chef cookbook metadata",
88            file_patterns: &["**/metadata.json", "**/metadata.rb"],
89            package_type: "chef",
90            primary_language: "Ruby",
91            documentation_url: Some("https://docs.chef.io/config_rb_metadata/"),
92        }]
93    }
94
95    fn is_match(path: &Path) -> bool {
96        if path.file_name().is_some_and(|name| name == "metadata.json") {
97            // Check parent directory doesn't end with "dist-info"
98            // to prevent false positives with Python wheel metadata.json files
99            if let Some(parent) = path.parent()
100                && let Some(parent_name) = parent.file_name().and_then(|n| n.to_str())
101            {
102                return !parent_name.ends_with("dist-info");
103            }
104            return true;
105        }
106        false
107    }
108
109    fn extract_packages(path: &Path) -> Vec<PackageData> {
110        let json_content = match read_json_file(path) {
111            Ok(content) => content,
112            Err(e) => {
113                warn!("Failed to read metadata.json at {:?}: {}", path, e);
114                return vec![default_package_data(DatasourceId::ChefCookbookMetadataJson)];
115            }
116        };
117
118        let name = json_content
119            .get(FIELD_NAME)
120            .and_then(|v| v.as_str())
121            .map(|s| s.trim().to_string())
122            .filter(|s| !s.is_empty())
123            .map(truncate_field);
124
125        let version = json_content
126            .get(FIELD_VERSION)
127            .and_then(|v| v.as_str())
128            .map(|s| s.trim().to_string())
129            .filter(|s| !s.is_empty())
130            .map(truncate_field);
131
132        let description = extract_description(&json_content).map(truncate_field);
133
134        let extracted_license_statement = json_content
135            .get(FIELD_LICENSE)
136            .and_then(|v| v.as_str())
137            .map(|s| s.trim().to_string())
138            .filter(|s| !s.is_empty())
139            .map(truncate_field);
140
141        let maintainer_name = json_content
142            .get(FIELD_MAINTAINER)
143            .and_then(|v| v.as_str())
144            .map(|s| s.trim().to_string())
145            .filter(|s| !s.is_empty())
146            .map(truncate_field);
147
148        let maintainer_email = json_content
149            .get(FIELD_MAINTAINER_EMAIL)
150            .and_then(|v| v.as_str())
151            .map(|s| s.trim().to_string())
152            .filter(|s| !s.is_empty())
153            .map(truncate_field);
154
155        let code_view_url = json_content
156            .get(FIELD_SOURCE_URL)
157            .and_then(|v| v.as_str())
158            .map(|s| s.trim().to_string())
159            .filter(|s| !s.is_empty())
160            .map(truncate_field);
161
162        let bug_tracking_url = json_content
163            .get(FIELD_ISSUES_URL)
164            .and_then(|v| v.as_str())
165            .map(|s| s.trim().to_string())
166            .filter(|s| !s.is_empty())
167            .map(truncate_field);
168
169        let mut deps: HashMap<String, Option<String>> = HashMap::new();
170
171        if let Some(deps_obj) = json_content
172            .get(FIELD_DEPENDENCIES)
173            .and_then(|v| v.as_object())
174        {
175            for (dep_name, dep_version) in deps_obj.iter().take(MAX_ITERATION_COUNT) {
176                let version_constraint = dep_version
177                    .as_str()
178                    .map(|s| s.trim().to_string())
179                    .filter(|s| !s.is_empty())
180                    .map(truncate_field);
181                deps.insert(
182                    truncate_field(dep_name.trim().to_string()),
183                    version_constraint,
184                );
185            }
186        }
187
188        if let Some(depends_obj) = json_content.get(FIELD_DEPENDS).and_then(|v| v.as_object()) {
189            for (dep_name, dep_version) in depends_obj.iter().take(MAX_ITERATION_COUNT) {
190                let version_constraint = dep_version
191                    .as_str()
192                    .map(|s| s.trim().to_string())
193                    .filter(|s| !s.is_empty())
194                    .map(truncate_field);
195                deps.insert(
196                    truncate_field(dep_name.trim().to_string()),
197                    version_constraint,
198                );
199            }
200        }
201
202        vec![build_package(ChefPackageFields {
203            datasource_id: DatasourceId::ChefCookbookMetadataJson,
204            name,
205            version,
206            description,
207            extracted_license_statement,
208            maintainer_name,
209            maintainer_email,
210            code_view_url,
211            bug_tracking_url,
212            deps,
213        })]
214    }
215}
216
217fn read_json_file(path: &Path) -> Result<Value, String> {
218    let contents = read_file_to_string(path, None).map_err(|e| e.to_string())?;
219    serde_json::from_str(&contents).map_err(|e| format!("Failed to parse JSON: {}", e))
220}
221
222fn default_package_data(datasource_id: DatasourceId) -> PackageData {
223    PackageData {
224        package_type: Some(ChefMetadataJsonParser::PACKAGE_TYPE),
225        datasource_id: Some(datasource_id),
226        ..Default::default()
227    }
228}
229
230fn extract_description(json: &Value) -> Option<String> {
231    // Try description first, then long_description
232    json.get(FIELD_DESCRIPTION)
233        .and_then(|v| v.as_str())
234        .map(|s| s.trim().to_string())
235        .filter(|s| !s.is_empty())
236        .or_else(|| {
237            json.get(FIELD_LONG_DESCRIPTION)
238                .and_then(|v| v.as_str())
239                .map(|s| s.trim().to_string())
240                .filter(|s| !s.is_empty())
241        })
242}
243
244/// Chef metadata.rb parser for Chef cookbook manifests in Ruby DSL format.
245///
246/// Uses line-based token extraction to parse Ruby DSL without executing Ruby code.
247pub struct ChefMetadataRbParser;
248
249impl PackageParser for ChefMetadataRbParser {
250    const PACKAGE_TYPE: PackageType = PackageType::Chef;
251
252    fn is_match(path: &Path) -> bool {
253        path.file_name().is_some_and(|name| name == "metadata.rb")
254    }
255
256    fn extract_packages(path: &Path) -> Vec<PackageData> {
257        if let Ok(metadata) = fs::metadata(path)
258            && metadata.len() > MAX_MANIFEST_SIZE
259        {
260            warn!(
261                "File {:?} is {} bytes, exceeding the {} byte limit",
262                path,
263                metadata.len(),
264                MAX_MANIFEST_SIZE
265            );
266            return vec![default_package_data(DatasourceId::ChefCookbookMetadataRb)];
267        }
268
269        let file = match File::open(path) {
270            Ok(f) => f,
271            Err(e) => {
272                warn!("Failed to open metadata.rb at {:?}: {}", path, e);
273                return vec![default_package_data(DatasourceId::ChefCookbookMetadataRb)];
274            }
275        };
276
277        let reader = BufReader::new(file);
278        let mut fields: HashMap<String, String> = HashMap::new();
279        let mut deps: HashMap<String, Option<String>> = HashMap::new();
280
281        for line in reader.lines().take(MAX_ITERATION_COUNT) {
282            let line = match line {
283                Ok(l) => l,
284                Err(e) => {
285                    warn!("Skipping non-UTF-8 line in {:?}: {}", path, e);
286                    continue;
287                }
288            };
289
290            let trimmed = line.trim();
291
292            if trimmed.is_empty() || trimmed.starts_with('#') {
293                continue;
294            }
295
296            if RE_IO_READ.is_match(&line) {
297                continue;
298            }
299
300            if let Some(caps) = RE_DEPENDS.captures(&line) {
301                let dep_name = caps
302                    .get(1)
303                    .map(|m| m.as_str().to_string())
304                    .unwrap_or_default();
305                let dep_version = caps.get(2).map(|m| m.as_str().to_string());
306                if !dep_name.is_empty() {
307                    deps.insert(dep_name, dep_version);
308                }
309                continue;
310            }
311
312            if let Some(caps) = RE_FIELD.captures(&line) {
313                let key = caps
314                    .get(1)
315                    .map(|m| m.as_str().to_string())
316                    .unwrap_or_default();
317                let value = caps
318                    .get(2)
319                    .map(|m| m.as_str().to_string())
320                    .unwrap_or_default();
321
322                if !key.is_empty() && !value.is_empty() {
323                    match key.as_str() {
324                        "name" | "version" | "description" | "long_description" | "license"
325                        | "maintainer" | "maintainer_email" | "source_url" | "issues_url" => {
326                            fields.insert(key, value);
327                        }
328                        _ => {}
329                    }
330                }
331            }
332        }
333
334        let name = fields
335            .get("name")
336            .map(|s| s.trim().to_string())
337            .filter(|s| !s.is_empty())
338            .map(truncate_field);
339
340        let version = fields
341            .get("version")
342            .map(|s| s.trim().to_string())
343            .filter(|s| !s.is_empty())
344            .map(truncate_field);
345
346        let description = fields
347            .get("description")
348            .map(|s| s.trim().to_string())
349            .filter(|s| !s.is_empty())
350            .map(truncate_field)
351            .or_else(|| {
352                fields
353                    .get("long_description")
354                    .map(|s| s.trim().to_string())
355                    .filter(|s| !s.is_empty())
356                    .map(truncate_field)
357            });
358
359        let extracted_license_statement = fields
360            .get("license")
361            .map(|s| s.trim().to_string())
362            .filter(|s| !s.is_empty())
363            .map(truncate_field);
364
365        let maintainer_name = fields
366            .get("maintainer")
367            .map(|s| s.trim().to_string())
368            .filter(|s| !s.is_empty())
369            .map(truncate_field);
370
371        let maintainer_email = fields
372            .get("maintainer_email")
373            .map(|s| s.trim().to_string())
374            .filter(|s| !s.is_empty())
375            .map(truncate_field);
376
377        let code_view_url = fields
378            .get("source_url")
379            .map(|s| s.trim().to_string())
380            .filter(|s| !s.is_empty())
381            .map(truncate_field);
382
383        let bug_tracking_url = fields
384            .get("issues_url")
385            .map(|s| s.trim().to_string())
386            .filter(|s| !s.is_empty())
387            .map(truncate_field);
388
389        vec![build_package(ChefPackageFields {
390            datasource_id: DatasourceId::ChefCookbookMetadataRb,
391            name,
392            version,
393            description,
394            extracted_license_statement,
395            maintainer_name,
396            maintainer_email,
397            code_view_url,
398            bug_tracking_url,
399            deps,
400        })]
401    }
402}
403
404fn build_package(fields: ChefPackageFields) -> PackageData {
405    let ChefPackageFields {
406        datasource_id,
407        name,
408        version,
409        description,
410        extracted_license_statement,
411        maintainer_name,
412        maintainer_email,
413        code_view_url,
414        bug_tracking_url,
415        deps,
416    } = fields;
417    let parties = if maintainer_name.is_some() || maintainer_email.is_some() {
418        vec![Party {
419            r#type: None,
420            role: Some(truncate_field("maintainer".to_string())),
421            name: maintainer_name.map(truncate_field),
422            email: maintainer_email.map(truncate_field),
423            url: None,
424            organization: None,
425            organization_url: None,
426            timezone: None,
427        }]
428    } else {
429        Vec::new()
430    };
431
432    let mut dependencies: Vec<Dependency> = deps
433        .into_iter()
434        .map(|(dep_name, version_constraint)| {
435            let purl = PackageUrl::new("chef", &dep_name)
436                .map(|p| truncate_field(p.to_string()))
437                .ok();
438            Dependency {
439                purl,
440                extracted_requirement: version_constraint.map(truncate_field),
441                scope: Some(truncate_field("dependencies".to_string())),
442                is_runtime: Some(true),
443                is_optional: Some(false),
444                is_pinned: None,
445                is_direct: None,
446                resolved_package: None,
447                extra_data: None,
448            }
449        })
450        .collect();
451
452    dependencies.sort_by(|a, b| {
453        let name_a = a.purl.as_deref().unwrap_or("");
454        let name_b = b.purl.as_deref().unwrap_or("");
455        name_a.cmp(name_b)
456    });
457
458    let (download_url, repository_download_url, repository_homepage_url, api_data_url) =
459        if let (Some(n), Some(v)) = (&name, &version) {
460            let download = truncate_field(format!(
461                "https://supermarket.chef.io/cookbooks/{}/versions/{}/download",
462                n, v
463            ));
464            let homepage = truncate_field(format!(
465                "https://supermarket.chef.io/cookbooks/{}/versions/{}/",
466                n, v
467            ));
468            let api = truncate_field(format!(
469                "https://supermarket.chef.io/api/v1/cookbooks/{}/versions/{}",
470                n, v
471            ));
472            (
473                Some(download.clone()),
474                Some(download),
475                Some(homepage),
476                Some(api),
477            )
478        } else {
479            (None, None, None, None)
480        };
481
482    let purl = match (name.as_deref(), version.as_deref()) {
483        (Some(name), Some(version)) => PackageUrl::new("chef", name)
484            .map(|mut p| {
485                let _ = p.with_version(version);
486                truncate_field(p.to_string())
487            })
488            .ok(),
489        _ => None,
490    };
491
492    PackageData {
493        package_type: Some(ChefMetadataJsonParser::PACKAGE_TYPE),
494        datasource_id: Some(datasource_id),
495        name,
496        version,
497        description,
498        extracted_license_statement,
499        parties,
500        code_view_url,
501        bug_tracking_url,
502        dependencies,
503        download_url,
504        repository_download_url,
505        repository_homepage_url,
506        api_data_url,
507        purl,
508        primary_language: Some(truncate_field("Ruby".to_string())),
509        ..Default::default()
510    }
511}