Skip to main content

provenant/parsers/
conda.rs

1//! Parser for Conda/Anaconda package manifest files.
2//!
3//! Extracts package metadata and dependencies from Conda ecosystem manifest files
4//! supporting both recipe definitions and environment specifications.
5//!
6//! # Supported Formats
7//! - meta.yaml (Conda recipe metadata with Jinja2 templating support)
8//! - conda.yaml/environment.yml (Conda environment dependency specifications)
9//!
10//! # Key Features
11//! - YAML parsing for environment files
12//! - Dependency extraction from dependencies and build_requirements sections
13//! - Channel specification and platform detection
14//! - Version constraint parsing for Conda version specifiers
15//! - Package URL (purl) generation for conda packages
16//! - Limited meta.yaml support (note: Jinja2 templating not fully resolved)
17//!
18//! # Implementation Notes
19//! - Uses YAML parsing via `yaml_serde`
20//! - meta.yaml: Jinja2 templates not evaluated (use rendered YAML if available)
21//! - environment.yml: Full dependency specification support
22//! - Graceful error handling with `warn!()` logs
23//!
24//! # References
25//! - <https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html>
26//! - <https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html>
27
28use std::collections::HashMap;
29use std::fs;
30use std::path::Path;
31
32use crate::parser_warn as warn;
33use regex::Regex;
34use yaml_serde::Value;
35
36use crate::models::{DatasourceId, Dependency, PackageData, PackageType};
37
38use super::PackageParser;
39use super::license_normalization::{
40    DeclaredLicenseMatchMetadata, build_declared_license_data_from_pair,
41    normalize_spdx_declared_license,
42};
43
44fn default_package_data(datasource_id: Option<DatasourceId>) -> PackageData {
45    PackageData {
46        package_type: Some(CondaMetaYamlParser::PACKAGE_TYPE),
47        datasource_id,
48        ..Default::default()
49    }
50}
51
52/// Build a PURL (Package URL) for Conda or PyPI packages
53pub(crate) fn build_purl(
54    package_type: &str,
55    namespace: Option<&str>,
56    name: &str,
57    version: Option<&str>,
58    _qualifiers: Option<&str>,
59    _subpath: Option<&str>,
60    _extras: Option<&str>,
61) -> Option<String> {
62    let purl = match package_type {
63        "conda" => {
64            if let Some(ns) = namespace {
65                match version {
66                    Some(v) => format!("pkg:conda/{}/{}@{}", ns, name, v),
67                    None => format!("pkg:conda/{}/{}", ns, name),
68                }
69            } else {
70                match version {
71                    Some(v) => format!("pkg:conda/{}@{}", name, v),
72                    None => format!("pkg:conda/{}", name),
73                }
74            }
75        }
76        "pypi" => match version {
77            Some(v) => format!("pkg:pypi/{}@{}", name, v),
78            None => format!("pkg:pypi/{}", name),
79        },
80        _ => format!("pkg:{}/{}", package_type, name),
81    };
82    Some(purl)
83}
84
85fn build_conda_package_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
86    let name = name?;
87    build_purl("conda", None, name, version, None, None, None)
88}
89
90fn yaml_value_to_string(value: &Value) -> Option<String> {
91    match value {
92        Value::String(s) => Some(s.clone()),
93        Value::Number(n) => Some(n.to_string()),
94        Value::Bool(b) => Some(b.to_string()),
95        _ => None,
96    }
97}
98
99fn extract_conda_requirement_name(req: &str) -> Option<String> {
100    let req = req.trim();
101    if req.is_empty() {
102        return None;
103    }
104
105    let req_without_ns = req.rsplit_once("::").map(|(_, rest)| rest).unwrap_or(req);
106
107    let name = req_without_ns
108        .split_whitespace()
109        .next()
110        .unwrap_or(req_without_ns)
111        .split(['=', '<', '>', '!', '~'])
112        .next()
113        .unwrap_or(req_without_ns)
114        .trim();
115
116    if name.is_empty() {
117        None
118    } else {
119        Some(name.to_string())
120    }
121}
122
123/// Conda recipe manifest (meta.yaml) parser.
124///
125/// Extracts package metadata and dependencies from Conda recipe files, which
126/// define how to build a Conda package. Handles Jinja2 templating used in
127/// recipe files for variable substitution.
128pub struct CondaMetaYamlParser;
129
130impl PackageParser for CondaMetaYamlParser {
131    const PACKAGE_TYPE: PackageType = PackageType::Conda;
132
133    fn is_match(path: &Path) -> bool {
134        // Match */meta.yaml following Python reference logic
135        path.file_name()
136            .is_some_and(|name| name == "meta.yaml" || name == "meta.yml")
137    }
138
139    fn extract_packages(path: &Path) -> Vec<PackageData> {
140        let contents = match fs::read_to_string(path) {
141            Ok(c) => c,
142            Err(e) => {
143                warn!("Failed to read {}: {}", path.display(), e);
144                return vec![default_package_data(Some(DatasourceId::CondaMetaYaml))];
145            }
146        };
147
148        // Extract Jinja2 variables and apply crude substitution
149        let variables = extract_jinja2_variables(&contents);
150        let processed_yaml = apply_jinja2_substitutions(&contents, &variables);
151
152        // Parse YAML after Jinja2 processing
153        let yaml: Value = match yaml_serde::from_str(&processed_yaml) {
154            Ok(y) => y,
155            Err(e) => {
156                warn!("Failed to parse YAML in {}: {}", path.display(), e);
157                return vec![default_package_data(Some(DatasourceId::CondaMetaYaml))];
158            }
159        };
160
161        let package_element = yaml.get("package").and_then(|v| v.as_mapping());
162        let name = package_element
163            .and_then(|p| p.get("name"))
164            .and_then(yaml_value_to_string);
165
166        let version = package_element
167            .and_then(|p| p.get("version"))
168            .and_then(yaml_value_to_string);
169
170        let source = yaml.get("source").and_then(|v| v.as_mapping());
171        let download_url = source
172            .and_then(|s| s.get("url"))
173            .and_then(|v| v.as_str())
174            .map(String::from);
175
176        let sha256 = source
177            .and_then(|s| s.get("sha256"))
178            .and_then(|v| v.as_str())
179            .map(String::from);
180
181        let about = yaml.get("about").and_then(|v| v.as_mapping());
182        let homepage_url = about
183            .and_then(|a| a.get("home"))
184            .and_then(|v| v.as_str())
185            .map(String::from);
186
187        let extracted_license_statement = about
188            .and_then(|a| a.get("license"))
189            .and_then(|v| v.as_str())
190            .map(String::from);
191        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
192            normalize_conda_declared_license(extracted_license_statement.as_deref());
193
194        let description = about
195            .and_then(|a| a.get("summary"))
196            .and_then(|v| v.as_str())
197            .map(String::from);
198
199        let vcs_url = about
200            .and_then(|a| a.get("dev_url"))
201            .and_then(|v| v.as_str())
202            .map(String::from);
203        let license_file = about
204            .and_then(|a| a.get("license_file"))
205            .and_then(|v| v.as_str())
206            .map(str::trim)
207            .filter(|value| !value.is_empty())
208            .map(String::from);
209
210        // Extract dependencies from requirements sections
211        let mut dependencies = Vec::new();
212        let mut extra_data: HashMap<String, serde_json::Value> = HashMap::new();
213
214        if let Some(requirements) = yaml.get("requirements").and_then(|v| v.as_mapping()) {
215            for (scope_key, reqs_value) in requirements {
216                let scope = scope_key.as_str().unwrap_or("unknown");
217                if let Some(reqs) = reqs_value.as_sequence() {
218                    for req in reqs {
219                        if let Some(req_str) = req.as_str()
220                            && let Some(dep) = parse_conda_requirement(req_str, scope)
221                        {
222                            // Filter out pip/python from dependencies, add to extra_data
223                            if extract_conda_requirement_name(req_str)
224                                .is_some_and(|n| n == "pip" || n == "python")
225                            {
226                                if let Some(arr) = extra_data
227                                    .entry(scope.to_string())
228                                    .or_insert_with(|| serde_json::Value::Array(vec![]))
229                                    .as_array_mut()
230                                {
231                                    arr.push(serde_json::Value::String(req_str.to_string()))
232                                }
233                            } else {
234                                dependencies.push(dep);
235                            }
236                        }
237                    }
238                }
239            }
240        }
241
242        let mut pkg = default_package_data(Some(DatasourceId::CondaMetaYaml));
243        pkg.package_type = Some(Self::PACKAGE_TYPE);
244        pkg.datasource_id = Some(DatasourceId::CondaMetaYaml);
245        pkg.name = name;
246        pkg.version = version;
247        pkg.purl = build_conda_package_purl(pkg.name.as_deref(), pkg.version.as_deref());
248        pkg.download_url = download_url;
249        pkg.homepage_url = homepage_url;
250        pkg.declared_license_expression = declared_license_expression;
251        pkg.declared_license_expression_spdx = declared_license_expression_spdx;
252        pkg.license_detections = license_detections;
253        pkg.extracted_license_statement = extracted_license_statement;
254        pkg.description = description;
255        pkg.vcs_url = vcs_url;
256        pkg.sha256 = sha256;
257        pkg.dependencies = dependencies;
258        if let Some(license_file) = license_file {
259            extra_data.insert(
260                "license_file".to_string(),
261                serde_json::Value::String(license_file),
262            );
263        }
264        if !extra_data.is_empty() {
265            pkg.extra_data = Some(extra_data);
266        }
267        vec![pkg]
268    }
269}
270
271fn normalize_conda_declared_license(
272    statement: Option<&str>,
273) -> (
274    Option<String>,
275    Option<String>,
276    Vec<crate::models::LicenseDetection>,
277) {
278    match statement.map(str::trim).filter(|value| !value.is_empty()) {
279        Some("Apache Software") => build_declared_license_data_from_pair(
280            "apache-2.0",
281            "Apache-2.0",
282            DeclaredLicenseMatchMetadata::single_line("Apache Software"),
283        ),
284        Some("BSD-3-Clause") => build_declared_license_data_from_pair(
285            "bsd-new",
286            "BSD-3-Clause",
287            DeclaredLicenseMatchMetadata::single_line("BSD-3-Clause"),
288        ),
289        other => normalize_spdx_declared_license(other),
290    }
291}
292
293/// Conda environment file (environment.yml, conda.yaml) parser.
294///
295/// Extracts dependencies from Conda environment files used to define reproducible
296/// environments. Supports both Conda and pip dependencies, with channel specifications.
297pub struct CondaEnvironmentYmlParser;
298
299impl PackageParser for CondaEnvironmentYmlParser {
300    const PACKAGE_TYPE: PackageType = PackageType::Conda;
301
302    fn is_match(path: &Path) -> bool {
303        // Python reference: path_patterns = ('*conda*.yaml', '*env*.yaml', '*environment*.yaml')
304        if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
305            let lower = name.to_lowercase();
306            (lower.contains("conda") || lower.contains("env") || lower.contains("environment"))
307                && (lower.ends_with(".yaml") || lower.ends_with(".yml"))
308        } else {
309            false
310        }
311    }
312
313    fn extract_packages(path: &Path) -> Vec<PackageData> {
314        let contents = match fs::read_to_string(path) {
315            Ok(c) => c,
316            Err(e) => {
317                warn!("Failed to read {}: {}", path.display(), e);
318                return vec![default_package_data(Some(DatasourceId::CondaYaml))];
319            }
320        };
321
322        let yaml: Value = match yaml_serde::from_str(&contents) {
323            Ok(y) => y,
324            Err(e) => {
325                warn!("Failed to parse YAML in {}: {}", path.display(), e);
326                return vec![default_package_data(Some(DatasourceId::CondaYaml))];
327            }
328        };
329
330        let name = yaml.get("name").and_then(|v| v.as_str()).map(String::from);
331
332        let dependencies = extract_environment_dependencies(&yaml);
333
334        let mut extra_data = HashMap::new();
335        if let Some(channels) = yaml.get("channels").and_then(|v| v.as_sequence()) {
336            let channels_vec: Vec<String> = channels
337                .iter()
338                .filter_map(|c| c.as_str().map(String::from))
339                .collect();
340            if !channels_vec.is_empty() {
341                extra_data.insert("channels".to_string(), serde_json::json!(channels_vec));
342            }
343        }
344
345        // Environment files are private (not published packages)
346        let mut pkg = default_package_data(Some(DatasourceId::CondaYaml));
347        pkg.package_type = Some(Self::PACKAGE_TYPE);
348        pkg.datasource_id = Some(DatasourceId::CondaYaml);
349        pkg.name = name;
350        pkg.purl = build_conda_package_purl(pkg.name.as_deref(), pkg.version.as_deref());
351        pkg.primary_language = Some("Python".to_string());
352        pkg.dependencies = dependencies;
353        pkg.is_private = true;
354        if !extra_data.is_empty() {
355            pkg.extra_data = Some(extra_data);
356        }
357        vec![pkg]
358    }
359}
360
361/// Extract Jinja2-style variables from a Conda meta.yaml
362///
363/// Example:
364/// ```ignore
365/// {% set version = "0.45.0" %}
366/// {% set sha256 = "abc123..." %}
367/// ```
368pub fn extract_jinja2_variables(content: &str) -> HashMap<String, String> {
369    let mut variables = HashMap::new();
370
371    for line in content.lines() {
372        let trimmed = line.trim();
373        if trimmed.starts_with("{%") && trimmed.ends_with("%}") && trimmed.contains('=') {
374            // Strip {% and %}
375            let inner = trimmed
376                .trim_start_matches("{%")
377                .trim_end_matches("%}")
378                .trim()
379                .trim_start_matches("set")
380                .trim();
381
382            // Split on '=' to get key and value
383            if let Some((key, value)) = inner.split_once('=') {
384                let key = key.trim();
385                let value = value.trim().trim_matches('"').trim_matches('\'');
386                variables.insert(key.to_string(), value.to_string());
387            }
388        }
389    }
390
391    variables
392}
393
394/// Apply Jinja2 variable substitutions to YAML content
395///
396/// Supports:
397/// - `{{ variable }}` - Simple substitution
398/// - `{{ variable|lower }}` - Lowercase filter
399pub fn apply_jinja2_substitutions(content: &str, variables: &HashMap<String, String>) -> String {
400    let mut result = Vec::new();
401
402    for line in content.lines() {
403        let trimmed = line.trim();
404
405        // Skip Jinja2 set statements (already extracted)
406        if trimmed.starts_with("{%") && trimmed.ends_with("%}") && trimmed.contains('=') {
407            continue;
408        }
409
410        let mut processed_line = line.to_string();
411
412        // Apply variable substitutions
413        if line.contains("{{") && line.contains("}}") {
414            for (var_name, var_value) in variables {
415                // Handle |lower filter
416                let pattern_lower = format!("{{{{ {}|lower }}}}", var_name);
417                if processed_line.contains(&pattern_lower) {
418                    processed_line =
419                        processed_line.replace(&pattern_lower, &var_value.to_lowercase());
420                }
421
422                // Handle normal substitution
423                let pattern_normal = format!("{{{{ {} }}}}", var_name);
424                processed_line = processed_line.replace(&pattern_normal, var_value);
425            }
426        }
427
428        // Skip lines with unresolved Jinja2 templates (complex expressions we can't handle)
429        if processed_line.contains("{{") {
430            continue;
431        }
432
433        result.push(processed_line);
434    }
435
436    result.join("\n")
437}
438
439/// Parse a Conda requirement string into a Dependency
440///
441/// Format examples:
442/// - `mccortex ==1.0` - Pinned version with space before operator
443/// - `python >=3.6` - Version constraint
444/// - `conda-forge::numpy=1.15.4` - Namespace and pinned version (no space)
445/// - `bwa` - No version specified
446pub fn parse_conda_requirement(req: &str, scope: &str) -> Option<Dependency> {
447    let req = req.trim();
448
449    // Handle namespace prefix (conda-forge::package)
450    let (namespace, channel_url, req_without_ns) = parse_conda_channel_prefix(req);
451
452    // Split on first space to separate name from version constraint
453    let (name_part, version_constraint) =
454        if let Some((name, constraint)) = req_without_ns.split_once(' ') {
455            (name.trim(), Some(constraint.trim()))
456        } else {
457            (req_without_ns, None)
458        };
459
460    // Check for pinned version with `=` (no space): package=1.0
461    let (name, version, is_pinned, extracted_requirement) = if name_part.contains('=') {
462        let parts: Vec<&str> = name_part.splitn(2, '=').collect();
463        let n = parts[0].trim();
464        let v = if parts.len() > 1 {
465            let parsed = parts[1].trim();
466            if parsed.is_empty() {
467                None
468            } else {
469                Some(parsed.to_string())
470            }
471        } else {
472            None
473        };
474        let req = v
475            .as_ref()
476            .map(|ver| format!("={}", ver))
477            .unwrap_or_default();
478        (n, v, true, Some(req))
479    } else if let Some(constraint) = version_constraint {
480        // Handle space-separated constraints: package >=3.6, package ==1.0
481        let version_opt = if constraint.starts_with("==") {
482            Some(constraint.trim_start_matches("==").trim().to_string())
483        } else {
484            None
485        };
486        (
487            name_part.trim(),
488            version_opt,
489            false,
490            Some(constraint.to_string()),
491        )
492    } else {
493        (name_part.trim(), None, false, Some(String::new()))
494    };
495
496    // Build PURL
497    let purl = build_purl(
498        "conda",
499        namespace,
500        name,
501        version.as_deref(),
502        None,
503        None,
504        None,
505    );
506
507    // Determine is_runtime and is_optional based on scope
508    let (is_runtime, is_optional) = match scope {
509        "run" => (true, false),
510        _ => (false, true), // build, host, test are all optional
511    };
512
513    let mut extra_data = HashMap::new();
514    if let Some(namespace) = namespace {
515        extra_data.insert("channel".to_string(), serde_json::json!(namespace));
516    }
517    if let Some(channel_url) = channel_url {
518        extra_data.insert("channel_url".to_string(), serde_json::json!(channel_url));
519    }
520
521    Some(Dependency {
522        purl,
523        extracted_requirement,
524        scope: Some(scope.to_string()),
525        is_runtime: Some(is_runtime),
526        is_optional: Some(is_optional),
527        is_pinned: Some(is_pinned),
528        is_direct: Some(true),
529        resolved_package: None,
530        extra_data: (!extra_data.is_empty()).then_some(extra_data),
531    })
532}
533
534fn extract_environment_dependencies(yaml: &Value) -> Vec<Dependency> {
535    let dependencies = match yaml.get("dependencies").and_then(|v| v.as_sequence()) {
536        Some(d) => d,
537        None => return Vec::new(),
538    };
539
540    let mut deps = Vec::new();
541    for dep_value in dependencies {
542        if let Some(dep_str) = dep_value.as_str() {
543            if let Some(dep) = parse_environment_string_dependency(dep_str) {
544                deps.push(dep);
545            }
546        } else if let Some(pip_deps) = dep_value.get("pip").and_then(|v| v.as_sequence()) {
547            deps.extend(extract_pip_dependencies(pip_deps));
548        }
549    }
550    deps
551}
552
553fn parse_environment_string_dependency(dep_str: &str) -> Option<Dependency> {
554    let (namespace, channel_url, dep_without_ns) = parse_conda_channel_prefix(dep_str);
555    create_conda_dependency(namespace, channel_url, dep_without_ns, "dependencies")
556}
557
558fn parse_conda_channel_prefix(dep_str: &str) -> (Option<&str>, Option<&str>, &str) {
559    if let Some((ns, rest)) = dep_str.rsplit_once("::") {
560        if ns.contains('/') || ns.contains(':') {
561            (None, Some(ns), rest)
562        } else {
563            (Some(ns), None, rest)
564        }
565    } else {
566        (None, None, dep_str)
567    }
568}
569
570fn create_conda_dependency(
571    namespace: Option<&str>,
572    channel_url: Option<&str>,
573    dep_without_ns: &str,
574    scope: &str,
575) -> Option<Dependency> {
576    let dep = dep_without_ns.trim();
577    let name_re = match Regex::new(r"^([A-Za-z0-9_.\-]+)") {
578        Ok(re) => re,
579        Err(_) => return None,
580    };
581
582    let caps = name_re.captures(dep)?;
583    let name_match = caps.get(1)?;
584    let name = name_match.as_str().trim();
585    let rest = dep[name_match.end()..].trim();
586
587    let (version, is_pinned, extracted_requirement) = if rest.is_empty() {
588        (None, false, Some(String::new()))
589    } else {
590        let req_no_space = rest.replace(' ', "");
591        let is_exact = req_no_space.starts_with("=") || req_no_space.starts_with("==");
592        let parsed_version = if is_exact {
593            Some(
594                req_no_space
595                    .trim_start_matches('=')
596                    .trim_start_matches('=')
597                    .to_string(),
598            )
599        } else {
600            None
601        };
602
603        (parsed_version, is_exact, Some(rest.to_string()))
604    };
605
606    if name == "pip" || name == "python" {
607        return None;
608    }
609
610    let purl = build_purl(
611        "conda",
612        namespace,
613        name,
614        version.as_deref(),
615        None,
616        None,
617        None,
618    );
619    let mut extra_data = HashMap::new();
620    if let Some(namespace) = namespace {
621        extra_data.insert("channel".to_string(), serde_json::json!(namespace));
622    }
623    if let Some(channel_url) = channel_url {
624        extra_data.insert("channel_url".to_string(), serde_json::json!(channel_url));
625    }
626
627    Some(Dependency {
628        purl,
629        extracted_requirement,
630        scope: Some(scope.to_string()),
631        is_runtime: Some(true),
632        is_optional: Some(false),
633        is_pinned: Some(is_pinned),
634        is_direct: Some(true),
635        resolved_package: None,
636        extra_data: (!extra_data.is_empty()).then_some(extra_data),
637    })
638}
639
640fn extract_pip_dependencies(pip_deps: &[Value]) -> Vec<Dependency> {
641    pip_deps
642        .iter()
643        .filter_map(|pip_dep| {
644            if let Some(pip_req_str) = pip_dep.as_str()
645                && let Ok(parsed_req) = pip_req_str.parse::<pep508_rs::Requirement>()
646            {
647                create_pip_dependency(parsed_req, "dependencies", Some(pip_req_str))
648            } else {
649                None
650            }
651        })
652        .collect()
653}
654
655fn create_pip_dependency(
656    parsed_req: pep508_rs::Requirement,
657    scope: &str,
658    raw_requirement: Option<&str>,
659) -> Option<Dependency> {
660    let name = parsed_req.name.to_string();
661
662    if name == "pip" || name == "python" {
663        return None;
664    }
665
666    let specs = parsed_req.version_or_url.as_ref().map(|v| match v {
667        pep508_rs::VersionOrUrl::VersionSpecifier(spec) => spec.to_string(),
668        pep508_rs::VersionOrUrl::Url(url) => url.to_string(),
669    });
670
671    let extracted_requirement = if let Some(raw) = raw_requirement {
672        let raw = raw.trim();
673        let suffix = raw.strip_prefix(&name).unwrap_or(raw).trim().to_string();
674        Some(suffix)
675    } else {
676        Some(specs.clone().unwrap_or_default())
677    };
678
679    let version = specs.as_ref().and_then(|spec_str| {
680        if spec_str.starts_with("==") {
681            Some(spec_str.trim_start_matches("==").to_string())
682        } else {
683            None
684        }
685    });
686
687    let is_pinned = specs.as_ref().map(|s| s.contains("==")).unwrap_or(false);
688    let purl = build_purl("pypi", None, &name, version.as_deref(), None, None, None);
689
690    Some(Dependency {
691        purl,
692        extracted_requirement,
693        scope: Some(scope.to_string()),
694        is_runtime: Some(true),
695        is_optional: Some(false),
696        is_pinned: Some(is_pinned),
697        is_direct: Some(true),
698        resolved_package: None,
699        extra_data: None,
700    })
701}
702
703crate::register_parser!(
704    "Conda package manifest and environment file",
705    &[
706        "**/meta.yaml",
707        "**/meta.yml",
708        "**/environment.yml",
709        "**/environment.yaml",
710        "**/env.yaml",
711        "**/env.yml",
712        "**/conda.yaml",
713        "**/conda.yml",
714        "**/*conda*.yaml",
715        "**/*conda*.yml",
716        "**/*env*.yaml",
717        "**/*env*.yml",
718        "**/*environment*.yaml",
719        "**/*environment*.yml"
720    ],
721    "conda",
722    "Python",
723    Some("https://docs.conda.io/"),
724);