Skip to main content

provenant/parsers/
conda.rs

1//! Parser for Conda/Anaconda package manifest files.
2//!
3//! Extracts package metadata and dependencies from Conda ecosystem manifest files
4//! supporting both recipe definitions and environment specifications.
5//!
6//! # Supported Formats
7//! - meta.yaml (Conda recipe metadata with Jinja2 templating support)
8//! - conda.yaml/environment.yml (Conda environment dependency specifications)
9//!
10//! # Key Features
11//! - YAML parsing for environment files
12//! - Dependency extraction from dependencies and build_requirements sections
13//! - Channel specification and platform detection
14//! - Version constraint parsing for Conda version specifiers
15//! - Package URL (purl) generation for conda packages
16//! - Limited meta.yaml support (note: Jinja2 templating not fully resolved)
17//!
18//! # Implementation Notes
19//! - Uses YAML parsing via `serde_yaml` crate
20//! - meta.yaml: Jinja2 templates not evaluated (use rendered YAML if available)
21//! - environment.yml: Full dependency specification support
22//! - Graceful error handling with `warn!()` logs
23//!
24//! # References
25//! - <https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html>
26//! - <https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html>
27
28use std::collections::HashMap;
29use std::fs;
30use std::path::Path;
31
32use log::warn;
33use regex::Regex;
34use serde_yaml::Value;
35
36use crate::models::{DatasourceId, Dependency, PackageData, PackageType};
37
38use super::PackageParser;
39
40fn default_package_data(datasource_id: Option<DatasourceId>) -> PackageData {
41    PackageData {
42        package_type: Some(CondaMetaYamlParser::PACKAGE_TYPE),
43        datasource_id,
44        ..Default::default()
45    }
46}
47
48/// Build a PURL (Package URL) for Conda or PyPI packages
49pub(crate) fn build_purl(
50    package_type: &str,
51    namespace: Option<&str>,
52    name: &str,
53    version: Option<&str>,
54    _qualifiers: Option<&str>,
55    _subpath: Option<&str>,
56    _extras: Option<&str>,
57) -> Option<String> {
58    let purl = match package_type {
59        "conda" => {
60            if let Some(ns) = namespace {
61                match version {
62                    Some(v) => format!("pkg:conda/{}/{}@{}", ns, name, v),
63                    None => format!("pkg:conda/{}/{}", ns, name),
64                }
65            } else {
66                match version {
67                    Some(v) => format!("pkg:conda/{}@{}", name, v),
68                    None => format!("pkg:conda/{}", name),
69                }
70            }
71        }
72        "pypi" => match version {
73            Some(v) => format!("pkg:pypi/{}@{}", name, v),
74            None => format!("pkg:pypi/{}", name),
75        },
76        _ => format!("pkg:{}/{}", package_type, name),
77    };
78    Some(purl)
79}
80
81fn build_conda_package_purl(name: Option<&str>, version: Option<&str>) -> Option<String> {
82    let name = name?;
83    build_purl("conda", None, name, version, None, None, None)
84}
85
86fn yaml_value_to_string(value: &Value) -> Option<String> {
87    match value {
88        Value::String(s) => Some(s.clone()),
89        Value::Number(n) => Some(n.to_string()),
90        Value::Bool(b) => Some(b.to_string()),
91        _ => None,
92    }
93}
94
95fn extract_conda_requirement_name(req: &str) -> Option<String> {
96    let req = req.trim();
97    if req.is_empty() {
98        return None;
99    }
100
101    let req_without_ns = req.rsplit_once("::").map(|(_, rest)| rest).unwrap_or(req);
102
103    let name = req_without_ns
104        .split_whitespace()
105        .next()
106        .unwrap_or(req_without_ns)
107        .split(['=', '<', '>', '!', '~'])
108        .next()
109        .unwrap_or(req_without_ns)
110        .trim();
111
112    if name.is_empty() {
113        None
114    } else {
115        Some(name.to_string())
116    }
117}
118
119/// Conda recipe manifest (meta.yaml) parser.
120///
121/// Extracts package metadata and dependencies from Conda recipe files, which
122/// define how to build a Conda package. Handles Jinja2 templating used in
123/// recipe files for variable substitution.
124pub struct CondaMetaYamlParser;
125
126impl PackageParser for CondaMetaYamlParser {
127    const PACKAGE_TYPE: PackageType = PackageType::Conda;
128
129    fn is_match(path: &Path) -> bool {
130        // Match */meta.yaml following Python reference logic
131        path.file_name()
132            .is_some_and(|name| name == "meta.yaml" || name == "meta.yml")
133    }
134
135    fn extract_packages(path: &Path) -> Vec<PackageData> {
136        let contents = match fs::read_to_string(path) {
137            Ok(c) => c,
138            Err(e) => {
139                warn!("Failed to read {}: {}", path.display(), e);
140                return vec![default_package_data(Some(DatasourceId::CondaMetaYaml))];
141            }
142        };
143
144        // Extract Jinja2 variables and apply crude substitution
145        let variables = extract_jinja2_variables(&contents);
146        let processed_yaml = apply_jinja2_substitutions(&contents, &variables);
147
148        // Parse YAML after Jinja2 processing
149        let yaml: Value = match serde_yaml::from_str(&processed_yaml) {
150            Ok(y) => y,
151            Err(e) => {
152                warn!("Failed to parse YAML in {}: {}", path.display(), e);
153                return vec![default_package_data(Some(DatasourceId::CondaMetaYaml))];
154            }
155        };
156
157        let package_element = yaml.get("package").and_then(|v| v.as_mapping());
158        let name = package_element
159            .and_then(|p| p.get("name"))
160            .and_then(yaml_value_to_string);
161
162        let version = package_element
163            .and_then(|p| p.get("version"))
164            .and_then(yaml_value_to_string);
165
166        let source = yaml.get("source").and_then(|v| v.as_mapping());
167        let download_url = source
168            .and_then(|s| s.get("url"))
169            .and_then(|v| v.as_str())
170            .map(String::from);
171
172        let sha256 = source
173            .and_then(|s| s.get("sha256"))
174            .and_then(|v| v.as_str())
175            .map(String::from);
176
177        let about = yaml.get("about").and_then(|v| v.as_mapping());
178        let homepage_url = about
179            .and_then(|a| a.get("home"))
180            .and_then(|v| v.as_str())
181            .map(String::from);
182
183        let extracted_license_statement = about
184            .and_then(|a| a.get("license"))
185            .and_then(|v| v.as_str())
186            .map(String::from);
187
188        let description = about
189            .and_then(|a| a.get("summary"))
190            .and_then(|v| v.as_str())
191            .map(String::from);
192
193        let vcs_url = about
194            .and_then(|a| a.get("dev_url"))
195            .and_then(|v| v.as_str())
196            .map(String::from);
197
198        // Extract dependencies from requirements sections
199        let mut dependencies = Vec::new();
200        let mut extra_data: HashMap<String, serde_json::Value> = HashMap::new();
201
202        if let Some(requirements) = yaml.get("requirements").and_then(|v| v.as_mapping()) {
203            for (scope_key, reqs_value) in requirements {
204                let scope = scope_key.as_str().unwrap_or("unknown");
205                if let Some(reqs) = reqs_value.as_sequence() {
206                    for req in reqs {
207                        if let Some(req_str) = req.as_str()
208                            && let Some(dep) = parse_conda_requirement(req_str, scope)
209                        {
210                            // Filter out pip/python from dependencies, add to extra_data
211                            if extract_conda_requirement_name(req_str)
212                                .is_some_and(|n| n == "pip" || n == "python")
213                            {
214                                if let Some(arr) = extra_data
215                                    .entry(scope.to_string())
216                                    .or_insert_with(|| serde_json::Value::Array(vec![]))
217                                    .as_array_mut()
218                                {
219                                    arr.push(serde_json::Value::String(req_str.to_string()))
220                                }
221                            } else {
222                                dependencies.push(dep);
223                            }
224                        }
225                    }
226                }
227            }
228        }
229
230        let mut pkg = default_package_data(Some(DatasourceId::CondaMetaYaml));
231        pkg.package_type = Some(Self::PACKAGE_TYPE);
232        pkg.datasource_id = Some(DatasourceId::CondaMetaYaml);
233        pkg.name = name;
234        pkg.version = version;
235        pkg.purl = build_conda_package_purl(pkg.name.as_deref(), pkg.version.as_deref());
236        pkg.download_url = download_url;
237        pkg.homepage_url = homepage_url;
238        pkg.extracted_license_statement = extracted_license_statement;
239        pkg.description = description;
240        pkg.vcs_url = vcs_url;
241        pkg.sha256 = sha256;
242        pkg.dependencies = dependencies;
243        if !extra_data.is_empty() {
244            pkg.extra_data = Some(extra_data);
245        }
246        vec![pkg]
247    }
248}
249
250/// Conda environment file (environment.yml, conda.yaml) parser.
251///
252/// Extracts dependencies from Conda environment files used to define reproducible
253/// environments. Supports both Conda and pip dependencies, with channel specifications.
254pub struct CondaEnvironmentYmlParser;
255
256impl PackageParser for CondaEnvironmentYmlParser {
257    const PACKAGE_TYPE: PackageType = PackageType::Conda;
258
259    fn is_match(path: &Path) -> bool {
260        // Python reference: path_patterns = ('*conda*.yaml', '*env*.yaml', '*environment*.yaml')
261        if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
262            let lower = name.to_lowercase();
263            (lower.contains("conda") || lower.contains("env") || lower.contains("environment"))
264                && (lower.ends_with(".yaml") || lower.ends_with(".yml"))
265        } else {
266            false
267        }
268    }
269
270    fn extract_packages(path: &Path) -> Vec<PackageData> {
271        let contents = match fs::read_to_string(path) {
272            Ok(c) => c,
273            Err(e) => {
274                warn!("Failed to read {}: {}", path.display(), e);
275                return vec![default_package_data(Some(DatasourceId::CondaYaml))];
276            }
277        };
278
279        let yaml: Value = match serde_yaml::from_str(&contents) {
280            Ok(y) => y,
281            Err(e) => {
282                warn!("Failed to parse YAML in {}: {}", path.display(), e);
283                return vec![default_package_data(Some(DatasourceId::CondaYaml))];
284            }
285        };
286
287        let name = yaml.get("name").and_then(|v| v.as_str()).map(String::from);
288
289        let dependencies = extract_environment_dependencies(&yaml);
290
291        let mut extra_data = HashMap::new();
292        if let Some(channels) = yaml.get("channels").and_then(|v| v.as_sequence()) {
293            let channels_vec: Vec<String> = channels
294                .iter()
295                .filter_map(|c| c.as_str().map(String::from))
296                .collect();
297            if !channels_vec.is_empty() {
298                extra_data.insert("channels".to_string(), serde_json::json!(channels_vec));
299            }
300        }
301
302        // Environment files are private (not published packages)
303        let mut pkg = default_package_data(Some(DatasourceId::CondaYaml));
304        pkg.package_type = Some(Self::PACKAGE_TYPE);
305        pkg.datasource_id = Some(DatasourceId::CondaYaml);
306        pkg.name = name;
307        pkg.purl = build_conda_package_purl(pkg.name.as_deref(), pkg.version.as_deref());
308        pkg.primary_language = Some("Python".to_string());
309        pkg.dependencies = dependencies;
310        pkg.is_private = true;
311        if !extra_data.is_empty() {
312            pkg.extra_data = Some(extra_data);
313        }
314        vec![pkg]
315    }
316}
317
318/// Extract Jinja2-style variables from a Conda meta.yaml
319///
320/// Example:
321/// ```ignore
322/// {% set version = "0.45.0" %}
323/// {% set sha256 = "abc123..." %}
324/// ```
325pub fn extract_jinja2_variables(content: &str) -> HashMap<String, String> {
326    let mut variables = HashMap::new();
327
328    for line in content.lines() {
329        let trimmed = line.trim();
330        if trimmed.starts_with("{%") && trimmed.ends_with("%}") && trimmed.contains('=') {
331            // Strip {% and %}
332            let inner = trimmed
333                .trim_start_matches("{%")
334                .trim_end_matches("%}")
335                .trim()
336                .trim_start_matches("set")
337                .trim();
338
339            // Split on '=' to get key and value
340            if let Some((key, value)) = inner.split_once('=') {
341                let key = key.trim();
342                let value = value.trim().trim_matches('"').trim_matches('\'');
343                variables.insert(key.to_string(), value.to_string());
344            }
345        }
346    }
347
348    variables
349}
350
351/// Apply Jinja2 variable substitutions to YAML content
352///
353/// Supports:
354/// - `{{ variable }}` - Simple substitution
355/// - `{{ variable|lower }}` - Lowercase filter
356pub fn apply_jinja2_substitutions(content: &str, variables: &HashMap<String, String>) -> String {
357    let mut result = Vec::new();
358
359    for line in content.lines() {
360        let trimmed = line.trim();
361
362        // Skip Jinja2 set statements (already extracted)
363        if trimmed.starts_with("{%") && trimmed.ends_with("%}") && trimmed.contains('=') {
364            continue;
365        }
366
367        let mut processed_line = line.to_string();
368
369        // Apply variable substitutions
370        if line.contains("{{") && line.contains("}}") {
371            for (var_name, var_value) in variables {
372                // Handle |lower filter
373                let pattern_lower = format!("{{{{ {}|lower }}}}", var_name);
374                if processed_line.contains(&pattern_lower) {
375                    processed_line =
376                        processed_line.replace(&pattern_lower, &var_value.to_lowercase());
377                }
378
379                // Handle normal substitution
380                let pattern_normal = format!("{{{{ {} }}}}", var_name);
381                processed_line = processed_line.replace(&pattern_normal, var_value);
382            }
383        }
384
385        // Skip lines with unresolved Jinja2 templates (complex expressions we can't handle)
386        if processed_line.contains("{{") {
387            continue;
388        }
389
390        result.push(processed_line);
391    }
392
393    result.join("\n")
394}
395
396/// Parse a Conda requirement string into a Dependency
397///
398/// Format examples:
399/// - `mccortex ==1.0` - Pinned version with space before operator
400/// - `python >=3.6` - Version constraint
401/// - `conda-forge::numpy=1.15.4` - Namespace and pinned version (no space)
402/// - `bwa` - No version specified
403pub fn parse_conda_requirement(req: &str, scope: &str) -> Option<Dependency> {
404    let req = req.trim();
405
406    // Handle namespace prefix (conda-forge::package)
407    let (namespace, channel_url, req_without_ns) = parse_conda_channel_prefix(req);
408
409    // Split on first space to separate name from version constraint
410    let (name_part, version_constraint) =
411        if let Some((name, constraint)) = req_without_ns.split_once(' ') {
412            (name.trim(), Some(constraint.trim()))
413        } else {
414            (req_without_ns, None)
415        };
416
417    // Check for pinned version with `=` (no space): package=1.0
418    let (name, version, is_pinned, extracted_requirement) = if name_part.contains('=') {
419        let parts: Vec<&str> = name_part.splitn(2, '=').collect();
420        let n = parts[0].trim();
421        let v = if parts.len() > 1 {
422            let parsed = parts[1].trim();
423            if parsed.is_empty() {
424                None
425            } else {
426                Some(parsed.to_string())
427            }
428        } else {
429            None
430        };
431        let req = v
432            .as_ref()
433            .map(|ver| format!("={}", ver))
434            .unwrap_or_default();
435        (n, v, true, Some(req))
436    } else if let Some(constraint) = version_constraint {
437        // Handle space-separated constraints: package >=3.6, package ==1.0
438        let version_opt = if constraint.starts_with("==") {
439            Some(constraint.trim_start_matches("==").trim().to_string())
440        } else {
441            None
442        };
443        (
444            name_part.trim(),
445            version_opt,
446            false,
447            Some(constraint.to_string()),
448        )
449    } else {
450        (name_part.trim(), None, false, Some(String::new()))
451    };
452
453    // Build PURL
454    let purl = build_purl(
455        "conda",
456        namespace,
457        name,
458        version.as_deref(),
459        None,
460        None,
461        None,
462    );
463
464    // Determine is_runtime and is_optional based on scope
465    let (is_runtime, is_optional) = match scope {
466        "run" => (true, false),
467        _ => (false, true), // build, host, test are all optional
468    };
469
470    let mut extra_data = HashMap::new();
471    if let Some(namespace) = namespace {
472        extra_data.insert("channel".to_string(), serde_json::json!(namespace));
473    }
474    if let Some(channel_url) = channel_url {
475        extra_data.insert("channel_url".to_string(), serde_json::json!(channel_url));
476    }
477
478    Some(Dependency {
479        purl,
480        extracted_requirement,
481        scope: Some(scope.to_string()),
482        is_runtime: Some(is_runtime),
483        is_optional: Some(is_optional),
484        is_pinned: Some(is_pinned),
485        is_direct: Some(true),
486        resolved_package: None,
487        extra_data: (!extra_data.is_empty()).then_some(extra_data),
488    })
489}
490
491fn extract_environment_dependencies(yaml: &Value) -> Vec<Dependency> {
492    let dependencies = match yaml.get("dependencies").and_then(|v| v.as_sequence()) {
493        Some(d) => d,
494        None => return Vec::new(),
495    };
496
497    let mut deps = Vec::new();
498    for dep_value in dependencies {
499        if let Some(dep_str) = dep_value.as_str() {
500            if let Some(dep) = parse_environment_string_dependency(dep_str) {
501                deps.push(dep);
502            }
503        } else if let Some(pip_deps) = dep_value.get("pip").and_then(|v| v.as_sequence()) {
504            deps.extend(extract_pip_dependencies(pip_deps));
505        }
506    }
507    deps
508}
509
510fn parse_environment_string_dependency(dep_str: &str) -> Option<Dependency> {
511    let (namespace, channel_url, dep_without_ns) = parse_conda_channel_prefix(dep_str);
512
513    if let Ok(parsed_req) = dep_without_ns.parse::<pep508_rs::Requirement>() {
514        return create_pip_dependency(parsed_req, "dependencies", Some(dep_without_ns));
515    }
516
517    create_conda_dependency(namespace, channel_url, dep_without_ns, "dependencies")
518}
519
520fn parse_conda_channel_prefix(dep_str: &str) -> (Option<&str>, Option<&str>, &str) {
521    if let Some((ns, rest)) = dep_str.rsplit_once("::") {
522        if ns.contains('/') || ns.contains(':') {
523            (None, Some(ns), rest)
524        } else {
525            (Some(ns), None, rest)
526        }
527    } else {
528        (None, None, dep_str)
529    }
530}
531
532fn create_conda_dependency(
533    namespace: Option<&str>,
534    channel_url: Option<&str>,
535    dep_without_ns: &str,
536    scope: &str,
537) -> Option<Dependency> {
538    let dep = dep_without_ns.trim();
539    let name_re = match Regex::new(r"^([A-Za-z0-9_.\-]+)") {
540        Ok(re) => re,
541        Err(_) => return None,
542    };
543
544    let caps = name_re.captures(dep)?;
545    let name_match = caps.get(1)?;
546    let name = name_match.as_str().trim();
547    let rest = dep[name_match.end()..].trim();
548
549    let (version, is_pinned, extracted_requirement) = if rest.is_empty() {
550        (None, false, Some(String::new()))
551    } else {
552        let req_no_space = rest.replace(' ', "");
553        let is_exact = req_no_space.starts_with("=") || req_no_space.starts_with("==");
554        let parsed_version = if is_exact {
555            Some(
556                req_no_space
557                    .trim_start_matches('=')
558                    .trim_start_matches('=')
559                    .to_string(),
560            )
561        } else {
562            None
563        };
564
565        (parsed_version, is_exact, Some(rest.to_string()))
566    };
567
568    if name == "pip" || name == "python" {
569        return None;
570    }
571
572    let purl = build_purl(
573        "conda",
574        namespace,
575        name,
576        version.as_deref(),
577        None,
578        None,
579        None,
580    );
581    let mut extra_data = HashMap::new();
582    if let Some(namespace) = namespace {
583        extra_data.insert("channel".to_string(), serde_json::json!(namespace));
584    }
585    if let Some(channel_url) = channel_url {
586        extra_data.insert("channel_url".to_string(), serde_json::json!(channel_url));
587    }
588
589    Some(Dependency {
590        purl,
591        extracted_requirement,
592        scope: Some(scope.to_string()),
593        is_runtime: Some(true),
594        is_optional: Some(false),
595        is_pinned: Some(is_pinned),
596        is_direct: Some(true),
597        resolved_package: None,
598        extra_data: (!extra_data.is_empty()).then_some(extra_data),
599    })
600}
601
602fn extract_pip_dependencies(pip_deps: &[Value]) -> Vec<Dependency> {
603    pip_deps
604        .iter()
605        .filter_map(|pip_dep| {
606            if let Some(pip_req_str) = pip_dep.as_str()
607                && let Ok(parsed_req) = pip_req_str.parse::<pep508_rs::Requirement>()
608            {
609                create_pip_dependency(parsed_req, "dependencies", Some(pip_req_str))
610            } else {
611                None
612            }
613        })
614        .collect()
615}
616
617fn create_pip_dependency(
618    parsed_req: pep508_rs::Requirement,
619    scope: &str,
620    raw_requirement: Option<&str>,
621) -> Option<Dependency> {
622    let name = parsed_req.name.to_string();
623
624    if name == "pip" || name == "python" {
625        return None;
626    }
627
628    let specs = parsed_req.version_or_url.as_ref().map(|v| match v {
629        pep508_rs::VersionOrUrl::VersionSpecifier(spec) => spec.to_string(),
630        pep508_rs::VersionOrUrl::Url(url) => url.to_string(),
631    });
632
633    let extracted_requirement = if let Some(raw) = raw_requirement {
634        let raw = raw.trim();
635        let suffix = raw.strip_prefix(&name).unwrap_or(raw).trim().to_string();
636        Some(suffix)
637    } else {
638        Some(specs.clone().unwrap_or_default())
639    };
640
641    let version = specs.as_ref().and_then(|spec_str| {
642        if spec_str.starts_with("==") {
643            Some(spec_str.trim_start_matches("==").to_string())
644        } else {
645            None
646        }
647    });
648
649    let is_pinned = specs.as_ref().map(|s| s.contains("==")).unwrap_or(false);
650    let purl = build_purl("pypi", None, &name, version.as_deref(), None, None, None);
651
652    Some(Dependency {
653        purl,
654        extracted_requirement,
655        scope: Some(scope.to_string()),
656        is_runtime: Some(true),
657        is_optional: Some(false),
658        is_pinned: Some(is_pinned),
659        is_direct: Some(true),
660        resolved_package: None,
661        extra_data: None,
662    })
663}
664
665crate::register_parser!(
666    "Conda package manifest and environment file",
667    &[
668        "**/meta.yaml",
669        "**/meta.yml",
670        "**/environment.yml",
671        "**/conda.yaml"
672    ],
673    "conda",
674    "Python",
675    Some("https://docs.conda.io/"),
676);