Skip to main content

provenant/parsers/
opam.rs

1//! Parser for OCaml OPAM package manager manifests.
2//!
3//! Extracts package metadata and dependencies from OPAM files used by the
4//! OCaml ecosystem.
5//!
6//! # Supported Formats
7//! - *.opam files (OPAM package manifests)
8//! - opam files without extension
9//!
10//! # Key Features
11//! - Field-based parsing of OPAM's custom format (key: value)
12//! - Author and maintainer extraction with email parsing
13//! - URL extraction for source archives, homepage, repository
14//! - License statement extraction
15//! - Checksum extraction (sha1, md5, sha256, sha512)
16//!
17//! # Implementation Notes
18//! - OPAM format uses custom syntax, not JSON/YAML/TOML
19//! - Strings can be quoted or unquoted
20//! - Lists use bracket notation: [item1 item2]
21//! - Multi-line strings use three-quote notation: """..."""
22
23use std::path::Path;
24
25use log::warn;
26use regex::Regex;
27
28use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
29use crate::parsers::PackageParser;
30
31use super::license_normalization::{
32    DeclaredLicenseMatchMetadata, build_declared_license_data_from_pair,
33    normalize_spdx_declared_license,
34};
35
36/// Parser for OCaml OPAM package manifest files.
37///
38/// Handles the OPAM file format used by the OCaml package manager.
39/// Reference: <https://opam.ocaml.org/doc/Manual.html#Common-file-format>
40pub struct OpamParser;
41
42impl PackageParser for OpamParser {
43    const PACKAGE_TYPE: PackageType = PackageType::Opam;
44
45    fn is_match(path: &Path) -> bool {
46        path.file_name().is_some_and(|name| {
47            name.to_string_lossy().ends_with(".opam") || name.to_string_lossy() == "opam"
48        })
49    }
50
51    fn extract_packages(path: &Path) -> Vec<PackageData> {
52        vec![match std::fs::read_to_string(path) {
53            Ok(text) => parse_opam(&text),
54            Err(e) => {
55                warn!("Failed to read OPAM file {:?}: {}", path, e);
56                default_package_data()
57            }
58        }]
59    }
60}
61
62/// Parsed OPAM file data
63#[derive(Debug, Default)]
64struct OpamData {
65    name: Option<String>,
66    version: Option<String>,
67    synopsis: Option<String>,
68    description: Option<String>,
69    homepage: Option<String>,
70    dev_repo: Option<String>,
71    bug_reports: Option<String>,
72    src: Option<String>,
73    authors: Vec<String>,
74    maintainers: Vec<String>,
75    license: Option<String>,
76    sha1: Option<String>,
77    md5: Option<String>,
78    sha256: Option<String>,
79    sha512: Option<String>,
80    dependencies: Vec<(String, String)>, // (name, version_constraint)
81}
82
83fn default_package_data() -> PackageData {
84    PackageData {
85        package_type: Some(OpamParser::PACKAGE_TYPE),
86        primary_language: Some("Ocaml".to_string()),
87        datasource_id: Some(DatasourceId::OpamFile),
88        ..Default::default()
89    }
90}
91
92/// Parse an OPAM file from text content
93fn parse_opam(text: &str) -> PackageData {
94    let opam_data = parse_opam_data(text);
95
96    let description = build_description(&opam_data.synopsis, &opam_data.description);
97    let parties = extract_parties(&opam_data.authors, &opam_data.maintainers);
98    let dependencies = extract_dependencies(&opam_data.dependencies);
99
100    let (repository_homepage_url, api_data_url, purl) =
101        build_opam_urls(&opam_data.name, &opam_data.version);
102    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
103        normalize_opam_declared_license(opam_data.license.as_deref());
104
105    PackageData {
106        package_type: Some(OpamParser::PACKAGE_TYPE),
107        namespace: None,
108        name: opam_data.name,
109        version: opam_data.version,
110        qualifiers: None,
111        subpath: None,
112        primary_language: Some("Ocaml".to_string()),
113        description,
114        release_date: None,
115        parties,
116        keywords: Vec::new(),
117        homepage_url: opam_data.homepage,
118        download_url: opam_data.src,
119        size: None,
120        sha1: opam_data.sha1,
121        md5: opam_data.md5,
122        sha256: opam_data.sha256,
123        sha512: opam_data.sha512,
124        bug_tracking_url: opam_data.bug_reports,
125        code_view_url: None,
126        vcs_url: opam_data.dev_repo,
127        copyright: None,
128        holder: None,
129        declared_license_expression,
130        declared_license_expression_spdx,
131        license_detections,
132        other_license_expression: None,
133        other_license_expression_spdx: None,
134        other_license_detections: Vec::new(),
135        extracted_license_statement: opam_data.license,
136        notice_text: None,
137        source_packages: Vec::new(),
138        file_references: Vec::new(),
139        is_private: false,
140        is_virtual: false,
141        extra_data: None,
142        dependencies,
143        repository_homepage_url,
144        repository_download_url: None,
145        api_data_url,
146        datasource_id: Some(DatasourceId::OpamFile),
147        purl,
148    }
149}
150
151fn normalize_opam_declared_license(
152    statement: Option<&str>,
153) -> (
154    Option<String>,
155    Option<String>,
156    Vec<crate::models::LicenseDetection>,
157) {
158    let Some(statement) = statement.map(str::trim).filter(|value| !value.is_empty()) else {
159        return super::license_normalization::empty_declared_license_data();
160    };
161
162    match statement {
163        "GPL-2.0-only" => build_declared_license_data_from_pair(
164            "gpl-2.0",
165            "GPL-2.0-only",
166            DeclaredLicenseMatchMetadata::single_line(statement),
167        ),
168        "GPL-3.0-only" => build_declared_license_data_from_pair(
169            "gpl-3.0",
170            "GPL-3.0-only",
171            DeclaredLicenseMatchMetadata::single_line(statement),
172        ),
173        "LGPL-3.0-only with OCaml-LGPL-linking-exception" => build_declared_license_data_from_pair(
174            "lgpl-3.0 WITH ocaml-lgpl-linking-exception",
175            "LGPL-3.0-only WITH OCaml-LGPL-linking-exception",
176            DeclaredLicenseMatchMetadata::single_line(statement),
177        ),
178        _ => normalize_spdx_declared_license(Some(statement)),
179    }
180}
181
182fn build_opam_urls(
183    name: &Option<String>,
184    version: &Option<String>,
185) -> (Option<String>, Option<String>, Option<String>) {
186    let repository_homepage_url = name
187        .as_ref()
188        .map(|_| "{https://opam.ocaml.org/packages}/{name}".to_string());
189
190    let api_data_url = match (name, version) {
191        (Some(n), Some(v)) => Some(format!(
192            "https://github.com/ocaml/opam-repository/blob/master/packages/{}/{}.{}/opam",
193            n, n, v
194        )),
195        _ => None,
196    };
197
198    let purl = match (name, version) {
199        (Some(n), Some(v)) => Some(format!("pkg:opam/{}@{}", n, v)),
200        (Some(n), None) => Some(format!("pkg:opam/{}", n)),
201        _ => None,
202    };
203
204    (repository_homepage_url, api_data_url, purl)
205}
206
207/// Parse OPAM file text into structured data
208fn parse_opam_data(text: &str) -> OpamData {
209    let mut data = OpamData::default();
210    let lines: Vec<&str> = text.lines().collect();
211    let mut i = 0;
212
213    while i < lines.len() {
214        let line = lines[i];
215
216        // Parse key: value format
217        if let Some((key, value)) = parse_key_value(line) {
218            match key.as_str() {
219                "name" => data.name = clean_value(&value),
220                "version" => data.version = clean_value(&value),
221                "synopsis" => data.synopsis = clean_value(&value),
222                "description" => {
223                    data.description = parse_multiline_string(&lines, &mut i);
224                }
225                "homepage" => data.homepage = clean_value(&value),
226                "dev-repo" => data.dev_repo = clean_value(&value),
227                "bug-reports" => data.bug_reports = clean_value(&value),
228                "src" => {
229                    if value.trim().is_empty() && i + 1 < lines.len() {
230                        i += 1;
231                        data.src = clean_value(lines[i]);
232                    } else {
233                        data.src = clean_value(&value);
234                    }
235                }
236                "license" => data.license = clean_value(&value),
237                "authors" => {
238                    data.authors = parse_string_array(&lines, &mut i, &value);
239                }
240                "maintainer" => {
241                    data.maintainers = parse_string_array(&lines, &mut i, &value);
242                }
243                "depends" => {
244                    data.dependencies = parse_dependency_array(&lines, &mut i);
245                }
246                "checksum" => {
247                    parse_checksums(&lines, &mut i, &mut data);
248                }
249                _ => {}
250            }
251        }
252
253        i += 1;
254    }
255
256    data
257}
258
259/// Parse a key: value line
260fn parse_key_value(line: &str) -> Option<(String, String)> {
261    let line = line.trim();
262    if line.is_empty() || line.starts_with('#') {
263        return None;
264    }
265
266    if let Some(colon_pos) = line.find(':') {
267        let key = line[..colon_pos].trim().to_string();
268        let value = line[colon_pos + 1..].trim().to_string();
269        Some((key, value))
270    } else {
271        None
272    }
273}
274
275/// Clean a value by removing quotes and brackets
276fn clean_value(value: &str) -> Option<String> {
277    let cleaned = value
278        .trim()
279        .trim_matches('"')
280        .trim_matches('[')
281        .trim_matches(']')
282        .trim();
283
284    if cleaned.is_empty() {
285        None
286    } else {
287        Some(cleaned.to_string())
288    }
289}
290
291/// Parse a multiline string enclosed in triple quotes
292fn parse_multiline_string(lines: &[&str], i: &mut usize) -> Option<String> {
293    let mut result = String::new();
294
295    // First line might contain opening """ and some content
296    if let Some((_, value)) = parse_key_value(lines[*i]) {
297        result.push_str(value.trim_matches('"').trim());
298    }
299
300    *i += 1;
301    while *i < lines.len() {
302        let line = lines[*i];
303        result.push(' ');
304        result.push_str(line.trim_matches('"').trim());
305
306        if line.contains("\"\"\"") {
307            break;
308        }
309        *i += 1;
310    }
311
312    let cleaned = result.trim().to_string();
313    if cleaned.is_empty() {
314        None
315    } else {
316        Some(cleaned)
317    }
318}
319
320/// Parse a string array (single-line or multiline)
321fn parse_string_array(lines: &[&str], i: &mut usize, first_value: &str) -> Vec<String> {
322    let mut result = Vec::new();
323
324    let mut content = first_value.to_string();
325
326    // If it's a multiline array (starts with [ but no matching ])
327    if content.contains('[') && !content.contains(']') {
328        *i += 1;
329        while *i < lines.len() {
330            let line = lines[*i];
331            content.push(' ');
332            content.push_str(line);
333
334            if line.contains(']') {
335                break;
336            }
337            *i += 1;
338        }
339    }
340
341    // Parse the content
342    let cleaned = content.trim_matches('[').trim_matches(']').trim();
343
344    // Split by quote-delimited strings
345    for part in split_quoted_strings(cleaned) {
346        let p = part.trim_matches('"').trim();
347        if !p.is_empty() {
348            result.push(p.to_string());
349        }
350    }
351
352    result
353}
354
355/// Parse dependency array
356fn parse_dependency_array(lines: &[&str], i: &mut usize) -> Vec<(String, String)> {
357    let mut result = Vec::new();
358
359    *i += 1;
360    while *i < lines.len() {
361        let line = lines[*i];
362
363        if line.trim().contains(']') {
364            break;
365        }
366
367        if let Some((name, version)) = parse_dependency_line(line) {
368            result.push((name, version));
369        }
370
371        *i += 1;
372    }
373
374    result
375}
376
377/// Parse a single dependency line: "name" {version_constraint}
378fn parse_dependency_line(line: &str) -> Option<(String, String)> {
379    let line = line.trim();
380    if line.is_empty() {
381        return None;
382    }
383
384    // Match: "name" {optional version}
385    let regex = Regex::new(r#""([^"]+)"\s*(.*)$"#).ok()?;
386    let caps = regex.captures(line)?;
387
388    let name = caps.get(1)?.as_str().to_string();
389    let version_part = caps.get(2)?.as_str().trim();
390
391    // Extract the operator and version constraint
392    let constraint = if version_part.is_empty() {
393        String::new()
394    } else {
395        extract_version_constraint(version_part)
396    };
397
398    Some((name, constraint))
399}
400
401/// Extract version constraint from {>= "1.0"} format
402fn extract_version_constraint(version_part: &str) -> String {
403    let regex = Regex::new(r#"\{\s*([<>=!]+)\s*"([^"]*)"\s*\}"#);
404    if let Ok(re) = regex
405        && let Some(caps) = re.captures(version_part)
406    {
407        let op = caps.get(1).map(|m| m.as_str()).unwrap_or("");
408        let ver = caps.get(2).map(|m| m.as_str()).unwrap_or("");
409        if !op.is_empty() && !ver.is_empty() {
410            return format!("{} {}", op, ver);
411        }
412    }
413
414    // If regex parsing fails, try to extract raw content
415    let content = version_part
416        .trim_matches('{')
417        .trim_matches('}')
418        .trim_matches('"')
419        .trim();
420
421    content.replace('"', "")
422}
423
424/// Parse checksums from checksum array
425fn parse_checksums(lines: &[&str], i: &mut usize, data: &mut OpamData) {
426    if let Some((_, first_value)) = parse_key_value(lines[*i]) {
427        let inline = first_value.trim();
428        if !inline.is_empty() && inline != "[" {
429            if let Some((key, value)) = parse_checksum_line(inline) {
430                match key.as_str() {
431                    "sha1" => data.sha1 = Some(value),
432                    "md5" => data.md5 = Some(value),
433                    "sha256" => data.sha256 = Some(value),
434                    "sha512" => data.sha512 = Some(value),
435                    _ => {}
436                }
437            }
438            return;
439        }
440    }
441
442    *i += 1;
443    while *i < lines.len() {
444        let line = lines[*i];
445
446        if line.trim().contains(']') {
447            break;
448        }
449
450        if let Some((key, value)) = parse_checksum_line(line) {
451            match key.as_str() {
452                "sha1" => data.sha1 = Some(value),
453                "md5" => data.md5 = Some(value),
454                "sha256" => data.sha256 = Some(value),
455                "sha512" => data.sha512 = Some(value),
456                _ => {}
457            }
458        }
459
460        *i += 1;
461    }
462}
463
464/// Parse a single checksum line: algo=hash
465fn parse_checksum_line(line: &str) -> Option<(String, String)> {
466    let line = line.trim().trim_matches('"').trim();
467
468    let regex = Regex::new(r"^(\w+)\s*=\s*(.+)$").ok()?;
469    let caps = regex.captures(line)?;
470
471    let key = caps.get(1)?.as_str().to_string();
472    let value = caps.get(2)?.as_str().to_string();
473
474    Some((key, value))
475}
476
477/// Split quoted strings like: "str1" "str2" "str3"
478fn split_quoted_strings(content: &str) -> Vec<String> {
479    let mut result = Vec::new();
480    let mut current = String::new();
481    let mut in_quotes = false;
482
483    for ch in content.chars() {
484        match ch {
485            '"' => in_quotes = !in_quotes,
486            ' ' if !in_quotes => {
487                if !current.is_empty() {
488                    result.push(current.trim_matches('"').to_string());
489                    current.clear();
490                }
491            }
492            _ => current.push(ch),
493        }
494    }
495
496    if !current.is_empty() {
497        result.push(current.trim_matches('"').to_string());
498    }
499
500    result
501}
502
503/// Build description from synopsis and description
504fn build_description(synopsis: &Option<String>, description: &Option<String>) -> Option<String> {
505    let parts: Vec<&str> = vec![synopsis.as_deref(), description.as_deref()]
506        .into_iter()
507        .filter(|p| p.is_some())
508        .flatten()
509        .collect();
510
511    if parts.is_empty() {
512        None
513    } else {
514        Some(parts.join("\n"))
515    }
516}
517
518/// Extract parties from authors and maintainers
519fn extract_parties(authors: &[String], maintainers: &[String]) -> Vec<Party> {
520    let mut parties = Vec::new();
521
522    // Add authors
523    for author in authors {
524        parties.push(Party {
525            r#type: Some("person".to_string()),
526            role: Some("author".to_string()),
527            name: Some(author.clone()),
528            email: None,
529            url: None,
530            organization: None,
531            organization_url: None,
532            timezone: None,
533        });
534    }
535
536    // Add maintainers (as email)
537    for maintainer in maintainers {
538        parties.push(Party {
539            r#type: Some("person".to_string()),
540            role: Some("maintainer".to_string()),
541            name: None,
542            email: Some(maintainer.clone()),
543            url: None,
544            organization: None,
545            organization_url: None,
546            timezone: None,
547        });
548    }
549
550    parties
551}
552
553/// Extract dependencies into Dependency objects
554fn extract_dependencies(deps: &[(String, String)]) -> Vec<Dependency> {
555    deps.iter()
556        .map(|(name, version_constraint)| Dependency {
557            purl: Some(format!("pkg:opam/{}", name)),
558            extracted_requirement: Some(version_constraint.clone()),
559            scope: Some("dependency".to_string()),
560            is_runtime: Some(true),
561            is_optional: Some(false),
562            is_pinned: Some(false),
563            is_direct: Some(true),
564            resolved_package: None,
565            extra_data: None,
566        })
567        .collect()
568}
569
570#[cfg(test)]
571mod tests {
572    use super::*;
573    use crate::parsers::PackageParser;
574
575    #[test]
576    fn test_is_match_with_opam_extension() {
577        let path = Path::new("sample.opam");
578        assert!(OpamParser::is_match(path));
579    }
580
581    #[test]
582    fn test_is_match_with_opam_name() {
583        let path = Path::new("opam");
584        assert!(OpamParser::is_match(path));
585    }
586
587    #[test]
588    fn test_is_match_with_non_opam() {
589        let path = Path::new("sample.txt");
590        assert!(!OpamParser::is_match(path));
591    }
592
593    #[test]
594    fn test_parse_key_value() {
595        let (key, value) = parse_key_value("name: \"js_of_ocaml\"").unwrap();
596        assert_eq!(key, "name");
597        assert_eq!(value, "\"js_of_ocaml\"");
598    }
599
600    #[test]
601    fn test_clean_value() {
602        assert_eq!(
603            clean_value("\"js_of_ocaml\""),
604            Some("js_of_ocaml".to_string())
605        );
606        assert_eq!(clean_value("\"\""), None);
607    }
608
609    #[test]
610    fn test_extract_version_constraint() {
611        let result = extract_version_constraint(r#"{>= "4.02.0"}"#);
612        assert_eq!(result, ">= 4.02.0");
613    }
614
615    #[test]
616    fn test_parse_dependency_line() {
617        let (name, version) = parse_dependency_line(r#""ocaml" {>= "4.02.0"}"#).unwrap();
618        assert_eq!(name, "ocaml");
619        assert_eq!(version, ">= 4.02.0");
620    }
621
622    #[test]
623    fn test_parse_dependency_line_without_version() {
624        let (name, version) = parse_dependency_line(r#""uchar""#).unwrap();
625        assert_eq!(name, "uchar");
626        assert_eq!(version, "");
627    }
628
629    #[test]
630    fn test_split_quoted_strings() {
631        let parts = split_quoted_strings(r#""str1" "str2""#);
632        assert_eq!(parts, vec!["str1", "str2"]);
633    }
634
635    #[test]
636    fn test_build_description() {
637        let synopsis = Some("Short description".to_string());
638        let description = Some("Long description".to_string());
639        let result = build_description(&synopsis, &description);
640        assert_eq!(
641            result,
642            Some("Short description\nLong description".to_string())
643        );
644    }
645
646    #[test]
647    fn test_extract_parties() {
648        let authors = vec!["Author One".to_string()];
649        let maintainers = vec!["maintainer@example.com".to_string()];
650        let parties = extract_parties(&authors, &maintainers);
651
652        assert_eq!(parties.len(), 2);
653        assert_eq!(parties[0].name, Some("Author One".to_string()));
654        assert_eq!(parties[0].role, Some("author".to_string()));
655        assert_eq!(parties[1].email, Some("maintainer@example.com".to_string()));
656        assert_eq!(parties[1].role, Some("maintainer".to_string()));
657    }
658
659    #[test]
660    fn test_normalize_opam_declared_license_preserves_scancode_style_expression() {
661        let (declared, declared_spdx, detections) = normalize_opam_declared_license(Some(
662            "LGPL-3.0-only with OCaml-LGPL-linking-exception",
663        ));
664
665        assert_eq!(
666            declared.as_deref(),
667            Some("lgpl-3.0 WITH ocaml-lgpl-linking-exception")
668        );
669        assert_eq!(
670            declared_spdx.as_deref(),
671            Some("LGPL-3.0-only WITH OCaml-LGPL-linking-exception")
672        );
673        assert_eq!(detections.len(), 1);
674        assert_eq!(
675            detections[0].license_expression,
676            "lgpl-3.0 WITH ocaml-lgpl-linking-exception"
677        );
678    }
679}
680
681crate::register_parser!(
682    "OCaml OPAM package manifest",
683    &["**/*.opam", "**/opam"],
684    "opam",
685    "OCaml",
686    Some("https://opam.ocaml.org/doc/Manual.html"),
687);