Skip to main content

provenant/parsers/
opam.rs

1//! Parser for OCaml OPAM package manager manifests.
2//!
3//! Extracts package metadata and dependencies from OPAM files used by the
4//! OCaml ecosystem.
5//!
6//! # Supported Formats
7//! - *.opam files (OPAM package manifests)
8//! - opam files without extension
9//!
10//! # Key Features
11//! - Field-based parsing of OPAM's custom format (key: value)
12//! - Author and maintainer extraction with email parsing
13//! - URL extraction for source archives, homepage, repository
14//! - License statement extraction
15//! - Checksum extraction (sha1, md5, sha256, sha512)
16//!
17//! # Implementation Notes
18//! - OPAM format uses custom syntax, not JSON/YAML/TOML
19//! - Strings can be quoted or unquoted
20//! - Lists use bracket notation: [item1 item2]
21//! - Multi-line strings use three-quote notation: """..."""
22
23use std::path::Path;
24
25use crate::parser_warn as warn;
26use regex::Regex;
27
28use crate::models::{
29    DatasourceId, Dependency, Md5Digest, PackageData, PackageType, Party, Sha1Digest, Sha256Digest,
30    Sha512Digest,
31};
32use crate::parsers::PackageParser;
33
34use super::license_normalization::{
35    DeclaredLicenseMatchMetadata, build_declared_license_data_from_pair,
36    normalize_spdx_declared_license,
37};
38
39/// Parser for OCaml OPAM package manifest files.
40///
41/// Handles the OPAM file format used by the OCaml package manager.
42/// Reference: <https://opam.ocaml.org/doc/Manual.html#Common-file-format>
43pub struct OpamParser;
44
45impl PackageParser for OpamParser {
46    const PACKAGE_TYPE: PackageType = PackageType::Opam;
47
48    fn is_match(path: &Path) -> bool {
49        path.file_name().is_some_and(|name| {
50            name.to_string_lossy().ends_with(".opam") || name.to_string_lossy() == "opam"
51        })
52    }
53
54    fn extract_packages(path: &Path) -> Vec<PackageData> {
55        vec![match std::fs::read_to_string(path) {
56            Ok(text) => parse_opam(&text),
57            Err(e) => {
58                warn!("Failed to read OPAM file {:?}: {}", path, e);
59                default_package_data()
60            }
61        }]
62    }
63}
64
65/// Parsed OPAM file data
66#[derive(Debug, Default)]
67struct OpamData {
68    name: Option<String>,
69    version: Option<String>,
70    synopsis: Option<String>,
71    description: Option<String>,
72    homepage: Option<String>,
73    dev_repo: Option<String>,
74    bug_reports: Option<String>,
75    src: Option<String>,
76    authors: Vec<String>,
77    maintainers: Vec<String>,
78    license: Option<String>,
79    sha1: Option<Sha1Digest>,
80    md5: Option<Md5Digest>,
81    sha256: Option<Sha256Digest>,
82    sha512: Option<Sha512Digest>,
83    dependencies: Vec<(String, String)>, // (name, version_constraint)
84}
85
86fn default_package_data() -> PackageData {
87    PackageData {
88        package_type: Some(OpamParser::PACKAGE_TYPE),
89        primary_language: Some("Ocaml".to_string()),
90        datasource_id: Some(DatasourceId::OpamFile),
91        ..Default::default()
92    }
93}
94
95/// Parse an OPAM file from text content
96fn parse_opam(text: &str) -> PackageData {
97    let opam_data = parse_opam_data(text);
98
99    let description = build_description(&opam_data.synopsis, &opam_data.description);
100    let parties = extract_parties(&opam_data.authors, &opam_data.maintainers);
101    let dependencies = extract_dependencies(&opam_data.dependencies);
102
103    let (repository_homepage_url, api_data_url, purl) =
104        build_opam_urls(&opam_data.name, &opam_data.version);
105    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
106        normalize_opam_declared_license(opam_data.license.as_deref());
107
108    PackageData {
109        package_type: Some(OpamParser::PACKAGE_TYPE),
110        namespace: None,
111        name: opam_data.name,
112        version: opam_data.version,
113        qualifiers: None,
114        subpath: None,
115        primary_language: Some("Ocaml".to_string()),
116        description,
117        release_date: None,
118        parties,
119        keywords: Vec::new(),
120        homepage_url: opam_data.homepage,
121        download_url: opam_data.src,
122        size: None,
123        sha1: opam_data.sha1,
124        md5: opam_data.md5,
125        sha256: opam_data.sha256,
126        sha512: opam_data.sha512,
127        bug_tracking_url: opam_data.bug_reports,
128        code_view_url: None,
129        vcs_url: opam_data.dev_repo,
130        copyright: None,
131        holder: None,
132        declared_license_expression,
133        declared_license_expression_spdx,
134        license_detections,
135        other_license_expression: None,
136        other_license_expression_spdx: None,
137        other_license_detections: Vec::new(),
138        extracted_license_statement: opam_data.license,
139        notice_text: None,
140        source_packages: Vec::new(),
141        file_references: Vec::new(),
142        is_private: false,
143        is_virtual: false,
144        extra_data: None,
145        dependencies,
146        repository_homepage_url,
147        repository_download_url: None,
148        api_data_url,
149        datasource_id: Some(DatasourceId::OpamFile),
150        purl,
151    }
152}
153
154fn normalize_opam_declared_license(
155    statement: Option<&str>,
156) -> (
157    Option<String>,
158    Option<String>,
159    Vec<crate::models::LicenseDetection>,
160) {
161    let Some(statement) = statement.map(str::trim).filter(|value| !value.is_empty()) else {
162        return super::license_normalization::empty_declared_license_data();
163    };
164
165    match statement {
166        "GPL-2.0-only" => build_declared_license_data_from_pair(
167            "gpl-2.0",
168            "GPL-2.0-only",
169            DeclaredLicenseMatchMetadata::single_line(statement),
170        ),
171        "GPL-3.0-only" => build_declared_license_data_from_pair(
172            "gpl-3.0",
173            "GPL-3.0-only",
174            DeclaredLicenseMatchMetadata::single_line(statement),
175        ),
176        "LGPL-3.0-only with OCaml-LGPL-linking-exception" => build_declared_license_data_from_pair(
177            "lgpl-3.0 WITH ocaml-lgpl-linking-exception",
178            "LGPL-3.0-only WITH OCaml-LGPL-linking-exception",
179            DeclaredLicenseMatchMetadata::single_line(statement),
180        ),
181        _ => normalize_spdx_declared_license(Some(statement)),
182    }
183}
184
185fn build_opam_urls(
186    name: &Option<String>,
187    version: &Option<String>,
188) -> (Option<String>, Option<String>, Option<String>) {
189    let repository_homepage_url = name
190        .as_ref()
191        .map(|_| "{https://opam.ocaml.org/packages}/{name}".to_string());
192
193    let api_data_url = match (name, version) {
194        (Some(n), Some(v)) => Some(format!(
195            "https://github.com/ocaml/opam-repository/blob/master/packages/{}/{}.{}/opam",
196            n, n, v
197        )),
198        _ => None,
199    };
200
201    let purl = match (name, version) {
202        (Some(n), Some(v)) => Some(format!("pkg:opam/{}@{}", n, v)),
203        (Some(n), None) => Some(format!("pkg:opam/{}", n)),
204        _ => None,
205    };
206
207    (repository_homepage_url, api_data_url, purl)
208}
209
210/// Parse OPAM file text into structured data
211fn parse_opam_data(text: &str) -> OpamData {
212    let mut data = OpamData::default();
213    let lines: Vec<&str> = text.lines().collect();
214    let mut i = 0;
215
216    while i < lines.len() {
217        let line = lines[i];
218
219        // Parse key: value format
220        if let Some((key, value)) = parse_key_value(line) {
221            match key.as_str() {
222                "name" => data.name = clean_value(&value),
223                "version" => data.version = clean_value(&value),
224                "synopsis" => data.synopsis = clean_value(&value),
225                "description" => {
226                    data.description = parse_multiline_string(&lines, &mut i);
227                }
228                "homepage" => data.homepage = clean_value(&value),
229                "dev-repo" => data.dev_repo = clean_value(&value),
230                "bug-reports" => data.bug_reports = clean_value(&value),
231                "src" => {
232                    if value.trim().is_empty() && i + 1 < lines.len() {
233                        i += 1;
234                        data.src = clean_value(lines[i]);
235                    } else {
236                        data.src = clean_value(&value);
237                    }
238                }
239                "license" => data.license = clean_value(&value),
240                "authors" => {
241                    data.authors = parse_string_array(&lines, &mut i, &value);
242                }
243                "maintainer" => {
244                    data.maintainers = parse_string_array(&lines, &mut i, &value);
245                }
246                "depends" => {
247                    data.dependencies = parse_dependency_array(&lines, &mut i);
248                }
249                "checksum" => {
250                    parse_checksums(&lines, &mut i, &mut data);
251                }
252                _ => {}
253            }
254        }
255
256        i += 1;
257    }
258
259    data
260}
261
262/// Parse a key: value line
263fn parse_key_value(line: &str) -> Option<(String, String)> {
264    let line = line.trim();
265    if line.is_empty() || line.starts_with('#') {
266        return None;
267    }
268
269    if let Some(colon_pos) = line.find(':') {
270        let key = line[..colon_pos].trim().to_string();
271        let value = line[colon_pos + 1..].trim().to_string();
272        Some((key, value))
273    } else {
274        None
275    }
276}
277
278/// Clean a value by removing quotes and brackets
279fn clean_value(value: &str) -> Option<String> {
280    let cleaned = value
281        .trim()
282        .trim_matches('"')
283        .trim_matches('[')
284        .trim_matches(']')
285        .trim();
286
287    if cleaned.is_empty() {
288        None
289    } else {
290        Some(cleaned.to_string())
291    }
292}
293
294/// Parse a multiline string enclosed in triple quotes
295fn parse_multiline_string(lines: &[&str], i: &mut usize) -> Option<String> {
296    let mut result = String::new();
297
298    // First line might contain opening """ and some content
299    if let Some((_, value)) = parse_key_value(lines[*i]) {
300        result.push_str(value.trim_matches('"').trim());
301    }
302
303    *i += 1;
304    while *i < lines.len() {
305        let line = lines[*i];
306        result.push(' ');
307        result.push_str(line.trim_matches('"').trim());
308
309        if line.contains("\"\"\"") {
310            break;
311        }
312        *i += 1;
313    }
314
315    let cleaned = result.trim().to_string();
316    if cleaned.is_empty() {
317        None
318    } else {
319        Some(cleaned)
320    }
321}
322
323/// Parse a string array (single-line or multiline)
324fn parse_string_array(lines: &[&str], i: &mut usize, first_value: &str) -> Vec<String> {
325    let mut result = Vec::new();
326
327    let mut content = first_value.to_string();
328
329    // If it's a multiline array (starts with [ but no matching ])
330    if content.contains('[') && !content.contains(']') {
331        *i += 1;
332        while *i < lines.len() {
333            let line = lines[*i];
334            content.push(' ');
335            content.push_str(line);
336
337            if line.contains(']') {
338                break;
339            }
340            *i += 1;
341        }
342    }
343
344    // Parse the content
345    let cleaned = content.trim_matches('[').trim_matches(']').trim();
346
347    // Split by quote-delimited strings
348    for part in split_quoted_strings(cleaned) {
349        let p = part.trim_matches('"').trim();
350        if !p.is_empty() {
351            result.push(p.to_string());
352        }
353    }
354
355    result
356}
357
358/// Parse dependency array
359fn parse_dependency_array(lines: &[&str], i: &mut usize) -> Vec<(String, String)> {
360    let mut result = Vec::new();
361
362    *i += 1;
363    while *i < lines.len() {
364        let line = lines[*i];
365
366        if line.trim().contains(']') {
367            break;
368        }
369
370        if let Some((name, version)) = parse_dependency_line(line) {
371            result.push((name, version));
372        }
373
374        *i += 1;
375    }
376
377    result
378}
379
380/// Parse a single dependency line: "name" {version_constraint}
381fn parse_dependency_line(line: &str) -> Option<(String, String)> {
382    let line = line.trim();
383    if line.is_empty() {
384        return None;
385    }
386
387    // Match: "name" {optional version}
388    let regex = Regex::new(r#""([^"]+)"\s*(.*)$"#).ok()?;
389    let caps = regex.captures(line)?;
390
391    let name = caps.get(1)?.as_str().to_string();
392    let version_part = caps.get(2)?.as_str().trim();
393
394    // Extract the operator and version constraint
395    let constraint = if version_part.is_empty() {
396        String::new()
397    } else {
398        extract_version_constraint(version_part)
399    };
400
401    Some((name, constraint))
402}
403
404/// Extract version constraint from {>= "1.0"} format
405fn extract_version_constraint(version_part: &str) -> String {
406    let regex = Regex::new(r#"\{\s*([<>=!]+)\s*"([^"]*)"\s*\}"#);
407    if let Ok(re) = regex
408        && let Some(caps) = re.captures(version_part)
409    {
410        let op = caps.get(1).map(|m| m.as_str()).unwrap_or("");
411        let ver = caps.get(2).map(|m| m.as_str()).unwrap_or("");
412        if !op.is_empty() && !ver.is_empty() {
413            return format!("{} {}", op, ver);
414        }
415    }
416
417    // If regex parsing fails, try to extract raw content
418    let content = version_part
419        .trim_matches('{')
420        .trim_matches('}')
421        .trim_matches('"')
422        .trim();
423
424    content.replace('"', "")
425}
426
427/// Parse checksums from checksum array
428fn parse_checksums(lines: &[&str], i: &mut usize, data: &mut OpamData) {
429    if let Some((_, first_value)) = parse_key_value(lines[*i]) {
430        let inline = first_value.trim();
431        if !inline.is_empty() && inline != "[" {
432            if let Some((key, value)) = parse_checksum_line(inline) {
433                match key.as_str() {
434                    "sha1" => data.sha1 = Sha1Digest::from_hex(&value).ok(),
435                    "md5" => data.md5 = Md5Digest::from_hex(&value).ok(),
436                    "sha256" => data.sha256 = Sha256Digest::from_hex(&value).ok(),
437                    "sha512" => data.sha512 = Sha512Digest::from_hex(&value).ok(),
438                    _ => {}
439                }
440            }
441            return;
442        }
443    }
444
445    *i += 1;
446    while *i < lines.len() {
447        let line = lines[*i];
448
449        if line.trim().contains(']') {
450            break;
451        }
452
453        if let Some((key, value)) = parse_checksum_line(line) {
454            match key.as_str() {
455                "sha1" => data.sha1 = Sha1Digest::from_hex(&value).ok(),
456                "md5" => data.md5 = Md5Digest::from_hex(&value).ok(),
457                "sha256" => data.sha256 = Sha256Digest::from_hex(&value).ok(),
458                "sha512" => data.sha512 = Sha512Digest::from_hex(&value).ok(),
459                _ => {}
460            }
461        }
462
463        *i += 1;
464    }
465}
466
467/// Parse a single checksum line: algo=hash
468fn parse_checksum_line(line: &str) -> Option<(String, String)> {
469    let line = line.trim().trim_matches('"').trim();
470
471    let regex = Regex::new(r"^(\w+)\s*=\s*(.+)$").ok()?;
472    let caps = regex.captures(line)?;
473
474    let key = caps.get(1)?.as_str().to_string();
475    let value = caps.get(2)?.as_str().to_string();
476
477    Some((key, value))
478}
479
480/// Split quoted strings like: "str1" "str2" "str3"
481fn split_quoted_strings(content: &str) -> Vec<String> {
482    let mut result = Vec::new();
483    let mut current = String::new();
484    let mut in_quotes = false;
485
486    for ch in content.chars() {
487        match ch {
488            '"' => in_quotes = !in_quotes,
489            ' ' if !in_quotes => {
490                if !current.is_empty() {
491                    result.push(current.trim_matches('"').to_string());
492                    current.clear();
493                }
494            }
495            _ => current.push(ch),
496        }
497    }
498
499    if !current.is_empty() {
500        result.push(current.trim_matches('"').to_string());
501    }
502
503    result
504}
505
506/// Build description from synopsis and description
507fn build_description(synopsis: &Option<String>, description: &Option<String>) -> Option<String> {
508    let parts: Vec<&str> = vec![synopsis.as_deref(), description.as_deref()]
509        .into_iter()
510        .filter(|p| p.is_some())
511        .flatten()
512        .collect();
513
514    if parts.is_empty() {
515        None
516    } else {
517        Some(parts.join("\n"))
518    }
519}
520
521/// Extract parties from authors and maintainers
522fn extract_parties(authors: &[String], maintainers: &[String]) -> Vec<Party> {
523    let mut parties = Vec::new();
524
525    // Add authors
526    for author in authors {
527        parties.push(Party {
528            r#type: Some("person".to_string()),
529            role: Some("author".to_string()),
530            name: Some(author.clone()),
531            email: None,
532            url: None,
533            organization: None,
534            organization_url: None,
535            timezone: None,
536        });
537    }
538
539    // Add maintainers (as email)
540    for maintainer in maintainers {
541        parties.push(Party {
542            r#type: Some("person".to_string()),
543            role: Some("maintainer".to_string()),
544            name: None,
545            email: Some(maintainer.clone()),
546            url: None,
547            organization: None,
548            organization_url: None,
549            timezone: None,
550        });
551    }
552
553    parties
554}
555
556/// Extract dependencies into Dependency objects
557fn extract_dependencies(deps: &[(String, String)]) -> Vec<Dependency> {
558    deps.iter()
559        .map(|(name, version_constraint)| Dependency {
560            purl: Some(format!("pkg:opam/{}", name)),
561            extracted_requirement: Some(version_constraint.clone()),
562            scope: Some("dependency".to_string()),
563            is_runtime: Some(true),
564            is_optional: Some(false),
565            is_pinned: Some(false),
566            is_direct: Some(true),
567            resolved_package: None,
568            extra_data: None,
569        })
570        .collect()
571}
572
573#[cfg(test)]
574mod tests {
575    use super::*;
576    use crate::parsers::PackageParser;
577
578    #[test]
579    fn test_is_match_with_opam_extension() {
580        let path = Path::new("sample.opam");
581        assert!(OpamParser::is_match(path));
582    }
583
584    #[test]
585    fn test_is_match_with_opam_name() {
586        let path = Path::new("opam");
587        assert!(OpamParser::is_match(path));
588    }
589
590    #[test]
591    fn test_is_match_with_non_opam() {
592        let path = Path::new("sample.txt");
593        assert!(!OpamParser::is_match(path));
594    }
595
596    #[test]
597    fn test_parse_key_value() {
598        let (key, value) = parse_key_value("name: \"js_of_ocaml\"").unwrap();
599        assert_eq!(key, "name");
600        assert_eq!(value, "\"js_of_ocaml\"");
601    }
602
603    #[test]
604    fn test_clean_value() {
605        assert_eq!(
606            clean_value("\"js_of_ocaml\""),
607            Some("js_of_ocaml".to_string())
608        );
609        assert_eq!(clean_value("\"\""), None);
610    }
611
612    #[test]
613    fn test_extract_version_constraint() {
614        let result = extract_version_constraint(r#"{>= "4.02.0"}"#);
615        assert_eq!(result, ">= 4.02.0");
616    }
617
618    #[test]
619    fn test_parse_dependency_line() {
620        let (name, version) = parse_dependency_line(r#""ocaml" {>= "4.02.0"}"#).unwrap();
621        assert_eq!(name, "ocaml");
622        assert_eq!(version, ">= 4.02.0");
623    }
624
625    #[test]
626    fn test_parse_dependency_line_without_version() {
627        let (name, version) = parse_dependency_line(r#""uchar""#).unwrap();
628        assert_eq!(name, "uchar");
629        assert_eq!(version, "");
630    }
631
632    #[test]
633    fn test_split_quoted_strings() {
634        let parts = split_quoted_strings(r#""str1" "str2""#);
635        assert_eq!(parts, vec!["str1", "str2"]);
636    }
637
638    #[test]
639    fn test_build_description() {
640        let synopsis = Some("Short description".to_string());
641        let description = Some("Long description".to_string());
642        let result = build_description(&synopsis, &description);
643        assert_eq!(
644            result,
645            Some("Short description\nLong description".to_string())
646        );
647    }
648
649    #[test]
650    fn test_extract_parties() {
651        let authors = vec!["Author One".to_string()];
652        let maintainers = vec!["maintainer@example.com".to_string()];
653        let parties = extract_parties(&authors, &maintainers);
654
655        assert_eq!(parties.len(), 2);
656        assert_eq!(parties[0].name, Some("Author One".to_string()));
657        assert_eq!(parties[0].role, Some("author".to_string()));
658        assert_eq!(parties[1].email, Some("maintainer@example.com".to_string()));
659        assert_eq!(parties[1].role, Some("maintainer".to_string()));
660    }
661
662    #[test]
663    fn test_normalize_opam_declared_license_preserves_scancode_style_expression() {
664        let (declared, declared_spdx, detections) = normalize_opam_declared_license(Some(
665            "LGPL-3.0-only with OCaml-LGPL-linking-exception",
666        ));
667
668        assert_eq!(
669            declared.as_deref(),
670            Some("lgpl-3.0 WITH ocaml-lgpl-linking-exception")
671        );
672        assert_eq!(
673            declared_spdx.as_deref(),
674            Some("LGPL-3.0-only WITH OCaml-LGPL-linking-exception")
675        );
676        assert_eq!(detections.len(), 1);
677        assert_eq!(
678            detections[0].license_expression,
679            "lgpl-3.0 WITH ocaml-lgpl-linking-exception"
680        );
681    }
682}
683
684crate::register_parser!(
685    "OCaml OPAM package manifest",
686    &["**/*.opam", "**/opam"],
687    "opam",
688    "OCaml",
689    Some("https://opam.ocaml.org/doc/Manual.html"),
690);