Skip to main content

provenant/parsers/
opam.rs

1//! Parser for OCaml OPAM package manager manifests.
2//!
3//! Extracts package metadata and dependencies from OPAM files used by the
4//! OCaml ecosystem.
5//!
6//! # Supported Formats
7//! - *.opam files (OPAM package manifests)
8//! - opam files without extension
9//!
10//! # Key Features
11//! - Field-based parsing of OPAM's custom format (key: value)
12//! - Author and maintainer extraction with email parsing
13//! - URL extraction for source archives, homepage, repository
14//! - License statement extraction
15//! - Checksum extraction (sha1, md5, sha256, sha512)
16//!
17//! # Implementation Notes
18//! - OPAM format uses custom syntax, not JSON/YAML/TOML
19//! - Strings can be quoted or unquoted
20//! - Lists use bracket notation: [item1 item2]
21//! - Multi-line strings use three-quote notation: """..."""
22
23use std::path::Path;
24
25use crate::parser_warn as warn;
26use regex::Regex;
27
28use crate::models::{
29    DatasourceId, Dependency, Md5Digest, PackageData, PackageType, Party, Sha1Digest, Sha256Digest,
30    Sha512Digest,
31};
32use crate::parsers::PackageParser;
33use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
34
35use super::license_normalization::{
36    DeclaredLicenseMatchMetadata, build_declared_license_data_from_pair,
37    normalize_spdx_declared_license,
38};
39
40/// Parser for OCaml OPAM package manifest files.
41///
42/// Handles the OPAM file format used by the OCaml package manager.
43/// Reference: <https://opam.ocaml.org/doc/Manual.html#Common-file-format>
44pub struct OpamParser;
45
46impl PackageParser for OpamParser {
47    const PACKAGE_TYPE: PackageType = PackageType::Opam;
48
49    fn is_match(path: &Path) -> bool {
50        path.file_name().is_some_and(|name| {
51            name.to_string_lossy().ends_with(".opam") || name.to_string_lossy() == "opam"
52        })
53    }
54
55    fn extract_packages(path: &Path) -> Vec<PackageData> {
56        vec![match read_file_to_string(path, None) {
57            Ok(text) => parse_opam(&text),
58            Err(e) => {
59                warn!("Failed to read OPAM file {:?}: {}", path, e);
60                default_package_data()
61            }
62        }]
63    }
64}
65
66/// Parsed OPAM file data
67#[derive(Debug, Default)]
68struct OpamData {
69    name: Option<String>,
70    version: Option<String>,
71    synopsis: Option<String>,
72    description: Option<String>,
73    homepage: Option<String>,
74    dev_repo: Option<String>,
75    bug_reports: Option<String>,
76    src: Option<String>,
77    authors: Vec<String>,
78    maintainers: Vec<String>,
79    license: Option<String>,
80    sha1: Option<Sha1Digest>,
81    md5: Option<Md5Digest>,
82    sha256: Option<Sha256Digest>,
83    sha512: Option<Sha512Digest>,
84    dependencies: Vec<(String, String)>, // (name, version_constraint)
85}
86
87fn default_package_data() -> PackageData {
88    PackageData {
89        package_type: Some(OpamParser::PACKAGE_TYPE),
90        primary_language: Some("Ocaml".to_string()),
91        datasource_id: Some(DatasourceId::OpamFile),
92        ..Default::default()
93    }
94}
95
96/// Parse an OPAM file from text content
97fn parse_opam(text: &str) -> PackageData {
98    let opam_data = parse_opam_data(text);
99
100    let description = build_description(&opam_data.synopsis, &opam_data.description);
101    let parties = extract_parties(&opam_data.authors, &opam_data.maintainers);
102    let dependencies = extract_dependencies(&opam_data.dependencies);
103
104    let (repository_homepage_url, api_data_url, purl) =
105        build_opam_urls(&opam_data.name, &opam_data.version);
106    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
107        normalize_opam_declared_license(opam_data.license.as_deref());
108
109    PackageData {
110        package_type: Some(OpamParser::PACKAGE_TYPE),
111        namespace: None,
112        name: opam_data.name,
113        version: opam_data.version,
114        qualifiers: None,
115        subpath: None,
116        primary_language: Some("Ocaml".to_string()),
117        description,
118        release_date: None,
119        parties,
120        keywords: Vec::new(),
121        homepage_url: opam_data.homepage,
122        download_url: opam_data.src,
123        size: None,
124        sha1: opam_data.sha1,
125        md5: opam_data.md5,
126        sha256: opam_data.sha256,
127        sha512: opam_data.sha512,
128        bug_tracking_url: opam_data.bug_reports,
129        code_view_url: None,
130        vcs_url: opam_data.dev_repo,
131        copyright: None,
132        holder: None,
133        declared_license_expression,
134        declared_license_expression_spdx,
135        license_detections,
136        other_license_expression: None,
137        other_license_expression_spdx: None,
138        other_license_detections: Vec::new(),
139        extracted_license_statement: opam_data.license,
140        notice_text: None,
141        source_packages: Vec::new(),
142        file_references: Vec::new(),
143        is_private: false,
144        is_virtual: false,
145        extra_data: None,
146        dependencies,
147        repository_homepage_url,
148        repository_download_url: None,
149        api_data_url,
150        datasource_id: Some(DatasourceId::OpamFile),
151        purl,
152    }
153}
154
155fn normalize_opam_declared_license(
156    statement: Option<&str>,
157) -> (
158    Option<String>,
159    Option<String>,
160    Vec<crate::models::LicenseDetection>,
161) {
162    let Some(statement) = statement.map(str::trim).filter(|value| !value.is_empty()) else {
163        return super::license_normalization::empty_declared_license_data();
164    };
165
166    match statement {
167        "GPL-2.0-only" => build_declared_license_data_from_pair(
168            "gpl-2.0",
169            "GPL-2.0-only",
170            DeclaredLicenseMatchMetadata::single_line(statement),
171        ),
172        "GPL-3.0-only" => build_declared_license_data_from_pair(
173            "gpl-3.0",
174            "GPL-3.0-only",
175            DeclaredLicenseMatchMetadata::single_line(statement),
176        ),
177        "LGPL-3.0-only with OCaml-LGPL-linking-exception" => build_declared_license_data_from_pair(
178            "lgpl-3.0 WITH ocaml-lgpl-linking-exception",
179            "LGPL-3.0-only WITH OCaml-LGPL-linking-exception",
180            DeclaredLicenseMatchMetadata::single_line(statement),
181        ),
182        _ => normalize_spdx_declared_license(Some(statement)),
183    }
184}
185
186fn build_opam_urls(
187    name: &Option<String>,
188    version: &Option<String>,
189) -> (Option<String>, Option<String>, Option<String>) {
190    let repository_homepage_url = name
191        .as_ref()
192        .map(|n| format!("https://opam.ocaml.org/packages/{}", n));
193
194    let api_data_url = match (name, version) {
195        (Some(n), Some(v)) => Some(format!(
196            "https://github.com/ocaml/opam-repository/blob/master/packages/{}/{}.{}/opam",
197            n, n, v
198        )),
199        _ => None,
200    };
201
202    let purl = match (name, version) {
203        (Some(n), Some(v)) => Some(format!("pkg:opam/{}@{}", n, v)),
204        (Some(n), None) => Some(format!("pkg:opam/{}", n)),
205        _ => None,
206    };
207
208    (repository_homepage_url, api_data_url, purl)
209}
210
211/// Parse OPAM file text into structured data
212fn parse_opam_data(text: &str) -> OpamData {
213    let mut data = OpamData::default();
214    let lines: Vec<&str> = text.lines().collect();
215    let mut i = 0;
216    let mut iteration_count: usize = 0;
217
218    while i < lines.len() {
219        iteration_count += 1;
220        if iteration_count > MAX_ITERATION_COUNT {
221            warn!("parse_opam_data: exceeded MAX_ITERATION_COUNT, breaking");
222            break;
223        }
224        let line = lines[i];
225
226        // Parse key: value format
227        if let Some((key, value)) = parse_key_value(line) {
228            match key.as_str() {
229                "name" => data.name = clean_value(&value),
230                "version" => data.version = clean_value(&value),
231                "synopsis" => data.synopsis = clean_value(&value),
232                "description" => {
233                    data.description = parse_multiline_string(&lines, &mut i);
234                }
235                "homepage" => data.homepage = clean_value(&value),
236                "dev-repo" => data.dev_repo = clean_value(&value),
237                "bug-reports" => data.bug_reports = clean_value(&value),
238                "src" => {
239                    if value.trim().is_empty() && i + 1 < lines.len() {
240                        i += 1;
241                        data.src = clean_value(lines[i]);
242                    } else {
243                        data.src = clean_value(&value);
244                    }
245                }
246                "license" => data.license = clean_value(&value),
247                "authors" => {
248                    data.authors = parse_string_array(&lines, &mut i, &value);
249                }
250                "maintainer" => {
251                    data.maintainers = parse_string_array(&lines, &mut i, &value);
252                }
253                "depends" => {
254                    data.dependencies = parse_dependency_array(&lines, &mut i);
255                }
256                "checksum" => {
257                    parse_checksums(&lines, &mut i, &mut data);
258                }
259                _ => {}
260            }
261        }
262
263        i += 1;
264    }
265
266    data
267}
268
269/// Parse a key: value line
270fn parse_key_value(line: &str) -> Option<(String, String)> {
271    let line = line.trim();
272    if line.is_empty() || line.starts_with('#') {
273        return None;
274    }
275
276    if let Some(colon_pos) = line.find(':') {
277        let key = line[..colon_pos].trim().to_string();
278        let value = line[colon_pos + 1..].trim().to_string();
279        Some((key, value))
280    } else {
281        None
282    }
283}
284
285/// Clean a value by removing quotes and brackets
286fn clean_value(value: &str) -> Option<String> {
287    let cleaned = value
288        .trim()
289        .trim_matches('"')
290        .trim_matches('[')
291        .trim_matches(']')
292        .trim();
293
294    if cleaned.is_empty() {
295        None
296    } else {
297        Some(truncate_field(cleaned.to_string()))
298    }
299}
300
301/// Parse a multiline string enclosed in triple quotes
302fn parse_multiline_string(lines: &[&str], i: &mut usize) -> Option<String> {
303    let mut result = String::new();
304    let mut iteration_count: usize = 0;
305
306    if let Some((_, value)) = parse_key_value(lines[*i]) {
307        result.push_str(value.trim_matches('"').trim());
308    }
309
310    *i += 1;
311    while *i < lines.len() {
312        iteration_count += 1;
313        if iteration_count > MAX_ITERATION_COUNT {
314            warn!("parse_multiline_string: exceeded MAX_ITERATION_COUNT, breaking");
315            break;
316        }
317        let line = lines[*i];
318        result.push(' ');
319        result.push_str(line.trim_matches('"').trim());
320
321        if line.contains("\"\"\"") {
322            break;
323        }
324        *i += 1;
325    }
326
327    let cleaned = result.trim().to_string();
328    if cleaned.is_empty() {
329        None
330    } else {
331        Some(truncate_field(cleaned))
332    }
333}
334
335/// Parse a string array (single-line or multiline)
336fn parse_string_array(lines: &[&str], i: &mut usize, first_value: &str) -> Vec<String> {
337    let mut result = Vec::new();
338    let mut iteration_count: usize = 0;
339
340    let mut content = first_value.to_string();
341
342    if content.contains('[') && !content.contains(']') {
343        *i += 1;
344        while *i < lines.len() {
345            iteration_count += 1;
346            if iteration_count > MAX_ITERATION_COUNT {
347                warn!("parse_string_array: exceeded MAX_ITERATION_COUNT, breaking");
348                break;
349            }
350            let line = lines[*i];
351            content.push(' ');
352            content.push_str(line);
353
354            if line.contains(']') {
355                break;
356            }
357            *i += 1;
358        }
359    }
360
361    let cleaned = content.trim_matches('[').trim_matches(']').trim();
362
363    for part in split_quoted_strings(cleaned) {
364        let p = part.trim_matches('"').trim();
365        if !p.is_empty() {
366            result.push(truncate_field(p.to_string()));
367        }
368    }
369
370    result
371}
372
373/// Parse dependency array
374fn parse_dependency_array(lines: &[&str], i: &mut usize) -> Vec<(String, String)> {
375    let mut result = Vec::new();
376    let mut iteration_count: usize = 0;
377
378    *i += 1;
379    while *i < lines.len() {
380        iteration_count += 1;
381        if iteration_count > MAX_ITERATION_COUNT {
382            warn!("parse_dependency_array: exceeded MAX_ITERATION_COUNT, breaking");
383            break;
384        }
385        let line = lines[*i];
386
387        if line.trim().contains(']') {
388            break;
389        }
390
391        if let Some((name, version)) = parse_dependency_line(line) {
392            result.push((name, version));
393        }
394
395        *i += 1;
396    }
397
398    result
399}
400
401/// Parse a single dependency line: "name" {version_constraint}
402fn parse_dependency_line(line: &str) -> Option<(String, String)> {
403    let line = line.trim();
404    if line.is_empty() {
405        return None;
406    }
407
408    // Match: "name" {optional version}
409    let regex = Regex::new(r#""([^"]+)"\s*(.*)$"#).ok()?;
410    let caps = regex.captures(line)?;
411
412    let name = truncate_field(caps.get(1)?.as_str().to_string());
413    let version_part = caps.get(2)?.as_str().trim();
414
415    // Extract the operator and version constraint
416    let constraint = if version_part.is_empty() {
417        String::new()
418    } else {
419        truncate_field(extract_version_constraint(version_part))
420    };
421
422    Some((name, constraint))
423}
424
425/// Extract version constraint from {>= "1.0"} format
426fn extract_version_constraint(version_part: &str) -> String {
427    let regex = Regex::new(r#"\{\s*([<>=!]+)\s*"([^"]*)"\s*\}"#);
428    if let Ok(re) = regex
429        && let Some(caps) = re.captures(version_part)
430    {
431        let op = caps.get(1).map(|m| m.as_str()).unwrap_or("");
432        let ver = caps.get(2).map(|m| m.as_str()).unwrap_or("");
433        if !op.is_empty() && !ver.is_empty() {
434            return format!("{} {}", op, ver);
435        }
436    }
437
438    // If regex parsing fails, try to extract raw content
439    let content = version_part
440        .trim_matches('{')
441        .trim_matches('}')
442        .trim_matches('"')
443        .trim();
444
445    content.replace('"', "")
446}
447
448/// Parse checksums from checksum array
449fn parse_checksums(lines: &[&str], i: &mut usize, data: &mut OpamData) {
450    if let Some((_, first_value)) = parse_key_value(lines[*i]) {
451        let inline = first_value.trim();
452        if !inline.is_empty() && inline != "[" {
453            if let Some((key, value)) = parse_checksum_line(inline) {
454                match key.as_str() {
455                    "sha1" => data.sha1 = Sha1Digest::from_hex(&value).ok(),
456                    "md5" => data.md5 = Md5Digest::from_hex(&value).ok(),
457                    "sha256" => data.sha256 = Sha256Digest::from_hex(&value).ok(),
458                    "sha512" => data.sha512 = Sha512Digest::from_hex(&value).ok(),
459                    _ => {}
460                }
461            }
462            return;
463        }
464    }
465
466    let mut iteration_count: usize = 0;
467    *i += 1;
468    while *i < lines.len() {
469        iteration_count += 1;
470        if iteration_count > MAX_ITERATION_COUNT {
471            warn!("parse_checksums: exceeded MAX_ITERATION_COUNT, breaking");
472            break;
473        }
474        let line = lines[*i];
475
476        if line.trim().contains(']') {
477            break;
478        }
479
480        if let Some((key, value)) = parse_checksum_line(line) {
481            match key.as_str() {
482                "sha1" => data.sha1 = Sha1Digest::from_hex(&value).ok(),
483                "md5" => data.md5 = Md5Digest::from_hex(&value).ok(),
484                "sha256" => data.sha256 = Sha256Digest::from_hex(&value).ok(),
485                "sha512" => data.sha512 = Sha512Digest::from_hex(&value).ok(),
486                _ => {}
487            }
488        }
489
490        *i += 1;
491    }
492}
493
494/// Parse a single checksum line: algo=hash
495fn parse_checksum_line(line: &str) -> Option<(String, String)> {
496    let line = line.trim().trim_matches('"').trim();
497
498    let regex = Regex::new(r"^(\w+)\s*=\s*(.+)$").ok()?;
499    let caps = regex.captures(line)?;
500
501    let key = caps.get(1)?.as_str().to_string();
502    let value = caps.get(2)?.as_str().to_string();
503
504    Some((key, value))
505}
506
507/// Split quoted strings like: "str1" "str2" "str3"
508fn split_quoted_strings(content: &str) -> Vec<String> {
509    let mut result = Vec::new();
510    let mut current = String::new();
511    let mut in_quotes = false;
512
513    for ch in content.chars() {
514        match ch {
515            '"' => in_quotes = !in_quotes,
516            ' ' if !in_quotes => {
517                if !current.is_empty() {
518                    result.push(current.trim_matches('"').to_string());
519                    current.clear();
520                }
521            }
522            _ => current.push(ch),
523        }
524    }
525
526    if !current.is_empty() {
527        result.push(current.trim_matches('"').to_string());
528    }
529
530    result
531}
532
533/// Build description from synopsis and description
534fn build_description(synopsis: &Option<String>, description: &Option<String>) -> Option<String> {
535    let parts: Vec<&str> = vec![synopsis.as_deref(), description.as_deref()]
536        .into_iter()
537        .filter(|p| p.is_some())
538        .flatten()
539        .collect();
540
541    if parts.is_empty() {
542        None
543    } else {
544        Some(parts.join("\n"))
545    }
546}
547
548/// Extract parties from authors and maintainers
549fn extract_parties(authors: &[String], maintainers: &[String]) -> Vec<Party> {
550    let mut parties = Vec::new();
551
552    // Add authors
553    for author in authors {
554        parties.push(Party {
555            r#type: Some("person".to_string()),
556            role: Some("author".to_string()),
557            name: Some(truncate_field(author.clone())),
558            email: None,
559            url: None,
560            organization: None,
561            organization_url: None,
562            timezone: None,
563        });
564    }
565
566    // Add maintainers (as email)
567    for maintainer in maintainers {
568        parties.push(Party {
569            r#type: Some("person".to_string()),
570            role: Some("maintainer".to_string()),
571            name: None,
572            email: Some(truncate_field(maintainer.clone())),
573            url: None,
574            organization: None,
575            organization_url: None,
576            timezone: None,
577        });
578    }
579
580    parties
581}
582
583/// Extract dependencies into Dependency objects
584fn extract_dependencies(deps: &[(String, String)]) -> Vec<Dependency> {
585    deps.iter()
586        .map(|(name, version_constraint)| Dependency {
587            purl: Some(truncate_field(format!("pkg:opam/{}", name))),
588            extracted_requirement: Some(truncate_field(version_constraint.clone())),
589            scope: Some("dependency".to_string()),
590            is_runtime: Some(true),
591            is_optional: Some(false),
592            is_pinned: Some(false),
593            is_direct: Some(true),
594            resolved_package: None,
595            extra_data: None,
596        })
597        .collect()
598}
599
600#[cfg(test)]
601mod tests {
602    use super::*;
603    use crate::parsers::PackageParser;
604
605    #[test]
606    fn test_is_match_with_opam_extension() {
607        let path = Path::new("sample.opam");
608        assert!(OpamParser::is_match(path));
609    }
610
611    #[test]
612    fn test_is_match_with_opam_name() {
613        let path = Path::new("opam");
614        assert!(OpamParser::is_match(path));
615    }
616
617    #[test]
618    fn test_is_match_with_non_opam() {
619        let path = Path::new("sample.txt");
620        assert!(!OpamParser::is_match(path));
621    }
622
623    #[test]
624    fn test_parse_key_value() {
625        let (key, value) = parse_key_value("name: \"js_of_ocaml\"").unwrap();
626        assert_eq!(key, "name");
627        assert_eq!(value, "\"js_of_ocaml\"");
628    }
629
630    #[test]
631    fn test_clean_value() {
632        assert_eq!(
633            clean_value("\"js_of_ocaml\""),
634            Some("js_of_ocaml".to_string())
635        );
636        assert_eq!(clean_value("\"\""), None);
637    }
638
639    #[test]
640    fn test_extract_version_constraint() {
641        let result = extract_version_constraint(r#"{>= "4.02.0"}"#);
642        assert_eq!(result, ">= 4.02.0");
643    }
644
645    #[test]
646    fn test_parse_dependency_line() {
647        let (name, version) = parse_dependency_line(r#""ocaml" {>= "4.02.0"}"#).unwrap();
648        assert_eq!(name, "ocaml");
649        assert_eq!(version, ">= 4.02.0");
650    }
651
652    #[test]
653    fn test_parse_dependency_line_without_version() {
654        let (name, version) = parse_dependency_line(r#""uchar""#).unwrap();
655        assert_eq!(name, "uchar");
656        assert_eq!(version, "");
657    }
658
659    #[test]
660    fn test_split_quoted_strings() {
661        let parts = split_quoted_strings(r#""str1" "str2""#);
662        assert_eq!(parts, vec!["str1", "str2"]);
663    }
664
665    #[test]
666    fn test_build_description() {
667        let synopsis = Some("Short description".to_string());
668        let description = Some("Long description".to_string());
669        let result = build_description(&synopsis, &description);
670        assert_eq!(
671            result,
672            Some("Short description\nLong description".to_string())
673        );
674    }
675
676    #[test]
677    fn test_extract_parties() {
678        let authors = vec!["Author One".to_string()];
679        let maintainers = vec!["maintainer@example.com".to_string()];
680        let parties = extract_parties(&authors, &maintainers);
681
682        assert_eq!(parties.len(), 2);
683        assert_eq!(parties[0].name, Some("Author One".to_string()));
684        assert_eq!(parties[0].role, Some("author".to_string()));
685        assert_eq!(parties[1].email, Some("maintainer@example.com".to_string()));
686        assert_eq!(parties[1].role, Some("maintainer".to_string()));
687    }
688
689    #[test]
690    fn test_normalize_opam_declared_license_preserves_scancode_style_expression() {
691        let (declared, declared_spdx, detections) = normalize_opam_declared_license(Some(
692            "LGPL-3.0-only with OCaml-LGPL-linking-exception",
693        ));
694
695        assert_eq!(
696            declared.as_deref(),
697            Some("lgpl-3.0 WITH ocaml-lgpl-linking-exception")
698        );
699        assert_eq!(
700            declared_spdx.as_deref(),
701            Some("LGPL-3.0-only WITH OCaml-LGPL-linking-exception")
702        );
703        assert_eq!(detections.len(), 1);
704        assert_eq!(
705            detections[0].license_expression,
706            "lgpl-3.0 WITH ocaml-lgpl-linking-exception"
707        );
708    }
709}
710
711crate::register_parser!(
712    "OCaml OPAM package manifest",
713    &["**/*.opam", "**/opam"],
714    "opam",
715    "OCaml",
716    Some("https://opam.ocaml.org/doc/Manual.html"),
717);