Skip to main content

provenant/parsers/
opam.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for OCaml OPAM package manager manifests.
5//!
6//! Extracts package metadata and dependencies from OPAM files used by the
7//! OCaml ecosystem.
8//!
9//! # Supported Formats
10//! - *.opam files (OPAM package manifests)
11//! - opam files without extension
12//!
13//! # Key Features
14//! - Field-based parsing of OPAM's custom format (key: value)
15//! - Author and maintainer extraction with email parsing
16//! - URL extraction for source archives, homepage, repository
17//! - License statement extraction
18//! - Checksum extraction (sha1, md5, sha256, sha512)
19//!
20//! # Implementation Notes
21//! - OPAM format uses custom syntax, not JSON/YAML/TOML
22//! - Strings can be quoted or unquoted
23//! - Lists use bracket notation: [item1 item2]
24//! - Multi-line strings use three-quote notation: """..."""
25
26use std::path::Path;
27
28use crate::parser_warn as warn;
29use regex::Regex;
30
31use crate::models::{
32    DatasourceId, Dependency, Md5Digest, PackageData, PackageType, Party, Sha1Digest, Sha256Digest,
33    Sha512Digest,
34};
35use crate::parsers::PackageParser;
36use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
37
38use super::license_normalization::{
39    DeclaredLicenseMatchMetadata, build_declared_license_data_from_pair,
40    normalize_spdx_declared_license,
41};
42
43/// Parser for OCaml OPAM package manifest files.
44///
45/// Handles the OPAM file format used by the OCaml package manager.
46/// Reference: <https://opam.ocaml.org/doc/Manual.html#Common-file-format>
47pub struct OpamParser;
48
49impl PackageParser for OpamParser {
50    const PACKAGE_TYPE: PackageType = PackageType::Opam;
51
52    fn is_match(path: &Path) -> bool {
53        path.file_name().is_some_and(|name| {
54            name.to_string_lossy().ends_with(".opam") || name.to_string_lossy() == "opam"
55        })
56    }
57
58    fn extract_packages(path: &Path) -> Vec<PackageData> {
59        vec![match read_file_to_string(path, None) {
60            Ok(text) => parse_opam(&text),
61            Err(e) => {
62                warn!("Failed to read OPAM file {:?}: {}", path, e);
63                default_package_data()
64            }
65        }]
66    }
67}
68
69/// Parsed OPAM file data
70#[derive(Debug, Default)]
71struct OpamData {
72    name: Option<String>,
73    version: Option<String>,
74    synopsis: Option<String>,
75    description: Option<String>,
76    homepage: Option<String>,
77    dev_repo: Option<String>,
78    bug_reports: Option<String>,
79    src: Option<String>,
80    authors: Vec<String>,
81    maintainers: Vec<String>,
82    license: Option<String>,
83    sha1: Option<Sha1Digest>,
84    md5: Option<Md5Digest>,
85    sha256: Option<Sha256Digest>,
86    sha512: Option<Sha512Digest>,
87    dependencies: Vec<(String, String)>, // (name, version_constraint)
88}
89
90fn default_package_data() -> PackageData {
91    PackageData {
92        package_type: Some(OpamParser::PACKAGE_TYPE),
93        primary_language: Some("Ocaml".to_string()),
94        datasource_id: Some(DatasourceId::OpamFile),
95        ..Default::default()
96    }
97}
98
99/// Parse an OPAM file from text content
100fn parse_opam(text: &str) -> PackageData {
101    let opam_data = parse_opam_data(text);
102
103    let description = build_description(&opam_data.synopsis, &opam_data.description);
104    let parties = extract_parties(&opam_data.authors, &opam_data.maintainers);
105    let dependencies = extract_dependencies(&opam_data.dependencies);
106
107    let (repository_homepage_url, api_data_url, purl) =
108        build_opam_urls(&opam_data.name, &opam_data.version);
109    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
110        normalize_opam_declared_license(opam_data.license.as_deref());
111
112    PackageData {
113        package_type: Some(OpamParser::PACKAGE_TYPE),
114        namespace: None,
115        name: opam_data.name,
116        version: opam_data.version,
117        qualifiers: None,
118        subpath: None,
119        primary_language: Some("Ocaml".to_string()),
120        description,
121        release_date: None,
122        parties,
123        keywords: Vec::new(),
124        homepage_url: opam_data.homepage,
125        download_url: opam_data.src,
126        size: None,
127        sha1: opam_data.sha1,
128        md5: opam_data.md5,
129        sha256: opam_data.sha256,
130        sha512: opam_data.sha512,
131        bug_tracking_url: opam_data.bug_reports,
132        code_view_url: None,
133        vcs_url: opam_data.dev_repo,
134        copyright: None,
135        holder: None,
136        declared_license_expression,
137        declared_license_expression_spdx,
138        license_detections,
139        other_license_expression: None,
140        other_license_expression_spdx: None,
141        other_license_detections: Vec::new(),
142        extracted_license_statement: opam_data.license,
143        notice_text: None,
144        source_packages: Vec::new(),
145        file_references: Vec::new(),
146        is_private: false,
147        is_virtual: false,
148        extra_data: None,
149        dependencies,
150        repository_homepage_url,
151        repository_download_url: None,
152        api_data_url,
153        datasource_id: Some(DatasourceId::OpamFile),
154        purl,
155    }
156}
157
158fn normalize_opam_declared_license(
159    statement: Option<&str>,
160) -> (
161    Option<String>,
162    Option<String>,
163    Vec<crate::models::LicenseDetection>,
164) {
165    let Some(statement) = statement.map(str::trim).filter(|value| !value.is_empty()) else {
166        return super::license_normalization::empty_declared_license_data();
167    };
168
169    match statement {
170        "GPL-2.0-only" => build_declared_license_data_from_pair(
171            "gpl-2.0",
172            "GPL-2.0-only",
173            DeclaredLicenseMatchMetadata::single_line(statement),
174        ),
175        "GPL-3.0-only" => build_declared_license_data_from_pair(
176            "gpl-3.0",
177            "GPL-3.0-only",
178            DeclaredLicenseMatchMetadata::single_line(statement),
179        ),
180        "LGPL-3.0-only with OCaml-LGPL-linking-exception" => build_declared_license_data_from_pair(
181            "lgpl-3.0 WITH ocaml-lgpl-linking-exception",
182            "LGPL-3.0-only WITH OCaml-LGPL-linking-exception",
183            DeclaredLicenseMatchMetadata::single_line(statement),
184        ),
185        _ => normalize_spdx_declared_license(Some(statement)),
186    }
187}
188
189fn build_opam_urls(
190    name: &Option<String>,
191    version: &Option<String>,
192) -> (Option<String>, Option<String>, Option<String>) {
193    let repository_homepage_url = name
194        .as_ref()
195        .map(|n| format!("https://opam.ocaml.org/packages/{}", n));
196
197    let api_data_url = match (name, version) {
198        (Some(n), Some(v)) => Some(format!(
199            "https://github.com/ocaml/opam-repository/blob/master/packages/{}/{}.{}/opam",
200            n, n, v
201        )),
202        _ => None,
203    };
204
205    let purl = match (name, version) {
206        (Some(n), Some(v)) => Some(format!("pkg:opam/{}@{}", n, v)),
207        (Some(n), None) => Some(format!("pkg:opam/{}", n)),
208        _ => None,
209    };
210
211    (repository_homepage_url, api_data_url, purl)
212}
213
214/// Parse OPAM file text into structured data
215fn parse_opam_data(text: &str) -> OpamData {
216    let mut data = OpamData::default();
217    let lines: Vec<&str> = text.lines().collect();
218    let mut i = 0;
219    let mut iteration_count: usize = 0;
220
221    while i < lines.len() {
222        iteration_count += 1;
223        if iteration_count > MAX_ITERATION_COUNT {
224            warn!("parse_opam_data: exceeded MAX_ITERATION_COUNT, breaking");
225            break;
226        }
227        let line = lines[i];
228
229        // Parse key: value format
230        if let Some((key, value)) = parse_key_value(line) {
231            match key.as_str() {
232                "name" => data.name = clean_value(&value),
233                "version" => data.version = clean_value(&value),
234                "synopsis" => data.synopsis = clean_value(&value),
235                "description" => {
236                    data.description = parse_multiline_string(&lines, &mut i);
237                }
238                "homepage" => data.homepage = clean_value(&value),
239                "dev-repo" => data.dev_repo = clean_value(&value),
240                "bug-reports" => data.bug_reports = clean_value(&value),
241                "src" => {
242                    if value.trim().is_empty() && i + 1 < lines.len() {
243                        i += 1;
244                        data.src = clean_value(lines[i]);
245                    } else {
246                        data.src = clean_value(&value);
247                    }
248                }
249                "license" => data.license = clean_value(&value),
250                "authors" => {
251                    data.authors = parse_string_array(&lines, &mut i, &value);
252                }
253                "maintainer" => {
254                    data.maintainers = parse_string_array(&lines, &mut i, &value);
255                }
256                "depends" => {
257                    data.dependencies = parse_dependency_array(&lines, &mut i);
258                }
259                "checksum" => {
260                    parse_checksums(&lines, &mut i, &mut data);
261                }
262                _ => {}
263            }
264        }
265
266        i += 1;
267    }
268
269    data
270}
271
272/// Parse a key: value line
273fn parse_key_value(line: &str) -> Option<(String, String)> {
274    let line = line.trim();
275    if line.is_empty() || line.starts_with('#') {
276        return None;
277    }
278
279    if let Some(colon_pos) = line.find(':') {
280        let key = line[..colon_pos].trim().to_string();
281        let value = line[colon_pos + 1..].trim().to_string();
282        Some((key, value))
283    } else {
284        None
285    }
286}
287
288/// Clean a value by removing quotes and brackets
289fn clean_value(value: &str) -> Option<String> {
290    let cleaned = value
291        .trim()
292        .trim_matches('"')
293        .trim_matches('[')
294        .trim_matches(']')
295        .trim();
296
297    if cleaned.is_empty() {
298        None
299    } else {
300        Some(truncate_field(cleaned.to_string()))
301    }
302}
303
304/// Parse a multiline string enclosed in triple quotes
305fn parse_multiline_string(lines: &[&str], i: &mut usize) -> Option<String> {
306    let mut result = String::new();
307    let mut iteration_count: usize = 0;
308
309    if let Some((_, value)) = parse_key_value(lines[*i]) {
310        result.push_str(value.trim_matches('"').trim());
311    }
312
313    *i += 1;
314    while *i < lines.len() {
315        iteration_count += 1;
316        if iteration_count > MAX_ITERATION_COUNT {
317            warn!("parse_multiline_string: exceeded MAX_ITERATION_COUNT, breaking");
318            break;
319        }
320        let line = lines[*i];
321        result.push(' ');
322        result.push_str(line.trim_matches('"').trim());
323
324        if line.contains("\"\"\"") {
325            break;
326        }
327        *i += 1;
328    }
329
330    let cleaned = result.trim().to_string();
331    if cleaned.is_empty() {
332        None
333    } else {
334        Some(truncate_field(cleaned))
335    }
336}
337
338/// Parse a string array (single-line or multiline)
339fn parse_string_array(lines: &[&str], i: &mut usize, first_value: &str) -> Vec<String> {
340    let mut result = Vec::new();
341    let mut iteration_count: usize = 0;
342
343    let mut content = first_value.to_string();
344
345    if content.contains('[') && !content.contains(']') {
346        *i += 1;
347        while *i < lines.len() {
348            iteration_count += 1;
349            if iteration_count > MAX_ITERATION_COUNT {
350                warn!("parse_string_array: exceeded MAX_ITERATION_COUNT, breaking");
351                break;
352            }
353            let line = lines[*i];
354            content.push(' ');
355            content.push_str(line);
356
357            if line.contains(']') {
358                break;
359            }
360            *i += 1;
361        }
362    }
363
364    let cleaned = content.trim_matches('[').trim_matches(']').trim();
365
366    for part in split_quoted_strings(cleaned) {
367        let p = part.trim_matches('"').trim();
368        if !p.is_empty() {
369            result.push(truncate_field(p.to_string()));
370        }
371    }
372
373    result
374}
375
376/// Parse dependency array
377fn parse_dependency_array(lines: &[&str], i: &mut usize) -> Vec<(String, String)> {
378    let mut result = Vec::new();
379    let mut iteration_count: usize = 0;
380
381    *i += 1;
382    while *i < lines.len() {
383        iteration_count += 1;
384        if iteration_count > MAX_ITERATION_COUNT {
385            warn!("parse_dependency_array: exceeded MAX_ITERATION_COUNT, breaking");
386            break;
387        }
388        let line = lines[*i];
389
390        if line.trim().contains(']') {
391            break;
392        }
393
394        if let Some((name, version)) = parse_dependency_line(line) {
395            result.push((name, version));
396        }
397
398        *i += 1;
399    }
400
401    result
402}
403
404/// Parse a single dependency line: "name" {version_constraint}
405fn parse_dependency_line(line: &str) -> Option<(String, String)> {
406    let line = line.trim();
407    if line.is_empty() {
408        return None;
409    }
410
411    // Match: "name" {optional version}
412    let regex = Regex::new(r#""([^"]+)"\s*(.*)$"#).ok()?;
413    let caps = regex.captures(line)?;
414
415    let name = truncate_field(caps.get(1)?.as_str().to_string());
416    let version_part = caps.get(2)?.as_str().trim();
417
418    // Extract the operator and version constraint
419    let constraint = if version_part.is_empty() {
420        String::new()
421    } else {
422        truncate_field(extract_version_constraint(version_part))
423    };
424
425    Some((name, constraint))
426}
427
428/// Extract version constraint from {>= "1.0"} format
429fn extract_version_constraint(version_part: &str) -> String {
430    let regex = Regex::new(r#"\{\s*([<>=!]+)\s*"([^"]*)"\s*\}"#);
431    if let Ok(re) = regex
432        && let Some(caps) = re.captures(version_part)
433    {
434        let op = caps.get(1).map(|m| m.as_str()).unwrap_or("");
435        let ver = caps.get(2).map(|m| m.as_str()).unwrap_or("");
436        if !op.is_empty() && !ver.is_empty() {
437            return format!("{} {}", op, ver);
438        }
439    }
440
441    // If regex parsing fails, try to extract raw content
442    let content = version_part
443        .trim_matches('{')
444        .trim_matches('}')
445        .trim_matches('"')
446        .trim();
447
448    content.replace('"', "")
449}
450
451/// Parse checksums from checksum array
452fn parse_checksums(lines: &[&str], i: &mut usize, data: &mut OpamData) {
453    if let Some((_, first_value)) = parse_key_value(lines[*i]) {
454        let inline = first_value.trim();
455        if !inline.is_empty() && inline != "[" {
456            if let Some((key, value)) = parse_checksum_line(inline) {
457                match key.as_str() {
458                    "sha1" => data.sha1 = Sha1Digest::from_hex(&value).ok(),
459                    "md5" => data.md5 = Md5Digest::from_hex(&value).ok(),
460                    "sha256" => data.sha256 = Sha256Digest::from_hex(&value).ok(),
461                    "sha512" => data.sha512 = Sha512Digest::from_hex(&value).ok(),
462                    _ => {}
463                }
464            }
465            return;
466        }
467    }
468
469    let mut iteration_count: usize = 0;
470    *i += 1;
471    while *i < lines.len() {
472        iteration_count += 1;
473        if iteration_count > MAX_ITERATION_COUNT {
474            warn!("parse_checksums: exceeded MAX_ITERATION_COUNT, breaking");
475            break;
476        }
477        let line = lines[*i];
478
479        if line.trim().contains(']') {
480            break;
481        }
482
483        if let Some((key, value)) = parse_checksum_line(line) {
484            match key.as_str() {
485                "sha1" => data.sha1 = Sha1Digest::from_hex(&value).ok(),
486                "md5" => data.md5 = Md5Digest::from_hex(&value).ok(),
487                "sha256" => data.sha256 = Sha256Digest::from_hex(&value).ok(),
488                "sha512" => data.sha512 = Sha512Digest::from_hex(&value).ok(),
489                _ => {}
490            }
491        }
492
493        *i += 1;
494    }
495}
496
497/// Parse a single checksum line: algo=hash
498fn parse_checksum_line(line: &str) -> Option<(String, String)> {
499    let line = line.trim().trim_matches('"').trim();
500
501    let regex = Regex::new(r"^(\w+)\s*=\s*(.+)$").ok()?;
502    let caps = regex.captures(line)?;
503
504    let key = caps.get(1)?.as_str().to_string();
505    let value = caps.get(2)?.as_str().to_string();
506
507    Some((key, value))
508}
509
510/// Split quoted strings like: "str1" "str2" "str3"
511fn split_quoted_strings(content: &str) -> Vec<String> {
512    let mut result = Vec::new();
513    let mut current = String::new();
514    let mut in_quotes = false;
515
516    for ch in content.chars() {
517        match ch {
518            '"' => in_quotes = !in_quotes,
519            ' ' if !in_quotes => {
520                if !current.is_empty() {
521                    result.push(current.trim_matches('"').to_string());
522                    current.clear();
523                }
524            }
525            _ => current.push(ch),
526        }
527    }
528
529    if !current.is_empty() {
530        result.push(current.trim_matches('"').to_string());
531    }
532
533    result
534}
535
536/// Build description from synopsis and description
537fn build_description(synopsis: &Option<String>, description: &Option<String>) -> Option<String> {
538    let parts: Vec<&str> = vec![synopsis.as_deref(), description.as_deref()]
539        .into_iter()
540        .filter(|p| p.is_some())
541        .flatten()
542        .collect();
543
544    if parts.is_empty() {
545        None
546    } else {
547        Some(parts.join("\n"))
548    }
549}
550
551/// Extract parties from authors and maintainers
552fn extract_parties(authors: &[String], maintainers: &[String]) -> Vec<Party> {
553    let mut parties = Vec::new();
554
555    // Add authors
556    for author in authors {
557        parties.push(Party {
558            r#type: Some("person".to_string()),
559            role: Some("author".to_string()),
560            name: Some(truncate_field(author.clone())),
561            email: None,
562            url: None,
563            organization: None,
564            organization_url: None,
565            timezone: None,
566        });
567    }
568
569    // Add maintainers (as email)
570    for maintainer in maintainers {
571        parties.push(Party {
572            r#type: Some("person".to_string()),
573            role: Some("maintainer".to_string()),
574            name: None,
575            email: Some(truncate_field(maintainer.clone())),
576            url: None,
577            organization: None,
578            organization_url: None,
579            timezone: None,
580        });
581    }
582
583    parties
584}
585
586/// Extract dependencies into Dependency objects
587fn extract_dependencies(deps: &[(String, String)]) -> Vec<Dependency> {
588    deps.iter()
589        .map(|(name, version_constraint)| Dependency {
590            purl: Some(truncate_field(format!("pkg:opam/{}", name))),
591            extracted_requirement: Some(truncate_field(version_constraint.clone())),
592            scope: Some("dependency".to_string()),
593            is_runtime: Some(true),
594            is_optional: Some(false),
595            is_pinned: Some(false),
596            is_direct: Some(true),
597            resolved_package: None,
598            extra_data: None,
599        })
600        .collect()
601}
602
603#[cfg(test)]
604mod tests {
605    use super::*;
606    use crate::parsers::PackageParser;
607
608    #[test]
609    fn test_is_match_with_opam_extension() {
610        let path = Path::new("sample.opam");
611        assert!(OpamParser::is_match(path));
612    }
613
614    #[test]
615    fn test_is_match_with_opam_name() {
616        let path = Path::new("opam");
617        assert!(OpamParser::is_match(path));
618    }
619
620    #[test]
621    fn test_is_match_with_non_opam() {
622        let path = Path::new("sample.txt");
623        assert!(!OpamParser::is_match(path));
624    }
625
626    #[test]
627    fn test_parse_key_value() {
628        let (key, value) = parse_key_value("name: \"js_of_ocaml\"").unwrap();
629        assert_eq!(key, "name");
630        assert_eq!(value, "\"js_of_ocaml\"");
631    }
632
633    #[test]
634    fn test_clean_value() {
635        assert_eq!(
636            clean_value("\"js_of_ocaml\""),
637            Some("js_of_ocaml".to_string())
638        );
639        assert_eq!(clean_value("\"\""), None);
640    }
641
642    #[test]
643    fn test_extract_version_constraint() {
644        let result = extract_version_constraint(r#"{>= "4.02.0"}"#);
645        assert_eq!(result, ">= 4.02.0");
646    }
647
648    #[test]
649    fn test_parse_dependency_line() {
650        let (name, version) = parse_dependency_line(r#""ocaml" {>= "4.02.0"}"#).unwrap();
651        assert_eq!(name, "ocaml");
652        assert_eq!(version, ">= 4.02.0");
653    }
654
655    #[test]
656    fn test_parse_dependency_line_without_version() {
657        let (name, version) = parse_dependency_line(r#""uchar""#).unwrap();
658        assert_eq!(name, "uchar");
659        assert_eq!(version, "");
660    }
661
662    #[test]
663    fn test_split_quoted_strings() {
664        let parts = split_quoted_strings(r#""str1" "str2""#);
665        assert_eq!(parts, vec!["str1", "str2"]);
666    }
667
668    #[test]
669    fn test_build_description() {
670        let synopsis = Some("Short description".to_string());
671        let description = Some("Long description".to_string());
672        let result = build_description(&synopsis, &description);
673        assert_eq!(
674            result,
675            Some("Short description\nLong description".to_string())
676        );
677    }
678
679    #[test]
680    fn test_extract_parties() {
681        let authors = vec!["Author One".to_string()];
682        let maintainers = vec!["maintainer@example.com".to_string()];
683        let parties = extract_parties(&authors, &maintainers);
684
685        assert_eq!(parties.len(), 2);
686        assert_eq!(parties[0].name, Some("Author One".to_string()));
687        assert_eq!(parties[0].role, Some("author".to_string()));
688        assert_eq!(parties[1].email, Some("maintainer@example.com".to_string()));
689        assert_eq!(parties[1].role, Some("maintainer".to_string()));
690    }
691
692    #[test]
693    fn test_normalize_opam_declared_license_preserves_scancode_style_expression() {
694        let (declared, declared_spdx, detections) = normalize_opam_declared_license(Some(
695            "LGPL-3.0-only with OCaml-LGPL-linking-exception",
696        ));
697
698        assert_eq!(
699            declared.as_deref(),
700            Some("lgpl-3.0 WITH ocaml-lgpl-linking-exception")
701        );
702        assert_eq!(
703            declared_spdx.as_deref(),
704            Some("LGPL-3.0-only WITH OCaml-LGPL-linking-exception")
705        );
706        assert_eq!(detections.len(), 1);
707        assert_eq!(
708            detections[0].license_expression,
709            "lgpl-3.0 WITH ocaml-lgpl-linking-exception"
710        );
711    }
712}
713
714crate::register_parser!(
715    "OCaml OPAM package manifest",
716    &["**/*.opam", "**/opam"],
717    "opam",
718    "OCaml",
719    Some("https://opam.ocaml.org/doc/Manual.html"),
720);