Skip to main content

provenant/parsers/
opam.rs

1//! Parser for OCaml OPAM package manager manifests.
2//!
3//! Extracts package metadata and dependencies from OPAM files used by the
4//! OCaml ecosystem.
5//!
6//! # Supported Formats
7//! - *.opam files (OPAM package manifests)
8//! - opam files without extension
9//!
10//! # Key Features
11//! - Field-based parsing of OPAM's custom format (key: value)
12//! - Author and maintainer extraction with email parsing
13//! - URL extraction for source archives, homepage, repository
14//! - License statement extraction
15//! - Checksum extraction (sha1, md5, sha256, sha512)
16//!
17//! # Implementation Notes
18//! - OPAM format uses custom syntax, not JSON/YAML/TOML
19//! - Strings can be quoted or unquoted
20//! - Lists use bracket notation: [item1 item2]
21//! - Multi-line strings use three-quote notation: """..."""
22
23use std::path::Path;
24
25use log::warn;
26use regex::Regex;
27
28use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
29use crate::parsers::PackageParser;
30
31/// Parser for OCaml OPAM package manifest files.
32///
33/// Handles the OPAM file format used by the OCaml package manager.
34/// Reference: <https://opam.ocaml.org/doc/Manual.html#Common-file-format>
35pub struct OpamParser;
36
37impl PackageParser for OpamParser {
38    const PACKAGE_TYPE: PackageType = PackageType::Opam;
39
40    fn is_match(path: &Path) -> bool {
41        path.file_name().is_some_and(|name| {
42            name.to_string_lossy().ends_with(".opam") || name.to_string_lossy() == "opam"
43        })
44    }
45
46    fn extract_packages(path: &Path) -> Vec<PackageData> {
47        vec![match std::fs::read_to_string(path) {
48            Ok(text) => parse_opam(&text),
49            Err(e) => {
50                warn!("Failed to read OPAM file {:?}: {}", path, e);
51                default_package_data()
52            }
53        }]
54    }
55}
56
57/// Parsed OPAM file data
58#[derive(Debug, Default)]
59struct OpamData {
60    name: Option<String>,
61    version: Option<String>,
62    synopsis: Option<String>,
63    description: Option<String>,
64    homepage: Option<String>,
65    dev_repo: Option<String>,
66    bug_reports: Option<String>,
67    src: Option<String>,
68    authors: Vec<String>,
69    maintainers: Vec<String>,
70    license: Option<String>,
71    sha1: Option<String>,
72    md5: Option<String>,
73    sha256: Option<String>,
74    sha512: Option<String>,
75    dependencies: Vec<(String, String)>, // (name, version_constraint)
76}
77
78fn default_package_data() -> PackageData {
79    PackageData {
80        package_type: Some(OpamParser::PACKAGE_TYPE),
81        primary_language: Some("Ocaml".to_string()),
82        datasource_id: Some(DatasourceId::OpamFile),
83        ..Default::default()
84    }
85}
86
87/// Parse an OPAM file from text content
88fn parse_opam(text: &str) -> PackageData {
89    let opam_data = parse_opam_data(text);
90
91    let description = build_description(&opam_data.synopsis, &opam_data.description);
92    let parties = extract_parties(&opam_data.authors, &opam_data.maintainers);
93    let dependencies = extract_dependencies(&opam_data.dependencies);
94
95    let (repository_homepage_url, api_data_url, purl) =
96        build_opam_urls(&opam_data.name, &opam_data.version);
97
98    PackageData {
99        package_type: Some(OpamParser::PACKAGE_TYPE),
100        namespace: None,
101        name: opam_data.name,
102        version: opam_data.version,
103        qualifiers: None,
104        subpath: None,
105        primary_language: Some("Ocaml".to_string()),
106        description,
107        release_date: None,
108        parties,
109        keywords: Vec::new(),
110        homepage_url: opam_data.homepage,
111        download_url: opam_data.src,
112        size: None,
113        sha1: opam_data.sha1,
114        md5: opam_data.md5,
115        sha256: opam_data.sha256,
116        sha512: opam_data.sha512,
117        bug_tracking_url: opam_data.bug_reports,
118        code_view_url: None,
119        vcs_url: opam_data.dev_repo,
120        copyright: None,
121        holder: None,
122        declared_license_expression: None,
123        declared_license_expression_spdx: None,
124        license_detections: Vec::new(),
125        other_license_expression: None,
126        other_license_expression_spdx: None,
127        other_license_detections: Vec::new(),
128        extracted_license_statement: opam_data.license,
129        notice_text: None,
130        source_packages: Vec::new(),
131        file_references: Vec::new(),
132        is_private: false,
133        is_virtual: false,
134        extra_data: None,
135        dependencies,
136        repository_homepage_url,
137        repository_download_url: None,
138        api_data_url,
139        datasource_id: Some(DatasourceId::OpamFile),
140        purl,
141    }
142}
143
144fn build_opam_urls(
145    name: &Option<String>,
146    version: &Option<String>,
147) -> (Option<String>, Option<String>, Option<String>) {
148    let repository_homepage_url = name
149        .as_ref()
150        .map(|_| "{https://opam.ocaml.org/packages}/{name}".to_string());
151
152    let api_data_url = match (name, version) {
153        (Some(n), Some(v)) => Some(format!(
154            "https://github.com/ocaml/opam-repository/blob/master/packages/{}/{}.{}/opam",
155            n, n, v
156        )),
157        _ => None,
158    };
159
160    let purl = match (name, version) {
161        (Some(n), Some(v)) => Some(format!("pkg:opam/{}@{}", n, v)),
162        (Some(n), None) => Some(format!("pkg:opam/{}", n)),
163        _ => None,
164    };
165
166    (repository_homepage_url, api_data_url, purl)
167}
168
169/// Parse OPAM file text into structured data
170fn parse_opam_data(text: &str) -> OpamData {
171    let mut data = OpamData::default();
172    let lines: Vec<&str> = text.lines().collect();
173    let mut i = 0;
174
175    while i < lines.len() {
176        let line = lines[i];
177
178        // Parse key: value format
179        if let Some((key, value)) = parse_key_value(line) {
180            match key.as_str() {
181                "name" => data.name = clean_value(&value),
182                "version" => data.version = clean_value(&value),
183                "synopsis" => data.synopsis = clean_value(&value),
184                "description" => {
185                    data.description = parse_multiline_string(&lines, &mut i);
186                }
187                "homepage" => data.homepage = clean_value(&value),
188                "dev-repo" => data.dev_repo = clean_value(&value),
189                "bug-reports" => data.bug_reports = clean_value(&value),
190                "src" => {
191                    if value.trim().is_empty() && i + 1 < lines.len() {
192                        i += 1;
193                        data.src = clean_value(lines[i]);
194                    } else {
195                        data.src = clean_value(&value);
196                    }
197                }
198                "license" => data.license = clean_value(&value),
199                "authors" => {
200                    data.authors = parse_string_array(&lines, &mut i, &value);
201                }
202                "maintainer" => {
203                    data.maintainers = parse_string_array(&lines, &mut i, &value);
204                }
205                "depends" => {
206                    data.dependencies = parse_dependency_array(&lines, &mut i);
207                }
208                "checksum" => {
209                    parse_checksums(&lines, &mut i, &mut data);
210                }
211                _ => {}
212            }
213        }
214
215        i += 1;
216    }
217
218    data
219}
220
221/// Parse a key: value line
222fn parse_key_value(line: &str) -> Option<(String, String)> {
223    let line = line.trim();
224    if line.is_empty() || line.starts_with('#') {
225        return None;
226    }
227
228    if let Some(colon_pos) = line.find(':') {
229        let key = line[..colon_pos].trim().to_string();
230        let value = line[colon_pos + 1..].trim().to_string();
231        Some((key, value))
232    } else {
233        None
234    }
235}
236
237/// Clean a value by removing quotes and brackets
238fn clean_value(value: &str) -> Option<String> {
239    let cleaned = value
240        .trim()
241        .trim_matches('"')
242        .trim_matches('[')
243        .trim_matches(']')
244        .trim();
245
246    if cleaned.is_empty() {
247        None
248    } else {
249        Some(cleaned.to_string())
250    }
251}
252
253/// Parse a multiline string enclosed in triple quotes
254fn parse_multiline_string(lines: &[&str], i: &mut usize) -> Option<String> {
255    let mut result = String::new();
256
257    // First line might contain opening """ and some content
258    if let Some((_, value)) = parse_key_value(lines[*i]) {
259        result.push_str(value.trim_matches('"').trim());
260    }
261
262    *i += 1;
263    while *i < lines.len() {
264        let line = lines[*i];
265        result.push(' ');
266        result.push_str(line.trim_matches('"').trim());
267
268        if line.contains("\"\"\"") {
269            break;
270        }
271        *i += 1;
272    }
273
274    let cleaned = result.trim().to_string();
275    if cleaned.is_empty() {
276        None
277    } else {
278        Some(cleaned)
279    }
280}
281
282/// Parse a string array (single-line or multiline)
283fn parse_string_array(lines: &[&str], i: &mut usize, first_value: &str) -> Vec<String> {
284    let mut result = Vec::new();
285
286    let mut content = first_value.to_string();
287
288    // If it's a multiline array (starts with [ but no matching ])
289    if content.contains('[') && !content.contains(']') {
290        *i += 1;
291        while *i < lines.len() {
292            let line = lines[*i];
293            content.push(' ');
294            content.push_str(line);
295
296            if line.contains(']') {
297                break;
298            }
299            *i += 1;
300        }
301    }
302
303    // Parse the content
304    let cleaned = content.trim_matches('[').trim_matches(']').trim();
305
306    // Split by quote-delimited strings
307    for part in split_quoted_strings(cleaned) {
308        let p = part.trim_matches('"').trim();
309        if !p.is_empty() {
310            result.push(p.to_string());
311        }
312    }
313
314    result
315}
316
317/// Parse dependency array
318fn parse_dependency_array(lines: &[&str], i: &mut usize) -> Vec<(String, String)> {
319    let mut result = Vec::new();
320
321    *i += 1;
322    while *i < lines.len() {
323        let line = lines[*i];
324
325        if line.trim().contains(']') {
326            break;
327        }
328
329        if let Some((name, version)) = parse_dependency_line(line) {
330            result.push((name, version));
331        }
332
333        *i += 1;
334    }
335
336    result
337}
338
339/// Parse a single dependency line: "name" {version_constraint}
340fn parse_dependency_line(line: &str) -> Option<(String, String)> {
341    let line = line.trim();
342    if line.is_empty() {
343        return None;
344    }
345
346    // Match: "name" {optional version}
347    let regex = Regex::new(r#""([^"]+)"\s*(.*)$"#).ok()?;
348    let caps = regex.captures(line)?;
349
350    let name = caps.get(1)?.as_str().to_string();
351    let version_part = caps.get(2)?.as_str().trim();
352
353    // Extract the operator and version constraint
354    let constraint = if version_part.is_empty() {
355        String::new()
356    } else {
357        extract_version_constraint(version_part)
358    };
359
360    Some((name, constraint))
361}
362
363/// Extract version constraint from {>= "1.0"} format
364fn extract_version_constraint(version_part: &str) -> String {
365    let regex = Regex::new(r#"\{\s*([<>=!]+)\s*"([^"]*)"\s*\}"#);
366    if let Ok(re) = regex
367        && let Some(caps) = re.captures(version_part)
368    {
369        let op = caps.get(1).map(|m| m.as_str()).unwrap_or("");
370        let ver = caps.get(2).map(|m| m.as_str()).unwrap_or("");
371        if !op.is_empty() && !ver.is_empty() {
372            return format!("{} {}", op, ver);
373        }
374    }
375
376    // If regex parsing fails, try to extract raw content
377    let content = version_part
378        .trim_matches('{')
379        .trim_matches('}')
380        .trim_matches('"')
381        .trim();
382
383    content.replace('"', "")
384}
385
386/// Parse checksums from checksum array
387fn parse_checksums(lines: &[&str], i: &mut usize, data: &mut OpamData) {
388    if let Some((_, first_value)) = parse_key_value(lines[*i]) {
389        let inline = first_value.trim();
390        if !inline.is_empty() && inline != "[" {
391            if let Some((key, value)) = parse_checksum_line(inline) {
392                match key.as_str() {
393                    "sha1" => data.sha1 = Some(value),
394                    "md5" => data.md5 = Some(value),
395                    "sha256" => data.sha256 = Some(value),
396                    "sha512" => data.sha512 = Some(value),
397                    _ => {}
398                }
399            }
400            return;
401        }
402    }
403
404    *i += 1;
405    while *i < lines.len() {
406        let line = lines[*i];
407
408        if line.trim().contains(']') {
409            break;
410        }
411
412        if let Some((key, value)) = parse_checksum_line(line) {
413            match key.as_str() {
414                "sha1" => data.sha1 = Some(value),
415                "md5" => data.md5 = Some(value),
416                "sha256" => data.sha256 = Some(value),
417                "sha512" => data.sha512 = Some(value),
418                _ => {}
419            }
420        }
421
422        *i += 1;
423    }
424}
425
426/// Parse a single checksum line: algo=hash
427fn parse_checksum_line(line: &str) -> Option<(String, String)> {
428    let line = line.trim().trim_matches('"').trim();
429
430    let regex = Regex::new(r"^(\w+)\s*=\s*(.+)$").ok()?;
431    let caps = regex.captures(line)?;
432
433    let key = caps.get(1)?.as_str().to_string();
434    let value = caps.get(2)?.as_str().to_string();
435
436    Some((key, value))
437}
438
439/// Split quoted strings like: "str1" "str2" "str3"
440fn split_quoted_strings(content: &str) -> Vec<String> {
441    let mut result = Vec::new();
442    let mut current = String::new();
443    let mut in_quotes = false;
444
445    for ch in content.chars() {
446        match ch {
447            '"' => in_quotes = !in_quotes,
448            ' ' if !in_quotes => {
449                if !current.is_empty() {
450                    result.push(current.trim_matches('"').to_string());
451                    current.clear();
452                }
453            }
454            _ => current.push(ch),
455        }
456    }
457
458    if !current.is_empty() {
459        result.push(current.trim_matches('"').to_string());
460    }
461
462    result
463}
464
465/// Build description from synopsis and description
466fn build_description(synopsis: &Option<String>, description: &Option<String>) -> Option<String> {
467    let parts: Vec<&str> = vec![synopsis.as_deref(), description.as_deref()]
468        .into_iter()
469        .filter(|p| p.is_some())
470        .flatten()
471        .collect();
472
473    if parts.is_empty() {
474        None
475    } else {
476        Some(parts.join("\n"))
477    }
478}
479
480/// Extract parties from authors and maintainers
481fn extract_parties(authors: &[String], maintainers: &[String]) -> Vec<Party> {
482    let mut parties = Vec::new();
483
484    // Add authors
485    for author in authors {
486        parties.push(Party {
487            r#type: Some("person".to_string()),
488            role: Some("author".to_string()),
489            name: Some(author.clone()),
490            email: None,
491            url: None,
492            organization: None,
493            organization_url: None,
494            timezone: None,
495        });
496    }
497
498    // Add maintainers (as email)
499    for maintainer in maintainers {
500        parties.push(Party {
501            r#type: Some("person".to_string()),
502            role: Some("maintainer".to_string()),
503            name: None,
504            email: Some(maintainer.clone()),
505            url: None,
506            organization: None,
507            organization_url: None,
508            timezone: None,
509        });
510    }
511
512    parties
513}
514
515/// Extract dependencies into Dependency objects
516fn extract_dependencies(deps: &[(String, String)]) -> Vec<Dependency> {
517    deps.iter()
518        .map(|(name, version_constraint)| Dependency {
519            purl: Some(format!("pkg:opam/{}", name)),
520            extracted_requirement: Some(version_constraint.clone()),
521            scope: Some("dependency".to_string()),
522            is_runtime: Some(true),
523            is_optional: Some(false),
524            is_pinned: Some(false),
525            is_direct: Some(true),
526            resolved_package: None,
527            extra_data: None,
528        })
529        .collect()
530}
531
532#[cfg(test)]
533mod tests {
534    use super::*;
535    use crate::parsers::PackageParser;
536
537    #[test]
538    fn test_is_match_with_opam_extension() {
539        let path = Path::new("sample.opam");
540        assert!(OpamParser::is_match(path));
541    }
542
543    #[test]
544    fn test_is_match_with_opam_name() {
545        let path = Path::new("opam");
546        assert!(OpamParser::is_match(path));
547    }
548
549    #[test]
550    fn test_is_match_with_non_opam() {
551        let path = Path::new("sample.txt");
552        assert!(!OpamParser::is_match(path));
553    }
554
555    #[test]
556    fn test_parse_key_value() {
557        let (key, value) = parse_key_value("name: \"js_of_ocaml\"").unwrap();
558        assert_eq!(key, "name");
559        assert_eq!(value, "\"js_of_ocaml\"");
560    }
561
562    #[test]
563    fn test_clean_value() {
564        assert_eq!(
565            clean_value("\"js_of_ocaml\""),
566            Some("js_of_ocaml".to_string())
567        );
568        assert_eq!(clean_value("\"\""), None);
569    }
570
571    #[test]
572    fn test_extract_version_constraint() {
573        let result = extract_version_constraint(r#"{>= "4.02.0"}"#);
574        assert_eq!(result, ">= 4.02.0");
575    }
576
577    #[test]
578    fn test_parse_dependency_line() {
579        let (name, version) = parse_dependency_line(r#""ocaml" {>= "4.02.0"}"#).unwrap();
580        assert_eq!(name, "ocaml");
581        assert_eq!(version, ">= 4.02.0");
582    }
583
584    #[test]
585    fn test_parse_dependency_line_without_version() {
586        let (name, version) = parse_dependency_line(r#""uchar""#).unwrap();
587        assert_eq!(name, "uchar");
588        assert_eq!(version, "");
589    }
590
591    #[test]
592    fn test_split_quoted_strings() {
593        let parts = split_quoted_strings(r#""str1" "str2""#);
594        assert_eq!(parts, vec!["str1", "str2"]);
595    }
596
597    #[test]
598    fn test_build_description() {
599        let synopsis = Some("Short description".to_string());
600        let description = Some("Long description".to_string());
601        let result = build_description(&synopsis, &description);
602        assert_eq!(
603            result,
604            Some("Short description\nLong description".to_string())
605        );
606    }
607
608    #[test]
609    fn test_extract_parties() {
610        let authors = vec!["Author One".to_string()];
611        let maintainers = vec!["maintainer@example.com".to_string()];
612        let parties = extract_parties(&authors, &maintainers);
613
614        assert_eq!(parties.len(), 2);
615        assert_eq!(parties[0].name, Some("Author One".to_string()));
616        assert_eq!(parties[0].role, Some("author".to_string()));
617        assert_eq!(parties[1].email, Some("maintainer@example.com".to_string()));
618        assert_eq!(parties[1].role, Some("maintainer".to_string()));
619    }
620}
621
622crate::register_parser!(
623    "OCaml OPAM package manifest",
624    &["**/*.opam", "**/opam"],
625    "opam",
626    "OCaml",
627    Some("https://opam.ocaml.org/doc/Manual.html"),
628);