Skip to main content

provenant/parsers/
cran.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for CRAN R package DESCRIPTION files.
5//!
6//! Extracts package metadata and dependencies from R package DESCRIPTION files
7//! which use Debian Control File (DCF) format similar to RFC822.
8//!
9//! # Supported Formats
10//! - DESCRIPTION (CRAN R package manifest)
11//!
12//! # Key Features
13//! - Multi-type dependency extraction (Depends, Imports, Suggests, Enhances, LinkingTo)
14//! - Version constraint parsing with operators (>=, <=, >, <, ==)
15//! - Filters out R version requirements (not actual packages)
16//! - Author/Maintainer party extraction with email parsing
17//! - Package URL (purl) generation
18//!
19//! # Implementation Notes
20//! - Uses DCF/RFC822-like format with continuation lines
21//! - Field names are case-sensitive (Package, Version, Description, etc.)
22//! - Dependencies are comma-separated with optional version constraints
23//! - R version requirements (e.g., "R (>= 4.1.0)") are filtered out
24//! - Authors@R field is NOT parsed (requires R interpreter)
25
26use std::collections::HashMap;
27use std::path::Path;
28use std::sync::LazyLock;
29
30use crate::parser_warn as warn;
31use packageurl::PackageUrl;
32use regex::Regex;
33
34use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
35use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
36
37use super::PackageParser;
38
39/// CRAN R package DESCRIPTION file parser.
40///
41/// Extracts package metadata, dependencies, and party information from
42/// standard DESCRIPTION files used by R packages in the CRAN ecosystem.
43pub struct CranParser;
44
45impl PackageParser for CranParser {
46    const PACKAGE_TYPE: PackageType = PackageType::Cran;
47
48    fn is_match(path: &Path) -> bool {
49        path.file_name().is_some_and(|name| name == "DESCRIPTION")
50    }
51
52    fn extract_packages(path: &Path) -> Vec<PackageData> {
53        let content = match read_file_to_string(path, None) {
54            Ok(c) => c,
55            Err(e) => {
56                warn!("Failed to read DESCRIPTION at {:?}: {}", path, e);
57                return vec![default_package_data()];
58            }
59        };
60        let fields = parse_dcf(&content);
61
62        let name = fields
63            .get("Package")
64            .map(|s| truncate_field(s.trim().to_string()));
65        let version = fields
66            .get("Version")
67            .map(|s| truncate_field(s.trim().to_string()));
68
69        // Generate PURL
70        let purl = create_package_url(&name, &version);
71
72        // Generate repository URLs
73        let repository_homepage_url = name
74            .as_ref()
75            .map(|n| truncate_field(format!("https://cran.r-project.org/package={}", n)));
76
77        // Build description from Title and Description fields
78        let description = build_description(&fields);
79
80        // Extract license statement
81        let extracted_license_statement = fields
82            .get("License")
83            .map(|s| truncate_field(s.trim().to_string()));
84
85        // Extract URL field
86        let homepage_url = fields
87            .get("URL")
88            .map(|s| truncate_field(s.split(',').next().unwrap_or("").trim().to_string()))
89            .filter(|s| !s.is_empty());
90
91        // Extract parties (Author and Maintainer)
92        let mut parties = Vec::new();
93
94        // Parse Maintainer field
95        if let Some(maintainer_str) = fields.get("Maintainer")
96            && let Some(party) = parse_party(maintainer_str, "maintainer")
97        {
98            parties.push(party);
99        }
100
101        // Parse Author field
102        if let Some(author_str) = fields.get("Author") {
103            for author_part in split_author_entries(author_str) {
104                if let Some(party) = parse_party(author_part, "author") {
105                    parties.push(party);
106                }
107            }
108        }
109
110        // Extract dependencies from all dependency fields
111        let mut dependencies = Vec::new();
112
113        // Process each dependency type
114        for (field_name, scope) in [
115            ("Depends", None),
116            ("Imports", Some("imports")),
117            ("Suggests", Some("suggests")),
118            ("Enhances", Some("enhances")),
119            ("LinkingTo", Some("linkingto")),
120        ] {
121            if let Some(deps_str) = fields.get(field_name) {
122                dependencies.extend(parse_dependencies(deps_str, scope));
123            }
124        }
125
126        vec![PackageData {
127            package_type: Some(Self::PACKAGE_TYPE),
128            namespace: None,
129            name,
130            version,
131            qualifiers: None,
132            subpath: None,
133            primary_language: Some("R".to_string()),
134            description,
135            release_date: None,
136            parties,
137            keywords: Vec::new(),
138            homepage_url,
139            download_url: None,
140            size: None,
141            sha1: None,
142            md5: None,
143            sha256: None,
144            sha512: None,
145            bug_tracking_url: None,
146            code_view_url: None,
147            vcs_url: None,
148            copyright: None,
149            holder: None,
150            declared_license_expression: None,
151            declared_license_expression_spdx: None,
152            license_detections: Vec::new(),
153            other_license_expression: None,
154            other_license_expression_spdx: None,
155            other_license_detections: Vec::new(),
156            extracted_license_statement,
157            notice_text: None,
158            source_packages: Vec::new(),
159            file_references: Vec::new(),
160            is_private: false,
161            is_virtual: false,
162            extra_data: None,
163            dependencies,
164            repository_homepage_url,
165            repository_download_url: None,
166            api_data_url: None,
167            datasource_id: Some(DatasourceId::CranDescription),
168            purl,
169        }]
170    }
171
172    fn metadata() -> Vec<super::metadata::ParserMetadata> {
173        vec![super::metadata::ParserMetadata {
174            description: "CRAN R package DESCRIPTION file",
175            file_patterns: &["**/DESCRIPTION"],
176            package_type: "cran",
177            primary_language: "R",
178            documentation_url: Some(
179                "https://cran.r-project.org/doc/manuals/r-release/R-exts.html#The-DESCRIPTION-file",
180            ),
181        }]
182    }
183}
184
185fn parse_dcf(content: &str) -> HashMap<String, String> {
186    let mut fields: HashMap<String, String> = HashMap::new();
187    let mut current_field: Option<String> = None;
188    let mut current_value = String::new();
189
190    for line in content.lines().take(MAX_ITERATION_COUNT) {
191        // Check if line is a continuation (starts with whitespace)
192        if line.starts_with(' ') || line.starts_with('\t') {
193            if current_field.is_some() {
194                // Append to current value, replacing continuation line indent with space
195                if !current_value.is_empty() {
196                    current_value.push(' ');
197                }
198                current_value.push_str(line.trim_start());
199            }
200        } else if let Some((field_name, field_value)) = line.split_once(':') {
201            // New field: save previous field if any
202            if let Some(field) = current_field.take() {
203                fields.insert(field, truncate_field(current_value.clone()));
204                current_value.clear();
205            }
206
207            // Start new field
208            current_field = Some(field_name.trim().to_string());
209            current_value = field_value.trim_start().to_string();
210        }
211        // Else: empty line or invalid line - ignore
212    }
213
214    // Save the last field
215    if let Some(field) = current_field {
216        fields.insert(field, truncate_field(current_value));
217    }
218
219    fields
220}
221
222/// Parse a comma-separated dependency list with optional version constraints.
223///
224/// Format: "package1 (>= 1.0), package2, package3 (== 2.0)"
225/// Filters out R version requirements like "R (>= 4.1.0)"
226fn parse_dependencies(deps_str: &str, scope: Option<&str>) -> Vec<Dependency> {
227    let mut dependencies = Vec::new();
228
229    for dep in deps_str.split(',').take(MAX_ITERATION_COUNT) {
230        let dep = dep.trim();
231        if dep.is_empty() {
232            continue;
233        }
234
235        let (name, extracted_requirement, is_pinned) = parse_version_constraint(dep);
236
237        // Skip R version requirements (not actual package dependencies)
238        if name == "R" {
239            continue;
240        }
241
242        // Create PURL for dependency
243        let purl = if is_pinned {
244            // For pinned versions, extract version from requirement
245            if let Some(ref req) = extracted_requirement {
246                if let Some(version) = extract_version_from_requirement(req) {
247                    match PackageUrl::new("cran", &name) {
248                        Ok(mut p) => {
249                            if p.with_version(&version).is_ok() {
250                                Some(p.to_string())
251                            } else {
252                                // Failed to set version, create without it
253                                PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
254                            }
255                        }
256                        Err(e) => {
257                            warn!(
258                                "Failed to create PURL for CRAN dependency '{}': {}",
259                                name, e
260                            );
261                            None
262                        }
263                    }
264                } else {
265                    // No version found in requirement
266                    PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
267                }
268            } else {
269                // No requirement
270                PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
271            }
272        } else {
273            // Not pinned, create PURL without version
274            PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
275        };
276
277        dependencies.push(Dependency {
278            purl,
279            extracted_requirement: extracted_requirement.map(truncate_field),
280            scope: scope.map(|s| truncate_field(s.to_string())),
281            is_runtime: Some(scope.is_none() || scope == Some("imports")),
282            is_optional: Some(scope == Some("suggests") || scope == Some("enhances")),
283            is_pinned: Some(is_pinned),
284            is_direct: Some(true),
285            resolved_package: None,
286            extra_data: None,
287        });
288    }
289
290    dependencies
291}
292
293static VERSION_CONSTRAINT_RE: LazyLock<Regex> = LazyLock::new(|| {
294    Regex::new(r"^([a-zA-Z0-9.]+)\s*\(([><=]+)\s*([^)]+)\)\s*$").expect("valid regex")
295});
296
297/// Examples:
298/// - "cli (>= 3.6.2)" -> ("cli", Some(">= 3.6.2"), true)
299/// - "generics" -> ("generics", None, false)
300/// - "glue (== 1.3.2)" -> ("glue", Some("== 1.3.2"), true)
301fn parse_version_constraint(dep: &str) -> (String, Option<String>, bool) {
302    if let Some(captures) = VERSION_CONSTRAINT_RE.captures(dep) {
303        let name = match captures.get(1) {
304            Some(m) => truncate_field(m.as_str().to_string()),
305            None => return (truncate_field(dep.trim().to_string()), None, false),
306        };
307        let operator = match captures.get(2) {
308            Some(m) => m.as_str(),
309            None => return (name, None, false),
310        };
311        let version = match captures.get(3) {
312            Some(m) => m.as_str(),
313            None => return (name, None, false),
314        };
315        let requirement = truncate_field(format!("{} {}", operator, version));
316        let is_pinned = operator == "==";
317
318        (name, Some(requirement), is_pinned)
319    } else {
320        (truncate_field(dep.trim().to_string()), None, false)
321    }
322}
323
324/// Extract version number from a requirement string like ">= 3.6.2" or "== 1.0.0".
325fn extract_version_from_requirement(requirement: &str) -> Option<String> {
326    requirement
327        .split_whitespace()
328        .nth(1)
329        .map(|s| truncate_field(s.to_string()))
330}
331
332/// Build description from Title and Description fields.
333fn build_description(fields: &HashMap<String, String>) -> Option<String> {
334    let title = fields.get("Title").map(|s| s.trim());
335    let desc = fields.get("Description").map(|s| s.trim());
336
337    match (title, desc) {
338        (Some(t), Some(d)) if !t.is_empty() && !d.is_empty() => {
339            Some(truncate_field(format!("{}\n{}", t, d)))
340        }
341        (Some(t), _) if !t.is_empty() => Some(truncate_field(t.to_string())),
342        (_, Some(d)) if !d.is_empty() => Some(truncate_field(d.to_string())),
343        _ => None,
344    }
345}
346
347fn split_author_entries(author_str: &str) -> Vec<&str> {
348    let mut entries = Vec::new();
349    let mut start = 0;
350    let mut bracket_depth: usize = 0;
351    let mut paren_depth: usize = 0;
352
353    for (idx, ch) in author_str.char_indices().take(MAX_ITERATION_COUNT) {
354        match ch {
355            '[' => bracket_depth += 1,
356            ']' => bracket_depth = bracket_depth.saturating_sub(1),
357            '(' => paren_depth += 1,
358            ')' => paren_depth = paren_depth.saturating_sub(1),
359            ',' if bracket_depth == 0 && paren_depth == 0 => {
360                let entry = author_str[start..idx].trim();
361                if !entry.is_empty() {
362                    entries.push(entry);
363                }
364                start = idx + 1;
365            }
366            _ => {}
367        }
368    }
369
370    let final_entry = author_str[start..].trim();
371    if !final_entry.is_empty() {
372        entries.push(final_entry);
373    }
374
375    entries
376}
377
378/// Parse party information from Author or Maintainer field.
379///
380/// Formats supported:
381/// - "Name <email@domain.com>"
382/// - "Name"
383/// - "email@domain.com"
384fn parse_party(info: &str, role: &str) -> Option<Party> {
385    let info = info.trim();
386    if info.is_empty() {
387        return None;
388    }
389
390    // Check for "Name <email>" format
391    if info.contains('<') && info.contains('>') {
392        let parts: Vec<&str> = info.split('<').collect();
393        if parts.len() == 2 {
394            let name = parts[0].trim().to_string();
395            let email = parts[1].trim_end_matches('>').trim().to_string();
396
397            if !email.contains('@') {
398                return Some(Party {
399                    r#type: Some(truncate_field("person".to_string())),
400                    role: Some(truncate_field(role.to_string())),
401                    name: Some(truncate_field(info.to_string())),
402                    email: None,
403                    url: None,
404                    organization: None,
405                    organization_url: None,
406                    timezone: None,
407                });
408            }
409
410            return Some(Party {
411                r#type: Some(truncate_field("person".to_string())),
412                role: Some(truncate_field(role.to_string())),
413                name: if name.is_empty() {
414                    None
415                } else {
416                    Some(truncate_field(name))
417                },
418                email: if email.is_empty() {
419                    None
420                } else {
421                    Some(truncate_field(email))
422                },
423                url: None,
424                organization: None,
425                organization_url: None,
426                timezone: None,
427            });
428        }
429    }
430
431    // Just a name or email
432    Some(Party {
433        r#type: Some(truncate_field("person".to_string())),
434        role: Some(truncate_field(role.to_string())),
435        name: Some(truncate_field(info.to_string())),
436        email: None,
437        url: None,
438        organization: None,
439        organization_url: None,
440        timezone: None,
441    })
442}
443
444/// Create a package URL for a CRAN package.
445fn create_package_url(name: &Option<String>, version: &Option<String>) -> Option<String> {
446    name.as_ref().and_then(|name| {
447        let mut package_url = match PackageUrl::new("cran", name) {
448            Ok(p) => p,
449            Err(e) => {
450                warn!(
451                    "Failed to create PackageUrl for CRAN package '{}': {}",
452                    name, e
453                );
454                return None;
455            }
456        };
457
458        if let Some(v) = version
459            && let Err(e) = package_url.with_version(v)
460        {
461            warn!(
462                "Failed to set version '{}' for CRAN package '{}': {}",
463                v, name, e
464            );
465            return None;
466        }
467
468        Some(package_url.to_string())
469    })
470}
471
472fn default_package_data() -> PackageData {
473    PackageData {
474        package_type: Some(CranParser::PACKAGE_TYPE),
475        primary_language: Some("R".to_string()),
476        datasource_id: Some(DatasourceId::CranDescription),
477        ..Default::default()
478    }
479}