Skip to main content

provenant/parsers/
cran.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for CRAN R package DESCRIPTION files.
5//!
6//! Extracts package metadata and dependencies from R package DESCRIPTION files
7//! which use Debian Control File (DCF) format similar to RFC822.
8//!
9//! # Supported Formats
10//! - DESCRIPTION (CRAN R package manifest)
11//!
12//! # Key Features
13//! - Multi-type dependency extraction (Depends, Imports, Suggests, Enhances, LinkingTo)
14//! - Version constraint parsing with operators (>=, <=, >, <, ==)
15//! - Filters out R version requirements (not actual packages)
16//! - Author/Maintainer party extraction with email parsing
17//! - Package URL (purl) generation
18//!
19//! # Implementation Notes
20//! - Uses DCF/RFC822-like format with continuation lines
21//! - Field names are case-sensitive (Package, Version, Description, etc.)
22//! - Dependencies are comma-separated with optional version constraints
23//! - R version requirements (e.g., "R (>= 4.1.0)") are filtered out
24//! - Authors@R field is NOT parsed (requires R interpreter)
25
26use std::collections::HashMap;
27use std::path::Path;
28use std::sync::LazyLock;
29
30use crate::parser_warn as warn;
31use packageurl::PackageUrl;
32use regex::Regex;
33
34use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
35use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
36
37use super::PackageParser;
38
39/// CRAN R package DESCRIPTION file parser.
40///
41/// Extracts package metadata, dependencies, and party information from
42/// standard DESCRIPTION files used by R packages in the CRAN ecosystem.
43pub struct CranParser;
44
45impl PackageParser for CranParser {
46    const PACKAGE_TYPE: PackageType = PackageType::Cran;
47
48    fn is_match(path: &Path) -> bool {
49        path.file_name().is_some_and(|name| name == "DESCRIPTION")
50    }
51
52    fn extract_packages(path: &Path) -> Vec<PackageData> {
53        let content = match read_file_to_string(path, None) {
54            Ok(c) => c,
55            Err(e) => {
56                warn!("Failed to read DESCRIPTION at {:?}: {}", path, e);
57                return vec![default_package_data()];
58            }
59        };
60        let fields = parse_dcf(&content);
61
62        let name = fields
63            .get("Package")
64            .map(|s| truncate_field(s.trim().to_string()));
65        let version = fields
66            .get("Version")
67            .map(|s| truncate_field(s.trim().to_string()));
68
69        // Generate PURL
70        let purl = create_package_url(&name, &version);
71
72        // Generate repository URLs
73        let repository_homepage_url = name
74            .as_ref()
75            .map(|n| truncate_field(format!("https://cran.r-project.org/package={}", n)));
76
77        // Build description from Title and Description fields
78        let description = build_description(&fields);
79
80        // Extract license statement
81        let extracted_license_statement = fields
82            .get("License")
83            .map(|s| truncate_field(s.trim().to_string()));
84
85        // Extract URL field
86        let homepage_url = fields
87            .get("URL")
88            .map(|s| truncate_field(s.split(',').next().unwrap_or("").trim().to_string()))
89            .filter(|s| !s.is_empty());
90
91        // Extract parties (Author and Maintainer)
92        let mut parties = Vec::new();
93
94        // Parse Maintainer field
95        if let Some(maintainer_str) = fields.get("Maintainer")
96            && let Some(party) = parse_party(maintainer_str, "maintainer")
97        {
98            parties.push(party);
99        }
100
101        // Parse Author field
102        if let Some(author_str) = fields.get("Author") {
103            for author_part in split_author_entries(author_str) {
104                if let Some(party) = parse_party(author_part, "author") {
105                    parties.push(party);
106                }
107            }
108        }
109
110        // Extract dependencies from all dependency fields
111        let mut dependencies = Vec::new();
112
113        // Process each dependency type
114        for (field_name, scope) in [
115            ("Depends", None),
116            ("Imports", Some("imports")),
117            ("Suggests", Some("suggests")),
118            ("Enhances", Some("enhances")),
119            ("LinkingTo", Some("linkingto")),
120        ] {
121            if let Some(deps_str) = fields.get(field_name) {
122                dependencies.extend(parse_dependencies(deps_str, scope));
123            }
124        }
125
126        vec![PackageData {
127            package_type: Some(Self::PACKAGE_TYPE),
128            namespace: None,
129            name,
130            version,
131            qualifiers: None,
132            subpath: None,
133            primary_language: Some("R".to_string()),
134            description,
135            release_date: None,
136            parties,
137            keywords: Vec::new(),
138            homepage_url,
139            download_url: None,
140            size: None,
141            sha1: None,
142            md5: None,
143            sha256: None,
144            sha512: None,
145            bug_tracking_url: None,
146            code_view_url: None,
147            vcs_url: None,
148            copyright: None,
149            holder: None,
150            declared_license_expression: None,
151            declared_license_expression_spdx: None,
152            license_detections: Vec::new(),
153            other_license_expression: None,
154            other_license_expression_spdx: None,
155            other_license_detections: Vec::new(),
156            extracted_license_statement,
157            notice_text: None,
158            source_packages: Vec::new(),
159            file_references: Vec::new(),
160            is_private: false,
161            is_virtual: false,
162            extra_data: None,
163            dependencies,
164            repository_homepage_url,
165            repository_download_url: None,
166            api_data_url: None,
167            datasource_id: Some(DatasourceId::CranDescription),
168            purl,
169        }]
170    }
171}
172
173fn parse_dcf(content: &str) -> HashMap<String, String> {
174    let mut fields: HashMap<String, String> = HashMap::new();
175    let mut current_field: Option<String> = None;
176    let mut current_value = String::new();
177
178    for line in content.lines().take(MAX_ITERATION_COUNT) {
179        // Check if line is a continuation (starts with whitespace)
180        if line.starts_with(' ') || line.starts_with('\t') {
181            if current_field.is_some() {
182                // Append to current value, replacing continuation line indent with space
183                if !current_value.is_empty() {
184                    current_value.push(' ');
185                }
186                current_value.push_str(line.trim_start());
187            }
188        } else if let Some((field_name, field_value)) = line.split_once(':') {
189            // New field: save previous field if any
190            if let Some(field) = current_field.take() {
191                fields.insert(field, truncate_field(current_value.clone()));
192                current_value.clear();
193            }
194
195            // Start new field
196            current_field = Some(field_name.trim().to_string());
197            current_value = field_value.trim_start().to_string();
198        }
199        // Else: empty line or invalid line - ignore
200    }
201
202    // Save the last field
203    if let Some(field) = current_field {
204        fields.insert(field, truncate_field(current_value));
205    }
206
207    fields
208}
209
210/// Parse a comma-separated dependency list with optional version constraints.
211///
212/// Format: "package1 (>= 1.0), package2, package3 (== 2.0)"
213/// Filters out R version requirements like "R (>= 4.1.0)"
214fn parse_dependencies(deps_str: &str, scope: Option<&str>) -> Vec<Dependency> {
215    let mut dependencies = Vec::new();
216
217    for dep in deps_str.split(',').take(MAX_ITERATION_COUNT) {
218        let dep = dep.trim();
219        if dep.is_empty() {
220            continue;
221        }
222
223        let (name, extracted_requirement, is_pinned) = parse_version_constraint(dep);
224
225        // Skip R version requirements (not actual package dependencies)
226        if name == "R" {
227            continue;
228        }
229
230        // Create PURL for dependency
231        let purl = if is_pinned {
232            // For pinned versions, extract version from requirement
233            if let Some(ref req) = extracted_requirement {
234                if let Some(version) = extract_version_from_requirement(req) {
235                    match PackageUrl::new("cran", &name) {
236                        Ok(mut p) => {
237                            if p.with_version(&version).is_ok() {
238                                Some(p.to_string())
239                            } else {
240                                // Failed to set version, create without it
241                                PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
242                            }
243                        }
244                        Err(e) => {
245                            warn!(
246                                "Failed to create PURL for CRAN dependency '{}': {}",
247                                name, e
248                            );
249                            None
250                        }
251                    }
252                } else {
253                    // No version found in requirement
254                    PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
255                }
256            } else {
257                // No requirement
258                PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
259            }
260        } else {
261            // Not pinned, create PURL without version
262            PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
263        };
264
265        dependencies.push(Dependency {
266            purl,
267            extracted_requirement: extracted_requirement.map(truncate_field),
268            scope: scope.map(|s| truncate_field(s.to_string())),
269            is_runtime: Some(scope.is_none() || scope == Some("imports")),
270            is_optional: Some(scope == Some("suggests") || scope == Some("enhances")),
271            is_pinned: Some(is_pinned),
272            is_direct: Some(true),
273            resolved_package: None,
274            extra_data: None,
275        });
276    }
277
278    dependencies
279}
280
281static VERSION_CONSTRAINT_RE: LazyLock<Regex> = LazyLock::new(|| {
282    Regex::new(r"^([a-zA-Z0-9.]+)\s*\(([><=]+)\s*([^)]+)\)\s*$").expect("valid regex")
283});
284
285/// Examples:
286/// - "cli (>= 3.6.2)" -> ("cli", Some(">= 3.6.2"), true)
287/// - "generics" -> ("generics", None, false)
288/// - "glue (== 1.3.2)" -> ("glue", Some("== 1.3.2"), true)
289fn parse_version_constraint(dep: &str) -> (String, Option<String>, bool) {
290    if let Some(captures) = VERSION_CONSTRAINT_RE.captures(dep) {
291        let name = match captures.get(1) {
292            Some(m) => truncate_field(m.as_str().to_string()),
293            None => return (truncate_field(dep.trim().to_string()), None, false),
294        };
295        let operator = match captures.get(2) {
296            Some(m) => m.as_str(),
297            None => return (name, None, false),
298        };
299        let version = match captures.get(3) {
300            Some(m) => m.as_str(),
301            None => return (name, None, false),
302        };
303        let requirement = truncate_field(format!("{} {}", operator, version));
304        let is_pinned = operator == "==";
305
306        (name, Some(requirement), is_pinned)
307    } else {
308        (truncate_field(dep.trim().to_string()), None, false)
309    }
310}
311
312/// Extract version number from a requirement string like ">= 3.6.2" or "== 1.0.0".
313fn extract_version_from_requirement(requirement: &str) -> Option<String> {
314    requirement
315        .split_whitespace()
316        .nth(1)
317        .map(|s| truncate_field(s.to_string()))
318}
319
320/// Build description from Title and Description fields.
321fn build_description(fields: &HashMap<String, String>) -> Option<String> {
322    let title = fields.get("Title").map(|s| s.trim());
323    let desc = fields.get("Description").map(|s| s.trim());
324
325    match (title, desc) {
326        (Some(t), Some(d)) if !t.is_empty() && !d.is_empty() => {
327            Some(truncate_field(format!("{}\n{}", t, d)))
328        }
329        (Some(t), _) if !t.is_empty() => Some(truncate_field(t.to_string())),
330        (_, Some(d)) if !d.is_empty() => Some(truncate_field(d.to_string())),
331        _ => None,
332    }
333}
334
335fn split_author_entries(author_str: &str) -> Vec<&str> {
336    let mut entries = Vec::new();
337    let mut start = 0;
338    let mut bracket_depth: usize = 0;
339    let mut paren_depth: usize = 0;
340
341    for (idx, ch) in author_str.char_indices().take(MAX_ITERATION_COUNT) {
342        match ch {
343            '[' => bracket_depth += 1,
344            ']' => bracket_depth = bracket_depth.saturating_sub(1),
345            '(' => paren_depth += 1,
346            ')' => paren_depth = paren_depth.saturating_sub(1),
347            ',' if bracket_depth == 0 && paren_depth == 0 => {
348                let entry = author_str[start..idx].trim();
349                if !entry.is_empty() {
350                    entries.push(entry);
351                }
352                start = idx + 1;
353            }
354            _ => {}
355        }
356    }
357
358    let final_entry = author_str[start..].trim();
359    if !final_entry.is_empty() {
360        entries.push(final_entry);
361    }
362
363    entries
364}
365
366/// Parse party information from Author or Maintainer field.
367///
368/// Formats supported:
369/// - "Name <email@domain.com>"
370/// - "Name"
371/// - "email@domain.com"
372fn parse_party(info: &str, role: &str) -> Option<Party> {
373    let info = info.trim();
374    if info.is_empty() {
375        return None;
376    }
377
378    // Check for "Name <email>" format
379    if info.contains('<') && info.contains('>') {
380        let parts: Vec<&str> = info.split('<').collect();
381        if parts.len() == 2 {
382            let name = parts[0].trim().to_string();
383            let email = parts[1].trim_end_matches('>').trim().to_string();
384
385            if !email.contains('@') {
386                return Some(Party {
387                    r#type: Some(truncate_field("person".to_string())),
388                    role: Some(truncate_field(role.to_string())),
389                    name: Some(truncate_field(info.to_string())),
390                    email: None,
391                    url: None,
392                    organization: None,
393                    organization_url: None,
394                    timezone: None,
395                });
396            }
397
398            return Some(Party {
399                r#type: Some(truncate_field("person".to_string())),
400                role: Some(truncate_field(role.to_string())),
401                name: if name.is_empty() {
402                    None
403                } else {
404                    Some(truncate_field(name))
405                },
406                email: if email.is_empty() {
407                    None
408                } else {
409                    Some(truncate_field(email))
410                },
411                url: None,
412                organization: None,
413                organization_url: None,
414                timezone: None,
415            });
416        }
417    }
418
419    // Just a name or email
420    Some(Party {
421        r#type: Some(truncate_field("person".to_string())),
422        role: Some(truncate_field(role.to_string())),
423        name: Some(truncate_field(info.to_string())),
424        email: None,
425        url: None,
426        organization: None,
427        organization_url: None,
428        timezone: None,
429    })
430}
431
432/// Create a package URL for a CRAN package.
433fn create_package_url(name: &Option<String>, version: &Option<String>) -> Option<String> {
434    name.as_ref().and_then(|name| {
435        let mut package_url = match PackageUrl::new("cran", name) {
436            Ok(p) => p,
437            Err(e) => {
438                warn!(
439                    "Failed to create PackageUrl for CRAN package '{}': {}",
440                    name, e
441                );
442                return None;
443            }
444        };
445
446        if let Some(v) = version
447            && let Err(e) = package_url.with_version(v)
448        {
449            warn!(
450                "Failed to set version '{}' for CRAN package '{}': {}",
451                v, name, e
452            );
453            return None;
454        }
455
456        Some(package_url.to_string())
457    })
458}
459
460fn default_package_data() -> PackageData {
461    PackageData {
462        package_type: Some(CranParser::PACKAGE_TYPE),
463        primary_language: Some("R".to_string()),
464        datasource_id: Some(DatasourceId::CranDescription),
465        ..Default::default()
466    }
467}
468
469crate::register_parser!(
470    "CRAN R package DESCRIPTION file",
471    &["**/DESCRIPTION"],
472    "cran",
473    "R",
474    Some("https://cran.r-project.org/doc/manuals/r-release/R-exts.html#The-DESCRIPTION-file"),
475);