Skip to main content

provenant/parsers/
cran.rs

1//! Parser for CRAN R package DESCRIPTION files.
2//!
3//! Extracts package metadata and dependencies from R package DESCRIPTION files
4//! which use Debian Control File (DCF) format similar to RFC822.
5//!
6//! # Supported Formats
7//! - DESCRIPTION (CRAN R package manifest)
8//!
9//! # Key Features
10//! - Multi-type dependency extraction (Depends, Imports, Suggests, Enhances, LinkingTo)
11//! - Version constraint parsing with operators (>=, <=, >, <, ==)
12//! - Filters out R version requirements (not actual packages)
13//! - Author/Maintainer party extraction with email parsing
14//! - Package URL (purl) generation
15//!
16//! # Implementation Notes
17//! - Uses DCF/RFC822-like format with continuation lines
18//! - Field names are case-sensitive (Package, Version, Description, etc.)
19//! - Dependencies are comma-separated with optional version constraints
20//! - R version requirements (e.g., "R (>= 4.1.0)") are filtered out
21//! - Authors@R field is NOT parsed (requires R interpreter)
22
23use std::collections::HashMap;
24use std::fs::File;
25use std::io::Read;
26use std::path::Path;
27
28use lazy_static::lazy_static;
29use log::warn;
30use packageurl::PackageUrl;
31use regex::Regex;
32
33use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
34
35use super::PackageParser;
36
37/// CRAN R package DESCRIPTION file parser.
38///
39/// Extracts package metadata, dependencies, and party information from
40/// standard DESCRIPTION files used by R packages in the CRAN ecosystem.
41pub struct CranParser;
42
43impl PackageParser for CranParser {
44    const PACKAGE_TYPE: PackageType = PackageType::Cran;
45
46    fn is_match(path: &Path) -> bool {
47        path.file_name().is_some_and(|name| name == "DESCRIPTION")
48    }
49
50    fn extract_packages(path: &Path) -> Vec<PackageData> {
51        let fields = match read_description_file(path) {
52            Ok(content) => parse_dcf(&content),
53            Err(e) => {
54                warn!("Failed to read DESCRIPTION at {:?}: {}", path, e);
55                return vec![default_package_data()];
56            }
57        };
58
59        let name = fields.get("Package").map(|s| s.trim().to_string());
60        let version = fields.get("Version").map(|s| s.trim().to_string());
61
62        // Generate PURL
63        let purl = create_package_url(&name, &version);
64
65        // Generate repository URLs
66        let repository_homepage_url = name
67            .as_ref()
68            .map(|n| format!("https://cran.r-project.org/package={}", n));
69
70        // Build description from Title and Description fields
71        let description = build_description(&fields);
72
73        // Extract license statement
74        let extracted_license_statement = fields.get("License").map(|s| s.trim().to_string());
75
76        // Extract URL field
77        let homepage_url = fields
78            .get("URL")
79            .map(|s| s.split(',').next().unwrap_or("").trim().to_string())
80            .filter(|s| !s.is_empty());
81
82        // Extract parties (Author and Maintainer)
83        let mut parties = Vec::new();
84
85        // Parse Maintainer field
86        if let Some(maintainer_str) = fields.get("Maintainer")
87            && let Some(party) = parse_party(maintainer_str, "maintainer")
88        {
89            parties.push(party);
90        }
91
92        // Parse Author field
93        if let Some(author_str) = fields.get("Author") {
94            for author_part in split_author_entries(author_str) {
95                if let Some(party) = parse_party(author_part, "author") {
96                    parties.push(party);
97                }
98            }
99        }
100
101        // Extract dependencies from all dependency fields
102        let mut dependencies = Vec::new();
103
104        // Process each dependency type
105        for (field_name, scope) in [
106            ("Depends", None),
107            ("Imports", Some("imports")),
108            ("Suggests", Some("suggests")),
109            ("Enhances", Some("enhances")),
110            ("LinkingTo", Some("linkingto")),
111        ] {
112            if let Some(deps_str) = fields.get(field_name) {
113                dependencies.extend(parse_dependencies(deps_str, scope));
114            }
115        }
116
117        vec![PackageData {
118            package_type: Some(Self::PACKAGE_TYPE),
119            namespace: None,
120            name,
121            version,
122            qualifiers: None,
123            subpath: None,
124            primary_language: Some("R".to_string()),
125            description,
126            release_date: None,
127            parties,
128            keywords: Vec::new(),
129            homepage_url,
130            download_url: None,
131            size: None,
132            sha1: None,
133            md5: None,
134            sha256: None,
135            sha512: None,
136            bug_tracking_url: None,
137            code_view_url: None,
138            vcs_url: None,
139            copyright: None,
140            holder: None,
141            declared_license_expression: None,
142            declared_license_expression_spdx: None,
143            license_detections: Vec::new(),
144            other_license_expression: None,
145            other_license_expression_spdx: None,
146            other_license_detections: Vec::new(),
147            extracted_license_statement,
148            notice_text: None,
149            source_packages: Vec::new(),
150            file_references: Vec::new(),
151            is_private: false,
152            is_virtual: false,
153            extra_data: None,
154            dependencies,
155            repository_homepage_url,
156            repository_download_url: None,
157            api_data_url: None,
158            datasource_id: Some(DatasourceId::CranDescription),
159            purl,
160        }]
161    }
162}
163
164/// Read a DESCRIPTION file into a string.
165fn read_description_file(path: &Path) -> Result<String, String> {
166    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {}", e))?;
167
168    let mut content = String::new();
169    file.read_to_string(&mut content)
170        .map_err(|e| format!("Failed to read file: {}", e))?;
171
172    Ok(content)
173}
174
175/// Parse DCF (Debian Control File) format into a HashMap of fields.
176///
177/// DCF format is similar to RFC822:
178/// - Field names followed by colon and value
179/// - Continuation lines start with whitespace (space or tab)
180/// - Field names are case-sensitive
181fn parse_dcf(content: &str) -> HashMap<String, String> {
182    let mut fields: HashMap<String, String> = HashMap::new();
183    let mut current_field: Option<String> = None;
184    let mut current_value = String::new();
185
186    for line in content.lines() {
187        // Check if line is a continuation (starts with whitespace)
188        if line.starts_with(' ') || line.starts_with('\t') {
189            if current_field.is_some() {
190                // Append to current value, replacing continuation line indent with space
191                if !current_value.is_empty() {
192                    current_value.push(' ');
193                }
194                current_value.push_str(line.trim_start());
195            }
196        } else if let Some((field_name, field_value)) = line.split_once(':') {
197            // New field: save previous field if any
198            if let Some(field) = current_field.take() {
199                fields.insert(field, current_value.clone());
200                current_value.clear();
201            }
202
203            // Start new field
204            current_field = Some(field_name.trim().to_string());
205            current_value = field_value.trim_start().to_string();
206        }
207        // Else: empty line or invalid line - ignore
208    }
209
210    // Save the last field
211    if let Some(field) = current_field {
212        fields.insert(field, current_value);
213    }
214
215    fields
216}
217
218/// Parse a comma-separated dependency list with optional version constraints.
219///
220/// Format: "package1 (>= 1.0), package2, package3 (== 2.0)"
221/// Filters out R version requirements like "R (>= 4.1.0)"
222fn parse_dependencies(deps_str: &str, scope: Option<&str>) -> Vec<Dependency> {
223    let mut dependencies = Vec::new();
224
225    for dep in deps_str.split(',') {
226        let dep = dep.trim();
227        if dep.is_empty() {
228            continue;
229        }
230
231        let (name, extracted_requirement, is_pinned) = parse_version_constraint(dep);
232
233        // Skip R version requirements (not actual package dependencies)
234        if name == "R" {
235            continue;
236        }
237
238        // Create PURL for dependency
239        let purl = if is_pinned {
240            // For pinned versions, extract version from requirement
241            if let Some(ref req) = extracted_requirement {
242                if let Some(version) = extract_version_from_requirement(req) {
243                    match PackageUrl::new("cran", &name) {
244                        Ok(mut p) => {
245                            if p.with_version(&version).is_ok() {
246                                Some(p.to_string())
247                            } else {
248                                // Failed to set version, create without it
249                                PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
250                            }
251                        }
252                        Err(e) => {
253                            warn!(
254                                "Failed to create PURL for CRAN dependency '{}': {}",
255                                name, e
256                            );
257                            None
258                        }
259                    }
260                } else {
261                    // No version found in requirement
262                    PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
263                }
264            } else {
265                // No requirement
266                PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
267            }
268        } else {
269            // Not pinned, create PURL without version
270            PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
271        };
272
273        dependencies.push(Dependency {
274            purl,
275            extracted_requirement,
276            scope: scope.map(|s| s.to_string()),
277            is_runtime: Some(scope.is_none() || scope == Some("imports")),
278            is_optional: Some(scope == Some("suggests") || scope == Some("enhances")),
279            is_pinned: Some(is_pinned),
280            is_direct: Some(true),
281            resolved_package: None,
282            extra_data: None,
283        });
284    }
285
286    dependencies
287}
288
289lazy_static! {
290    static ref VERSION_CONSTRAINT_RE: Regex =
291        Regex::new(r"^([a-zA-Z0-9.]+)\s*\(([><=]+)\s*([0-9.]+)\)\s*$").unwrap();
292}
293
294/// Examples:
295/// - "cli (>= 3.6.2)" -> ("cli", Some(">= 3.6.2"), true)
296/// - "generics" -> ("generics", None, false)
297/// - "glue (== 1.3.2)" -> ("glue", Some("== 1.3.2"), true)
298fn parse_version_constraint(dep: &str) -> (String, Option<String>, bool) {
299    if let Some(captures) = VERSION_CONSTRAINT_RE.captures(dep) {
300        let name = captures.get(1).unwrap().as_str().to_string();
301        let operator = captures.get(2).unwrap().as_str();
302        let version = captures.get(3).unwrap().as_str();
303        let requirement = format!("{} {}", operator, version);
304        let is_pinned = operator == "==";
305
306        (name, Some(requirement), is_pinned)
307    } else {
308        // No version constraint
309        (dep.trim().to_string(), None, false)
310    }
311}
312
313/// Extract version number from a requirement string like ">= 3.6.2" or "== 1.0.0".
314fn extract_version_from_requirement(requirement: &str) -> Option<String> {
315    requirement.split_whitespace().nth(1).map(|s| s.to_string())
316}
317
318/// Build description from Title and Description fields.
319fn build_description(fields: &HashMap<String, String>) -> Option<String> {
320    let title = fields.get("Title").map(|s| s.trim());
321    let desc = fields.get("Description").map(|s| s.trim());
322
323    match (title, desc) {
324        (Some(t), Some(d)) if !t.is_empty() && !d.is_empty() => Some(format!("{}\n{}", t, d)),
325        (Some(t), _) if !t.is_empty() => Some(t.to_string()),
326        (_, Some(d)) if !d.is_empty() => Some(d.to_string()),
327        _ => None,
328    }
329}
330
331fn split_author_entries(author_str: &str) -> Vec<&str> {
332    let mut entries = Vec::new();
333    let mut start = 0;
334    let mut bracket_depth: usize = 0;
335    let mut paren_depth: usize = 0;
336
337    for (idx, ch) in author_str.char_indices() {
338        match ch {
339            '[' => bracket_depth += 1,
340            ']' => bracket_depth = bracket_depth.saturating_sub(1),
341            '(' => paren_depth += 1,
342            ')' => paren_depth = paren_depth.saturating_sub(1),
343            ',' if bracket_depth == 0 && paren_depth == 0 => {
344                let entry = author_str[start..idx].trim();
345                if !entry.is_empty() {
346                    entries.push(entry);
347                }
348                start = idx + 1;
349            }
350            _ => {}
351        }
352    }
353
354    let final_entry = author_str[start..].trim();
355    if !final_entry.is_empty() {
356        entries.push(final_entry);
357    }
358
359    entries
360}
361
362/// Parse party information from Author or Maintainer field.
363///
364/// Formats supported:
365/// - "Name <email@domain.com>"
366/// - "Name"
367/// - "email@domain.com"
368fn parse_party(info: &str, role: &str) -> Option<Party> {
369    let info = info.trim();
370    if info.is_empty() {
371        return None;
372    }
373
374    // Check for "Name <email>" format
375    if info.contains('<') && info.contains('>') {
376        let parts: Vec<&str> = info.split('<').collect();
377        if parts.len() == 2 {
378            let name = parts[0].trim().to_string();
379            let email = parts[1].trim_end_matches('>').trim().to_string();
380
381            if !email.contains('@') {
382                return Some(Party {
383                    r#type: Some("person".to_string()),
384                    role: Some(role.to_string()),
385                    name: Some(info.to_string()),
386                    email: None,
387                    url: None,
388                    organization: None,
389                    organization_url: None,
390                    timezone: None,
391                });
392            }
393
394            return Some(Party {
395                r#type: Some("person".to_string()),
396                role: Some(role.to_string()),
397                name: if name.is_empty() { None } else { Some(name) },
398                email: if email.is_empty() { None } else { Some(email) },
399                url: None,
400                organization: None,
401                organization_url: None,
402                timezone: None,
403            });
404        }
405    }
406
407    // Just a name or email
408    Some(Party {
409        r#type: Some("person".to_string()),
410        role: Some(role.to_string()),
411        name: Some(info.to_string()),
412        email: None,
413        url: None,
414        organization: None,
415        organization_url: None,
416        timezone: None,
417    })
418}
419
420/// Create a package URL for a CRAN package.
421fn create_package_url(name: &Option<String>, version: &Option<String>) -> Option<String> {
422    name.as_ref().and_then(|name| {
423        let mut package_url = match PackageUrl::new("cran", name) {
424            Ok(p) => p,
425            Err(e) => {
426                warn!(
427                    "Failed to create PackageUrl for CRAN package '{}': {}",
428                    name, e
429                );
430                return None;
431            }
432        };
433
434        if let Some(v) = version
435            && let Err(e) = package_url.with_version(v)
436        {
437            warn!(
438                "Failed to set version '{}' for CRAN package '{}': {}",
439                v, name, e
440            );
441            return None;
442        }
443
444        Some(package_url.to_string())
445    })
446}
447
448fn default_package_data() -> PackageData {
449    PackageData {
450        package_type: Some(CranParser::PACKAGE_TYPE),
451        primary_language: Some("R".to_string()),
452        datasource_id: Some(DatasourceId::CranDescription),
453        ..Default::default()
454    }
455}
456
457crate::register_parser!(
458    "CRAN R package DESCRIPTION file",
459    &["**/DESCRIPTION"],
460    "cran",
461    "R",
462    Some("https://cran.r-project.org/doc/manuals/r-release/R-exts.html#The-DESCRIPTION-file"),
463);