Skip to main content

provenant/parsers/
cran.rs

1//! Parser for CRAN R package DESCRIPTION files.
2//!
3//! Extracts package metadata and dependencies from R package DESCRIPTION files
4//! which use Debian Control File (DCF) format similar to RFC822.
5//!
6//! # Supported Formats
7//! - DESCRIPTION (CRAN R package manifest)
8//!
9//! # Key Features
10//! - Multi-type dependency extraction (Depends, Imports, Suggests, Enhances, LinkingTo)
11//! - Version constraint parsing with operators (>=, <=, >, <, ==)
12//! - Filters out R version requirements (not actual packages)
13//! - Author/Maintainer party extraction with email parsing
14//! - Package URL (purl) generation
15//!
16//! # Implementation Notes
17//! - Uses DCF/RFC822-like format with continuation lines
18//! - Field names are case-sensitive (Package, Version, Description, etc.)
19//! - Dependencies are comma-separated with optional version constraints
20//! - R version requirements (e.g., "R (>= 4.1.0)") are filtered out
21//! - Authors@R field is NOT parsed (requires R interpreter)
22
23use std::collections::HashMap;
24use std::path::Path;
25use std::sync::LazyLock;
26
27use crate::parser_warn as warn;
28use packageurl::PackageUrl;
29use regex::Regex;
30
31use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
32use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
33
34use super::PackageParser;
35
36/// CRAN R package DESCRIPTION file parser.
37///
38/// Extracts package metadata, dependencies, and party information from
39/// standard DESCRIPTION files used by R packages in the CRAN ecosystem.
40pub struct CranParser;
41
42impl PackageParser for CranParser {
43    const PACKAGE_TYPE: PackageType = PackageType::Cran;
44
45    fn is_match(path: &Path) -> bool {
46        path.file_name().is_some_and(|name| name == "DESCRIPTION")
47    }
48
49    fn extract_packages(path: &Path) -> Vec<PackageData> {
50        let content = match read_file_to_string(path, None) {
51            Ok(c) => c,
52            Err(e) => {
53                warn!("Failed to read DESCRIPTION at {:?}: {}", path, e);
54                return vec![default_package_data()];
55            }
56        };
57        let fields = parse_dcf(&content);
58
59        let name = fields
60            .get("Package")
61            .map(|s| truncate_field(s.trim().to_string()));
62        let version = fields
63            .get("Version")
64            .map(|s| truncate_field(s.trim().to_string()));
65
66        // Generate PURL
67        let purl = create_package_url(&name, &version);
68
69        // Generate repository URLs
70        let repository_homepage_url = name
71            .as_ref()
72            .map(|n| truncate_field(format!("https://cran.r-project.org/package={}", n)));
73
74        // Build description from Title and Description fields
75        let description = build_description(&fields);
76
77        // Extract license statement
78        let extracted_license_statement = fields
79            .get("License")
80            .map(|s| truncate_field(s.trim().to_string()));
81
82        // Extract URL field
83        let homepage_url = fields
84            .get("URL")
85            .map(|s| truncate_field(s.split(',').next().unwrap_or("").trim().to_string()))
86            .filter(|s| !s.is_empty());
87
88        // Extract parties (Author and Maintainer)
89        let mut parties = Vec::new();
90
91        // Parse Maintainer field
92        if let Some(maintainer_str) = fields.get("Maintainer")
93            && let Some(party) = parse_party(maintainer_str, "maintainer")
94        {
95            parties.push(party);
96        }
97
98        // Parse Author field
99        if let Some(author_str) = fields.get("Author") {
100            for author_part in split_author_entries(author_str) {
101                if let Some(party) = parse_party(author_part, "author") {
102                    parties.push(party);
103                }
104            }
105        }
106
107        // Extract dependencies from all dependency fields
108        let mut dependencies = Vec::new();
109
110        // Process each dependency type
111        for (field_name, scope) in [
112            ("Depends", None),
113            ("Imports", Some("imports")),
114            ("Suggests", Some("suggests")),
115            ("Enhances", Some("enhances")),
116            ("LinkingTo", Some("linkingto")),
117        ] {
118            if let Some(deps_str) = fields.get(field_name) {
119                dependencies.extend(parse_dependencies(deps_str, scope));
120            }
121        }
122
123        vec![PackageData {
124            package_type: Some(Self::PACKAGE_TYPE),
125            namespace: None,
126            name,
127            version,
128            qualifiers: None,
129            subpath: None,
130            primary_language: Some("R".to_string()),
131            description,
132            release_date: None,
133            parties,
134            keywords: Vec::new(),
135            homepage_url,
136            download_url: None,
137            size: None,
138            sha1: None,
139            md5: None,
140            sha256: None,
141            sha512: None,
142            bug_tracking_url: None,
143            code_view_url: None,
144            vcs_url: None,
145            copyright: None,
146            holder: None,
147            declared_license_expression: None,
148            declared_license_expression_spdx: None,
149            license_detections: Vec::new(),
150            other_license_expression: None,
151            other_license_expression_spdx: None,
152            other_license_detections: Vec::new(),
153            extracted_license_statement,
154            notice_text: None,
155            source_packages: Vec::new(),
156            file_references: Vec::new(),
157            is_private: false,
158            is_virtual: false,
159            extra_data: None,
160            dependencies,
161            repository_homepage_url,
162            repository_download_url: None,
163            api_data_url: None,
164            datasource_id: Some(DatasourceId::CranDescription),
165            purl,
166        }]
167    }
168}
169
170fn parse_dcf(content: &str) -> HashMap<String, String> {
171    let mut fields: HashMap<String, String> = HashMap::new();
172    let mut current_field: Option<String> = None;
173    let mut current_value = String::new();
174
175    for line in content.lines().take(MAX_ITERATION_COUNT) {
176        // Check if line is a continuation (starts with whitespace)
177        if line.starts_with(' ') || line.starts_with('\t') {
178            if current_field.is_some() {
179                // Append to current value, replacing continuation line indent with space
180                if !current_value.is_empty() {
181                    current_value.push(' ');
182                }
183                current_value.push_str(line.trim_start());
184            }
185        } else if let Some((field_name, field_value)) = line.split_once(':') {
186            // New field: save previous field if any
187            if let Some(field) = current_field.take() {
188                fields.insert(field, truncate_field(current_value.clone()));
189                current_value.clear();
190            }
191
192            // Start new field
193            current_field = Some(field_name.trim().to_string());
194            current_value = field_value.trim_start().to_string();
195        }
196        // Else: empty line or invalid line - ignore
197    }
198
199    // Save the last field
200    if let Some(field) = current_field {
201        fields.insert(field, truncate_field(current_value));
202    }
203
204    fields
205}
206
207/// Parse a comma-separated dependency list with optional version constraints.
208///
209/// Format: "package1 (>= 1.0), package2, package3 (== 2.0)"
210/// Filters out R version requirements like "R (>= 4.1.0)"
211fn parse_dependencies(deps_str: &str, scope: Option<&str>) -> Vec<Dependency> {
212    let mut dependencies = Vec::new();
213
214    for dep in deps_str.split(',').take(MAX_ITERATION_COUNT) {
215        let dep = dep.trim();
216        if dep.is_empty() {
217            continue;
218        }
219
220        let (name, extracted_requirement, is_pinned) = parse_version_constraint(dep);
221
222        // Skip R version requirements (not actual package dependencies)
223        if name == "R" {
224            continue;
225        }
226
227        // Create PURL for dependency
228        let purl = if is_pinned {
229            // For pinned versions, extract version from requirement
230            if let Some(ref req) = extracted_requirement {
231                if let Some(version) = extract_version_from_requirement(req) {
232                    match PackageUrl::new("cran", &name) {
233                        Ok(mut p) => {
234                            if p.with_version(&version).is_ok() {
235                                Some(p.to_string())
236                            } else {
237                                // Failed to set version, create without it
238                                PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
239                            }
240                        }
241                        Err(e) => {
242                            warn!(
243                                "Failed to create PURL for CRAN dependency '{}': {}",
244                                name, e
245                            );
246                            None
247                        }
248                    }
249                } else {
250                    // No version found in requirement
251                    PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
252                }
253            } else {
254                // No requirement
255                PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
256            }
257        } else {
258            // Not pinned, create PURL without version
259            PackageUrl::new("cran", &name).ok().map(|p| p.to_string())
260        };
261
262        dependencies.push(Dependency {
263            purl,
264            extracted_requirement: extracted_requirement.map(truncate_field),
265            scope: scope.map(|s| truncate_field(s.to_string())),
266            is_runtime: Some(scope.is_none() || scope == Some("imports")),
267            is_optional: Some(scope == Some("suggests") || scope == Some("enhances")),
268            is_pinned: Some(is_pinned),
269            is_direct: Some(true),
270            resolved_package: None,
271            extra_data: None,
272        });
273    }
274
275    dependencies
276}
277
278static VERSION_CONSTRAINT_RE: LazyLock<Regex> = LazyLock::new(|| {
279    Regex::new(r"^([a-zA-Z0-9.]+)\s*\(([><=]+)\s*([0-9.]+)\)\s*$").expect("valid regex")
280});
281
282/// Examples:
283/// - "cli (>= 3.6.2)" -> ("cli", Some(">= 3.6.2"), true)
284/// - "generics" -> ("generics", None, false)
285/// - "glue (== 1.3.2)" -> ("glue", Some("== 1.3.2"), true)
286fn parse_version_constraint(dep: &str) -> (String, Option<String>, bool) {
287    if let Some(captures) = VERSION_CONSTRAINT_RE.captures(dep) {
288        let name = match captures.get(1) {
289            Some(m) => truncate_field(m.as_str().to_string()),
290            None => return (truncate_field(dep.trim().to_string()), None, false),
291        };
292        let operator = match captures.get(2) {
293            Some(m) => m.as_str(),
294            None => return (name, None, false),
295        };
296        let version = match captures.get(3) {
297            Some(m) => m.as_str(),
298            None => return (name, None, false),
299        };
300        let requirement = truncate_field(format!("{} {}", operator, version));
301        let is_pinned = operator == "==";
302
303        (name, Some(requirement), is_pinned)
304    } else {
305        (truncate_field(dep.trim().to_string()), None, false)
306    }
307}
308
309/// Extract version number from a requirement string like ">= 3.6.2" or "== 1.0.0".
310fn extract_version_from_requirement(requirement: &str) -> Option<String> {
311    requirement
312        .split_whitespace()
313        .nth(1)
314        .map(|s| truncate_field(s.to_string()))
315}
316
317/// Build description from Title and Description fields.
318fn build_description(fields: &HashMap<String, String>) -> Option<String> {
319    let title = fields.get("Title").map(|s| s.trim());
320    let desc = fields.get("Description").map(|s| s.trim());
321
322    match (title, desc) {
323        (Some(t), Some(d)) if !t.is_empty() && !d.is_empty() => {
324            Some(truncate_field(format!("{}\n{}", t, d)))
325        }
326        (Some(t), _) if !t.is_empty() => Some(truncate_field(t.to_string())),
327        (_, Some(d)) if !d.is_empty() => Some(truncate_field(d.to_string())),
328        _ => None,
329    }
330}
331
332fn split_author_entries(author_str: &str) -> Vec<&str> {
333    let mut entries = Vec::new();
334    let mut start = 0;
335    let mut bracket_depth: usize = 0;
336    let mut paren_depth: usize = 0;
337
338    for (idx, ch) in author_str.char_indices().take(MAX_ITERATION_COUNT) {
339        match ch {
340            '[' => bracket_depth += 1,
341            ']' => bracket_depth = bracket_depth.saturating_sub(1),
342            '(' => paren_depth += 1,
343            ')' => paren_depth = paren_depth.saturating_sub(1),
344            ',' if bracket_depth == 0 && paren_depth == 0 => {
345                let entry = author_str[start..idx].trim();
346                if !entry.is_empty() {
347                    entries.push(entry);
348                }
349                start = idx + 1;
350            }
351            _ => {}
352        }
353    }
354
355    let final_entry = author_str[start..].trim();
356    if !final_entry.is_empty() {
357        entries.push(final_entry);
358    }
359
360    entries
361}
362
363/// Parse party information from Author or Maintainer field.
364///
365/// Formats supported:
366/// - "Name <email@domain.com>"
367/// - "Name"
368/// - "email@domain.com"
369fn parse_party(info: &str, role: &str) -> Option<Party> {
370    let info = info.trim();
371    if info.is_empty() {
372        return None;
373    }
374
375    // Check for "Name <email>" format
376    if info.contains('<') && info.contains('>') {
377        let parts: Vec<&str> = info.split('<').collect();
378        if parts.len() == 2 {
379            let name = parts[0].trim().to_string();
380            let email = parts[1].trim_end_matches('>').trim().to_string();
381
382            if !email.contains('@') {
383                return Some(Party {
384                    r#type: Some(truncate_field("person".to_string())),
385                    role: Some(truncate_field(role.to_string())),
386                    name: Some(truncate_field(info.to_string())),
387                    email: None,
388                    url: None,
389                    organization: None,
390                    organization_url: None,
391                    timezone: None,
392                });
393            }
394
395            return Some(Party {
396                r#type: Some(truncate_field("person".to_string())),
397                role: Some(truncate_field(role.to_string())),
398                name: if name.is_empty() {
399                    None
400                } else {
401                    Some(truncate_field(name))
402                },
403                email: if email.is_empty() {
404                    None
405                } else {
406                    Some(truncate_field(email))
407                },
408                url: None,
409                organization: None,
410                organization_url: None,
411                timezone: None,
412            });
413        }
414    }
415
416    // Just a name or email
417    Some(Party {
418        r#type: Some(truncate_field("person".to_string())),
419        role: Some(truncate_field(role.to_string())),
420        name: Some(truncate_field(info.to_string())),
421        email: None,
422        url: None,
423        organization: None,
424        organization_url: None,
425        timezone: None,
426    })
427}
428
429/// Create a package URL for a CRAN package.
430fn create_package_url(name: &Option<String>, version: &Option<String>) -> Option<String> {
431    name.as_ref().and_then(|name| {
432        let mut package_url = match PackageUrl::new("cran", name) {
433            Ok(p) => p,
434            Err(e) => {
435                warn!(
436                    "Failed to create PackageUrl for CRAN package '{}': {}",
437                    name, e
438                );
439                return None;
440            }
441        };
442
443        if let Some(v) = version
444            && let Err(e) = package_url.with_version(v)
445        {
446            warn!(
447                "Failed to set version '{}' for CRAN package '{}': {}",
448                v, name, e
449            );
450            return None;
451        }
452
453        Some(package_url.to_string())
454    })
455}
456
457fn default_package_data() -> PackageData {
458    PackageData {
459        package_type: Some(CranParser::PACKAGE_TYPE),
460        primary_language: Some("R".to_string()),
461        datasource_id: Some(DatasourceId::CranDescription),
462        ..Default::default()
463    }
464}
465
466crate::register_parser!(
467    "CRAN R package DESCRIPTION file",
468    &["**/DESCRIPTION"],
469    "cran",
470    "R",
471    Some("https://cran.r-project.org/doc/manuals/r-release/R-exts.html#The-DESCRIPTION-file"),
472);