Skip to main content

provenant/parsers/
utils.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4/// Shared utility functions for package parsers
5///
6/// This module provides common file I/O and parsing utilities
7/// used across multiple parser implementations.
8use std::collections::HashSet;
9use std::fs::{self, File};
10use std::hash::Hash;
11use std::io::Read;
12use std::path::Path;
13
14use anyhow::Result;
15use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64_STANDARD};
16use packageurl::PackageUrl;
17
18/// Default maximum file size for non-archive manifest files (100 MB).
19pub const MAX_MANIFEST_SIZE: u64 = 100 * 1024 * 1024;
20
21/// Default maximum length for individual string field values (10 MB).
22pub const MAX_FIELD_LENGTH: usize = 10 * 1024 * 1024;
23
24/// Default maximum iteration count for loops processing items (100,000).
25pub const MAX_ITERATION_COUNT: usize = 100_000;
26
27/// Default maximum recursion depth for recursive parsing functions (50 levels).
28pub const MAX_RECURSION_DEPTH: usize = 50;
29
30/// A reusable guard that tracks recursion depth and detects cycles.
31///
32/// Use this in any recursive parser function to enforce the ADR 0004
33/// recursion depth limit (50 levels) and optionally detect circular
34/// references via a visited set keyed by `K`.
35///
36/// For depth-only tracking (no cycle detection), use `RecursionGuard<()>`
37/// — the unit type implements `Hash + Eq` as a singleton, so the visited
38/// set never grows and `enter`/`leave` are cheap no-ops.
39///
40/// # Type Parameters
41///
42/// * `K` — The key type for cycle detection (e.g., `usize` for package
43///   indices, `String` for dependency names, `PathBuf` for file paths,
44///   or `()` for depth-only tracking).
45///
46/// # Examples
47///
48/// ```no_run
49/// use provenant::parsers::utils::RecursionGuard;
50///
51/// fn walk_tree(idx: usize, guard: &mut RecursionGuard<usize>) {
52///     if guard.exceeded() { return; }
53///     if guard.enter(idx) { return; } // cycle detected
54///     // ... recurse into children ...
55///     walk_tree(idx + 1, guard);
56///     guard.leave(idx);
57/// }
58/// ```
59pub struct RecursionGuard<K: Hash + Eq> {
60    depth: usize,
61    visited: HashSet<K>,
62}
63
64impl<K: Hash + Eq> RecursionGuard<K> {
65    pub fn new() -> Self {
66        Self {
67            depth: 0,
68            visited: HashSet::new(),
69        }
70    }
71
72    pub fn exceeded(&self) -> bool {
73        self.depth > MAX_RECURSION_DEPTH
74    }
75
76    pub fn depth(&self) -> usize {
77        self.depth
78    }
79
80    pub fn enter(&mut self, key: K) -> bool {
81        if self.visited.contains(&key) {
82            return true;
83        }
84        self.visited.insert(key);
85        self.depth += 1;
86        false
87    }
88
89    pub fn leave(&mut self, key: K) {
90        self.visited.remove(&key);
91        self.depth -= 1;
92    }
93}
94
95impl RecursionGuard<()> {
96    pub fn depth_only() -> Self {
97        Self::new()
98    }
99
100    pub fn descend(&mut self) -> bool {
101        self.depth += 1;
102        self.exceeded()
103    }
104
105    pub fn ascend(&mut self) {
106        self.depth -= 1;
107    }
108}
109
110impl<K: Hash + Eq> Default for RecursionGuard<K> {
111    fn default() -> Self {
112        Self::new()
113    }
114}
115
116/// Truncates a string field value to [`MAX_FIELD_LENGTH`] bytes if it exceeds
117/// the limit, returning the truncated string. Returns the original string if
118/// within limits.
119pub fn truncate_field(value: String) -> String {
120    if value.len() <= MAX_FIELD_LENGTH {
121        return value;
122    }
123    let truncated = &value[..value.floor_char_boundary(MAX_FIELD_LENGTH)];
124    crate::parser_warn!(
125        "Truncated field value from {} bytes to {} bytes (MAX_FIELD_LENGTH)",
126        value.len(),
127        truncated.len()
128    );
129    truncated.to_string()
130}
131
132/// Reads a file's entire contents into a String with ADR 0004 security checks.
133///
134/// Performs the following validations before reading:
135/// 1. **File existence**: checks `fs::metadata()` before opening
136/// 2. **File size**: rejects files exceeding `max_size` (default 100 MB)
137/// 3. **UTF-8 encoding**: on UTF-8 failure, falls back to lossy conversion with a warning
138///
139/// # Arguments
140///
141/// * `path` - Path to the file to read
142/// * `max_size` - Maximum allowed file size in bytes (defaults to [`MAX_MANIFEST_SIZE`])
143///
144/// # Returns
145///
146/// * `Ok(String)` - File contents as UTF-8 string (lossy if non-UTF-8 bytes found)
147/// * `Err` - File doesn't exist, is too large, or cannot be read
148///
149/// # Examples
150///
151/// ```no_run
152/// use std::path::Path;
153/// use provenant::parsers::utils::read_file_to_string;
154///
155/// let content = read_file_to_string(Path::new("path/to/file.txt"), None)?;
156/// # Ok::<(), anyhow::Error>(())
157/// ```
158pub fn read_file_to_string(path: &Path, max_size: Option<u64>) -> Result<String> {
159    let limit = max_size.unwrap_or(MAX_MANIFEST_SIZE);
160
161    let metadata =
162        fs::metadata(path).map_err(|e| anyhow::anyhow!("Cannot stat file {:?}: {}", path, e))?;
163
164    if metadata.len() > limit {
165        anyhow::bail!(
166            "File {:?} is {} bytes, exceeding the {} byte limit",
167            path,
168            metadata.len(),
169            limit
170        );
171    }
172
173    let mut bytes = Vec::with_capacity(metadata.len() as usize);
174    let mut file = File::open(path)?;
175    file.read_to_end(&mut bytes)?;
176
177    match String::from_utf8(bytes) {
178        Ok(s) => Ok(s),
179        Err(err) => {
180            let bytes = err.into_bytes();
181            crate::parser_warn!(
182                "File {:?} contains invalid UTF-8; using lossy conversion",
183                path
184            );
185            Ok(String::from_utf8_lossy(&bytes).into_owned())
186        }
187    }
188}
189
190/// Creates a correctly-formatted npm Package URL for scoped or regular packages.
191///
192/// Handles namespace encoding for scoped packages (e.g., `@babel/core`) and ensures
193/// the slash between namespace and package name is NOT encoded as `%2F`.
194pub fn npm_purl(full_name: &str, version: Option<&str>) -> Option<String> {
195    let (namespace, name) = if full_name.starts_with('@') {
196        let parts: Vec<&str> = full_name.splitn(2, '/').collect();
197        if parts.len() == 2 {
198            (Some(parts[0]), parts[1])
199        } else {
200            (None, full_name)
201        }
202    } else {
203        (None, full_name)
204    };
205
206    let mut purl = PackageUrl::new("npm", name).ok()?;
207
208    if let Some(ns) = namespace {
209        purl.with_namespace(ns).ok()?;
210    }
211
212    if let Some(ver) = version {
213        purl.with_version(ver).ok()?;
214    }
215
216    Some(purl.to_string())
217}
218
219/// Parses Subresource Integrity (SRI) format and returns hash as hex string.
220///
221/// SRI format: "algorithm-base64string" (e.g., "sha512-9NET910DNaIPng...")
222///
223/// Returns the algorithm name and hex-encoded hash digest.
224pub fn parse_sri(integrity: &str) -> Option<(String, String)> {
225    let parts: Vec<&str> = integrity.splitn(2, '-').collect();
226    if parts.len() != 2 {
227        return None;
228    }
229
230    let algorithm = parts[0];
231    let base64_str = parts[1];
232
233    let bytes = BASE64_STANDARD.decode(base64_str).ok()?;
234
235    let hex_string = bytes
236        .iter()
237        .map(|b| format!("{:02x}", b))
238        .collect::<String>();
239
240    Some((algorithm.to_string(), hex_string))
241}
242
243/// Parses "Name <email@domain.com>" format into separate components.
244///
245/// This utility handles common author/maintainer strings found in package manifests
246/// where the format combines a human-readable name with an email address in angle brackets.
247///
248/// # Arguments
249///
250/// * `s` - A string potentially containing name and email in "Name \<email\>" format
251///
252/// # Returns
253///
254/// A tuple of `(Option<String>, Option<String>)` representing `(name, email)`:
255/// - If `\<email\>` pattern found: name (trimmed, or None if empty) and email
256/// - If no pattern: trimmed input as name, None for email
257///
258/// # Examples
259///
260/// ```
261/// use provenant::parsers::utils::split_name_email;
262///
263/// // Full format
264/// let (name, email) = split_name_email("John Doe <john@example.com>");
265/// assert_eq!(name, Some("John Doe".to_string()));
266/// assert_eq!(email, Some("john@example.com".to_string()));
267///
268/// // Email only in angle brackets
269/// let (name, email) = split_name_email("<john@example.com>");
270/// assert_eq!(name, None);
271/// assert_eq!(email, Some("john@example.com".to_string()));
272///
273/// // Name only (no angle brackets)
274/// let (name, email) = split_name_email("John Doe");
275/// assert_eq!(name, Some("John Doe".to_string()));
276/// assert_eq!(email, None);
277/// ```
278pub fn split_name_email(s: &str) -> (Option<String>, Option<String>) {
279    if let Some(email_start) = s.find('<')
280        && let Some(email_end) = s.find('>')
281        && email_start < email_end
282    {
283        let name = s[..email_start].trim();
284        let email = &s[email_start + 1..email_end];
285        (
286            if name.is_empty() {
287                None
288            } else {
289                Some(name.to_string())
290            },
291            Some(email.to_string()),
292        )
293    } else {
294        (Some(s.trim().to_string()), None)
295    }
296}
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301    use std::io::Write;
302    use tempfile::tempdir;
303
304    #[test]
305    fn test_read_file_to_string_success() {
306        let dir = tempdir().unwrap();
307        let file_path = dir.path().join("test.txt");
308        let mut file = File::create(&file_path).unwrap();
309        file.write_all(b"test content").unwrap();
310
311        let content = read_file_to_string(&file_path, None).unwrap();
312        assert_eq!(content, "test content");
313    }
314
315    #[test]
316    fn test_read_file_to_string_nonexistent() {
317        let path = Path::new("/nonexistent/file.txt");
318        let result = read_file_to_string(path, None);
319        assert!(result.is_err());
320    }
321
322    #[test]
323    fn test_read_file_to_string_empty() {
324        let dir = tempdir().unwrap();
325        let file_path = dir.path().join("empty.txt");
326        File::create(&file_path).unwrap();
327
328        let content = read_file_to_string(&file_path, None).unwrap();
329        assert_eq!(content, "");
330    }
331
332    #[test]
333    fn test_npm_purl_scoped_with_version() {
334        let purl = npm_purl("@babel/core", Some("7.0.0")).unwrap();
335        assert_eq!(purl, "pkg:npm/%40babel/core@7.0.0");
336    }
337
338    #[test]
339    fn test_npm_purl_scoped_without_version() {
340        let purl = npm_purl("@babel/core", None).unwrap();
341        assert_eq!(purl, "pkg:npm/%40babel/core");
342    }
343
344    #[test]
345    fn test_npm_purl_unscoped_with_version() {
346        let purl = npm_purl("lodash", Some("4.17.21")).unwrap();
347        assert_eq!(purl, "pkg:npm/lodash@4.17.21");
348    }
349
350    #[test]
351    fn test_npm_purl_unscoped_without_version() {
352        let purl = npm_purl("lodash", None).unwrap();
353        assert_eq!(purl, "pkg:npm/lodash");
354    }
355
356    #[test]
357    fn test_npm_purl_scoped_slash_not_encoded() {
358        let purl = npm_purl("@types/node", Some("18.0.0")).unwrap();
359        assert!(purl.contains("/%40types/node"));
360        assert!(!purl.contains("%2F"));
361    }
362
363    #[test]
364    fn test_parse_sri_sha512() {
365        let (algo, hash) = parse_sri("sha512-9NET910DNaIPngYnLLPeg+Ogzqsi9uM4mSboU5y6p8S5DzMTVEsJZrawi+BoDNUVBa2DhJqQYUFvMDfgU062LQ==").unwrap();
366        assert_eq!(algo, "sha512");
367        assert_eq!(hash.len(), 128);
368    }
369
370    #[test]
371    fn test_parse_sri_sha1() {
372        let (algo, hash) = parse_sri("sha1-w7M6te42DYbg5ijwRorn7yfWVN8=").unwrap();
373        assert_eq!(algo, "sha1");
374        assert_eq!(hash.len(), 40);
375    }
376
377    #[test]
378    fn test_parse_sri_sha256() {
379        let (algo, hash) =
380            parse_sri("sha256-47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=").unwrap();
381        assert_eq!(algo, "sha256");
382        assert_eq!(hash.len(), 64);
383    }
384
385    #[test]
386    fn test_parse_sri_invalid_format() {
387        assert!(parse_sri("invalid").is_none());
388        assert!(parse_sri("sha512").is_none());
389        assert!(parse_sri("").is_none());
390    }
391
392    #[test]
393    fn test_parse_sri_invalid_base64() {
394        assert!(parse_sri("sha512-!!!invalid!!!").is_none());
395    }
396
397    #[test]
398    fn test_split_name_email_full_format() {
399        let (name, email) = split_name_email("John Doe <john@example.com>");
400        assert_eq!(name, Some("John Doe".to_string()));
401        assert_eq!(email, Some("john@example.com".to_string()));
402    }
403
404    #[test]
405    fn test_split_name_email_name_only() {
406        let (name, email) = split_name_email("John Doe");
407        assert_eq!(name, Some("John Doe".to_string()));
408        assert_eq!(email, None);
409    }
410
411    #[test]
412    fn test_split_name_email_email_only_plain() {
413        let (name, email) = split_name_email("john@example.com");
414        assert_eq!(name, Some("john@example.com".to_string()));
415        assert_eq!(email, None);
416    }
417
418    #[test]
419    fn test_split_name_email_email_only_brackets() {
420        let (name, email) = split_name_email("<john@example.com>");
421        assert_eq!(name, None);
422        assert_eq!(email, Some("john@example.com".to_string()));
423    }
424
425    #[test]
426    fn test_split_name_email_whitespace_trimming() {
427        let (name, email) = split_name_email("  John Doe  <  john@example.com  >  ");
428        assert_eq!(name, Some("John Doe".to_string()));
429        assert_eq!(email, Some("  john@example.com  ".to_string()));
430    }
431
432    #[test]
433    fn test_split_name_email_empty_string() {
434        let (name, email) = split_name_email("");
435        assert_eq!(name, Some("".to_string()));
436        assert_eq!(email, None);
437    }
438
439    #[test]
440    fn test_split_name_email_whitespace_only() {
441        let (name, email) = split_name_email("   ");
442        assert_eq!(name, Some("".to_string()));
443        assert_eq!(email, None);
444    }
445
446    #[test]
447    fn test_split_name_email_invalid_bracket_order() {
448        let (name, email) = split_name_email("John >email< Doe");
449        assert_eq!(name, Some("John >email< Doe".to_string()));
450        assert_eq!(email, None);
451    }
452
453    #[test]
454    fn test_split_name_email_missing_close_bracket() {
455        let (name, email) = split_name_email("John Doe <email@example.com");
456        assert_eq!(name, Some("John Doe <email@example.com".to_string()));
457        assert_eq!(email, None);
458    }
459
460    #[test]
461    fn test_split_name_email_missing_open_bracket() {
462        let (name, email) = split_name_email("John Doe email@example.com>");
463        assert_eq!(name, Some("John Doe email@example.com>".to_string()));
464        assert_eq!(email, None);
465    }
466
467    #[test]
468    fn test_read_file_to_string_oversized() {
469        let dir = tempdir().unwrap();
470        let file_path = dir.path().join("big.txt");
471        fs::write(&file_path, "x").unwrap();
472
473        let result = read_file_to_string(&file_path, Some(0));
474        assert!(result.is_err());
475    }
476
477    #[test]
478    fn test_read_file_to_string_lossy_utf8() {
479        let dir = tempdir().unwrap();
480        let file_path = dir.path().join("bad_utf8.txt");
481        let mut file = File::create(&file_path).unwrap();
482        file.write_all(b"hello\xffworld").unwrap();
483
484        let content = read_file_to_string(&file_path, None).unwrap();
485        assert!(content.contains("hello"));
486        assert!(content.contains("world"));
487    }
488
489    #[test]
490    fn test_truncate_field_within_limit() {
491        let s = "short value".to_string();
492        assert_eq!(truncate_field(s.clone()), s);
493    }
494
495    #[test]
496    fn test_truncate_field_exceeds_limit() {
497        let long = "x".repeat(MAX_FIELD_LENGTH + 100);
498        let truncated = truncate_field(long);
499        assert!(truncated.len() <= MAX_FIELD_LENGTH);
500    }
501}