Skip to main content

provenant/parsers/
utils.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4/// Shared utility functions for package parsers
5///
6/// This module provides common file I/O and parsing utilities
7/// used across multiple parser implementations.
8use std::collections::HashSet;
9use std::fs::{self, File};
10use std::hash::Hash;
11use std::io::Read;
12use std::path::Path;
13
14use anyhow::Result;
15use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64_STANDARD};
16use packageurl::PackageUrl;
17
18/// Default maximum file size for non-archive manifest files (100 MB).
19pub const MAX_MANIFEST_SIZE: u64 = 100 * 1024 * 1024;
20
21/// Default maximum length for individual string field values (10 MB).
22pub const MAX_FIELD_LENGTH: usize = 10 * 1024 * 1024;
23
24/// Default maximum iteration count for loops processing items (100,000).
25pub const MAX_ITERATION_COUNT: usize = 100_000;
26
27/// Default maximum recursion depth for recursive parsing functions (50 levels).
28pub const MAX_RECURSION_DEPTH: usize = 50;
29
30/// A reusable guard that tracks recursion depth and detects cycles.
31///
32/// Use this in any recursive parser function to enforce the ADR 0004
33/// recursion depth limit (50 levels) and optionally detect circular
34/// references via a visited set keyed by `K`.
35///
36/// For depth-only tracking (no cycle detection), use `RecursionGuard<()>`
37/// — the unit type implements `Hash + Eq` as a singleton, so the visited
38/// set never grows and `enter`/`leave` are cheap no-ops.
39///
40/// # Type Parameters
41///
42/// * `K` — The key type for cycle detection (e.g., `usize` for package
43///   indices, `String` for dependency names, `PathBuf` for file paths,
44///   or `()` for depth-only tracking).
45///
46/// # Example
47///
48/// ```no_run
49/// use provenant::parsers::utils::RecursionGuard;
50///
51/// fn walk_tree(idx: usize, guard: &mut RecursionGuard<usize>) {
52///     if guard.exceeded() {
53///         return;
54///     }
55///     if guard.enter(idx) {
56///         return;
57///     }
58///     walk_tree(idx + 1, guard);
59///     guard.leave(idx);
60/// }
61/// ```
62pub struct RecursionGuard<K: Hash + Eq> {
63    depth: usize,
64    visited: HashSet<K>,
65}
66
67impl<K: Hash + Eq> RecursionGuard<K> {
68    pub fn new() -> Self {
69        Self {
70            depth: 0,
71            visited: HashSet::new(),
72        }
73    }
74
75    pub fn exceeded(&self) -> bool {
76        self.depth > MAX_RECURSION_DEPTH
77    }
78
79    pub fn depth(&self) -> usize {
80        self.depth
81    }
82
83    pub fn enter(&mut self, key: K) -> bool {
84        if self.visited.contains(&key) {
85            return true;
86        }
87        self.visited.insert(key);
88        self.depth += 1;
89        false
90    }
91
92    pub fn leave(&mut self, key: K) {
93        self.visited.remove(&key);
94        self.depth -= 1;
95    }
96}
97
98impl RecursionGuard<()> {
99    pub fn depth_only() -> Self {
100        Self::new()
101    }
102
103    pub fn descend(&mut self) -> bool {
104        self.depth += 1;
105        self.exceeded()
106    }
107
108    pub fn ascend(&mut self) {
109        self.depth -= 1;
110    }
111}
112
113impl<K: Hash + Eq> Default for RecursionGuard<K> {
114    fn default() -> Self {
115        Self::new()
116    }
117}
118
119/// Truncates a string field value to [`MAX_FIELD_LENGTH`] bytes if it exceeds
120/// the limit, returning the truncated string. Returns the original string if
121/// within limits.
122pub fn truncate_field(value: String) -> String {
123    if value.len() <= MAX_FIELD_LENGTH {
124        return value;
125    }
126    let truncated = &value[..value.floor_char_boundary(MAX_FIELD_LENGTH)];
127    crate::parser_warn!(
128        "Truncated field value from {} bytes to {} bytes (MAX_FIELD_LENGTH)",
129        value.len(),
130        truncated.len()
131    );
132    truncated.to_string()
133}
134
135/// Reads a file's entire contents into a String with ADR 0004 security checks.
136///
137/// Performs the following validations before reading:
138/// 1. **File existence**: checks `fs::metadata()` before opening
139/// 2. **File size**: rejects files exceeding `max_size` (default 100 MB)
140/// 3. **UTF-8 encoding**: on UTF-8 failure, falls back to lossy conversion with a warning
141///
142/// # Arguments
143///
144/// * `path` - Path to the file to read
145/// * `max_size` - Maximum allowed file size in bytes (defaults to [`MAX_MANIFEST_SIZE`])
146///
147/// # Returns
148///
149/// * `Ok(String)` - File contents as UTF-8 string (lossy if non-UTF-8 bytes found)
150/// * `Err` - File doesn't exist, is too large, or cannot be read
151///
152/// Typical usage is `read_file_to_string(path, None)` for the default size
153/// limit, or `read_file_to_string(path, Some(limit))` when a tighter bound is
154/// needed.
155pub fn read_file_to_string(path: &Path, max_size: Option<u64>) -> Result<String> {
156    let limit = max_size.unwrap_or(MAX_MANIFEST_SIZE);
157
158    let metadata =
159        fs::metadata(path).map_err(|e| anyhow::anyhow!("Cannot stat file {:?}: {}", path, e))?;
160
161    if metadata.len() > limit {
162        anyhow::bail!(
163            "File {:?} is {} bytes, exceeding the {} byte limit",
164            path,
165            metadata.len(),
166            limit
167        );
168    }
169
170    let mut bytes = Vec::with_capacity(metadata.len() as usize);
171    let mut file = File::open(path)?;
172    file.read_to_end(&mut bytes)?;
173
174    match String::from_utf8(bytes) {
175        Ok(s) => Ok(s),
176        Err(err) => {
177            let bytes = err.into_bytes();
178            crate::parser_warn!(
179                "File {:?} contains invalid UTF-8; using lossy conversion",
180                path
181            );
182            Ok(String::from_utf8_lossy(&bytes).into_owned())
183        }
184    }
185}
186
187/// Creates a correctly-formatted npm Package URL for scoped or regular packages.
188///
189/// Handles namespace encoding for scoped packages (e.g., `@babel/core`) and ensures
190/// the slash between namespace and package name is NOT encoded as `%2F`.
191pub fn npm_purl(full_name: &str, version: Option<&str>) -> Option<String> {
192    let (namespace, name) = if full_name.starts_with('@') {
193        let parts: Vec<&str> = full_name.splitn(2, '/').collect();
194        if parts.len() == 2 {
195            (Some(parts[0]), parts[1])
196        } else {
197            (None, full_name)
198        }
199    } else {
200        (None, full_name)
201    };
202
203    let mut purl = PackageUrl::new("npm", name).ok()?;
204
205    if let Some(ns) = namespace {
206        purl.with_namespace(ns).ok()?;
207    }
208
209    if let Some(ver) = version {
210        purl.with_version(ver).ok()?;
211    }
212
213    Some(purl.to_string())
214}
215
216/// Parses Subresource Integrity (SRI) format and returns hash as hex string.
217///
218/// SRI format: "algorithm-base64string" (e.g., "sha512-9NET910DNaIPng...")
219///
220/// Returns the algorithm name and hex-encoded hash digest.
221pub fn parse_sri(integrity: &str) -> Option<(String, String)> {
222    let parts: Vec<&str> = integrity.splitn(2, '-').collect();
223    if parts.len() != 2 {
224        return None;
225    }
226
227    let algorithm = parts[0];
228    let base64_str = parts[1];
229
230    let bytes = BASE64_STANDARD.decode(base64_str).ok()?;
231
232    let hex_string = bytes
233        .iter()
234        .map(|b| format!("{:02x}", b))
235        .collect::<String>();
236
237    Some((algorithm.to_string(), hex_string))
238}
239
240/// Parses "Name <email@domain.com>" format into separate components.
241///
242/// This utility handles common author/maintainer strings found in package manifests
243/// where the format combines a human-readable name with an email address in angle brackets.
244///
245/// # Arguments
246///
247/// * `s` - A string potentially containing name and email in "Name \<email\>" format
248///
249/// # Returns
250///
251/// A tuple of `(Option<String>, Option<String>)` representing `(name, email)`:
252/// - If `\<email\>` pattern found: name (trimmed, or None if empty) and email
253/// - If no pattern: trimmed input as name, None for email
254///
255/// Examples: `John Doe <john@example.com>` becomes `(Some("John Doe"),
256/// Some("john@example.com"))`, `<john@example.com>` becomes `(None,
257/// Some("john@example.com"))`, and `John Doe` becomes
258/// `(Some("John Doe"), None)`.
259pub fn split_name_email(s: &str) -> (Option<String>, Option<String>) {
260    if let Some(email_start) = s.find('<')
261        && let Some(email_end) = s.find('>')
262        && email_start < email_end
263    {
264        let name = s[..email_start].trim();
265        let email = &s[email_start + 1..email_end];
266        (
267            if name.is_empty() {
268                None
269            } else {
270                Some(name.to_string())
271            },
272            Some(email.to_string()),
273        )
274    } else {
275        (Some(s.trim().to_string()), None)
276    }
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282    use std::io::Write;
283    use tempfile::tempdir;
284
285    #[test]
286    fn test_recursion_guard_tracks_depth_and_cycles() {
287        let mut guard = RecursionGuard::new();
288
289        assert_eq!(guard.depth(), 0);
290        assert!(!guard.exceeded());
291
292        assert!(!guard.enter("root"));
293        assert_eq!(guard.depth(), 1);
294        assert!(!guard.enter("child"));
295        assert_eq!(guard.depth(), 2);
296
297        assert!(guard.enter("root"));
298        assert_eq!(guard.depth(), 2);
299
300        guard.leave("child");
301        assert_eq!(guard.depth(), 1);
302        guard.leave("root");
303        assert_eq!(guard.depth(), 0);
304        assert!(!guard.exceeded());
305    }
306
307    #[test]
308    fn test_recursion_guard_depth_limit_and_depth_only_mode() {
309        let mut guard = RecursionGuard::<()>::depth_only();
310
311        for _ in 0..MAX_RECURSION_DEPTH {
312            assert!(!guard.descend());
313        }
314
315        assert_eq!(guard.depth(), MAX_RECURSION_DEPTH);
316        assert!(!guard.exceeded());
317
318        assert!(guard.descend());
319        assert_eq!(guard.depth(), MAX_RECURSION_DEPTH + 1);
320        assert!(guard.exceeded());
321
322        guard.ascend();
323        assert_eq!(guard.depth(), MAX_RECURSION_DEPTH);
324        assert!(!guard.exceeded());
325    }
326
327    #[test]
328    fn test_read_file_to_string_success() {
329        let dir = tempdir().unwrap();
330        let file_path = dir.path().join("test.txt");
331        let mut file = File::create(&file_path).unwrap();
332        file.write_all(b"test content").unwrap();
333
334        let content = read_file_to_string(&file_path, None).unwrap();
335        assert_eq!(content, "test content");
336    }
337
338    #[test]
339    fn test_read_file_to_string_nonexistent() {
340        let path = Path::new("/nonexistent/file.txt");
341        let result = read_file_to_string(path, None);
342        assert!(result.is_err());
343    }
344
345    #[test]
346    fn test_read_file_to_string_empty() {
347        let dir = tempdir().unwrap();
348        let file_path = dir.path().join("empty.txt");
349        File::create(&file_path).unwrap();
350
351        let content = read_file_to_string(&file_path, None).unwrap();
352        assert_eq!(content, "");
353    }
354
355    #[test]
356    fn test_npm_purl_scoped_with_version() {
357        let purl = npm_purl("@babel/core", Some("7.0.0")).unwrap();
358        assert_eq!(purl, "pkg:npm/%40babel/core@7.0.0");
359    }
360
361    #[test]
362    fn test_npm_purl_scoped_without_version() {
363        let purl = npm_purl("@babel/core", None).unwrap();
364        assert_eq!(purl, "pkg:npm/%40babel/core");
365    }
366
367    #[test]
368    fn test_npm_purl_unscoped_with_version() {
369        let purl = npm_purl("lodash", Some("4.17.21")).unwrap();
370        assert_eq!(purl, "pkg:npm/lodash@4.17.21");
371    }
372
373    #[test]
374    fn test_npm_purl_unscoped_without_version() {
375        let purl = npm_purl("lodash", None).unwrap();
376        assert_eq!(purl, "pkg:npm/lodash");
377    }
378
379    #[test]
380    fn test_npm_purl_scoped_slash_not_encoded() {
381        let purl = npm_purl("@types/node", Some("18.0.0")).unwrap();
382        assert!(purl.contains("/%40types/node"));
383        assert!(!purl.contains("%2F"));
384    }
385
386    #[test]
387    fn test_parse_sri_sha512() {
388        let (algo, hash) = parse_sri("sha512-9NET910DNaIPngYnLLPeg+Ogzqsi9uM4mSboU5y6p8S5DzMTVEsJZrawi+BoDNUVBa2DhJqQYUFvMDfgU062LQ==").unwrap();
389        assert_eq!(algo, "sha512");
390        assert_eq!(hash.len(), 128);
391    }
392
393    #[test]
394    fn test_parse_sri_sha1() {
395        let (algo, hash) = parse_sri("sha1-w7M6te42DYbg5ijwRorn7yfWVN8=").unwrap();
396        assert_eq!(algo, "sha1");
397        assert_eq!(hash.len(), 40);
398    }
399
400    #[test]
401    fn test_parse_sri_sha256() {
402        let (algo, hash) =
403            parse_sri("sha256-47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=").unwrap();
404        assert_eq!(algo, "sha256");
405        assert_eq!(hash.len(), 64);
406    }
407
408    #[test]
409    fn test_parse_sri_invalid_format() {
410        assert!(parse_sri("invalid").is_none());
411        assert!(parse_sri("sha512").is_none());
412        assert!(parse_sri("").is_none());
413    }
414
415    #[test]
416    fn test_parse_sri_invalid_base64() {
417        assert!(parse_sri("sha512-!!!invalid!!!").is_none());
418    }
419
420    #[test]
421    fn test_split_name_email_full_format() {
422        let (name, email) = split_name_email("John Doe <john@example.com>");
423        assert_eq!(name, Some("John Doe".to_string()));
424        assert_eq!(email, Some("john@example.com".to_string()));
425    }
426
427    #[test]
428    fn test_split_name_email_name_only() {
429        let (name, email) = split_name_email("John Doe");
430        assert_eq!(name, Some("John Doe".to_string()));
431        assert_eq!(email, None);
432    }
433
434    #[test]
435    fn test_split_name_email_email_only_plain() {
436        let (name, email) = split_name_email("john@example.com");
437        assert_eq!(name, Some("john@example.com".to_string()));
438        assert_eq!(email, None);
439    }
440
441    #[test]
442    fn test_split_name_email_email_only_brackets() {
443        let (name, email) = split_name_email("<john@example.com>");
444        assert_eq!(name, None);
445        assert_eq!(email, Some("john@example.com".to_string()));
446    }
447
448    #[test]
449    fn test_split_name_email_whitespace_trimming() {
450        let (name, email) = split_name_email("  John Doe  <  john@example.com  >  ");
451        assert_eq!(name, Some("John Doe".to_string()));
452        assert_eq!(email, Some("  john@example.com  ".to_string()));
453    }
454
455    #[test]
456    fn test_split_name_email_empty_string() {
457        let (name, email) = split_name_email("");
458        assert_eq!(name, Some("".to_string()));
459        assert_eq!(email, None);
460    }
461
462    #[test]
463    fn test_split_name_email_whitespace_only() {
464        let (name, email) = split_name_email("   ");
465        assert_eq!(name, Some("".to_string()));
466        assert_eq!(email, None);
467    }
468
469    #[test]
470    fn test_split_name_email_invalid_bracket_order() {
471        let (name, email) = split_name_email("John >email< Doe");
472        assert_eq!(name, Some("John >email< Doe".to_string()));
473        assert_eq!(email, None);
474    }
475
476    #[test]
477    fn test_split_name_email_missing_close_bracket() {
478        let (name, email) = split_name_email("John Doe <email@example.com");
479        assert_eq!(name, Some("John Doe <email@example.com".to_string()));
480        assert_eq!(email, None);
481    }
482
483    #[test]
484    fn test_split_name_email_missing_open_bracket() {
485        let (name, email) = split_name_email("John Doe email@example.com>");
486        assert_eq!(name, Some("John Doe email@example.com>".to_string()));
487        assert_eq!(email, None);
488    }
489
490    #[test]
491    fn test_read_file_to_string_oversized() {
492        let dir = tempdir().unwrap();
493        let file_path = dir.path().join("big.txt");
494        fs::write(&file_path, "x").unwrap();
495
496        let result = read_file_to_string(&file_path, Some(0));
497        assert!(result.is_err());
498    }
499
500    #[test]
501    fn test_read_file_to_string_lossy_utf8() {
502        let dir = tempdir().unwrap();
503        let file_path = dir.path().join("bad_utf8.txt");
504        let mut file = File::create(&file_path).unwrap();
505        file.write_all(b"hello\xffworld").unwrap();
506
507        let content = read_file_to_string(&file_path, None).unwrap();
508        assert!(content.contains("hello"));
509        assert!(content.contains("world"));
510    }
511
512    #[test]
513    fn test_truncate_field_within_limit() {
514        let s = "short value".to_string();
515        assert_eq!(truncate_field(s.clone()), s);
516    }
517
518    #[test]
519    fn test_truncate_field_exceeds_limit() {
520        let long = "x".repeat(MAX_FIELD_LENGTH + 100);
521        let truncated = truncate_field(long);
522        assert!(truncated.len() <= MAX_FIELD_LENGTH);
523    }
524}