Skip to main content

provenant/parsers/
utils.rs

1/// Shared utility functions for package parsers
2///
3/// This module provides common file I/O and parsing utilities
4/// used across multiple parser implementations.
5use std::collections::HashSet;
6use std::fs::{self, File};
7use std::hash::Hash;
8use std::io::Read;
9use std::path::Path;
10
11use anyhow::Result;
12use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64_STANDARD};
13use packageurl::PackageUrl;
14
15/// Default maximum file size for non-archive manifest files (100 MB).
16pub const MAX_MANIFEST_SIZE: u64 = 100 * 1024 * 1024;
17
18/// Default maximum length for individual string field values (10 MB).
19pub const MAX_FIELD_LENGTH: usize = 10 * 1024 * 1024;
20
21/// Default maximum iteration count for loops processing items (100,000).
22pub const MAX_ITERATION_COUNT: usize = 100_000;
23
24/// Default maximum recursion depth for recursive parsing functions (50 levels).
25pub const MAX_RECURSION_DEPTH: usize = 50;
26
27/// A reusable guard that tracks recursion depth and detects cycles.
28///
29/// Use this in any recursive parser function to enforce the ADR 0004
30/// recursion depth limit (50 levels) and optionally detect circular
31/// references via a visited set keyed by `K`.
32///
33/// For depth-only tracking (no cycle detection), use `RecursionGuard<()>`
34/// — the unit type implements `Hash + Eq` as a singleton, so the visited
35/// set never grows and `enter`/`leave` are cheap no-ops.
36///
37/// # Type Parameters
38///
39/// * `K` — The key type for cycle detection (e.g., `usize` for package
40///   indices, `String` for dependency names, `PathBuf` for file paths,
41///   or `()` for depth-only tracking).
42///
43/// # Examples
44///
45/// ```no_run
46/// use provenant::parsers::utils::RecursionGuard;
47///
48/// fn walk_tree(idx: usize, guard: &mut RecursionGuard<usize>) {
49///     if guard.exceeded() { return; }
50///     if guard.enter(idx) { return; } // cycle detected
51///     // ... recurse into children ...
52///     walk_tree(idx + 1, guard);
53///     guard.leave(idx);
54/// }
55/// ```
56pub struct RecursionGuard<K: Hash + Eq> {
57    depth: usize,
58    visited: HashSet<K>,
59}
60
61impl<K: Hash + Eq> RecursionGuard<K> {
62    pub fn new() -> Self {
63        Self {
64            depth: 0,
65            visited: HashSet::new(),
66        }
67    }
68
69    pub fn exceeded(&self) -> bool {
70        self.depth > MAX_RECURSION_DEPTH
71    }
72
73    pub fn depth(&self) -> usize {
74        self.depth
75    }
76
77    pub fn enter(&mut self, key: K) -> bool {
78        if self.visited.contains(&key) {
79            return true;
80        }
81        self.visited.insert(key);
82        self.depth += 1;
83        false
84    }
85
86    pub fn leave(&mut self, key: K) {
87        self.visited.remove(&key);
88        self.depth -= 1;
89    }
90}
91
92impl RecursionGuard<()> {
93    pub fn depth_only() -> Self {
94        Self::new()
95    }
96
97    pub fn descend(&mut self) -> bool {
98        self.depth += 1;
99        self.exceeded()
100    }
101
102    pub fn ascend(&mut self) {
103        self.depth -= 1;
104    }
105}
106
107impl<K: Hash + Eq> Default for RecursionGuard<K> {
108    fn default() -> Self {
109        Self::new()
110    }
111}
112
113/// Truncates a string field value to [`MAX_FIELD_LENGTH`] bytes if it exceeds
114/// the limit, returning the truncated string. Returns the original string if
115/// within limits.
116pub fn truncate_field(value: String) -> String {
117    if value.len() <= MAX_FIELD_LENGTH {
118        return value;
119    }
120    let truncated = &value[..value.floor_char_boundary(MAX_FIELD_LENGTH)];
121    crate::parser_warn!(
122        "Truncated field value from {} bytes to {} bytes (MAX_FIELD_LENGTH)",
123        value.len(),
124        truncated.len()
125    );
126    truncated.to_string()
127}
128
129/// Reads a file's entire contents into a String with ADR 0004 security checks.
130///
131/// Performs the following validations before reading:
132/// 1. **File existence**: checks `fs::metadata()` before opening
133/// 2. **File size**: rejects files exceeding `max_size` (default 100 MB)
134/// 3. **UTF-8 encoding**: on UTF-8 failure, falls back to lossy conversion with a warning
135///
136/// # Arguments
137///
138/// * `path` - Path to the file to read
139/// * `max_size` - Maximum allowed file size in bytes (defaults to [`MAX_MANIFEST_SIZE`])
140///
141/// # Returns
142///
143/// * `Ok(String)` - File contents as UTF-8 string (lossy if non-UTF-8 bytes found)
144/// * `Err` - File doesn't exist, is too large, or cannot be read
145///
146/// # Examples
147///
148/// ```no_run
149/// use std::path::Path;
150/// use provenant::parsers::utils::read_file_to_string;
151///
152/// let content = read_file_to_string(Path::new("path/to/file.txt"), None)?;
153/// # Ok::<(), anyhow::Error>(())
154/// ```
155pub fn read_file_to_string(path: &Path, max_size: Option<u64>) -> Result<String> {
156    let limit = max_size.unwrap_or(MAX_MANIFEST_SIZE);
157
158    let metadata =
159        fs::metadata(path).map_err(|e| anyhow::anyhow!("Cannot stat file {:?}: {}", path, e))?;
160
161    if metadata.len() > limit {
162        anyhow::bail!(
163            "File {:?} is {} bytes, exceeding the {} byte limit",
164            path,
165            metadata.len(),
166            limit
167        );
168    }
169
170    let mut bytes = Vec::with_capacity(metadata.len() as usize);
171    let mut file = File::open(path)?;
172    file.read_to_end(&mut bytes)?;
173
174    match String::from_utf8(bytes) {
175        Ok(s) => Ok(s),
176        Err(err) => {
177            let bytes = err.into_bytes();
178            crate::parser_warn!(
179                "File {:?} contains invalid UTF-8; using lossy conversion",
180                path
181            );
182            Ok(String::from_utf8_lossy(&bytes).into_owned())
183        }
184    }
185}
186
187/// Creates a correctly-formatted npm Package URL for scoped or regular packages.
188///
189/// Handles namespace encoding for scoped packages (e.g., `@babel/core`) and ensures
190/// the slash between namespace and package name is NOT encoded as `%2F`.
191pub fn npm_purl(full_name: &str, version: Option<&str>) -> Option<String> {
192    let (namespace, name) = if full_name.starts_with('@') {
193        let parts: Vec<&str> = full_name.splitn(2, '/').collect();
194        if parts.len() == 2 {
195            (Some(parts[0]), parts[1])
196        } else {
197            (None, full_name)
198        }
199    } else {
200        (None, full_name)
201    };
202
203    let mut purl = PackageUrl::new("npm", name).ok()?;
204
205    if let Some(ns) = namespace {
206        purl.with_namespace(ns).ok()?;
207    }
208
209    if let Some(ver) = version {
210        purl.with_version(ver).ok()?;
211    }
212
213    Some(purl.to_string())
214}
215
216/// Parses Subresource Integrity (SRI) format and returns hash as hex string.
217///
218/// SRI format: "algorithm-base64string" (e.g., "sha512-9NET910DNaIPng...")
219///
220/// Returns the algorithm name and hex-encoded hash digest.
221pub fn parse_sri(integrity: &str) -> Option<(String, String)> {
222    let parts: Vec<&str> = integrity.splitn(2, '-').collect();
223    if parts.len() != 2 {
224        return None;
225    }
226
227    let algorithm = parts[0];
228    let base64_str = parts[1];
229
230    let bytes = BASE64_STANDARD.decode(base64_str).ok()?;
231
232    let hex_string = bytes
233        .iter()
234        .map(|b| format!("{:02x}", b))
235        .collect::<String>();
236
237    Some((algorithm.to_string(), hex_string))
238}
239
240/// Parses "Name <email@domain.com>" format into separate components.
241///
242/// This utility handles common author/maintainer strings found in package manifests
243/// where the format combines a human-readable name with an email address in angle brackets.
244///
245/// # Arguments
246///
247/// * `s` - A string potentially containing name and email in "Name \<email\>" format
248///
249/// # Returns
250///
251/// A tuple of `(Option<String>, Option<String>)` representing `(name, email)`:
252/// - If `\<email\>` pattern found: name (trimmed, or None if empty) and email
253/// - If no pattern: trimmed input as name, None for email
254///
255/// # Examples
256///
257/// ```
258/// use provenant::parsers::utils::split_name_email;
259///
260/// // Full format
261/// let (name, email) = split_name_email("John Doe <john@example.com>");
262/// assert_eq!(name, Some("John Doe".to_string()));
263/// assert_eq!(email, Some("john@example.com".to_string()));
264///
265/// // Email only in angle brackets
266/// let (name, email) = split_name_email("<john@example.com>");
267/// assert_eq!(name, None);
268/// assert_eq!(email, Some("john@example.com".to_string()));
269///
270/// // Name only (no angle brackets)
271/// let (name, email) = split_name_email("John Doe");
272/// assert_eq!(name, Some("John Doe".to_string()));
273/// assert_eq!(email, None);
274/// ```
275pub fn split_name_email(s: &str) -> (Option<String>, Option<String>) {
276    if let Some(email_start) = s.find('<')
277        && let Some(email_end) = s.find('>')
278        && email_start < email_end
279    {
280        let name = s[..email_start].trim();
281        let email = &s[email_start + 1..email_end];
282        (
283            if name.is_empty() {
284                None
285            } else {
286                Some(name.to_string())
287            },
288            Some(email.to_string()),
289        )
290    } else {
291        (Some(s.trim().to_string()), None)
292    }
293}
294
295#[cfg(test)]
296mod tests {
297    use super::*;
298    use std::io::Write;
299    use tempfile::tempdir;
300
301    #[test]
302    fn test_read_file_to_string_success() {
303        let dir = tempdir().unwrap();
304        let file_path = dir.path().join("test.txt");
305        let mut file = File::create(&file_path).unwrap();
306        file.write_all(b"test content").unwrap();
307
308        let content = read_file_to_string(&file_path, None).unwrap();
309        assert_eq!(content, "test content");
310    }
311
312    #[test]
313    fn test_read_file_to_string_nonexistent() {
314        let path = Path::new("/nonexistent/file.txt");
315        let result = read_file_to_string(path, None);
316        assert!(result.is_err());
317    }
318
319    #[test]
320    fn test_read_file_to_string_empty() {
321        let dir = tempdir().unwrap();
322        let file_path = dir.path().join("empty.txt");
323        File::create(&file_path).unwrap();
324
325        let content = read_file_to_string(&file_path, None).unwrap();
326        assert_eq!(content, "");
327    }
328
329    #[test]
330    fn test_npm_purl_scoped_with_version() {
331        let purl = npm_purl("@babel/core", Some("7.0.0")).unwrap();
332        assert_eq!(purl, "pkg:npm/%40babel/core@7.0.0");
333    }
334
335    #[test]
336    fn test_npm_purl_scoped_without_version() {
337        let purl = npm_purl("@babel/core", None).unwrap();
338        assert_eq!(purl, "pkg:npm/%40babel/core");
339    }
340
341    #[test]
342    fn test_npm_purl_unscoped_with_version() {
343        let purl = npm_purl("lodash", Some("4.17.21")).unwrap();
344        assert_eq!(purl, "pkg:npm/lodash@4.17.21");
345    }
346
347    #[test]
348    fn test_npm_purl_unscoped_without_version() {
349        let purl = npm_purl("lodash", None).unwrap();
350        assert_eq!(purl, "pkg:npm/lodash");
351    }
352
353    #[test]
354    fn test_npm_purl_scoped_slash_not_encoded() {
355        let purl = npm_purl("@types/node", Some("18.0.0")).unwrap();
356        assert!(purl.contains("/%40types/node"));
357        assert!(!purl.contains("%2F"));
358    }
359
360    #[test]
361    fn test_parse_sri_sha512() {
362        let (algo, hash) = parse_sri("sha512-9NET910DNaIPngYnLLPeg+Ogzqsi9uM4mSboU5y6p8S5DzMTVEsJZrawi+BoDNUVBa2DhJqQYUFvMDfgU062LQ==").unwrap();
363        assert_eq!(algo, "sha512");
364        assert_eq!(hash.len(), 128);
365    }
366
367    #[test]
368    fn test_parse_sri_sha1() {
369        let (algo, hash) = parse_sri("sha1-w7M6te42DYbg5ijwRorn7yfWVN8=").unwrap();
370        assert_eq!(algo, "sha1");
371        assert_eq!(hash.len(), 40);
372    }
373
374    #[test]
375    fn test_parse_sri_sha256() {
376        let (algo, hash) =
377            parse_sri("sha256-47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=").unwrap();
378        assert_eq!(algo, "sha256");
379        assert_eq!(hash.len(), 64);
380    }
381
382    #[test]
383    fn test_parse_sri_invalid_format() {
384        assert!(parse_sri("invalid").is_none());
385        assert!(parse_sri("sha512").is_none());
386        assert!(parse_sri("").is_none());
387    }
388
389    #[test]
390    fn test_parse_sri_invalid_base64() {
391        assert!(parse_sri("sha512-!!!invalid!!!").is_none());
392    }
393
394    #[test]
395    fn test_split_name_email_full_format() {
396        let (name, email) = split_name_email("John Doe <john@example.com>");
397        assert_eq!(name, Some("John Doe".to_string()));
398        assert_eq!(email, Some("john@example.com".to_string()));
399    }
400
401    #[test]
402    fn test_split_name_email_name_only() {
403        let (name, email) = split_name_email("John Doe");
404        assert_eq!(name, Some("John Doe".to_string()));
405        assert_eq!(email, None);
406    }
407
408    #[test]
409    fn test_split_name_email_email_only_plain() {
410        let (name, email) = split_name_email("john@example.com");
411        assert_eq!(name, Some("john@example.com".to_string()));
412        assert_eq!(email, None);
413    }
414
415    #[test]
416    fn test_split_name_email_email_only_brackets() {
417        let (name, email) = split_name_email("<john@example.com>");
418        assert_eq!(name, None);
419        assert_eq!(email, Some("john@example.com".to_string()));
420    }
421
422    #[test]
423    fn test_split_name_email_whitespace_trimming() {
424        let (name, email) = split_name_email("  John Doe  <  john@example.com  >  ");
425        assert_eq!(name, Some("John Doe".to_string()));
426        assert_eq!(email, Some("  john@example.com  ".to_string()));
427    }
428
429    #[test]
430    fn test_split_name_email_empty_string() {
431        let (name, email) = split_name_email("");
432        assert_eq!(name, Some("".to_string()));
433        assert_eq!(email, None);
434    }
435
436    #[test]
437    fn test_split_name_email_whitespace_only() {
438        let (name, email) = split_name_email("   ");
439        assert_eq!(name, Some("".to_string()));
440        assert_eq!(email, None);
441    }
442
443    #[test]
444    fn test_split_name_email_invalid_bracket_order() {
445        let (name, email) = split_name_email("John >email< Doe");
446        assert_eq!(name, Some("John >email< Doe".to_string()));
447        assert_eq!(email, None);
448    }
449
450    #[test]
451    fn test_split_name_email_missing_close_bracket() {
452        let (name, email) = split_name_email("John Doe <email@example.com");
453        assert_eq!(name, Some("John Doe <email@example.com".to_string()));
454        assert_eq!(email, None);
455    }
456
457    #[test]
458    fn test_split_name_email_missing_open_bracket() {
459        let (name, email) = split_name_email("John Doe email@example.com>");
460        assert_eq!(name, Some("John Doe email@example.com>".to_string()));
461        assert_eq!(email, None);
462    }
463
464    #[test]
465    fn test_read_file_to_string_oversized() {
466        let dir = tempdir().unwrap();
467        let file_path = dir.path().join("big.txt");
468        fs::write(&file_path, "x").unwrap();
469
470        let result = read_file_to_string(&file_path, Some(0));
471        assert!(result.is_err());
472    }
473
474    #[test]
475    fn test_read_file_to_string_lossy_utf8() {
476        let dir = tempdir().unwrap();
477        let file_path = dir.path().join("bad_utf8.txt");
478        let mut file = File::create(&file_path).unwrap();
479        file.write_all(b"hello\xffworld").unwrap();
480
481        let content = read_file_to_string(&file_path, None).unwrap();
482        assert!(content.contains("hello"));
483        assert!(content.contains("world"));
484    }
485
486    #[test]
487    fn test_truncate_field_within_limit() {
488        let s = "short value".to_string();
489        assert_eq!(truncate_field(s.clone()), s);
490    }
491
492    #[test]
493    fn test_truncate_field_exceeds_limit() {
494        let long = "x".repeat(MAX_FIELD_LENGTH + 100);
495        let truncated = truncate_field(long);
496        assert!(truncated.len() <= MAX_FIELD_LENGTH);
497    }
498}