Skip to main content

provenant/utils/
sourcemap.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Source map file processing for scanner text detection.
5//!
6//! Source map files (.js.map, .css.map) are JSON files containing embedded
7//! source code in a `sourcesContent` array. This module extracts that content
8//! so the scanner can detect licenses and parties from the embedded sources
9//! instead of from the raw source map JSON wrapper.
10
11use std::borrow::Cow;
12use std::path::Path;
13
14/// Check if a file is a source map file based on extension.
15pub fn is_sourcemap(path: &Path) -> bool {
16    path.file_name()
17        .and_then(|name| name.to_str())
18        .map(|name| {
19            let name_lower = name.to_lowercase();
20            name_lower.ends_with(".js.map") || name_lower.ends_with(".css.map")
21        })
22        .unwrap_or(false)
23}
24
25/// Extract source content from a source map JSON file.
26///
27/// Parses the JSON and extracts all entries from `sourcesContent`,
28/// combining them with newlines for license detection.
29///
30/// Returns `Some(combined_text)` if successfully parsed with content.
31/// Returns `None` if JSON parsing fails or no sourcesContent exists.
32pub fn extract_sourcemap_content(json_text: &str) -> Option<String> {
33    let json: serde_json::Value = serde_json::from_str(json_text).ok()?;
34    let sources = json.get("sourcesContent")?.as_array()?;
35
36    let combined: String = sources
37        .iter()
38        .filter_map(|v| v.as_str())
39        .map(replace_verbatim_cr_lf_chars)
40        .collect::<Vec<_>>()
41        .join("\n");
42
43    if combined.is_empty() {
44        None
45    } else {
46        Some(combined)
47    }
48}
49
50/// Return the text scanners should inspect for this file.
51pub fn detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
52    if !is_sourcemap(path) {
53        return Cow::Borrowed(text);
54    }
55
56    extract_sourcemap_content(text)
57        .map(Cow::Owned)
58        .unwrap_or_else(|| Cow::Borrowed(text))
59}
60
61/// Replace verbatim escaped CR/LF characters with actual newlines.
62///
63/// This matches Python's `replace_verbatim_cr_lf_chars()` behavior exactly:
64/// - Double-escaped (e.g., source had literal `\r` that was escaped again):
65///   - `\\r\\n` (backslash-backslash-r-backslash-backslash-n) → newline
66///   - `\\r` (backslash-backslash-r) → newline
67///   - `\\n` (backslash-backslash-n) → newline
68/// - Single-escaped (e.g., JSON-escaped newlines):
69///   - `\r\n` (backslash-r-backslash-n) → newline
70///   - `\r` (backslash-r) → newline
71///   - `\n` (backslash-n) → newline
72fn replace_verbatim_cr_lf_chars(s: &str) -> String {
73    s.replace("\\\\r\\\\n", "\n")
74        .replace("\\r\\n", "\n")
75        .replace("\\\\r", "\n")
76        .replace("\\\\n", "\n")
77        .replace("\\r", "\n")
78        .replace("\\n", "\n")
79}
80
81#[cfg(test)]
82mod tests {
83    use super::*;
84    use std::path::PathBuf;
85
86    #[test]
87    fn test_is_sourcemap_js_map() {
88        assert!(is_sourcemap(&PathBuf::from("app.js.map")));
89        assert!(is_sourcemap(&PathBuf::from("APP.JS.MAP")));
90    }
91
92    #[test]
93    fn test_is_sourcemap_css_map() {
94        assert!(is_sourcemap(&PathBuf::from("style.css.map")));
95        assert!(is_sourcemap(&PathBuf::from("STYLE.CSS.MAP")));
96    }
97
98    #[test]
99    fn test_is_sourcemap_not_map() {
100        assert!(!is_sourcemap(&PathBuf::from("app.js")));
101        assert!(!is_sourcemap(&PathBuf::from("data.json")));
102        assert!(!is_sourcemap(&PathBuf::from("other.map")));
103    }
104
105    #[test]
106    fn test_extract_sourcemap_content_basic() {
107        let json = r#"{"version":3,"sourcesContent":["hello\nworld"]}"#;
108        let result = extract_sourcemap_content(json);
109        assert!(result.is_some());
110        let content = result.unwrap();
111        assert!(content.contains("hello"));
112        assert!(content.contains("world"));
113    }
114
115    #[test]
116    fn test_extract_sourcemap_content_mit_license() {
117        let json = r#"{"version":3,"sourcesContent":["Use of this source code is governed by an MIT-style license\nthat can be found in the LICENSE file"]}"#;
118        let result = extract_sourcemap_content(json);
119        assert!(result.is_some());
120        let content = result.unwrap();
121        assert!(content.contains("MIT-style license"));
122        assert!(content.contains("LICENSE file"));
123        assert!(content.contains("\n"));
124    }
125
126    #[test]
127    fn test_extract_sourcemap_content_multiple_entries() {
128        let json = r#"{"version":3,"sourcesContent":["first\nfile","second\nfile"]}"#;
129        let result = extract_sourcemap_content(json);
130        assert!(result.is_some());
131        let content = result.unwrap();
132        assert!(content.contains("first"));
133        assert!(content.contains("second"));
134    }
135
136    #[test]
137    fn test_extract_sourcemap_content_no_sources() {
138        let json = r#"{"version":3,"sources":[]}"#;
139        let result = extract_sourcemap_content(json);
140        assert!(result.is_none());
141    }
142
143    #[test]
144    fn test_extract_sourcemap_content_invalid_json() {
145        let json = r#"not valid json"#;
146        let result = extract_sourcemap_content(json);
147        assert!(result.is_none());
148    }
149
150    #[test]
151    fn test_extract_sourcemap_content_null_entries() {
152        let json = r#"{"version":3,"sourcesContent":[null,"actual\ncontent"]}"#;
153        let result = extract_sourcemap_content(json);
154        assert!(result.is_some());
155        let content = result.unwrap();
156        assert!(content.contains("actual"));
157    }
158
159    #[test]
160    fn test_detection_text_prefers_embedded_sources_for_sourcemaps() {
161        let path = PathBuf::from("bundle.js.map");
162        let raw = r#"{"version":3,"comment":"Copyright 1999 Wrong Corp.","sourcesContent":["/* Copyright 2024 Example Corp. */\n"]}"#;
163
164        let result = detection_text(&path, raw);
165
166        assert_eq!(result.as_ref(), "/* Copyright 2024 Example Corp. */\n");
167    }
168
169    #[test]
170    fn test_replace_verbatim_cr_lf_chars() {
171        // Single-escaped (backslash-n, backslash-r in the string)
172        assert_eq!(replace_verbatim_cr_lf_chars("a\\nb"), "a\nb");
173        assert_eq!(replace_verbatim_cr_lf_chars("a\\rb"), "a\nb");
174        assert_eq!(replace_verbatim_cr_lf_chars("a\\r\\nb"), "a\nb");
175        // Double-escaped (literal backslash-backslash-n in the string)
176        assert_eq!(replace_verbatim_cr_lf_chars("a\\\\nb"), "a\nb");
177        assert_eq!(replace_verbatim_cr_lf_chars("a\\\\rb"), "a\nb");
178        assert_eq!(replace_verbatim_cr_lf_chars("a\\\\r\\\\nb"), "a\nb");
179    }
180
181    #[test]
182    fn test_ar_er_js_map_detection() {
183        let path = PathBuf::from("testdata/license-golden/datadriven/lic2/ar-ER.js.map");
184        if !path.exists() {
185            eprintln!("Skipping test: test file not found");
186            return;
187        }
188
189        let text = std::fs::read_to_string(&path).expect("Failed to read file");
190        eprintln!("Raw text length: {}", text.len());
191
192        let json: serde_json::Value = serde_json::from_str(&text).expect("JSON parse failed");
193        let sources = json
194            .get("sourcesContent")
195            .expect("No sourcesContent")
196            .as_array()
197            .expect("Not array");
198        eprintln!("Sources array length: {}", sources.len());
199
200        if let Some(first) = sources.first().and_then(|v| v.as_str()) {
201            eprintln!("First source length: {}", first.len());
202            eprintln!("First 100 chars: {:?}", &first[..100.min(first.len())]);
203        }
204
205        let result = extract_sourcemap_content(&text);
206        assert!(result.is_some(), "Should extract content from ar-ER.js.map");
207
208        let content = result.unwrap();
209        eprintln!("Extracted content length: {}", content.len());
210        assert!(
211            content.contains("MIT-style license"),
212            "Should contain MIT license text"
213        );
214    }
215}