Skip to main content

provenant/utils/
sourcemap.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Source map file processing for license detection.
5//!
6//! Source map files (.js.map, .css.map) are JSON files containing embedded
7//! source code in a `sourcesContent` array. This module extracts that content
8//! for license detection.
9
10use std::path::Path;
11
12/// Check if a file is a source map file based on extension.
13pub fn is_sourcemap(path: &Path) -> bool {
14    path.file_name()
15        .and_then(|name| name.to_str())
16        .map(|name| {
17            let name_lower = name.to_lowercase();
18            name_lower.ends_with(".js.map") || name_lower.ends_with(".css.map")
19        })
20        .unwrap_or(false)
21}
22
23/// Extract source content from a source map JSON file.
24///
25/// Parses the JSON and extracts all entries from `sourcesContent`,
26/// combining them with newlines for license detection.
27///
28/// Returns `Some(combined_text)` if successfully parsed with content.
29/// Returns `None` if JSON parsing fails or no sourcesContent exists.
30pub fn extract_sourcemap_content(json_text: &str) -> Option<String> {
31    let json: serde_json::Value = serde_json::from_str(json_text).ok()?;
32    let sources = json.get("sourcesContent")?.as_array()?;
33
34    let combined: String = sources
35        .iter()
36        .filter_map(|v| v.as_str())
37        .map(replace_verbatim_cr_lf_chars)
38        .collect::<Vec<_>>()
39        .join("\n");
40
41    if combined.is_empty() {
42        None
43    } else {
44        Some(combined)
45    }
46}
47
48/// Replace verbatim escaped CR/LF characters with actual newlines.
49///
50/// This matches Python's `replace_verbatim_cr_lf_chars()` behavior exactly:
51/// - Double-escaped (e.g., source had literal `\r` that was escaped again):
52///   - `\\r\\n` (backslash-backslash-r-backslash-backslash-n) → newline
53///   - `\\r` (backslash-backslash-r) → newline
54///   - `\\n` (backslash-backslash-n) → newline
55/// - Single-escaped (e.g., JSON-escaped newlines):
56///   - `\r\n` (backslash-r-backslash-n) → newline
57///   - `\r` (backslash-r) → newline
58///   - `\n` (backslash-n) → newline
59fn replace_verbatim_cr_lf_chars(s: &str) -> String {
60    s.replace("\\\\r\\\\n", "\n")
61        .replace("\\r\\n", "\n")
62        .replace("\\\\r", "\n")
63        .replace("\\\\n", "\n")
64        .replace("\\r", "\n")
65        .replace("\\n", "\n")
66}
67
68#[cfg(test)]
69mod tests {
70    use super::*;
71    use std::path::PathBuf;
72
73    #[test]
74    fn test_is_sourcemap_js_map() {
75        assert!(is_sourcemap(&PathBuf::from("app.js.map")));
76        assert!(is_sourcemap(&PathBuf::from("APP.JS.MAP")));
77    }
78
79    #[test]
80    fn test_is_sourcemap_css_map() {
81        assert!(is_sourcemap(&PathBuf::from("style.css.map")));
82        assert!(is_sourcemap(&PathBuf::from("STYLE.CSS.MAP")));
83    }
84
85    #[test]
86    fn test_is_sourcemap_not_map() {
87        assert!(!is_sourcemap(&PathBuf::from("app.js")));
88        assert!(!is_sourcemap(&PathBuf::from("data.json")));
89        assert!(!is_sourcemap(&PathBuf::from("other.map")));
90    }
91
92    #[test]
93    fn test_extract_sourcemap_content_basic() {
94        let json = r#"{"version":3,"sourcesContent":["hello\nworld"]}"#;
95        let result = extract_sourcemap_content(json);
96        assert!(result.is_some());
97        let content = result.unwrap();
98        assert!(content.contains("hello"));
99        assert!(content.contains("world"));
100    }
101
102    #[test]
103    fn test_extract_sourcemap_content_mit_license() {
104        let json = r#"{"version":3,"sourcesContent":["Use of this source code is governed by an MIT-style license\nthat can be found in the LICENSE file"]}"#;
105        let result = extract_sourcemap_content(json);
106        assert!(result.is_some());
107        let content = result.unwrap();
108        assert!(content.contains("MIT-style license"));
109        assert!(content.contains("LICENSE file"));
110        assert!(content.contains("\n"));
111    }
112
113    #[test]
114    fn test_extract_sourcemap_content_multiple_entries() {
115        let json = r#"{"version":3,"sourcesContent":["first\nfile","second\nfile"]}"#;
116        let result = extract_sourcemap_content(json);
117        assert!(result.is_some());
118        let content = result.unwrap();
119        assert!(content.contains("first"));
120        assert!(content.contains("second"));
121    }
122
123    #[test]
124    fn test_extract_sourcemap_content_no_sources() {
125        let json = r#"{"version":3,"sources":[]}"#;
126        let result = extract_sourcemap_content(json);
127        assert!(result.is_none());
128    }
129
130    #[test]
131    fn test_extract_sourcemap_content_invalid_json() {
132        let json = r#"not valid json"#;
133        let result = extract_sourcemap_content(json);
134        assert!(result.is_none());
135    }
136
137    #[test]
138    fn test_extract_sourcemap_content_null_entries() {
139        let json = r#"{"version":3,"sourcesContent":[null,"actual\ncontent"]}"#;
140        let result = extract_sourcemap_content(json);
141        assert!(result.is_some());
142        let content = result.unwrap();
143        assert!(content.contains("actual"));
144    }
145
146    #[test]
147    fn test_replace_verbatim_cr_lf_chars() {
148        // Single-escaped (backslash-n, backslash-r in the string)
149        assert_eq!(replace_verbatim_cr_lf_chars("a\\nb"), "a\nb");
150        assert_eq!(replace_verbatim_cr_lf_chars("a\\rb"), "a\nb");
151        assert_eq!(replace_verbatim_cr_lf_chars("a\\r\\nb"), "a\nb");
152        // Double-escaped (literal backslash-backslash-n in the string)
153        assert_eq!(replace_verbatim_cr_lf_chars("a\\\\nb"), "a\nb");
154        assert_eq!(replace_verbatim_cr_lf_chars("a\\\\rb"), "a\nb");
155        assert_eq!(replace_verbatim_cr_lf_chars("a\\\\r\\\\nb"), "a\nb");
156    }
157
158    #[test]
159    fn test_ar_er_js_map_detection() {
160        let path = PathBuf::from("testdata/license-golden/datadriven/lic2/ar-ER.js.map");
161        if !path.exists() {
162            eprintln!("Skipping test: test file not found");
163            return;
164        }
165
166        let text = std::fs::read_to_string(&path).expect("Failed to read file");
167        eprintln!("Raw text length: {}", text.len());
168
169        let json: serde_json::Value = serde_json::from_str(&text).expect("JSON parse failed");
170        let sources = json
171            .get("sourcesContent")
172            .expect("No sourcesContent")
173            .as_array()
174            .expect("Not array");
175        eprintln!("Sources array length: {}", sources.len());
176
177        if let Some(first) = sources.first().and_then(|v| v.as_str()) {
178            eprintln!("First source length: {}", first.len());
179            eprintln!("First 100 chars: {:?}", &first[..100.min(first.len())]);
180        }
181
182        let result = extract_sourcemap_content(&text);
183        assert!(result.is_some(), "Should extract content from ar-ER.js.map");
184
185        let content = result.unwrap();
186        eprintln!("Extracted content length: {}", content.len());
187        assert!(
188            content.contains("MIT-style license"),
189            "Should contain MIT license text"
190        );
191    }
192}