Skip to main content

keyhog_sources/
web.rs

1//! Web content source: scan JavaScript, source maps, and WASM binaries at URLs.
2//!
3//! Fetches web content over HTTP(S) and produces [`Chunk`]s for the scanner.
4//! Handles three content types:
5//!
6//! - **JavaScript**: fetched as text, scanned directly for hardcoded secrets.
7//! - **Source maps**: fetched as JSON, each `sourcesContent` entry becomes a
8//!   separate chunk tagged with its original filename.
9//! - **WASM binaries**: fetched as bytes, printable ASCII strings ≥ 8 chars are
10//!   extracted (identical to `strings` CLI) and scanned as text.
11//!
12//! # Examples
13//!
14//! ```rust,no_run
15//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
16//! use keyhog_sources::WebSource;
17//! use keyhog_core::Source;
18//!
19//! let source = WebSource::new(vec![
20//!     "https://example.com/app.js".to_string(),
21//!     "https://example.com/app.js.map".to_string(),
22//!     "https://example.com/module.wasm".to_string(),
23//! ]);
24//!
25//! for chunk in source.chunks() {
26//!     let chunk = chunk?;
27//!     println!("{}: {} bytes", chunk.metadata.source_type, chunk.data.len());
28//! }
29//! # Ok(()) }
30//! ```
31
32use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
33
34/// Minimum printable string length for WASM binary string extraction.
35const MIN_WASM_STRING_LEN: usize = 8;
36
37/// Maximum response body size to prevent OOM on malicious targets (10 MB).
38const MAX_RESPONSE_BYTES: usize = 10 * 1024 * 1024;
39
40/// WASM magic bytes: `\0asm`.
41const WASM_MAGIC: &[u8; 4] = b"\x00asm";
42
43/// Web content source that fetches JavaScript, source maps, and WASM from URLs.
44///
45/// URLs ending in `.wasm` are treated as binary and have strings extracted.
46/// URLs ending in `.map` are treated as source maps and have `sourcesContent`
47/// entries split into individual chunks. Everything else is treated as
48/// JavaScript text.
49pub struct WebSource {
50    urls: Vec<String>,
51}
52
53impl WebSource {
54    /// Create a web source from a list of URLs to scan.
55    ///
56    /// # Examples
57    ///
58    /// ```rust
59    /// use keyhog_sources::WebSource;
60    /// use keyhog_core::Source;
61    ///
62    /// let source = WebSource::new(vec!["https://example.com/app.js".into()]);
63    /// assert_eq!(source.name(), "web");
64    /// ```
65    pub fn new(urls: Vec<String>) -> Self {
66        Self { urls }
67    }
68
69    /// Create a web source from a single URL.
70    ///
71    /// # Examples
72    ///
73    /// ```rust
74    /// use keyhog_sources::WebSource;
75    /// use keyhog_core::Source;
76    ///
77    /// let source = WebSource::from_url("https://example.com/app.js");
78    /// assert_eq!(source.name(), "web");
79    /// ```
80    pub fn from_url(url: &str) -> Self {
81        Self {
82            urls: vec![url.to_string()],
83        }
84    }
85
86    /// Fetch all URLs and produce chunks.
87    ///
88    /// Uses `reqwest::blocking` directly; the blocking client internally manages
89    /// its own background runtime, so no dedicated thread wrapper is required.
90    fn fetch_all(&self) -> Vec<Result<Chunk, SourceError>> {
91        // Auto-decompression DISABLED — without this, reqwest expands gzip
92        // bodies to completion before we can check size, opening a gzip-bomb
93        // DoS. Decompression is opt-in per call where we explicitly want it.
94        let client = reqwest::blocking::Client::builder()
95            .timeout(crate::timeouts::HTTP_REQUEST)
96            .danger_accept_invalid_certs(false)
97            .redirect(reqwest::redirect::Policy::limited(5))
98            .user_agent("keyhog-web/0.1")
99            .no_gzip()
100            .no_brotli()
101            .no_deflate()
102            .build()
103            .map_err(|e| SourceError::Other(format!("failed to build HTTP client: {e}")));
104
105        let client = match client {
106            Ok(c) => c,
107            Err(e) => return vec![Err(e)],
108        };
109
110        let mut results = Vec::new();
111
112        for url in &self.urls {
113            let chunks = fetch_url(&client, url);
114            results.extend(chunks);
115        }
116
117        results
118    }
119}
120
121impl Source for WebSource {
122    fn name(&self) -> &str {
123        "web"
124    }
125
126    fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
127        Box::new(self.fetch_all().into_iter())
128    }
129    fn as_any(&self) -> &dyn std::any::Any {
130        self
131    }
132}
133
134/// Fetch a single URL and produce one or more chunks based on content type.
135fn fetch_url(client: &reqwest::blocking::Client, url: &str) -> Vec<Result<Chunk, SourceError>> {
136    let resp = match client.get(url).send() {
137        Ok(r) => r,
138        Err(e) => {
139            return vec![Err(SourceError::Other(format!(
140                "failed to fetch {url}: {e}"
141            )))];
142        }
143    };
144
145    let status = resp.status().as_u16();
146    if status != 200 {
147        tracing::warn!(url, status, "non-200 response, skipping");
148        return Vec::new();
149    }
150
151    // Route by URL extension
152    let lower = url.to_lowercase();
153    if lower.ends_with(".wasm") {
154        handle_wasm(resp, url)
155    } else if lower.ends_with(".map") || lower.contains(".map?") {
156        handle_sourcemap(resp, url)
157    } else {
158        handle_js(resp, url)
159    }
160}
161
162/// Handle a JavaScript file: return the full text as a single chunk.
163fn handle_js(resp: reqwest::blocking::Response, url: &str) -> Vec<Result<Chunk, SourceError>> {
164    match read_text_response(resp) {
165        Ok(body) => vec![Ok(Chunk {
166            data: body.into(),
167            metadata: ChunkMetadata {
168                base_offset: 0,
169                source_type: "web:js".to_string(),
170                path: Some(url.to_string()),
171                commit: None,
172                author: None,
173                date: None,
174                            mtime_ns: None,
175                size_bytes: None,
176},
177        })],
178        Err(e) => vec![Err(e)],
179    }
180}
181
182/// Handle a source map: parse JSON and emit each `sourcesContent` entry
183/// as a separate chunk tagged with the original filename.
184fn handle_sourcemap(
185    resp: reqwest::blocking::Response,
186    url: &str,
187) -> Vec<Result<Chunk, SourceError>> {
188    let body = match read_text_response(resp) {
189        Ok(b) => b,
190        Err(e) => return vec![Err(e)],
191    };
192
193    let map: serde_json::Value = match serde_json::from_str(&body) {
194        Ok(v) => v,
195        Err(e) => {
196            tracing::warn!(url, err = %e, "failed to parse source map JSON");
197            // Fall back to treating it as plain JS text
198            return vec![Ok(Chunk {
199                data: body.into(),
200                metadata: ChunkMetadata {
201                    base_offset: 0,
202                    source_type: "web:sourcemap:raw".to_string(),
203                    path: Some(url.to_string()),
204                    commit: None,
205                    author: None,
206                    date: None,
207                                    mtime_ns: None,
208                    size_bytes: None,
209},
210            })];
211        }
212    };
213
214    let sources: Vec<String> = map["sources"]
215        .as_array()
216        .unwrap_or(&vec![])
217        .iter()
218        .filter_map(|v| v.as_str().map(String::from))
219        .collect();
220
221    let contents: Vec<Option<String>> = map["sourcesContent"]
222        .as_array()
223        .map(|arr| arr.iter().map(|v| v.as_str().map(String::from)).collect())
224        .unwrap_or_default();
225
226    let mut chunks = Vec::new();
227
228    for (i, content) in contents.iter().enumerate() {
229        if let Some(code) = content {
230            if code.is_empty() {
231                continue;
232            }
233            let source_name = sources
234                .get(i)
235                .cloned()
236                .unwrap_or_else(|| format!("source_{i}"));
237            chunks.push(Ok(Chunk {
238                data: code.clone().into(),
239                metadata: ChunkMetadata {
240                    base_offset: 0,
241                    source_type: "web:sourcemap".to_string(),
242                    path: Some(format!("{url}!{source_name}")),
243                    commit: None,
244                    author: None,
245                    date: None,
246                    mtime_ns: None,
247                    size_bytes: None,
248                },
249            }));
250        }
251    }
252
253    // If no sourcesContent, treat the raw map as scannable text
254    if chunks.is_empty() {
255        chunks.push(Ok(Chunk {
256            data: body.into(),
257            metadata: ChunkMetadata {
258                base_offset: 0,
259                source_type: "web:sourcemap:raw".to_string(),
260                path: Some(url.to_string()),
261                commit: None,
262                author: None,
263                date: None,
264                            mtime_ns: None,
265                size_bytes: None,
266},
267        }));
268    }
269
270    chunks
271}
272
273/// Handle a WASM binary: extract printable strings and scan as text.
274fn handle_wasm(resp: reqwest::blocking::Response, url: &str) -> Vec<Result<Chunk, SourceError>> {
275    let bytes = match read_bytes_response(resp) {
276        Ok(b) => b,
277        Err(e) => return vec![Err(e)],
278    };
279
280    // Verify WASM magic bytes
281    if bytes.len() < 4 || &bytes[..4] != WASM_MAGIC {
282        tracing::warn!(url, "not a valid WASM file (wrong magic bytes)");
283        return Vec::new();
284    }
285
286    let strings = crate::strings::extract_printable_strings(&bytes, MIN_WASM_STRING_LEN);
287    if strings.is_empty() {
288        return Vec::new();
289    }
290
291    vec![Ok(Chunk {
292        data: keyhog_core::SensitiveString::join(&strings, "\n"),
293        metadata: ChunkMetadata {
294            base_offset: 0,
295            source_type: "web:wasm".to_string(),
296            path: Some(url.to_string()),
297            commit: None,
298            author: None,
299            date: None,
300                    mtime_ns: None,
301            size_bytes: None,
302},
303    })]
304}
305
306/// Read an HTTP response body as text, capping at `MAX_RESPONSE_BYTES`.
307///
308/// Pre-flight Content-Length and streamed cap-aware copy. The previous
309/// version called `.text()` (which auto-decompresses gzip/deflate to
310/// completion) before checking the size — a 1 MB gzip bomb expanding to
311/// 1+ GB would OOM before this check fired. See `audit release-2026-04-26
312/// web.rs:287-301`.
313fn read_text_response(resp: reqwest::blocking::Response) -> Result<String, SourceError> {
314    let bytes = read_bytes_response(resp)?;
315    String::from_utf8(bytes).map_err(|e| SourceError::Other(format!("non-UTF-8 response: {e}")))
316}
317
318/// Read an HTTP response body as bytes, capping at `MAX_RESPONSE_BYTES`
319/// BEFORE decompression to defeat gzip-bomb DoS.
320fn read_bytes_response(resp: reqwest::blocking::Response) -> Result<Vec<u8>, SourceError> {
321    use std::io::Read;
322    let url = resp.url().to_string();
323
324    if let Some(len) = resp.content_length() {
325        if len as usize > MAX_RESPONSE_BYTES {
326            return Err(SourceError::Other(format!(
327                "response from {url} declares {len} bytes (> {} MB limit)",
328                MAX_RESPONSE_BYTES / (1024 * 1024)
329            )));
330        }
331    }
332
333    // Stream into a bounded buffer; abort the moment we exceed the cap.
334    let mut buf = Vec::with_capacity(MAX_RESPONSE_BYTES.min(64 * 1024));
335    let mut taken = resp.take(MAX_RESPONSE_BYTES as u64 + 1);
336    taken
337        .read_to_end(&mut buf)
338        .map_err(|e| SourceError::Other(format!("failed to read bytes from {url}: {e}")))?;
339    if buf.len() > MAX_RESPONSE_BYTES {
340        return Err(SourceError::Other(format!(
341            "response from {url} exceeds {} MB limit",
342            MAX_RESPONSE_BYTES / (1024 * 1024)
343        )));
344    }
345
346    Ok(buf)
347}