use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
const MIN_WASM_STRING_LEN: usize = 8;
const MAX_RESPONSE_BYTES: usize = 10 * 1024 * 1024;
const REQUEST_TIMEOUT_SECS: u64 = 30;
const WASM_MAGIC: &[u8; 4] = b"\x00asm";
pub struct WebSource {
urls: Vec<String>,
}
impl WebSource {
pub fn new(urls: Vec<String>) -> Self {
Self { urls }
}
pub fn from_url(url: &str) -> Self {
Self {
urls: vec![url.to_string()],
}
}
fn fetch_all(&self) -> Vec<Result<Chunk, SourceError>> {
let urls = self.urls.clone();
std::thread::spawn(move || {
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(REQUEST_TIMEOUT_SECS))
.danger_accept_invalid_certs(false)
.redirect(reqwest::redirect::Policy::limited(5))
.user_agent("keyhog-web/0.1")
.build()
.map_err(|e| SourceError::Other(format!("failed to build HTTP client: {e}")));
let client = match client {
Ok(c) => c,
Err(e) => return vec![Err(e)],
};
let mut results = Vec::new();
for url in &urls {
let chunks = fetch_url(&client, url);
results.extend(chunks);
}
results
})
.join()
.unwrap_or_else(|_| vec![Err(SourceError::Other("web fetch thread panicked".into()))])
}
}
impl Source for WebSource {
fn name(&self) -> &str {
"web"
}
fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
Box::new(self.fetch_all().into_iter())
}
}
fn fetch_url(client: &reqwest::blocking::Client, url: &str) -> Vec<Result<Chunk, SourceError>> {
let resp = match client.get(url).send() {
Ok(r) => r,
Err(e) => {
return vec![Err(SourceError::Other(format!(
"failed to fetch {url}: {e}"
)))];
}
};
let status = resp.status().as_u16();
if status != 200 {
tracing::warn!(url, status, "non-200 response, skipping");
return Vec::new();
}
let lower = url.to_lowercase();
if lower.ends_with(".wasm") {
handle_wasm(resp, url)
} else if lower.ends_with(".map") || lower.contains(".map?") {
handle_sourcemap(resp, url)
} else {
handle_js(resp, url)
}
}
fn handle_js(resp: reqwest::blocking::Response, url: &str) -> Vec<Result<Chunk, SourceError>> {
match read_text_response(resp) {
Ok(body) => vec![Ok(Chunk {
data: body,
metadata: ChunkMetadata {
source_type: "web:js".to_string(),
path: Some(url.to_string()),
commit: None,
author: None,
date: None,
},
})],
Err(e) => vec![Err(e)],
}
}
fn handle_sourcemap(
resp: reqwest::blocking::Response,
url: &str,
) -> Vec<Result<Chunk, SourceError>> {
let body = match read_text_response(resp) {
Ok(b) => b,
Err(e) => return vec![Err(e)],
};
let map: serde_json::Value = match serde_json::from_str(&body) {
Ok(v) => v,
Err(e) => {
tracing::warn!(url, err = %e, "failed to parse source map JSON");
return vec![Ok(Chunk {
data: body,
metadata: ChunkMetadata {
source_type: "web:sourcemap:raw".to_string(),
path: Some(url.to_string()),
commit: None,
author: None,
date: None,
},
})];
}
};
let sources: Vec<String> = map["sources"]
.as_array()
.unwrap_or(&vec![])
.iter()
.filter_map(|v| v.as_str().map(String::from))
.collect();
let contents: Vec<Option<String>> = map["sourcesContent"]
.as_array()
.map(|arr| arr.iter().map(|v| v.as_str().map(String::from)).collect())
.unwrap_or_default();
let mut chunks = Vec::new();
for (i, content) in contents.iter().enumerate() {
if let Some(code) = content {
if code.is_empty() {
continue;
}
let source_name = sources
.get(i)
.cloned()
.unwrap_or_else(|| format!("source_{i}"));
chunks.push(Ok(Chunk {
data: code.clone(),
metadata: ChunkMetadata {
source_type: "web:sourcemap".to_string(),
path: Some(format!("{url}!{source_name}")),
commit: None,
author: None,
date: None,
},
}));
}
}
if chunks.is_empty() {
chunks.push(Ok(Chunk {
data: body,
metadata: ChunkMetadata {
source_type: "web:sourcemap:raw".to_string(),
path: Some(url.to_string()),
commit: None,
author: None,
date: None,
},
}));
}
chunks
}
fn handle_wasm(resp: reqwest::blocking::Response, url: &str) -> Vec<Result<Chunk, SourceError>> {
let bytes = match read_bytes_response(resp) {
Ok(b) => b,
Err(e) => return vec![Err(e)],
};
if bytes.len() < 4 || &bytes[..4] != WASM_MAGIC {
tracing::warn!(url, "not a valid WASM file (wrong magic bytes)");
return Vec::new();
}
let strings = crate::strings::extract_printable_strings(&bytes, MIN_WASM_STRING_LEN);
if strings.is_empty() {
return Vec::new();
}
vec![Ok(Chunk {
data: strings.join("\n"),
metadata: ChunkMetadata {
source_type: "web:wasm".to_string(),
path: Some(url.to_string()),
commit: None,
author: None,
date: None,
},
})]
}
fn read_text_response(resp: reqwest::blocking::Response) -> Result<String, SourceError> {
let url = resp.url().to_string();
let body = resp
.text()
.map_err(|e| SourceError::Other(format!("failed to read response from {url}: {e}")))?;
if body.len() > MAX_RESPONSE_BYTES {
return Err(SourceError::Other(format!(
"response from {url} exceeds {} MB limit",
MAX_RESPONSE_BYTES / (1024 * 1024)
)));
}
Ok(body)
}
fn read_bytes_response(resp: reqwest::blocking::Response) -> Result<Vec<u8>, SourceError> {
let url = resp.url().to_string();
let bytes = resp
.bytes()
.map_err(|e| SourceError::Other(format!("failed to read bytes from {url}: {e}")))?;
if bytes.len() > MAX_RESPONSE_BYTES {
return Err(SourceError::Other(format!(
"response from {url} exceeds {} MB limit",
MAX_RESPONSE_BYTES / (1024 * 1024)
)));
}
Ok(bytes.to_vec())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn web_source_name() {
let source = WebSource::new(vec![]);
assert_eq!(source.name(), "web");
}
#[test]
fn from_url_convenience() {
let source = WebSource::from_url("https://example.com/app.js");
assert_eq!(source.urls.len(), 1);
}
#[test]
fn empty_urls_produces_no_chunks() {
let source = WebSource::new(vec![]);
let chunks: Vec<_> = source.chunks().collect();
assert!(chunks.is_empty());
}
}