use std::path::Path;
use anyhow::{Result, bail};
use async_trait::async_trait;
use regex::Regex;
use serde::{Deserialize, Serialize};
use wasmtime::component::{Component, Linker as ComponentLinker};
use wasmtime::{Engine, Linker, Module, Store, StoreLimitsBuilder};
use super::{SiteContent, SiteMetadata, SiteProvider};
use crate::http_client::AcceleratedClient;
const FUEL_LIMIT: u64 = 100_000_000;
const MEMORY_LIMIT_BYTES: usize = 64 * 1024 * 1024;
#[derive(Debug, Default, Deserialize, Serialize)]
pub struct WasmArticle {
pub title: Option<String>,
pub content: Option<String>,
pub author: Option<String>,
pub date: Option<String>,
pub canonical_url: Option<String>,
}
struct StoreData {
limiter: wasmtime::StoreLimits,
}
wasmtime::component::bindgen!({
path: "wit/provider.wit",
world: "provider",
});
pub struct WitWasmProvider {
static_name: &'static str,
component: Component,
engine: Engine,
url_regex: Regex,
}
impl WitWasmProvider {
pub fn from_file(name: &str, wasm_path: &Path, url_pattern: &str) -> Result<Self> {
let engine = build_engine()?;
let bytes = std::fs::read(wasm_path)
.map_err(|e| anyhow::anyhow!("cannot read WASM file {}: {e}", wasm_path.display()))?;
Self::compile(name, &engine, &bytes, url_pattern)
}
pub fn from_bytes(name: &str, wasm_bytes: &[u8], url_pattern: &str) -> Result<Self> {
let engine = build_engine()?;
Self::compile(name, &engine, wasm_bytes, url_pattern)
}
fn compile(name: &str, engine: &Engine, bytes: &[u8], url_pattern: &str) -> Result<Self> {
let component = Component::from_binary(engine, bytes)
.map_err(|e| anyhow::anyhow!("failed to compile Component for '{name}': {e}"))?;
let url_regex = Regex::new(url_pattern).map_err(|e| {
anyhow::anyhow!("invalid URL pattern '{url_pattern}' for provider '{name}': {e}")
})?;
let static_name: &'static str = Box::leak(name.to_owned().into_boxed_str());
Ok(Self {
static_name,
component,
engine: engine.clone(),
url_regex,
})
}
fn run_component(&self, url: &str, html: &str) -> Result<WasmArticle> {
let mut store = build_component_store(&self.engine)?;
let linker: ComponentLinker<StoreData> = ComponentLinker::new(&self.engine);
let bindings = Provider::instantiate(&mut store, &self.component, &linker)
.map_err(|e| anyhow::anyhow!("failed to instantiate Component: {e}"))?;
let result = bindings
.nab_provider_extractor()
.call_extract(&mut store, url, html)
.map_err(|e| anyhow::anyhow!("Component extract() trap: {e}"))?;
match result {
Ok(article) => Ok(WasmArticle {
title: article.title,
content: Some(article.content),
author: article.author,
date: article.date,
canonical_url: None,
}),
Err(reason) => bail!("Component extract() returned error: {reason}"),
}
}
}
#[async_trait]
impl SiteProvider for WitWasmProvider {
fn name(&self) -> &'static str {
self.static_name
}
fn matches(&self, url: &str) -> bool {
self.url_regex.is_match(url)
}
async fn extract(
&self,
url: &str,
_client: &AcceleratedClient,
_cookies: Option<&str>,
prefetched_html: Option<&[u8]>,
) -> Result<SiteContent> {
let html_bytes = prefetched_html.ok_or_else(|| {
anyhow::anyhow!(
"WIT provider '{}' requires pre-fetched HTML but none was provided for {url}",
self.static_name
)
})?;
let html = std::str::from_utf8(html_bytes)
.map_err(|e| anyhow::anyhow!("HTML bytes are not valid UTF-8: {e}"))?;
let article = self.run_component(url, html).map_err(|e| {
anyhow::anyhow!("WIT provider '{}' failed for {url}: {e}", self.static_name)
})?;
Ok(article_to_site_content(
article,
url,
self.static_name,
"wit",
))
}
}
pub struct WasmProvider {
static_name: &'static str,
module: Module,
engine: Engine,
url_regex: Regex,
}
impl WasmProvider {
pub fn from_file(name: &str, wasm_path: &Path, url_pattern: &str) -> Result<Self> {
let engine = build_engine()?;
let bytes = std::fs::read(wasm_path)
.map_err(|e| anyhow::anyhow!("cannot read WASM file {}: {e}", wasm_path.display()))?;
Self::compile(name, &engine, &bytes, url_pattern)
}
pub fn from_bytes(name: &str, wasm_bytes: &[u8], url_pattern: &str) -> Result<Self> {
let engine = build_engine()?;
Self::compile(name, &engine, wasm_bytes, url_pattern)
}
fn compile(name: &str, engine: &Engine, bytes: &[u8], url_pattern: &str) -> Result<Self> {
let module = Module::new(engine, bytes)
.map_err(|e| anyhow::anyhow!("failed to compile WASM module for '{name}': {e}"))?;
let url_regex = Regex::new(url_pattern).map_err(|e| {
anyhow::anyhow!("invalid URL pattern '{url_pattern}' for provider '{name}': {e}")
})?;
let static_name: &'static str = Box::leak(name.to_owned().into_boxed_str());
Ok(Self {
static_name,
module,
engine: engine.clone(),
url_regex,
})
}
#[allow(
clippy::cast_possible_truncation,
clippy::cast_possible_wrap,
clippy::cast_sign_loss
)]
fn run_guest(&self, html: &[u8], url: &str) -> Result<WasmArticle> {
let mut store = build_store(&self.engine)?;
let linker: Linker<StoreData> = Linker::new(&self.engine);
let instance = linker
.instantiate(&mut store, &self.module)
.map_err(|e| anyhow::anyhow!("failed to instantiate WASM module: {e}"))?;
let alloc = instance
.get_typed_func::<i32, i32>(&mut store, "alloc")
.map_err(|e| anyhow::anyhow!("WASM guest must export 'alloc(len: i32) -> i32': {e}"))?;
let extract_fn = instance
.get_typed_func::<(i32, i32, i32, i32), i32>(&mut store, "extract")
.map_err(|e| {
anyhow::anyhow!(
"WASM guest must export 'extract(html_ptr, html_len, url_ptr, url_len) -> i32': {e}"
)
})?;
let memory = instance
.get_memory(&mut store, "memory")
.ok_or_else(|| anyhow::anyhow!("WASM guest must export a 'memory'"))?;
let html_ptr = alloc
.call(&mut store, html.len() as i32)
.map_err(|e| anyhow::anyhow!("guest alloc() for HTML failed: {e}"))?;
write_guest_memory(&memory, &mut store, html_ptr as usize, html)?;
let url_bytes = url.as_bytes();
let url_ptr = alloc
.call(&mut store, url_bytes.len() as i32)
.map_err(|e| anyhow::anyhow!("guest alloc() for URL failed: {e}"))?;
write_guest_memory(&memory, &mut store, url_ptr as usize, url_bytes)?;
let result_ptr = extract_fn
.call(
&mut store,
(html_ptr, html.len() as i32, url_ptr, url_bytes.len() as i32),
)
.map_err(|e| anyhow::anyhow!("WASM extract() call failed: {e}"))?;
if result_ptr == 0 {
bail!("WASM guest returned null pointer — extraction failed");
}
let json_bytes = read_guest_cstring(&memory, &mut store, result_ptr as usize)?;
serde_json::from_slice(&json_bytes).map_err(|e| {
anyhow::anyhow!(
"guest returned invalid JSON ({e}): {:?}",
&json_bytes[..json_bytes.len().min(200)]
)
})
}
}
#[async_trait]
impl SiteProvider for WasmProvider {
fn name(&self) -> &'static str {
self.static_name
}
fn matches(&self, url: &str) -> bool {
self.url_regex.is_match(url)
}
async fn extract(
&self,
url: &str,
_client: &AcceleratedClient,
_cookies: Option<&str>,
prefetched_html: Option<&[u8]>,
) -> Result<SiteContent> {
let html = prefetched_html.ok_or_else(|| {
anyhow::anyhow!(
"WASM provider '{}' requires pre-fetched HTML but none was provided for {url}",
self.static_name
)
})?;
let article = self.run_guest(html, url).map_err(|e| {
anyhow::anyhow!("WASM provider '{}' failed for {url}: {e}", self.static_name)
})?;
Ok(article_to_site_content(
article,
url,
self.static_name,
"wasm",
))
}
}
pub fn load_provider(
name: &str,
wasm_bytes: &[u8],
url_pattern: &str,
) -> Result<Box<dyn SiteProvider>> {
match WitWasmProvider::from_bytes(name, wasm_bytes, url_pattern) {
Ok(p) => {
tracing::debug!("WASM provider '{name}': using Component Model ABI");
Ok(Box::new(p))
}
Err(wit_err) => {
tracing::debug!(
"WASM provider '{name}': Component Model failed ({wit_err}), trying raw ABI"
);
WasmProvider::from_bytes(name, wasm_bytes, url_pattern)
.map(|p| -> Box<dyn SiteProvider> { Box::new(p) })
.map_err(|raw_err| {
anyhow::anyhow!(
"WASM provider '{name}': both ABIs failed — \
component: {wit_err}; raw: {raw_err}"
)
})
}
}
}
pub fn load_provider_from_file(
name: &str,
wasm_path: &Path,
url_pattern: &str,
) -> Result<Box<dyn SiteProvider>> {
let bytes = std::fs::read(wasm_path)
.map_err(|e| anyhow::anyhow!("cannot read WASM file {}: {e}", wasm_path.display()))?;
load_provider(name, &bytes, url_pattern)
}
fn build_engine() -> Result<Engine> {
let mut config = wasmtime::Config::new();
config.consume_fuel(true);
config.wasm_component_model(true);
config.max_wasm_stack(512 * 1024); Engine::new(&config).map_err(|e| anyhow::anyhow!("failed to create wasmtime Engine: {e}"))
}
fn build_store(engine: &Engine) -> Result<Store<StoreData>> {
let data = StoreData {
limiter: StoreLimitsBuilder::new()
.memory_size(MEMORY_LIMIT_BYTES)
.build(),
};
let mut store = Store::new(engine, data);
store
.set_fuel(FUEL_LIMIT)
.map_err(|e| anyhow::anyhow!("failed to set fuel limit: {e}"))?;
store.limiter(|d| &mut d.limiter);
Ok(store)
}
fn build_component_store(engine: &Engine) -> Result<Store<StoreData>> {
build_store(engine)
}
fn write_guest_memory(
memory: &wasmtime::Memory,
store: &mut Store<StoreData>,
offset: usize,
bytes: &[u8],
) -> Result<()> {
memory.write(store, offset, bytes).map_err(|e| {
anyhow::anyhow!(
"cannot write {}-byte slice to guest at offset {offset}: {e}",
bytes.len()
)
})
}
fn read_guest_cstring(
memory: &wasmtime::Memory,
store: &mut Store<StoreData>,
offset: usize,
) -> Result<Vec<u8>> {
let mem_slice = memory.data(store);
let available = mem_slice
.get(offset..)
.ok_or_else(|| anyhow::anyhow!("guest result pointer {offset} is out of memory bounds"))?;
let nul_pos = available.iter().position(|&b| b == 0).ok_or_else(|| {
anyhow::anyhow!("no NUL terminator found in guest result starting at {offset}")
})?;
Ok(available[..nul_pos].to_vec())
}
fn article_to_site_content(
article: WasmArticle,
url: &str,
provider_name: &'static str,
platform_prefix: &str,
) -> SiteContent {
let markdown = article.content.unwrap_or_default();
let metadata = SiteMetadata {
title: article.title,
author: article.author,
published: article.date,
platform: format!("{platform_prefix}:{provider_name}"),
canonical_url: article.canonical_url.unwrap_or_else(|| url.to_string()),
media_urls: Vec::new(),
engagement: None,
};
SiteContent { markdown, metadata }
}
#[cfg(test)]
mod tests {
use super::*;
fn minimal_wasm_bytes() -> Vec<u8> {
let json = br#"{"title":"Test Title","content":"Hello World","author":"Alice","date":"2026-01-01"}"#;
assert!(json.len() < 200, "JSON too long for test WAT module");
let mut stores = String::new();
for (i, &b) in json.iter().enumerate() {
stores.push_str(&format!(
"i32.const {}\ni32.const {}\ni32.store8\n",
256 + i,
b
));
}
stores.push_str(&format!(
"i32.const {}\ni32.const 0\ni32.store8\n",
256 + json.len()
));
let wat = format!(
r#"(module
(memory (export "memory") 1)
(func (export "alloc") (param i32) (result i32)
i32.const 0)
(func (export "extract") (param i32 i32 i32 i32) (result i32)
{stores}
i32.const 256)
)"#
);
wat::parse_str(&wat).expect("valid WAT")
}
fn failing_wasm_bytes() -> Vec<u8> {
wat::parse_str(
r#"(module
(memory (export "memory") 1)
(func (export "alloc") (param i32) (result i32) i32.const 0)
(func (export "extract") (param i32 i32 i32 i32) (result i32) i32.const 0)
)"#,
)
.expect("valid WAT")
}
fn infinite_loop_wasm_bytes() -> Vec<u8> {
wat::parse_str(
r#"(module
(memory (export "memory") 1)
(func (export "alloc") (param i32) (result i32) i32.const 0)
(func (export "extract") (param i32 i32 i32 i32) (result i32)
(loop $loop (br $loop))
i32.const 0)
)"#,
)
.expect("valid WAT")
}
#[test]
fn from_bytes_builds_provider_with_valid_input() {
let wasm = minimal_wasm_bytes();
let provider = WasmProvider::from_bytes("test", &wasm, r"example\.com");
assert!(provider.is_ok());
assert_eq!(provider.unwrap().name(), "test");
}
#[test]
fn from_bytes_rejects_invalid_url_pattern() {
let wasm = minimal_wasm_bytes();
assert!(WasmProvider::from_bytes("test", &wasm, r"[invalid").is_err());
}
#[test]
fn from_bytes_rejects_invalid_wasm() {
assert!(WasmProvider::from_bytes("test", b"not wasm at all", r"example\.com").is_err());
}
#[test]
fn matches_url_satisfying_pattern() {
let p = WasmProvider::from_bytes("t", &minimal_wasm_bytes(), r"example\.com").unwrap();
assert!(p.matches("https://example.com/article/1"));
}
#[test]
fn does_not_match_url_outside_pattern() {
let p = WasmProvider::from_bytes("t", &minimal_wasm_bytes(), r"example\.com").unwrap();
assert!(!p.matches("https://other.com/article/1"));
}
#[test]
fn run_guest_returns_article_from_valid_module() {
let p = WasmProvider::from_bytes("t", &minimal_wasm_bytes(), r"example\.com").unwrap();
let article = p
.run_guest(b"<html></html>", "https://example.com")
.unwrap();
assert_eq!(article.title.as_deref(), Some("Test Title"));
assert_eq!(article.content.as_deref(), Some("Hello World"));
assert_eq!(article.author.as_deref(), Some("Alice"));
}
#[test]
fn run_guest_returns_error_when_extract_returns_null() {
let p = WasmProvider::from_bytes("t", &failing_wasm_bytes(), r"example\.com").unwrap();
let result = p.run_guest(b"<html></html>", "https://example.com");
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("null pointer"));
}
#[test]
fn run_guest_returns_error_when_fuel_exhausted() {
let p =
WasmProvider::from_bytes("t", &infinite_loop_wasm_bytes(), r"example\.com").unwrap();
let result = p.run_guest(b"<html></html>", "https://example.com");
assert!(result.is_err());
}
#[tokio::test]
async fn extract_returns_error_without_prefetched_html() {
let p = WasmProvider::from_bytes("t", &minimal_wasm_bytes(), r"example\.com").unwrap();
let client = AcceleratedClient::new().expect("client");
let result = p
.extract("https://example.com/article", &client, None, None)
.await;
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("pre-fetched HTML"));
}
#[tokio::test]
async fn extract_produces_site_content_from_wasm_guest() {
let p = WasmProvider::from_bytes("t", &minimal_wasm_bytes(), r"example\.com").unwrap();
let client = AcceleratedClient::new().expect("client");
let html = b"<html><body><article>Hello</article></body></html>";
let result = p
.extract("https://example.com/article", &client, None, Some(html))
.await
.expect("extract should succeed");
assert_eq!(result.markdown, "Hello World");
assert_eq!(result.metadata.title.as_deref(), Some("Test Title"));
assert_eq!(result.metadata.author.as_deref(), Some("Alice"));
assert_eq!(result.metadata.platform, "wasm:t");
}
#[tokio::test]
async fn extract_uses_request_url_as_canonical_when_guest_omits_it() {
let p = WasmProvider::from_bytes("t", &minimal_wasm_bytes(), r"example\.com").unwrap();
let client = AcceleratedClient::new().expect("client");
let result = p
.extract(
"https://example.com/page",
&client,
None,
Some(b"<html></html>"),
)
.await
.expect("extract should succeed");
assert_eq!(result.metadata.canonical_url, "https://example.com/page");
}
#[test]
fn wasm_article_deserialises_all_fields() {
let json = r#"{"title":"T","content":"C","author":"A","date":"D","canonical_url":"U"}"#;
let a: WasmArticle = serde_json::from_str(json).unwrap();
assert_eq!(a.title.as_deref(), Some("T"));
assert_eq!(a.content.as_deref(), Some("C"));
assert_eq!(a.author.as_deref(), Some("A"));
assert_eq!(a.date.as_deref(), Some("D"));
assert_eq!(a.canonical_url.as_deref(), Some("U"));
}
#[test]
fn wasm_article_all_fields_optional() {
let a: WasmArticle = serde_json::from_str("{}").unwrap();
assert!(a.title.is_none());
assert!(a.content.is_none());
}
#[test]
fn build_engine_succeeds() {
assert!(build_engine().is_ok());
}
#[test]
fn build_store_has_fuel_and_limiter() {
let engine = build_engine().unwrap();
let store = build_store(&engine);
assert!(store.is_ok());
}
#[test]
fn wit_provider_from_bytes_rejects_invalid_bytes() {
assert!(WitWasmProvider::from_bytes("t", b"not a component", r"example\.com").is_err());
}
#[test]
fn wit_provider_from_bytes_rejects_plain_module() {
let module_bytes = minimal_wasm_bytes();
assert!(
WitWasmProvider::from_bytes("t", &module_bytes, r"example\.com").is_err(),
"plain module bytes must not be accepted as a Component"
);
}
#[test]
fn wit_provider_from_bytes_rejects_invalid_url_pattern() {
assert!(WitWasmProvider::from_bytes("t", b"junk", r"[bad regex").is_err());
}
#[test]
fn load_provider_falls_back_to_raw_abi_for_plain_module() {
let bytes = minimal_wasm_bytes();
let result = load_provider("fallback", &bytes, r"example\.com");
assert!(result.is_ok(), "fallback to raw ABI should succeed");
assert_eq!(result.unwrap().name(), "fallback");
}
#[test]
fn load_provider_returns_error_for_garbage_bytes() {
assert!(load_provider("bad", b"garbage", r"example\.com").is_err());
}
#[test]
fn load_provider_returns_error_for_invalid_url_pattern() {
let bytes = minimal_wasm_bytes();
assert!(load_provider("bad-regex", &bytes, r"[broken").is_err());
}
#[test]
fn article_to_site_content_uses_platform_prefix() {
let article = WasmArticle {
content: Some("body".to_string()),
title: Some("T".to_string()),
..Default::default()
};
let content = article_to_site_content(article, "https://example.com", "myprov", "wit");
assert_eq!(content.metadata.platform, "wit:myprov");
assert_eq!(content.markdown, "body");
}
#[test]
fn article_to_site_content_falls_back_to_url_for_canonical() {
let article = WasmArticle {
content: Some("x".to_string()),
..Default::default()
};
let content = article_to_site_content(article, "https://example.com/pg", "p", "wasm");
assert_eq!(content.metadata.canonical_url, "https://example.com/pg");
}
#[test]
fn article_to_site_content_prefers_canonical_url_when_present() {
let article = WasmArticle {
content: Some("x".to_string()),
canonical_url: Some("https://canonical.example.com/pg".to_string()),
..Default::default()
};
let content = article_to_site_content(article, "https://other.com/pg", "p", "wasm");
assert_eq!(
content.metadata.canonical_url,
"https://canonical.example.com/pg"
);
}
#[tokio::test]
async fn wit_extract_returns_error_without_prefetched_html() {
let _ = (); }
}