#[cfg(feature = "agent")]
pub use spider_agent::automation::{
best_effort_parse_json_object,
categories,
clean_html,
clean_html_base,
clean_html_full,
clean_html_raw,
clean_html_slim,
clean_html_with_profile,
clean_html_with_profile_and_intent,
extract_assistant_content,
extract_last_code_block,
extract_last_json_array,
extract_last_json_boundaries,
extract_last_json_object,
extract_usage,
fnv1a64,
is_url_allowed,
merged_config,
smart_clean_html,
truncate_utf8_tail,
ActResult,
ActionRecord,
ActionResult,
ActionType,
AutomationConfig,
AutomationMemory,
AutomationResult,
AutomationUsage,
CaptureProfile,
ChainBuilder,
ChainCondition,
ChainContext,
ChainResult,
ChainStep,
ChainStepResult,
CleaningIntent,
ClipViewport,
ContentAnalysis,
CostTier,
DiscoveredUrl,
EngineError,
EngineResult,
ExtractionSchema,
FormField,
FormInfo,
HtmlCleaningProfile,
InteractiveElement,
MapResult,
MemoryOperation,
ModelEndpoint,
ModelPolicy,
NavigationOption,
PageObservation,
PromptUrlGate,
RecoveryStrategy,
RemoteMultimodalConfig,
RemoteMultimodalConfigs,
RemoteMultimodalEngine,
RetryPolicy,
SelectorCache,
SelectorCacheEntry,
StructuredOutputConfig,
VisionRouteMode,
ACT_SYSTEM_PROMPT,
CHROME_AI_SYSTEM_PROMPT,
CONFIGURATION_SYSTEM_PROMPT,
DEFAULT_SYSTEM_PROMPT,
EXTRACTION_ONLY_SYSTEM_PROMPT,
EXTRACT_SYSTEM_PROMPT,
MAP_SYSTEM_PROMPT,
OBSERVE_SYSTEM_PROMPT,
};
#[cfg(feature = "agent_skills")]
pub use spider_agent::automation::skills;
#[cfg(feature = "agent")]
pub use spider_agent::automation::cache::{CacheStats, CacheValue, SmartCache};
#[cfg(feature = "agent")]
pub use spider_agent::automation::executor::{BatchExecutor, ChainExecutor, PrefetchManager};
#[cfg(feature = "agent")]
pub use spider_agent::automation::router::{
auto_policy, ModelRouter, RoutingDecision, TaskAnalysis, TaskCategory,
};
#[cfg(all(feature = "agent", feature = "agent_chrome"))]
pub use spider_agent::automation::run_remote_multimodal_with_page;
#[cfg(all(feature = "agent", feature = "agent_chrome"))]
pub use spider_agent::automation::{
run_spawn_pages_concurrent, run_spawn_pages_with_factory, run_spawn_pages_with_options,
PageFactory, PageSetupFn, SpawnPageOptions, SpawnedPageResult,
};
#[cfg(feature = "chrome")]
use chromiumoxide::Page;
#[cfg(not(feature = "agent"))]
pub use spider_agent_types::{AutomationResult, AutomationUsage};
#[cfg(not(feature = "agent"))]
#[derive(Debug)]
pub enum EngineError {
Http(reqwest::Error),
#[cfg(feature = "serde")]
Json(serde_json::Error),
MissingField(&'static str),
InvalidField(&'static str),
Remote(String),
Unsupported(&'static str),
}
#[cfg(not(feature = "agent"))]
impl std::fmt::Display for EngineError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
EngineError::Http(e) => write!(f, "http error: {e}"),
#[cfg(feature = "serde")]
EngineError::Json(e) => write!(f, "json error: {e}"),
EngineError::MissingField(s) => write!(f, "missing field: {s}"),
EngineError::InvalidField(s) => write!(f, "invalid field: {s}"),
EngineError::Remote(s) => write!(f, "remote error: {s}"),
EngineError::Unsupported(s) => write!(f, "unsupported: {s}"),
}
}
}
#[cfg(not(feature = "agent"))]
impl std::error::Error for EngineError {}
#[cfg(not(feature = "agent"))]
impl From<reqwest::Error> for EngineError {
fn from(e: reqwest::Error) -> Self {
EngineError::Http(e)
}
}
#[cfg(all(not(feature = "agent"), feature = "serde"))]
impl From<serde_json::Error> for EngineError {
fn from(e: serde_json::Error) -> Self {
EngineError::Json(e)
}
}
#[cfg(not(feature = "agent"))]
pub type EngineResult<T> = Result<T, EngineError>;
#[cfg(not(feature = "agent"))]
#[derive(Debug, Clone, Default, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct RemoteMultimodalConfigs {
pub api_url: String,
pub api_key: Option<String>,
pub model_name: String,
#[cfg_attr(feature = "serde", serde(default))]
pub use_chrome_ai: bool,
#[cfg_attr(
feature = "serde",
serde(default, skip_serializing_if = "Option::is_none")
)]
pub proxies: Option<Vec<String>>,
}
#[cfg(not(feature = "agent"))]
impl RemoteMultimodalConfigs {
pub fn new(api_url: impl Into<String>, model_name: impl Into<String>) -> Self {
Self {
api_url: api_url.into(),
model_name: model_name.into(),
..Default::default()
}
}
pub fn with_api_key(mut self, api_key: Option<impl Into<String>>) -> Self {
self.api_key = api_key.map(|k| k.into());
self
}
pub fn with_proxies(mut self, proxies: Option<Vec<String>>) -> Self {
self.proxies = proxies;
self
}
pub fn should_use_chrome_ai(&self) -> bool {
self.use_chrome_ai || (self.api_url.is_empty() && self.api_key.is_none())
}
pub fn automation_timeout(&self) -> Option<std::time::Duration> {
None
}
}
#[derive(Debug, Clone, Default)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[cfg_attr(feature = "serde", serde(default))]
pub struct PromptConfiguration {
pub respect_robots_txt: Option<bool>,
pub subdomains: Option<bool>,
pub tld: Option<bool>,
pub depth: Option<usize>,
pub delay: Option<u64>,
pub request_timeout_ms: Option<u64>,
pub crawl_timeout_ms: Option<u64>,
pub blacklist_url: Option<Vec<String>>,
pub whitelist_url: Option<Vec<String>>,
pub external_domains: Option<Vec<String>>,
pub user_agent: Option<String>,
pub headers: Option<std::collections::HashMap<String, String>>,
pub http2_prior_knowledge: Option<bool>,
pub accept_invalid_certs: Option<bool>,
pub proxies: Option<Vec<String>>,
pub redirect_limit: Option<usize>,
pub budget: Option<std::collections::HashMap<String, u32>>,
pub max_page_bytes: Option<f64>,
pub full_resources: Option<bool>,
pub only_html: Option<bool>,
pub return_page_links: Option<bool>,
pub use_chrome: Option<bool>,
pub stealth_mode: Option<String>,
pub viewport_width: Option<u32>,
pub viewport_height: Option<u32>,
pub wait_for_idle_network: Option<bool>,
pub wait_for_delay_ms: Option<u64>,
pub wait_for_selector: Option<String>,
pub evaluate_on_new_document: Option<String>,
pub shared_queue: Option<bool>,
pub retry: Option<u8>,
}
#[cfg(all(feature = "agent", feature = "serde"))]
pub async fn configure_crawler_from_prompt(
api_url: &str,
model_name: &str,
api_key: Option<&str>,
prompt: &str,
) -> EngineResult<PromptConfiguration> {
use serde::Serialize;
#[derive(Serialize)]
struct Message {
role: String,
content: String,
}
#[derive(Serialize)]
struct ResponseFormat {
#[serde(rename = "type")]
format_type: String,
}
#[derive(Serialize)]
struct InferenceRequest {
model: String,
messages: Vec<Message>,
temperature: f32,
max_tokens: u16,
#[serde(skip_serializing_if = "Option::is_none")]
response_format: Option<ResponseFormat>,
}
static CLIENT: std::sync::LazyLock<reqwest::Client> =
std::sync::LazyLock::new(reqwest::Client::new);
let request_body = InferenceRequest {
model: model_name.to_string(),
messages: vec![
Message {
role: "system".into(),
content: CONFIGURATION_SYSTEM_PROMPT.to_string(),
},
Message {
role: "user".into(),
content: format!(
"Configure a web crawler for the following requirements:\n\n{}",
prompt
),
},
],
temperature: 0.1,
max_tokens: 2048,
response_format: Some(ResponseFormat {
format_type: "json_object".into(),
}),
};
let mut req = CLIENT.post(api_url).json(&request_body);
if let Some(key) = api_key {
req = req.bearer_auth(key);
}
let http_resp = req.send().await.map_err(EngineError::Http)?;
let status = http_resp.status();
let raw_body = http_resp.text().await.map_err(EngineError::Http)?;
if !status.is_success() {
return Err(EngineError::Remote(format!(
"non-success status {status}: {raw_body}"
)));
}
let root: serde_json::Value = serde_json::from_str(&raw_body)
.map_err(|e| EngineError::Remote(format!("JSON parse error: {e}")))?;
let content = extract_assistant_content(&root)
.ok_or(EngineError::MissingField("choices[0].message.content"))?;
let config_value = best_effort_parse_json_object(&content)?;
let config: PromptConfiguration = serde_json::from_value(config_value)
.map_err(|e| EngineError::Remote(format!("Failed to parse configuration: {e}")))?;
Ok(config)
}
#[cfg(all(feature = "agent", feature = "agent_chrome"))]
pub async fn run_remote_multimodal_if_enabled(
cfgs: &Option<Box<RemoteMultimodalConfigs>>,
page: &Page,
url: &str,
) -> EngineResult<Option<AutomationResult>> {
let cfgs = match cfgs.as_deref() {
Some(c) => c,
None => return Ok(None),
};
let result = run_remote_multimodal_with_page(cfgs, page, url).await?;
if result.relevant == Some(false) {
cfgs.relevance_credits
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
Ok(Some(result))
}
#[cfg(all(feature = "agent", feature = "serde"))]
pub async fn run_remote_multimodal_extraction(
cfgs: &Option<Box<RemoteMultimodalConfigs>>,
html: &str,
url: &str,
title: Option<&str>,
) -> EngineResult<Option<AutomationResult>> {
let cfgs = match cfgs.as_deref() {
Some(c) => c,
None => return Ok(None),
};
if !cfgs.cfg.extra_ai_data {
return Ok(None);
}
if let Some(gate) = &cfgs.prompt_url_gate {
if !gate.is_allowed(url) {
return Ok(Some(AutomationResult {
label: "url_not_allowed".into(),
steps_executed: 0,
success: true,
error: None,
usage: AutomationUsage::default(),
extracted: None,
screenshot: None,
spawn_pages: Vec::new(),
relevant: None,
reasoning: None,
}));
}
}
let sem = cfgs.get_or_init_semaphore();
let mut engine = RemoteMultimodalEngine::new(
cfgs.api_url.clone(),
cfgs.model_name.clone(),
cfgs.system_prompt.clone(),
)
.with_api_key(cfgs.api_key.as_deref());
engine.with_system_prompt_extra(cfgs.system_prompt_extra.as_deref());
engine.with_user_message_extra(cfgs.user_message_extra.as_deref());
engine.with_remote_multimodal_config(cfgs.cfg.clone());
engine.with_prompt_url_gate(cfgs.prompt_url_gate.clone());
engine.with_semaphore(sem);
let result = engine.extract_from_html(html, url, title).await?;
if result.relevant == Some(false) {
cfgs.relevance_credits
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
Ok(Some(result))
}
#[cfg(all(feature = "agent", feature = "serde"))]
pub(crate) async fn prefilter_urls(
cfgs: &RemoteMultimodalConfigs,
urls: &hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>,
) -> hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString> {
use case_insensitive_string::CaseInsensitiveString;
if !cfgs.cfg.url_prefilter || !cfgs.cfg.relevance_gate || urls.is_empty() {
return urls.clone();
}
let batch_size = cfgs.cfg.url_prefilter_batch_size;
let max_tokens = cfgs.cfg.url_prefilter_max_tokens;
let mut relevant_set: hashbrown::HashSet<CaseInsensitiveString> =
hashbrown::HashSet::with_capacity(urls.len());
let mut uncached: Vec<CaseInsensitiveString> = Vec::new();
for url in urls {
let path = url_to_cache_key(url.inner().as_str());
match cfgs.url_prefilter_cache.get(&path).map(|v| *v.value()) {
Some(true) => {
relevant_set.insert(url.clone());
}
Some(false) => {
}
None => {
uncached.push(url.clone());
}
}
}
if uncached.is_empty() {
return relevant_set;
}
let sem = cfgs.get_or_init_semaphore();
let mut engine = RemoteMultimodalEngine::new(
cfgs.api_url.clone(),
cfgs.model_name.clone(),
cfgs.system_prompt.clone(),
)
.with_api_key(cfgs.api_key.as_deref());
engine.with_semaphore(sem);
engine.with_vision_model(cfgs.vision_model.clone());
engine.with_text_model(cfgs.text_model.clone());
engine.with_vision_route_mode(cfgs.vision_route_mode);
for batch in uncached.chunks(batch_size) {
let url_strs: Vec<&str> = batch.iter().map(|u| u.inner().as_str()).collect();
let classifications = match engine
.classify_urls(
&url_strs,
cfgs.cfg.relevance_prompt.as_deref(),
cfgs.cfg.extraction_prompt.as_deref(),
max_tokens,
)
.await
{
Ok(c) => c,
Err(e) => {
log::warn!("url_prefilter: classify_urls error, assuming all relevant: {e}");
relevant_set.extend(batch.iter().cloned());
continue;
}
};
if cfgs.url_prefilter_cache.len() >= 50_000 {
cfgs.url_prefilter_cache.clear();
}
for (url, &is_relevant) in batch.iter().zip(classifications.iter()) {
let path = url_to_cache_key(url.inner().as_str());
cfgs.url_prefilter_cache.insert(path, is_relevant);
if is_relevant {
relevant_set.insert(url.clone());
}
}
}
relevant_set
}
#[cfg(all(feature = "agent", feature = "serde"))]
fn url_to_cache_key(url: &str) -> String {
if let Ok(parsed) = url::Url::parse(url) {
let mut key = String::new();
key.push_str(parsed.scheme());
key.push_str("://");
key.push_str(parsed.host_str().unwrap_or_default());
if let Some(port) = parsed.port() {
key.push(':');
key.push_str(&port.to_string());
}
key.push_str(parsed.path());
if let Some(query) = parsed.query() {
key.push('?');
key.push_str(query);
}
return key;
}
url.to_string()
}
#[cfg(feature = "agent")]
pub trait AutomationResultExt {
fn to_automation_results(&self) -> crate::page::AutomationResults;
}
#[cfg(all(feature = "agent", feature = "serde"))]
impl AutomationResultExt for AutomationResult {
fn to_automation_results(&self) -> crate::page::AutomationResults {
crate::page::AutomationResults {
input: self.label.clone(),
content_output: self.extracted.clone().unwrap_or(serde_json::Value::Null),
screenshot_output: self.screenshot.clone(),
error: self.error.clone(),
usage: Some(self.usage.clone()),
relevant: self.relevant,
steps_executed: (self.steps_executed > 0).then_some(self.steps_executed),
reasoning: self
.reasoning
.as_ref()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty()),
}
}
}
#[cfg(all(feature = "agent", not(feature = "serde")))]
impl AutomationResultExt for AutomationResult {
fn to_automation_results(&self) -> crate::page::AutomationResults {
crate::page::AutomationResults {
input: self.label.clone(),
content_output: String::new(),
screenshot_output: self.screenshot.clone(),
error: self.error.clone(),
usage: Some(self.usage.clone()),
relevant: self.relevant,
steps_executed: (self.steps_executed > 0).then_some(self.steps_executed),
reasoning: self
.reasoning
.as_ref()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty()),
}
}
}
#[cfg(all(feature = "agent", feature = "agent_chrome"))]
pub async fn process_spawn_pages_with_config(
browser: &std::sync::Arc<chromiumoxide::browser::Browser>,
browser_context_id: &Option<chromiumoxide::cdp::browser_protocol::browser::BrowserContextId>,
urls: Vec<String>,
mm_cfgs: &RemoteMultimodalConfigs,
spider_config: &crate::configuration::Configuration,
options: SpawnPageOptions,
) -> Vec<SpawnedPageResult> {
use std::sync::Arc;
if urls.is_empty() {
return Vec::new();
}
let mm_cfgs = Arc::new(mm_cfgs.clone());
let spider_config = Arc::new(spider_config.clone());
let options = Arc::new(options);
let browser_context_id = browser_context_id.clone();
let mut handles = Vec::with_capacity(urls.len());
let track_events = spider_config
.track_events
.as_ref()
.map(|t| t.responses)
.unwrap_or(false);
for url in urls {
let browser = browser.clone();
let mm_cfgs = mm_cfgs.clone();
let spider_config = spider_config.clone();
let options = options.clone();
let browser_context_id = browser_context_id.clone();
let url_clone = url.clone();
handles.push(tokio::spawn(async move {
use chromiumoxide::cdp::browser_protocol::network::EventDataReceived;
use tokio_stream::StreamExt;
let new_page = match crate::features::chrome::attempt_navigation(
&url_clone,
&browser,
&spider_config.request_timeout,
&browser_context_id,
&spider_config.viewport,
)
.await
{
Ok(page) => page,
Err(e) => {
return SpawnedPageResult {
url: url_clone,
result: Err(format!("Failed to create page: {}", e)),
bytes_transferred: None,
response_map: None,
};
}
};
crate::features::chrome::setup_chrome_events(&new_page, &spider_config).await;
let bytes_listener = if track_events {
new_page.event_listener::<EventDataReceived>().await.ok()
} else {
None
};
use std::sync::atomic::{AtomicU64, Ordering};
let total_bytes: std::sync::Arc<AtomicU64> = std::sync::Arc::new(AtomicU64::new(0));
let response_map: std::sync::Arc<dashmap::DashMap<String, u64>> =
std::sync::Arc::new(dashmap::DashMap::new());
let total_bytes_clone = total_bytes.clone();
let response_map_clone = response_map.clone();
let listener_handle = bytes_listener.map(|mut listener| {
tokio::spawn(async move {
while let Some(event) = listener.next().await {
let bytes = event.encoded_data_length as u64;
total_bytes_clone.fetch_add(bytes, Ordering::Relaxed);
response_map_clone
.entry(event.request_id.inner().clone())
.and_modify(|v| *v += bytes)
.or_insert(bytes);
}
})
});
let mut page_cfgs = (*mm_cfgs).clone();
let mut page_cfg = page_cfgs.cfg.clone();
page_cfg.max_rounds = options.max_rounds.max(1);
page_cfg.screenshot = options.screenshot;
if let Some(ref prompt) = options.extraction_prompt {
page_cfg.extra_ai_data = true;
page_cfg.extraction_prompt = Some(prompt.clone());
}
page_cfgs.cfg = page_cfg;
if let Some(ref msg) = options.user_message_extra {
page_cfgs.user_message_extra = Some(msg.clone());
}
let result = run_remote_multimodal_with_page(&page_cfgs, &new_page, &url_clone)
.await
.map_err(|e| format!("Automation failed: {}", e));
if let Some(handle) = listener_handle {
handle.abort();
}
let (bytes_transferred, response_map_out) = {
let bytes_val = total_bytes.load(Ordering::Relaxed);
let bytes = if bytes_val > 0 {
Some(bytes_val as f64)
} else {
None
};
let map = if !response_map.is_empty() {
Some(
response_map
.iter()
.map(|e| (e.key().clone(), *e.value() as f64))
.collect(),
)
} else {
None
};
(bytes, map)
};
SpawnedPageResult {
url: url_clone,
result,
bytes_transferred,
response_map: response_map_out,
}
}));
}
let mut results = Vec::with_capacity(handles.len());
for handle in handles {
match handle.await {
Ok(spawn_result) => results.push(spawn_result),
Err(e) => {
results.push(SpawnedPageResult {
url: "unknown".to_string(),
result: Err(format!("Task failed: {}", e)),
bytes_transferred: None,
response_map: None,
});
}
}
}
results
}
#[cfg(all(feature = "chrome", not(feature = "agent_chrome")))]
pub async fn run_remote_multimodal_if_enabled(
_cfgs: &Option<Box<RemoteMultimodalConfigs>>,
_page: &Page,
_url: &str,
) -> EngineResult<Option<AutomationResult>> {
Ok(None)
}
#[cfg(all(feature = "chrome", not(feature = "agent")))]
pub trait AutomationResultExt {
fn to_automation_results(&self) -> crate::page::AutomationResults;
}
#[cfg(all(feature = "chrome", not(feature = "agent")))]
impl AutomationResultExt for AutomationResult {
fn to_automation_results(&self) -> crate::page::AutomationResults {
crate::page::AutomationResults {
input: self.label.clone(),
content_output: Default::default(),
screenshot_output: self.screenshot.clone(),
error: self.error.clone(),
usage: Some(self.usage.clone()),
relevant: None,
steps_executed: (self.steps_executed > 0).then_some(self.steps_executed),
reasoning: self
.reasoning
.as_ref()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty()),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_prompt_configuration_default() {
let config = PromptConfiguration::default();
assert!(config.respect_robots_txt.is_none());
assert!(config.depth.is_none());
}
#[cfg(feature = "serde")]
#[test]
fn test_prompt_configuration_serde() {
let json = r#"{"respect_robots_txt": true, "depth": 5}"#;
let config: PromptConfiguration = serde_json::from_str(json).unwrap();
assert_eq!(config.respect_robots_txt, Some(true));
assert_eq!(config.depth, Some(5));
}
#[cfg(feature = "agent")]
#[test]
fn test_automation_result_spawn_pages() {
let result = AutomationResult::success("test", 1).with_spawn_pages(vec![
"https://example.com/page1".to_string(),
"https://example.com/page2".to_string(),
]);
assert!(result.has_spawn_pages());
assert_eq!(result.spawn_pages.len(), 2);
assert_eq!(result.spawn_pages[0], "https://example.com/page1");
assert_eq!(result.spawn_pages[1], "https://example.com/page2");
let result = AutomationResult::success("test", 1);
assert!(!result.has_spawn_pages());
assert!(result.spawn_pages.is_empty());
}
#[cfg(all(feature = "agent", feature = "agent_chrome"))]
#[test]
fn test_spawn_page_options() {
let options = SpawnPageOptions::new()
.with_extraction("Extract content")
.with_screenshot(true)
.with_max_rounds(2);
assert_eq!(
options.extraction_prompt,
Some("Extract content".to_string())
);
assert!(options.screenshot);
assert_eq!(options.max_rounds, 2);
}
#[cfg(all(feature = "agent", feature = "agent_chrome"))]
#[test]
fn test_spawned_page_result_accessors() {
let result = SpawnedPageResult {
url: "https://example.com".to_string(),
result: Ok(AutomationResult::success("test", 1)
.with_extracted(serde_json::json!({"key": "value"}))),
bytes_transferred: Some(2048.0),
response_map: None,
};
assert!(result.is_ok());
assert!(result.error().is_none());
assert_eq!(result.label(), Some("test"));
assert!(result.extracted().is_some());
assert_eq!(result.bytes_transferred, Some(2048.0));
let error_result = SpawnedPageResult {
url: "https://example.com".to_string(),
result: Err("Connection error".to_string()),
bytes_transferred: None,
response_map: None,
};
assert!(!error_result.is_ok());
assert_eq!(error_result.error(), Some("Connection error"));
assert!(error_result.label().is_none());
}
#[cfg(all(feature = "agent", feature = "agent_chrome"))]
#[test]
fn test_spawned_page_result_with_response_map() {
let mut response_map = std::collections::HashMap::new();
response_map.insert("req-123".to_string(), 1024.0);
response_map.insert("req-456".to_string(), 2048.0);
let result = SpawnedPageResult {
url: "https://example.com".to_string(),
result: Ok(AutomationResult::success("test", 1)),
bytes_transferred: Some(3072.0),
response_map: Some(response_map),
};
assert!(result.bytes_transferred.is_some());
assert_eq!(result.bytes_transferred.unwrap(), 3072.0);
assert!(result.response_map.is_some());
assert_eq!(result.response_map.as_ref().unwrap().len(), 2);
}
#[cfg(all(feature = "agent", feature = "serde"))]
#[test]
fn test_url_to_cache_key_includes_origin() {
let a = url_to_cache_key("https://a.example.com/path");
let b = url_to_cache_key("https://b.example.com/path");
assert_ne!(a, b);
let with_query = url_to_cache_key("https://a.example.com/path?x=1&y=2");
assert_eq!(with_query, "https://a.example.com/path?x=1&y=2");
}
#[cfg(all(feature = "agent", feature = "serde"))]
#[tokio::test]
async fn test_prefilter_urls_returns_cached_relevance_without_network() {
use crate::CaseInsensitiveString;
use hashbrown::HashSet;
let mut cfgs = RemoteMultimodalConfigs::new("http://localhost:65535", "test-model");
cfgs.cfg.relevance_gate = true;
cfgs.cfg.url_prefilter = true;
let urls: HashSet<CaseInsensitiveString> = [
"https://a.example.com/p/1",
"https://b.example.com/p/2",
"https://c.example.com/p/3",
]
.into_iter()
.map(CaseInsensitiveString::from)
.collect();
cfgs.url_prefilter_cache
.insert(url_to_cache_key("https://a.example.com/p/1"), true);
cfgs.url_prefilter_cache
.insert(url_to_cache_key("https://b.example.com/p/2"), false);
cfgs.url_prefilter_cache
.insert(url_to_cache_key("https://c.example.com/p/3"), true);
let filtered = prefilter_urls(&cfgs, &urls).await;
assert_eq!(filtered.len(), 2);
assert!(filtered.contains(&CaseInsensitiveString::from("https://a.example.com/p/1")));
assert!(filtered.contains(&CaseInsensitiveString::from("https://c.example.com/p/3")));
assert!(!filtered.contains(&CaseInsensitiveString::from("https://b.example.com/p/2")));
}
#[cfg(all(feature = "agent", feature = "serde"))]
#[tokio::test]
async fn test_prefilter_urls_concurrent_cached_reads_are_stable() {
use crate::CaseInsensitiveString;
use hashbrown::HashSet;
use std::sync::Arc;
let mut cfgs = RemoteMultimodalConfigs::new("http://localhost:65535", "test-model");
cfgs.cfg.relevance_gate = true;
cfgs.cfg.url_prefilter = true;
cfgs.cfg.url_prefilter_batch_size = 64;
let urls: HashSet<CaseInsensitiveString> = (0..120usize)
.map(|i| format!("https://example.com/p/{i}"))
.map(CaseInsensitiveString::from)
.collect();
for (idx, url) in urls.iter().enumerate() {
cfgs.url_prefilter_cache
.insert(url_to_cache_key(url.inner().as_str()), idx % 2 == 0);
}
let cfgs = Arc::new(cfgs);
let urls = Arc::new(urls);
let mut tasks = tokio::task::JoinSet::new();
for _ in 0..32usize {
let cfgs = cfgs.clone();
let urls = urls.clone();
tasks.spawn(async move { prefilter_urls(&cfgs, &urls).await.len() });
}
while let Some(res) = tasks.join_next().await {
let len = res.expect("prefilter task should not panic");
assert_eq!(len, 60);
}
}
}