pub struct RemoteMultimodalEngine {Show 17 fields
pub api_url: String,
pub api_key: Option<String>,
pub model_name: String,
pub system_prompt: Option<String>,
pub system_prompt_extra: Option<String>,
pub user_message_extra: Option<String>,
pub cfg: RemoteMultimodalConfig,
pub prompt_url_gate: Option<PromptUrlGate>,
pub semaphore: Option<Arc<Semaphore>>,
pub vision_model: Option<ModelEndpoint>,
pub text_model: Option<ModelEndpoint>,
pub vision_route_mode: VisionRouteMode,
pub use_chrome_ai: bool,
pub chrome_ai_max_user_chars: usize,
pub model_router: Option<ModelRouter>,
pub model_pool: Vec<ModelEndpoint>,
pub client: Option<Client>,
}Expand description
Remote multimodal engine for LLM-driven web automation.
This engine makes API calls to OpenAI-compatible endpoints (like OpenRouter) to extract structured data from HTML content. It supports:
- HTML-only extraction (no browser required)
- HTML + screenshot extraction (multimodal)
- Configurable prompts and extraction schemas
- Concurrency limiting via semaphore
§Example
use spider_agent::automation::{RemoteMultimodalEngine, RemoteMultimodalConfig};
let engine = RemoteMultimodalEngine::new(
"https://openrouter.ai/api/v1/chat/completions",
"qwen/qwen-2-vl-72b-instruct",
None,
).with_api_key(Some("your-api-key"));
let result = engine.extract_from_html(
"<html><body><h1>Product</h1><p>$19.99</p></body></html>",
"https://example.com/product",
Some("Product Page"),
).await?;
println!("Extracted: {:?}", result.extracted);Fields§
§api_url: StringFull OpenAI-compatible chat completions endpoint URL.
api_key: Option<String>Optional bearer token for authenticated endpoints.
model_name: StringModel identifier understood by the endpoint.
system_prompt: Option<String>Optional base system prompt for the model.
system_prompt_extra: Option<String>Optional extra system instructions appended at runtime.
user_message_extra: Option<String>Optional extra user instructions appended at runtime.
cfg: RemoteMultimodalConfigRuntime configuration controlling capture, retry, and model policy.
prompt_url_gate: Option<PromptUrlGate>Optional URL-based gate controlling whether automation runs for a given URL and allowing per-URL config overrides.
semaphore: Option<Arc<Semaphore>>Optional semaphore used to limit concurrent in-flight LLM requests.
vision_model: Option<ModelEndpoint>Optional vision model endpoint for dual-model routing.
text_model: Option<ModelEndpoint>Optional text-only model endpoint for dual-model routing.
vision_route_mode: VisionRouteModeRouting mode controlling when vision vs text model is used.
use_chrome_ai: boolUse Chrome’s built-in LanguageModel API (Gemini Nano) for inference.
When true, uses page.evaluate() + LanguageModel.create() instead of
HTTP API calls. Also used as a last-resort fallback when api_url is empty.
chrome_ai_max_user_chars: usizeMaximum user-prompt characters for Chrome AI inference.
model_router: Option<ModelRouter>Optional model router for per-round complexity-based routing.
When set (pool has 3+ models), each round classifies its complexity
and routes to the appropriate cost tier (cheap for simple, expensive
for complex). None delegates to existing resolve_model_for_round.
model_pool: Vec<ModelEndpoint>Pool of model endpoints for complexity-based routing.
Each entry can have its own API URL and key. The router selects which model to use and this pool resolves the connection details.
client: Option<Client>Optional pre-built HTTP client (e.g. with proxy configuration).
When None, uses the default static client.
Implementations§
Source§impl RemoteMultimodalEngine
impl RemoteMultimodalEngine
Sourcepub fn new<S: Into<String>>(
api_url: S,
model_name: S,
system_prompt: Option<String>,
) -> Self
pub fn new<S: Into<String>>( api_url: S, model_name: S, system_prompt: Option<String>, ) -> Self
Create a new remote multimodal engine.
§Arguments
api_url- OpenAI-compatible chat completions endpoint URLmodel_name- Model identifier (e.g., “gpt-4o”, “qwen/qwen-2-vl-72b-instruct”)system_prompt- Optional custom system prompt (defaults to built-in)
Sourcepub fn with_api_key(self, key: Option<&str>) -> Self
pub fn with_api_key(self, key: Option<&str>) -> Self
Set/clear the API key (Bearer token).
Sourcepub fn with_config(self, cfg: RemoteMultimodalConfig) -> Self
pub fn with_config(self, cfg: RemoteMultimodalConfig) -> Self
Set the runtime configuration.
Sourcepub fn with_max_inflight_requests(&mut self, n: usize) -> &mut Self
pub fn with_max_inflight_requests(&mut self, n: usize) -> &mut Self
Set maximum concurrent LLM requests.
Sourcepub fn with_semaphore(&mut self, sem: Option<Arc<Semaphore>>) -> &mut Self
pub fn with_semaphore(&mut self, sem: Option<Arc<Semaphore>>) -> &mut Self
Provide a shared semaphore for concurrency control.
Sourcepub fn with_system_prompt_extra(&mut self, extra: Option<&str>) -> &mut Self
pub fn with_system_prompt_extra(&mut self, extra: Option<&str>) -> &mut Self
Set extra system prompt instructions.
Sourcepub fn with_user_message_extra(&mut self, extra: Option<&str>) -> &mut Self
pub fn with_user_message_extra(&mut self, extra: Option<&str>) -> &mut Self
Set extra user message instructions.
Sourcepub fn with_prompt_url_gate(&mut self, gate: Option<PromptUrlGate>) -> &mut Self
pub fn with_prompt_url_gate(&mut self, gate: Option<PromptUrlGate>) -> &mut Self
Set URL-based gating.
Sourcepub fn with_chrome_ai(&mut self, enabled: bool) -> &mut Self
pub fn with_chrome_ai(&mut self, enabled: bool) -> &mut Self
Enable Chrome built-in AI (LanguageModel / Gemini Nano) for inference.
Sourcepub fn with_chrome_ai_max_user_chars(&mut self, chars: usize) -> &mut Self
pub fn with_chrome_ai_max_user_chars(&mut self, chars: usize) -> &mut Self
Set the maximum user-prompt character budget for Chrome AI.
Sourcepub fn should_use_chrome_ai(&self) -> bool
pub fn should_use_chrome_ai(&self) -> bool
Whether Chrome AI should be used for inference.
Returns true when explicitly enabled OR when no API endpoint is
configured (last-resort fallback).
Sourcepub fn with_client(&mut self, client: Option<Client>) -> &mut Self
pub fn with_client(&mut self, client: Option<Client>) -> &mut Self
Set a pre-built HTTP client (e.g. with proxy configuration).
Sourcepub fn with_proxies(&mut self, proxies: Option<&[String]>) -> &mut Self
pub fn with_proxies(&mut self, proxies: Option<&[String]>) -> &mut Self
Set HTTP proxy URLs for LLM API requests.
Builds a reqwest::Client with the given proxies and a 120s timeout.
Invalid proxy URLs are silently skipped.
Sourcepub fn with_remote_multimodal_config(
&mut self,
cfg: RemoteMultimodalConfig,
) -> &mut Self
pub fn with_remote_multimodal_config( &mut self, cfg: RemoteMultimodalConfig, ) -> &mut Self
Set the full runtime configuration.
Sourcepub fn with_extra_ai_data(&mut self, enabled: bool) -> &mut Self
pub fn with_extra_ai_data(&mut self, enabled: bool) -> &mut Self
Enable/disable extraction mode.
Sourcepub fn with_extraction_prompt(&mut self, prompt: Option<&str>) -> &mut Self
pub fn with_extraction_prompt(&mut self, prompt: Option<&str>) -> &mut Self
Set the extraction prompt.
Sourcepub fn with_screenshot(&mut self, enabled: bool) -> &mut Self
pub fn with_screenshot(&mut self, enabled: bool) -> &mut Self
Enable/disable screenshot in results.
Sourcepub fn with_extraction_schema(
&mut self,
schema: Option<ExtractionSchema>,
) -> &mut Self
pub fn with_extraction_schema( &mut self, schema: Option<ExtractionSchema>, ) -> &mut Self
Set extraction schema.
Sourcepub fn config(&self) -> &RemoteMultimodalConfig
pub fn config(&self) -> &RemoteMultimodalConfig
Get current configuration.
Sourcepub fn prompt_url_gate(&self) -> Option<&PromptUrlGate>
pub fn prompt_url_gate(&self) -> Option<&PromptUrlGate>
Get prompt URL gate.
Sourcepub fn clone_with_cfg(&self, cfg: RemoteMultimodalConfig) -> Self
pub fn clone_with_cfg(&self, cfg: RemoteMultimodalConfig) -> Self
Clone with a different configuration.
Sourcepub async fn acquire_llm_permit(&self) -> Option<OwnedSemaphorePermit>
pub async fn acquire_llm_permit(&self) -> Option<OwnedSemaphorePermit>
Acquire LLM permit for concurrency control.
Sourcepub fn analyze_content(&self, html: &str) -> ContentAnalysis
pub fn analyze_content(&self, html: &str) -> ContentAnalysis
Analyze HTML content for extraction decisions.
Sourcepub fn needs_screenshot(&self, html: &str) -> bool
pub fn needs_screenshot(&self, html: &str) -> bool
Quick check if screenshot is likely needed for extraction.
Sourcepub fn system_prompt_compiled(
&self,
effective_cfg: &RemoteMultimodalConfig,
) -> String
pub fn system_prompt_compiled( &self, effective_cfg: &RemoteMultimodalConfig, ) -> String
Compile the system prompt with configuration.
Uses EXTRACTION_ONLY_SYSTEM_PROMPT for single-round extraction mode,
otherwise DEFAULT_SYSTEM_PROMPT is always the base.
Sourcepub fn with_vision_model(
&mut self,
endpoint: Option<ModelEndpoint>,
) -> &mut Self
pub fn with_vision_model( &mut self, endpoint: Option<ModelEndpoint>, ) -> &mut Self
Set the vision model endpoint for dual-model routing.
Sourcepub fn with_text_model(&mut self, endpoint: Option<ModelEndpoint>) -> &mut Self
pub fn with_text_model(&mut self, endpoint: Option<ModelEndpoint>) -> &mut Self
Set the text model endpoint for dual-model routing.
Sourcepub fn with_vision_route_mode(&mut self, mode: VisionRouteMode) -> &mut Self
pub fn with_vision_route_mode(&mut self, mode: VisionRouteMode) -> &mut Self
Set the vision routing mode.
Sourcepub fn has_dual_model_routing(&self) -> bool
pub fn has_dual_model_routing(&self) -> bool
Whether dual-model routing is active.
Sourcepub fn resolve_model_for_round(
&self,
use_vision: bool,
) -> (&str, &str, Option<&str>)
pub fn resolve_model_for_round( &self, use_vision: bool, ) -> (&str, &str, Option<&str>)
Resolve (api_url, model_name, api_key) for the current round.
Delegates to the same logic as [RemoteMultimodalConfigs::resolve_model_for_round]
but uses the engine’s own fields.
Sourcepub fn resolve_model_for_round_with_complexity(
&self,
use_vision: bool,
user_prompt: &str,
html_len: usize,
round_idx: usize,
stagnated: bool,
) -> (&str, &str, Option<&str>)
pub fn resolve_model_for_round_with_complexity( &self, use_vision: bool, user_prompt: &str, html_len: usize, round_idx: usize, stagnated: bool, ) -> (&str, &str, Option<&str>)
Resolve (api_url, model_name, api_key) using complexity-based pool routing.
When model_router is set (3+ models in pool), classifies the round’s
complexity and routes to the appropriate cost tier. Falls back to the
existing resolve_model_for_round when no pool routing is active.
If the routed model doesn’t support vision but use_vision is true,
walks up cost tiers to find a vision-capable model in the pool.
Sourcepub fn pick_fallback_model(
&self,
tried: &[String],
use_vision: bool,
) -> Option<(String, String, Option<String>)>
pub fn pick_fallback_model( &self, tried: &[String], use_vision: bool, ) -> Option<(String, String, Option<String>)>
Pick a fallback model from the pool, excluding already-tried models.
Used by infer_plan_with_retry when a retryable error (502, 503, 429,
timeout) occurs and the pool has alternative endpoints to try.
Returns (api_url, model_name, api_key) or None if all pool models
have been tried.
Sourcepub fn should_use_vision_this_round(
&self,
round_idx: usize,
stagnated: bool,
action_stuck_rounds: usize,
force_vision: bool,
) -> bool
pub fn should_use_vision_this_round( &self, round_idx: usize, stagnated: bool, action_stuck_rounds: usize, force_vision: bool, ) -> bool
Decide whether to use vision this round.
Sourcepub async fn extract_from_html(
&self,
html: &str,
url: &str,
title: Option<&str>,
) -> EngineResult<AutomationResult>
pub async fn extract_from_html( &self, html: &str, url: &str, title: Option<&str>, ) -> EngineResult<AutomationResult>
Extract structured data from raw HTML content (no browser required).
This method enables extraction from HTTP responses without Chrome. It sends the HTML to the multimodal model and returns extracted data.
§Arguments
html- The raw HTML content to extract fromurl- The URL of the page (for context)title- Optional page title
§Returns
An AutomationResult containing the extracted data in the extracted field.
Sourcepub async fn extract_with_screenshot(
&self,
html: &str,
url: &str,
title: Option<&str>,
screenshot_base64: Option<&str>,
) -> EngineResult<AutomationResult>
pub async fn extract_with_screenshot( &self, html: &str, url: &str, title: Option<&str>, screenshot_base64: Option<&str>, ) -> EngineResult<AutomationResult>
Extract structured data from HTML with an optional screenshot.
This method combines HTML text with a screenshot for more accurate extraction, especially useful for pages with visual content that isn’t in the HTML (iframes, videos, canvas, dynamically rendered content).
§Arguments
html- The raw HTML contenturl- The URL of the page (for context)title- Optional page titlescreenshot_base64- Optional base64-encoded screenshot (PNG/JPEG)
Sourcepub async fn chat_completion(
&self,
system_prompt: &str,
user_message: &str,
) -> EngineResult<(String, AutomationUsage)>
pub async fn chat_completion( &self, system_prompt: &str, user_message: &str, ) -> EngineResult<(String, AutomationUsage)>
Send a raw chat completion request and get the response.
This is a lower-level method for custom use cases.
Sourcepub async fn classify_urls(
&self,
urls: &[&str],
relevance_prompt: Option<&str>,
extraction_prompt: Option<&str>,
max_tokens: u16,
) -> EngineResult<Vec<bool>>
pub async fn classify_urls( &self, urls: &[&str], relevance_prompt: Option<&str>, extraction_prompt: Option<&str>, max_tokens: u16, ) -> EngineResult<Vec<bool>>
Classify a batch of URLs as relevant or irrelevant using the text model.
Returns a Vec<bool> parallel to the input URLs (true = relevant).
Uses resolve_model_for_round(false) to get the cheap/fast text model.
On any failure (HTTP, parse, length mismatch), returns all true (safe fallback).
Sourcepub fn generate_schema_from_examples(
&self,
examples: &[Value],
name: Option<&str>,
description: Option<&str>,
) -> GeneratedSchema
pub fn generate_schema_from_examples( &self, examples: &[Value], name: Option<&str>, description: Option<&str>, ) -> GeneratedSchema
Generate an extraction schema from example data.
Uses the schema generation utilities to create a JSON schema from example outputs. Useful for zero-config extraction setup.
Sourcepub fn infer_schema(&self, example: &Value) -> Value
pub fn infer_schema(&self, example: &Value) -> Value
Infer a JSON schema from a single example value.
Sourcepub fn build_schema_prompt(
&self,
examples: &[Value],
description: Option<&str>,
) -> String
pub fn build_schema_prompt( &self, examples: &[Value], description: Option<&str>, ) -> String
Build a schema generation prompt for LLM-assisted schema creation.
Sourcepub fn parse_tool_calls(&self, response: &Value) -> Vec<ToolCall>
pub fn parse_tool_calls(&self, response: &Value) -> Vec<ToolCall>
Parse tool calls from an LLM response.
Extracts OpenAI-compatible tool calls from a response JSON.
Sourcepub fn tool_calls_to_steps(&self, calls: &[ToolCall]) -> Vec<Value>
pub fn tool_calls_to_steps(&self, calls: &[ToolCall]) -> Vec<Value>
Convert tool calls to automation step actions.
Sourcepub fn action_tool_schemas(&self) -> Vec<ToolDefinition>
pub fn action_tool_schemas(&self) -> Vec<ToolDefinition>
Get all available action tool schemas.
Returns OpenAI-compatible tool definitions for all supported actions.
Sourcepub fn extract_html_context(&self, html: &str, max_bytes: usize) -> String
pub fn extract_html_context(&self, html: &str, max_bytes: usize) -> String
Extract HTML context around selectors for self-healing.
Sourcepub fn create_dependency_graph(
&self,
steps: Vec<DependentStep>,
) -> Result<DependencyGraph, String>
pub fn create_dependency_graph( &self, steps: Vec<DependentStep>, ) -> Result<DependencyGraph, String>
Create a new dependency graph for concurrent execution.
Sourcepub async fn execute_dependency_graph<F, Fut>(
&self,
graph: &mut DependencyGraph,
config: &ConcurrentChainConfig,
executor: F,
) -> ConcurrentChainResultwhere
F: Fn(DependentStep) -> Fut + Clone + Send + Sync + 'static,
Fut: Future<Output = StepResult> + Send + 'static,
pub async fn execute_dependency_graph<F, Fut>(
&self,
graph: &mut DependencyGraph,
config: &ConcurrentChainConfig,
executor: F,
) -> ConcurrentChainResultwhere
F: Fn(DependentStep) -> Fut + Clone + Send + Sync + 'static,
Fut: Future<Output = StepResult> + Send + 'static,
Execute a dependency graph with the provided executor.
This enables parallel execution of independent steps using tokio::JoinSet.
Trait Implementations§
Source§impl Clone for RemoteMultimodalEngine
impl Clone for RemoteMultimodalEngine
Source§fn clone(&self) -> RemoteMultimodalEngine
fn clone(&self) -> RemoteMultimodalEngine
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read more