Skip to main content

a3s_code_core/config/
search.rs

1use serde::{Deserialize, Serialize};
2use std::path::PathBuf;
3
4// ============================================================================
5// Search / Browser / Document Configuration
6// ============================================================================
7
8/// Search engine configuration (a3s-search integration)
9#[derive(Debug, Clone, Serialize, Deserialize)]
10#[serde(rename_all = "camelCase")]
11pub struct SearchConfig {
12    /// Default timeout in seconds for all engines
13    #[serde(default = "default_search_timeout")]
14    pub timeout: u64,
15
16    /// Health monitor configuration
17    #[serde(default, skip_serializing_if = "Option::is_none")]
18    pub health: Option<SearchHealthConfig>,
19
20    /// Engine configurations
21    #[serde(default, rename = "engine")]
22    pub engines: std::collections::HashMap<String, SearchEngineConfig>,
23
24    /// Headless browser configuration for JS-rendered engines (google, baidu, bing_cn).
25    /// When enabled, the browser binary is auto-detected or downloaded.
26    #[serde(default, skip_serializing_if = "Option::is_none")]
27    pub headless: Option<HeadlessConfig>,
28}
29
30/// Browser backend for JS-rendered search engines.
31#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
32#[serde(rename_all = "lowercase")]
33pub enum BrowserBackend {
34    /// Chrome/Chromium headless browser.
35    #[default]
36    Chrome,
37    /// Lightpanda headless browser.
38    Lightpanda,
39}
40
41/// Headless browser configuration for JS-rendered engines.
42/// Uses a3s-search's browser pool, backed by Chrome/Chromium or Lightpanda.
43#[derive(Debug, Clone, Serialize, Deserialize)]
44#[serde(rename_all = "camelCase")]
45pub struct HeadlessConfig {
46    /// Browser backend to use.
47    #[serde(default)]
48    pub backend: BrowserBackend,
49
50    /// Maximum number of concurrent browser tabs.
51    #[serde(default = "default_headless_max_tabs")]
52    pub max_tabs: usize,
53
54    /// Path to the browser executable. If None, auto-detected or downloaded.
55    #[serde(
56        default,
57        alias = "chromePath",
58        alias = "lightpandaPath",
59        alias = "obscuraPath",
60        alias = "playwrightPath",
61        skip_serializing_if = "Option::is_none"
62    )]
63    pub browser_path: Option<String>,
64
65    /// Additional browser launch arguments.
66    #[serde(default, skip_serializing_if = "Vec::is_empty")]
67    pub launch_args: Vec<String>,
68
69    /// Proxy URL for the browser to use.
70    #[serde(default, skip_serializing_if = "Option::is_none")]
71    pub proxy_url: Option<String>,
72}
73
74impl BrowserBackend {
75    pub fn is_lightpanda(self) -> bool {
76        matches!(self, Self::Lightpanda)
77    }
78}
79
80impl Default for HeadlessConfig {
81    fn default() -> Self {
82        Self {
83            backend: BrowserBackend::Chrome,
84            max_tabs: 4,
85            browser_path: None,
86            launch_args: Vec::new(),
87            proxy_url: None,
88        }
89    }
90}
91
92/// Default configuration for built-in document context extraction.
93#[derive(Debug, Clone, Serialize, Deserialize)]
94#[serde(rename_all = "camelCase")]
95pub struct DocumentParserConfig {
96    /// Whether the default document extraction stack is registered in the parser registry.
97    #[serde(default = "default_enabled")]
98    pub enabled: bool,
99
100    /// Maximum file size accepted by the parser, in MiB.
101    #[serde(default = "default_document_parser_max_file_size_mb")]
102    pub max_file_size_mb: u64,
103
104    /// Optional OCR / vision-model settings for image-heavy documents.
105    ///
106    /// These settings control OCR fallback when context extraction reaches
107    /// scanned or image-heavy inputs. Current parsers may not execute OCR for
108    /// every format.
109    #[serde(default, skip_serializing_if = "Option::is_none")]
110    pub ocr: Option<DocumentOcrConfig>,
111
112    /// Optional cache settings for parsed / normalized document context.
113    #[serde(default, skip_serializing_if = "Option::is_none")]
114    pub cache: Option<DocumentCacheConfig>,
115}
116
117impl Default for DocumentParserConfig {
118    fn default() -> Self {
119        Self {
120            enabled: true,
121            max_file_size_mb: default_document_parser_max_file_size_mb(),
122            ocr: None,
123            cache: Some(DocumentCacheConfig::default()),
124        }
125    }
126}
127
128impl DocumentParserConfig {
129    pub fn normalized(&self) -> Self {
130        Self {
131            enabled: self.enabled,
132            max_file_size_mb: self.max_file_size_mb.clamp(1, 1024),
133            ocr: self.ocr.as_ref().map(DocumentOcrConfig::normalized),
134            cache: self.cache.as_ref().map(DocumentCacheConfig::normalized),
135        }
136    }
137}
138
139#[derive(Debug, Clone, Serialize, Deserialize)]
140#[serde(rename_all = "camelCase")]
141pub struct DocumentCacheConfig {
142    #[serde(default = "default_enabled")]
143    pub enabled: bool,
144
145    #[serde(default, skip_serializing_if = "Option::is_none")]
146    pub directory: Option<PathBuf>,
147}
148
149impl Default for DocumentCacheConfig {
150    fn default() -> Self {
151        Self {
152            enabled: true,
153            directory: None,
154        }
155    }
156}
157
158impl DocumentCacheConfig {
159    pub fn normalized(&self) -> Self {
160        Self {
161            enabled: self.enabled,
162            directory: self.directory.clone(),
163        }
164    }
165}
166
167/// OCR / vision-model configuration for built-in document context extraction.
168#[derive(Debug, Clone, Serialize, Deserialize)]
169#[serde(rename_all = "camelCase")]
170pub struct DocumentOcrConfig {
171    /// Whether OCR fallback is enabled for image-heavy documents.
172    #[serde(default = "default_enabled")]
173    pub enabled: bool,
174
175    /// Vision-capable model identifier, for example `openai/gpt-4.1-mini`.
176    #[serde(default, skip_serializing_if = "Option::is_none")]
177    pub model: Option<String>,
178
179    /// Optional custom OCR prompt / extraction instruction.
180    #[serde(default, skip_serializing_if = "Option::is_none")]
181    pub prompt: Option<String>,
182
183    /// Maximum number of rendered images/pages to send for OCR fallback.
184    #[serde(default = "default_document_ocr_max_images")]
185    pub max_images: usize,
186
187    /// Render DPI when rasterizing pages for OCR fallback.
188    #[serde(default = "default_document_ocr_dpi")]
189    pub dpi: u32,
190
191    /// OCR provider backend. Defaults to "vision" when model is set.
192    /// "vision" - Vision API (OpenAI-compatible)
193    /// "builtin" - Local tesseract (requires tesseract + pdftoppm binaries)
194    #[serde(default, skip_serializing_if = "Option::is_none")]
195    pub provider: Option<String>,
196
197    /// Base URL for vision API. Defaults to OpenAI API if not set.
198    #[serde(default, skip_serializing_if = "Option::is_none")]
199    pub base_url: Option<String>,
200
201    /// API key for vision API.
202    #[serde(default, skip_serializing_if = "Option::is_none")]
203    pub api_key: Option<String>,
204}
205
206impl Default for DocumentOcrConfig {
207    fn default() -> Self {
208        Self {
209            enabled: false,
210            model: None,
211            prompt: None,
212            max_images: default_document_ocr_max_images(),
213            dpi: default_document_ocr_dpi(),
214            provider: None,
215            base_url: None,
216            api_key: None,
217        }
218    }
219}
220
221impl DocumentOcrConfig {
222    pub fn normalized(&self) -> Self {
223        Self {
224            enabled: self.enabled,
225            model: self.model.clone(),
226            prompt: self.prompt.clone(),
227            max_images: self.max_images.clamp(1, 64),
228            dpi: self.dpi.clamp(72, 600),
229            provider: self.provider.clone(),
230            base_url: self.base_url.clone(),
231            api_key: self.api_key.clone(),
232        }
233    }
234}
235
236/// Search health monitor configuration
237#[derive(Debug, Clone, Serialize, Deserialize)]
238#[serde(rename_all = "camelCase")]
239pub struct SearchHealthConfig {
240    /// Number of consecutive failures before suspending
241    #[serde(default = "default_max_failures")]
242    pub max_failures: u32,
243
244    /// Suspension duration in seconds
245    #[serde(default = "default_suspend_seconds")]
246    pub suspend_seconds: u64,
247}
248
249/// Per-engine search configuration
250#[derive(Debug, Clone, Serialize, Deserialize)]
251#[serde(rename_all = "camelCase")]
252pub struct SearchEngineConfig {
253    /// Whether the engine is enabled
254    #[serde(default = "default_enabled")]
255    pub enabled: bool,
256
257    /// Weight for ranking (higher = more influence)
258    #[serde(default = "default_weight")]
259    pub weight: f64,
260
261    /// Per-engine timeout override in seconds
262    #[serde(skip_serializing_if = "Option::is_none")]
263    pub timeout: Option<u64>,
264}
265
266pub(crate) fn default_search_timeout() -> u64 {
267    10
268}
269
270pub(crate) fn default_headless_max_tabs() -> usize {
271    4
272}
273
274fn default_max_failures() -> u32 {
275    3
276}
277
278fn default_suspend_seconds() -> u64 {
279    60
280}
281
282pub(crate) fn default_enabled() -> bool {
283    true
284}
285
286fn default_weight() -> f64 {
287    1.0
288}
289
290pub(crate) fn default_document_parser_max_file_size_mb() -> u64 {
291    50
292}
293
294pub(crate) fn default_document_ocr_max_images() -> usize {
295    8
296}
297
298pub(crate) fn default_document_ocr_dpi() -> u32 {
299    144
300}