1use schemars::JsonSchema;
4use serde::{Deserialize, Serialize};
5use url::Url;
6
7use crate::extractor::frontmatter::{PageMeta, render as render_frontmatter};
8use crate::extractor::options::{ImagesMode, SampleStrategy, TablesMode};
9use crate::extractor::pipeline::extract;
10use crate::fetcher::cached::{ExtractResult, FetchOptions, fetch_with_cache, sha256_hex};
11use crate::mcp::envelope::{
12 CacheStatus, CountResponse, CountSingleResponse, CountSource, FetchResponse,
13};
14use crate::mcp::error::McpError;
15use crate::mcp::handler::{RoverHandler, resolve_tokenizer};
16use crate::tokenizer;
17
18#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
28#[serde(deny_unknown_fields)]
29pub struct FetchArgs {
30 pub url: String,
31
32 #[serde(default)]
33 pub force_refresh: bool,
34
35 #[serde(default)]
38 pub user_agent: Option<String>,
39
40 #[serde(default)]
43 pub timeout_secs: Option<u64>,
44
45 #[serde(default)]
46 pub count_only: bool,
47
48 #[serde(default)]
49 pub tokenizer: Option<String>,
50
51 #[serde(default)]
52 pub max_tokens: Option<usize>,
53
54 #[serde(default)]
55 pub tables: Option<TablesArg>,
56
57 #[serde(default)]
58 pub images: Option<ImagesArg>,
59
60 #[serde(default)]
61 pub metadata: Option<MetadataArg>,
62
63 #[serde(default)]
72 pub summarize: Option<InlineSummarizeArgs>,
73
74 #[serde(default)]
75 pub headless: Option<HeadlessArg>,
76
77 #[serde(default)]
80 pub security: Option<crate::guard::SecurityArg>,
81}
82
83#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
87#[serde(deny_unknown_fields)]
88pub struct InlineSummarizeArgs {
89 #[serde(default)]
90 pub target_tokens: Option<usize>,
91
92 #[serde(default)]
93 pub mode: Option<crate::mcp::tools::summarize::SummarizeMode>,
94
95 #[serde(default)]
96 pub focus: Option<String>,
97
98 #[serde(default)]
99 pub preserve: Vec<crate::mcp::tools::summarize::SummarizePreserve>,
100
101 #[serde(default)]
102 pub style: Option<crate::mcp::tools::summarize::SummarizeStyle>,
103
104 #[serde(default)]
105 pub backend: Option<String>,
106}
107
108#[derive(Debug, Clone)]
115pub enum TablesArg {
116 Embed,
117 Drop,
118 CsvFile,
119 Summarize,
120 Sample { strategy: SampleArg },
121}
122
123#[derive(Debug, Clone)]
124pub enum SampleArg {
125 HeadTail { head: usize, tail: usize },
126 RandomSeed { rows: usize, seed: u64 },
127}
128
129#[derive(Debug, Serialize, Deserialize, JsonSchema)]
130#[serde(deny_unknown_fields, rename_all = "snake_case")]
131struct TablesArgWire {
132 mode: TablesModeWire,
133 #[serde(default)]
134 strategy: Option<SampleStrategyWire>,
135 #[serde(default)]
136 head: Option<usize>,
137 #[serde(default)]
138 tail: Option<usize>,
139 #[serde(default)]
140 rows: Option<usize>,
141 #[serde(default)]
142 seed: Option<u64>,
143}
144
145#[derive(Debug, Serialize, Deserialize, JsonSchema)]
146#[serde(rename_all = "snake_case")]
147enum TablesModeWire {
148 Embed,
149 Drop,
150 CsvFile,
151 Summarize,
152 Sample,
153}
154
155#[derive(Debug, Serialize, Deserialize, JsonSchema)]
156#[serde(rename_all = "snake_case")]
157enum SampleStrategyWire {
158 HeadTail,
159 RandomSeed,
160}
161
162impl<'de> Deserialize<'de> for TablesArg {
163 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
164 where
165 D: serde::Deserializer<'de>,
166 {
167 let w = TablesArgWire::deserialize(deserializer)?;
168 match w.mode {
169 TablesModeWire::Embed => Ok(TablesArg::Embed),
170 TablesModeWire::Drop => Ok(TablesArg::Drop),
171 TablesModeWire::CsvFile => Ok(TablesArg::CsvFile),
172 TablesModeWire::Summarize => Ok(TablesArg::Summarize),
173 TablesModeWire::Sample => {
174 let strategy = w.strategy.unwrap_or(SampleStrategyWire::HeadTail);
175 let inner = match strategy {
176 SampleStrategyWire::HeadTail => SampleArg::HeadTail {
177 head: w.head.unwrap_or_else(default_head),
178 tail: w.tail.unwrap_or_else(default_tail),
179 },
180 SampleStrategyWire::RandomSeed => SampleArg::RandomSeed {
181 rows: w.rows.unwrap_or_else(default_random_rows),
182 seed: w.seed.unwrap_or_else(default_random_seed),
183 },
184 };
185 Ok(TablesArg::Sample { strategy: inner })
186 }
187 }
188 }
189}
190
191impl Serialize for TablesArg {
192 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
193 where
194 S: serde::Serializer,
195 {
196 let w = match self {
197 TablesArg::Embed => TablesArgWire {
198 mode: TablesModeWire::Embed,
199 strategy: None,
200 head: None,
201 tail: None,
202 rows: None,
203 seed: None,
204 },
205 TablesArg::Drop => TablesArgWire {
206 mode: TablesModeWire::Drop,
207 strategy: None,
208 head: None,
209 tail: None,
210 rows: None,
211 seed: None,
212 },
213 TablesArg::CsvFile => TablesArgWire {
214 mode: TablesModeWire::CsvFile,
215 strategy: None,
216 head: None,
217 tail: None,
218 rows: None,
219 seed: None,
220 },
221 TablesArg::Summarize => TablesArgWire {
222 mode: TablesModeWire::Summarize,
223 strategy: None,
224 head: None,
225 tail: None,
226 rows: None,
227 seed: None,
228 },
229 TablesArg::Sample {
230 strategy: SampleArg::HeadTail { head, tail },
231 } => TablesArgWire {
232 mode: TablesModeWire::Sample,
233 strategy: Some(SampleStrategyWire::HeadTail),
234 head: Some(*head),
235 tail: Some(*tail),
236 rows: None,
237 seed: None,
238 },
239 TablesArg::Sample {
240 strategy: SampleArg::RandomSeed { rows, seed },
241 } => TablesArgWire {
242 mode: TablesModeWire::Sample,
243 strategy: Some(SampleStrategyWire::RandomSeed),
244 head: None,
245 tail: None,
246 rows: Some(*rows),
247 seed: Some(*seed),
248 },
249 };
250 w.serialize(serializer)
251 }
252}
253
254impl JsonSchema for TablesArg {
255 fn schema_name() -> std::borrow::Cow<'static, str> {
256 "TablesArg".into()
257 }
258
259 fn schema_id() -> std::borrow::Cow<'static, str> {
260 concat!(module_path!(), "::TablesArg").into()
261 }
262
263 fn json_schema(generator: &mut schemars::SchemaGenerator) -> schemars::Schema {
264 <TablesArgWire as JsonSchema>::json_schema(generator)
265 }
266}
267
268fn default_head() -> usize {
269 5
270}
271fn default_tail() -> usize {
272 5
273}
274fn default_random_rows() -> usize {
275 10
276}
277fn default_random_seed() -> u64 {
278 42
279}
280
281#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
282#[serde(deny_unknown_fields, rename_all = "snake_case", tag = "mode")]
283pub enum ImagesArg {
284 Keep,
285 AltTextOnly,
286 Download,
287 Drop,
288 Caption {
291 #[serde(default)]
292 captioner: Option<String>,
293 },
294}
295
296#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
297#[serde(deny_unknown_fields, rename_all = "snake_case")]
298pub enum MetadataArg {
299 Include,
300 Skip,
301}
302
303#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
308#[serde(deny_unknown_fields)]
309pub struct HeadlessArg {
310 #[serde(default)]
311 pub mode: Option<HeadlessModeWire>,
312}
313
314#[derive(Debug, Clone, Copy, Serialize, Deserialize, JsonSchema)]
316#[serde(rename_all = "snake_case")]
317pub enum HeadlessModeWire {
318 Off,
319 On,
320 Auto,
321}
322
323fn tables_mode(arg: Option<&TablesArg>) -> Result<TablesMode, McpError> {
324 Ok(match arg {
325 None | Some(TablesArg::Embed) => TablesMode::Embed,
326 Some(TablesArg::Drop) => TablesMode::Drop,
327 Some(TablesArg::CsvFile) => TablesMode::CsvFile,
328 Some(TablesArg::Sample { strategy }) => match strategy {
329 SampleArg::HeadTail { head, tail } => {
330 if *head == 0 || *tail == 0 {
331 return Err(McpError::InvalidArgs(
332 "tables.sample head/tail must be > 0".into(),
333 ));
334 }
335 TablesMode::Sample(SampleStrategy::HeadTail {
336 head: *head,
337 tail: *tail,
338 })
339 }
340 SampleArg::RandomSeed { rows, seed } => {
341 if *rows == 0 {
342 return Err(McpError::InvalidArgs(
343 "tables.sample rows must be > 0".into(),
344 ));
345 }
346 TablesMode::Sample(SampleStrategy::RandomSeed {
347 rows: *rows,
348 seed: *seed,
349 })
350 }
351 },
352 Some(TablesArg::Summarize) => TablesMode::Summarize,
353 })
354}
355
356fn images_mode(arg: Option<&ImagesArg>) -> Result<(ImagesMode, Option<String>), McpError> {
357 Ok(match arg {
358 None | Some(ImagesArg::AltTextOnly) => (ImagesMode::AltTextOnly, None),
359 Some(ImagesArg::Keep) => (ImagesMode::Keep, None),
360 Some(ImagesArg::Download) => (ImagesMode::Download, None),
361 Some(ImagesArg::Drop) => (ImagesMode::Drop, None),
362 Some(ImagesArg::Caption { captioner }) => (ImagesMode::Caption, captioner.clone()),
363 })
364}
365
366fn build_caption_filters(
369 cfg: &crate::config::ImageCaptionsConfig,
370 override_name: Option<String>,
371) -> crate::extractor::options::ImageCaptionFilters {
372 crate::extractor::options::ImageCaptionFilters {
373 max_per_page: cfg.max_per_page,
374 min_width: cfg.min_width,
375 min_height: cfg.min_height,
376 max_bytes: cfg.max_bytes,
377 max_tokens: cfg.max_tokens,
378 captioner_override: override_name,
379 }
380}
381
382fn resolve_headless(
388 arg: Option<&HeadlessArg>,
389 config: &crate::config::HeadlessConfig,
390) -> crate::fetcher::cached::HeadlessMode {
391 let mode = arg.and_then(|a| a.mode).map(|m| match m {
392 HeadlessModeWire::Off => crate::fetcher::cached::HeadlessMode::Off,
393 HeadlessModeWire::On => crate::fetcher::cached::HeadlessMode::On,
394 HeadlessModeWire::Auto => crate::fetcher::cached::HeadlessMode::Auto,
395 });
396 mode.unwrap_or(if config.auto_detect_spa {
397 crate::fetcher::cached::HeadlessMode::Auto
398 } else {
399 crate::fetcher::cached::HeadlessMode::Off
400 })
401}
402
403#[derive(Debug, Clone, Serialize, Deserialize)]
412#[serde(untagged)]
413pub enum FetchOutput {
414 Full(FetchResponse),
415 Count(CountResponse),
416}
417
418impl JsonSchema for FetchOutput {
419 fn schema_name() -> std::borrow::Cow<'static, str> {
420 "FetchOutput".into()
421 }
422
423 fn schema_id() -> std::borrow::Cow<'static, str> {
424 concat!(module_path!(), "::FetchOutput").into()
425 }
426
427 fn json_schema(generator: &mut schemars::SchemaGenerator) -> schemars::Schema {
428 let full = generator.subschema_for::<FetchResponse>();
429 let count = generator.subschema_for::<CountResponse>();
430 schemars::json_schema!({
431 "type": "object",
432 "oneOf": [full, count],
433 })
434 }
435}
436
437#[derive(Debug, Clone)]
440struct SummarizeOutcome {
441 summarized: bool,
442 fallback: Option<crate::mcp::envelope::SummarizerFallbackInfo>,
443}
444
445impl RoverHandler {
446 async fn run_compact(
449 &self,
450 body_md: &str,
451 opts: &crate::summarizer::backend::CompactOpts,
452 ) -> Result<(String, Option<crate::mcp::envelope::SummarizerFallbackInfo>), McpError> {
453 let content_hash = format!(
454 "sha256:{}",
455 crate::fetcher::cached::sha256_hex(body_md.as_bytes()),
456 );
457 let r = self
458 .summarizer
459 .compact(&content_hash, body_md, opts)
460 .await?;
461 let fallback = r
465 .fallback
466 .map(|f| crate::mcp::envelope::SummarizerFallbackInfo {
467 from: f.from,
468 reason: f.reason.to_string(),
469 });
470 Ok((r.summary_md, fallback))
471 }
472
473 pub async fn fetch_inner(&self, args: FetchArgs) -> Result<FetchOutput, McpError> {
476 let url = Url::parse(&args.url).map_err(|e| McpError::InvalidUrl(e.to_string()))?;
477 if matches!(args.max_tokens, Some(0)) {
478 return Err(McpError::InvalidArgs(
479 "max_tokens must be greater than 0".into(),
480 ));
481 }
482 if matches!(args.timeout_secs, Some(0)) {
483 return Err(McpError::InvalidArgs(
484 "timeout_secs must be greater than 0".into(),
485 ));
486 }
487
488 let effective_ua = args
494 .user_agent
495 .clone()
496 .unwrap_or_else(|| self.config.fetch.user_agent.clone());
497 let per_call_client;
498 let client: &reqwest::Client = if args.user_agent.is_some() || args.timeout_secs.is_some() {
499 let timeout = args
500 .timeout_secs
501 .map(std::time::Duration::from_secs)
502 .unwrap_or_else(|| self.config.fetch.timeout());
503 per_call_client = crate::fetcher::client::build_http_client(&effective_ua, timeout);
504 &per_call_client
505 } else {
506 &self.client
507 };
508
509 let family = resolve_tokenizer(args.tokenizer.as_deref(), &self.config)?;
510
511 let headless_mode = resolve_headless(args.headless.as_ref(), &self.config.headless);
512
513 #[cfg(feature = "headless")]
524 let headless: Option<crate::fetcher::headless::HeadlessHandle> =
525 if !matches!(headless_mode, crate::fetcher::cached::HeadlessMode::Off) {
526 Some(crate::fetcher::headless::HeadlessHandle::with_cell(
527 self.headless_renderer.clone(),
528 self.config.headless.clone(),
529 ))
530 } else {
531 None
532 };
533
534 let result = fetch_with_cache(
535 &self.db,
536 client,
537 &self.pacer,
538 &self.config.rate_limit,
539 &self.config.robots,
540 &url,
541 &self.config.cache,
542 FetchOptions {
543 force_refresh: args.force_refresh,
544 ssrf_level: self.ssrf_level,
545 ssrf_project_root: self.ssrf_project_root.clone(),
546 har_recorder: self.har_recorder.clone(),
547 ignore_robots: false,
548 user_agent: effective_ua.clone(),
549 #[cfg(feature = "headless")]
550 headless,
551 headless_mode,
552 synchronous_revalidation: false,
553 },
554 |body, base| {
555 let extracted =
556 extract(body, Some(base)).map_err(crate::fetcher::FetcherError::Extract)?;
557 let content_hash = format!("sha256:{}", sha256_hex(extracted.body_md.as_bytes()));
558 Ok(ExtractResult {
559 title: extracted.title,
560 body_md: extracted.body_md,
561 content_hash,
562 metadata: extracted.metadata,
563 })
564 },
565 )
566 .await?;
567
568 tokenizer::ensure_loaded(family).await?;
569
570 let output_paths = std::sync::Arc::new(
572 crate::extractor::output::OutputPaths::resolve(self.config.output.dir.as_deref())
573 .map_err(McpError::Extractor)?,
574 );
575
576 let tables_mode_resolved = tables_mode(args.tables.as_ref())?;
577 let (images_mode_resolved, captioner_override) = images_mode(args.images.as_ref())?;
578 let caption_filters =
579 build_caption_filters(&self.config.image_captions, captioner_override);
580
581 let body_md = result.page.extracted_md.clone();
585 let tables_hook: Option<crate::extractor::tables::TableSummarizeHook> =
586 if matches!(tables_mode_resolved, TablesMode::Summarize) {
587 let summarizer = self.summarizer.clone();
588 let config = self.config.clone();
589 Some(std::sync::Arc::new(move |table_text: &str| {
590 let summarizer = summarizer.clone();
591 let config = config.clone();
592 let table_text = table_text.to_string();
593 Box::pin(async move {
594 let defaults =
595 crate::summarizer::DefaultsHint::from_config(&config.summarization);
596 let opts = crate::summarizer::backend::CompactOpts {
597 mode: defaults.mode,
598 style: crate::summarizer::backend::Style::Bullet,
599 target_tokens: Some(config.summarization.tables.target_tokens),
600 focus: Some(config.summarization.tables.focus.clone()),
601 preserve: vec![],
602 backend_name: defaults.backend.clone(),
603 };
604 let content_hash = format!("sha256:{}", sha256_hex(table_text.as_bytes()));
605 summarizer
606 .compact(&content_hash, &table_text, &opts)
607 .await
608 .map(|r| {
609 let fb =
610 r.fallback.map(|f| crate::extractor::tables::FallbackInfo {
611 from: f.from,
612 reason: f.reason.to_string(),
613 });
614 if let Some(fb) = &fb {
615 tracing::debug!(
616 target: "rover::mcp",
617 from = %fb.from,
618 reason = %fb.reason,
619 "table summarizer fell back to extractive",
620 );
621 }
622 (r.summary_md, fb)
623 })
624 .map_err(|e| e.fallback_reason().to_string())
625 })
626 as std::pin::Pin<
627 Box<
628 dyn std::future::Future<
629 Output = Result<
630 (
631 String,
632 Option<crate::extractor::tables::FallbackInfo>,
633 ),
634 String,
635 >,
636 > + Send,
637 >,
638 >
639 }))
640 } else {
641 None
642 };
643 let (body_md, tables_transformed) = crate::extractor::tables::apply_with_summarizer(
644 &body_md,
645 &tables_mode_resolved,
646 &output_paths,
647 &url,
648 tables_hook.as_ref(),
649 )
650 .await
651 .map_err(McpError::Extractor)?;
652
653 let captioners_opt = if self.captioners.is_empty() {
654 None
655 } else {
656 Some(self.captioners.as_ref())
657 };
658 let images_result = crate::extractor::images::apply(
659 &body_md,
660 &images_mode_resolved,
661 &output_paths,
662 client,
663 captioners_opt,
664 &caption_filters,
665 Some(&self.db),
666 self.ssrf_level,
667 )
668 .await
669 .map_err(McpError::Extractor)?;
670 let body_md = images_result.markdown;
671
672 let guard_assessment = self
675 .guard
676 .assess(url.as_str(), args.security.as_ref(), &body_md);
677 let direct_body = if args.summarize.is_none() {
681 guard_assessment.acted_body.clone()
682 } else {
683 body_md.clone()
684 };
685 let body_md = direct_body;
686
687 let (body_md, summarize_meta): (String, Option<SummarizeOutcome>) = if let Some(inline) =
691 args.summarize.clone()
692 {
693 let defaults = crate::summarizer::DefaultsHint::from_config(&self.config.summarization);
694 let opts = self.summarizer.resolve_defaults(
695 inline.mode.map(Into::into),
696 inline.style.map(Into::into),
697 inline.target_tokens,
698 inline.focus,
699 inline.preserve.into_iter().map(Into::into).collect(),
700 inline.backend,
701 &defaults,
702 );
703 let (summary_md, fallback) = self.run_compact(&body_md, &opts).await?;
704 (
705 summary_md,
706 Some(SummarizeOutcome {
707 summarized: true,
708 fallback,
709 }),
710 )
711 } else {
712 (body_md, None)
713 };
714
715 let tokens = tokenizer::count(&body_md, family)?;
718
719 let (body_md, tokens, auto_meta): (String, usize, Option<SummarizeOutcome>) =
724 if let Some(max) = args.max_tokens {
725 if tokens <= max {
726 (body_md, tokens, None)
727 } else if summarize_meta.is_some() {
728 return Err(McpError::MaxTokensExceeded {
729 actual: tokens,
730 max,
731 was_auto: false,
732 });
733 } else if args.count_only {
734 (body_md, tokens, None)
739 } else {
740 let defaults =
741 crate::summarizer::DefaultsHint::from_config(&self.config.summarization);
742 let opts = self.summarizer.resolve_defaults(
743 None,
744 None,
745 Some(max),
746 None,
747 vec![],
748 None,
749 &defaults,
750 );
751 let (summary_md, fallback) = self.run_compact(&body_md, &opts).await?;
752 let new_tokens = tokenizer::count(&summary_md, family)?;
753 if new_tokens > max {
754 return Err(McpError::MaxTokensExceeded {
755 actual: new_tokens,
756 max,
757 was_auto: true,
758 });
759 }
760 (
761 summary_md,
762 new_tokens,
763 Some(SummarizeOutcome {
764 summarized: true,
765 fallback,
766 }),
767 )
768 }
769 } else {
770 (body_md, tokens, None)
771 };
772
773 let revalidation = match &result.cache_status {
776 crate::fetcher::cached::CacheStatus::Stale {
777 revalidation_task_id: Some(id),
778 } => Some(crate::mcp::envelope::StaleRevalidation {
779 task_id: id.clone(),
780 monitor_command: format!("rover task {id} --monitor"),
781 poll_command: format!("rover task {id}"),
782 hint: "Optional. Revalidation runs in the background regardless.".into(),
783 }),
784 _ => None,
785 };
786
787 let cache_status: CacheStatus = result.cache_status.into();
788
789 if args.count_only {
790 return Ok(FetchOutput::Count(CountResponse::Single(
793 CountSingleResponse {
794 tokens,
795 tokenizer: family.as_str().to_string(),
796 source: CountSource::Url,
797 url: Some(url.as_str().to_string()),
798 content_hash: Some(result.page.content_hash.clone()),
799 fetched_at: Some(
800 jiff::Timestamp::from_second(result.page.fetched_at)
801 .map(|t| t.to_string())
802 .unwrap_or_default(),
803 ),
804 cache_status: Some(cache_status),
805 },
806 )));
807 }
808
809 let canonical = Url::parse(&result.page.canonical_url)
810 .map_err(|e| McpError::InvalidUrl(e.to_string()))?;
811 let metadata: crate::extractor::ExtractedMetadata = result
815 .page
816 .metadata_json
817 .as_deref()
818 .and_then(|s| serde_json::from_str(s).ok())
819 .unwrap_or_default();
820 let metadata = match args.metadata.as_ref() {
823 Some(MetadataArg::Skip) => crate::extractor::ExtractedMetadata::default(),
824 _ => metadata,
825 };
826 let quality = crate::extractor::quality::score(
827 &body_md,
828 body_md.chars().count().max(1),
829 !metadata.is_empty(),
830 result.page.title.is_some(),
831 );
832 let frontmatter = render_frontmatter(&PageMeta {
833 url: &url,
834 canonical_url: &canonical,
835 title: result.page.title.as_deref(),
836 fetched_at: jiff::Timestamp::now(),
837 body: &body_md,
838 tokens,
839 tokenizer_name: family.as_str(),
840 description: metadata.description.as_deref(),
841 author: metadata.author.as_deref(),
842 published: metadata.published.as_deref(),
843 modified: metadata.modified.as_deref(),
844 image: metadata.image.as_deref(),
845 og_type: metadata.og_type.as_deref(),
846 language: metadata.language.as_deref(),
847 schema_types: &metadata.schema_types,
848 extraction_quality: quality,
851 summarized: false,
852 headless_render: result.page.render_reason.as_deref(),
853 tables_transformed: &tables_transformed,
854 images_seen: images_result.images_seen,
855 images_downloaded: images_result.images_downloaded,
856 images_failed: images_result.images_failed,
857 images_processed: images_result.images_processed.clone(),
858 prompt_injection: Some(&guard_assessment.telemetry),
859 });
860
861 let summarized_flag = summarize_meta.as_ref().map(|o| o.summarized);
862 let auto_summarized_flag = auto_meta.as_ref().map(|o| o.summarized);
863 let summarizer_fallback = summarize_meta
864 .and_then(|o| o.fallback)
865 .or_else(|| auto_meta.and_then(|o| o.fallback));
866
867 Ok(FetchOutput::Full(FetchResponse {
868 content: self.guard.finish(
869 &guard_assessment,
870 &frontmatter,
871 &body_md,
872 args.summarize.is_none(),
873 ),
874 cache_status,
875 revalidation,
876 summarized: summarized_flag,
877 auto_summarized: auto_summarized_flag,
878 summarizer_fallback,
879 }))
880 }
881}
882
883#[cfg(test)]
884mod tests {
885 use std::str::FromStr;
886
887 use super::*;
888 use crate::tokenizer::Tokenizer;
889
890 #[test]
891 fn fetch_args_deserialize_minimal() {
892 let v: FetchArgs = serde_json::from_str(r#"{"url":"https://example.com"}"#).unwrap();
893 assert_eq!(v.url, "https://example.com");
894 assert!(!v.force_refresh);
895 assert!(!v.count_only);
896 assert!(v.tokenizer.is_none());
897 assert!(v.max_tokens.is_none());
898 }
899
900 #[test]
901 fn fetch_args_headless_typed_mode_auto() {
902 let v: FetchArgs = serde_json::from_str(
903 r#"{
904 "url":"https://example.com",
905 "headless": { "mode": "auto" }
906 }"#,
907 )
908 .unwrap();
909 let h = v.headless.expect("headless parsed");
910 assert!(matches!(h.mode, Some(HeadlessModeWire::Auto)));
911 }
912
913 #[test]
914 fn fetch_args_parse_typed_summarize() {
915 let v: FetchArgs = serde_json::from_str(
916 r#"{
917 "url":"https://example.com",
918 "summarize":{
919 "target_tokens":500,
920 "mode":"extractive",
921 "style":"bullet",
922 "preserve":["code","tables"]
923 }
924 }"#,
925 )
926 .unwrap();
927 let s = v.summarize.expect("summarize parsed");
928 assert_eq!(s.target_tokens, Some(500));
929 assert!(matches!(
930 s.mode,
931 Some(crate::mcp::tools::summarize::SummarizeMode::Extractive)
932 ));
933 assert!(matches!(
934 s.style,
935 Some(crate::mcp::tools::summarize::SummarizeStyle::Bullet)
936 ));
937 assert_eq!(s.preserve.len(), 2);
938 }
939
940 #[test]
941 fn fetch_args_reject_unknown_summarize_field() {
942 let r: Result<FetchArgs, _> =
943 serde_json::from_str(r#"{"url":"https://x/","summarize":{"bogus":1}}"#);
944 assert!(r.is_err());
945 }
946
947 #[test]
948 fn fetch_args_reject_unknown_fields() {
949 let r: Result<FetchArgs, _> =
950 serde_json::from_str(r#"{"url":"https://example.com","bogus":1}"#);
951 assert!(r.is_err());
952 }
953
954 #[test]
955 fn fetch_args_parse_user_agent_and_timeout_overrides() {
956 let v: FetchArgs = serde_json::from_str(
957 r#"{"url":"https://example.com","user_agent":"my-agent/2.0","timeout_secs":42}"#,
958 )
959 .unwrap();
960 assert_eq!(v.user_agent.as_deref(), Some("my-agent/2.0"));
961 assert_eq!(v.timeout_secs, Some(42));
962 }
963
964 #[test]
965 fn fetch_args_default_transport_overrides_are_none() {
966 let v: FetchArgs = serde_json::from_str(r#"{"url":"https://example.com"}"#).unwrap();
967 assert!(v.user_agent.is_none());
968 assert!(v.timeout_secs.is_none());
969 }
970
971 #[test]
972 fn fetch_args_parse_tokenizer_string() {
973 let v: FetchArgs =
974 serde_json::from_str(r#"{"url":"https://example.com","tokenizer":"claude"}"#).unwrap();
975 assert_eq!(v.tokenizer.as_deref(), Some("claude"));
976 let t = Tokenizer::from_str(v.tokenizer.as_deref().unwrap()).unwrap();
978 assert_eq!(t, Tokenizer::Claude);
979 }
980
981 #[test]
982 fn fetch_args_schema_contains_all_documented_fields() {
983 let schema = schemars::schema_for!(FetchArgs);
984 let json = serde_json::to_string(&schema).unwrap();
985 for field in [
986 "url",
987 "force_refresh",
988 "count_only",
989 "tokenizer",
990 "max_tokens",
991 "headless",
992 "tables",
993 "images",
994 "metadata",
995 "summarize",
996 ] {
997 assert!(json.contains(field), "schema missing field: {field}");
998 }
999 }
1000
1001 #[test]
1002 fn typed_tables_sample_parses() {
1003 let v: FetchArgs = serde_json::from_str(
1004 r#"{"url":"https://x/","tables":{"mode":"sample","strategy":"head_tail","head":3,"tail":2}}"#,
1005 )
1006 .unwrap();
1007 match v.tables.unwrap() {
1008 TablesArg::Sample {
1009 strategy: SampleArg::HeadTail { head, tail },
1010 } => {
1011 assert_eq!(head, 3);
1012 assert_eq!(tail, 2);
1013 }
1014 _ => panic!("wrong variant"),
1015 }
1016 }
1017
1018 #[test]
1019 fn typed_tables_rejects_unknown_field() {
1020 let r: Result<FetchArgs, _> =
1021 serde_json::from_str(r#"{"url":"https://x/","tables":{"mode":"embed","bogus":1}}"#);
1022 assert!(r.is_err());
1023 }
1024
1025 #[test]
1026 fn typed_images_download_parses() {
1027 let v: FetchArgs =
1028 serde_json::from_str(r#"{"url":"https://x/","images":{"mode":"download"}}"#).unwrap();
1029 assert!(matches!(v.images, Some(ImagesArg::Download)));
1030 }
1031
1032 #[test]
1033 fn typed_images_caption_parses_without_captioner() {
1034 let v: FetchArgs =
1035 serde_json::from_str(r#"{"url":"https://x/","images":{"mode":"caption"}}"#).unwrap();
1036 assert!(matches!(
1037 v.images,
1038 Some(ImagesArg::Caption { captioner: None })
1039 ));
1040 }
1041
1042 #[test]
1043 fn typed_images_caption_parses_with_captioner_override() {
1044 let v: FetchArgs = serde_json::from_str(
1045 r#"{"url":"https://x/","images":{"mode":"caption","captioner":"gpt4o"}}"#,
1046 )
1047 .unwrap();
1048 assert!(matches!(
1049 v.images,
1050 Some(ImagesArg::Caption { captioner: Some(ref s) }) if s == "gpt4o"
1051 ));
1052 }
1053
1054 #[test]
1055 fn typed_metadata_skip_parses() {
1056 let v: FetchArgs =
1057 serde_json::from_str(r#"{"url":"https://x/","metadata":"skip"}"#).unwrap();
1058 assert!(matches!(v.metadata, Some(MetadataArg::Skip)));
1059 }
1060}