1use crate::{
2 error::ScrapeError,
3 search::SearchProvider,
4 types::{ScrapeOptions, ScrapeRequest, SearchRequest, SearchResponse},
5 validation,
6};
7use axum::Json;
8use futures::{stream, StreamExt};
9use std::env;
10use std::sync::Arc;
11use tracing::{error, info, warn};
12
13pub async fn search_handler(
15 Json(request): Json<SearchRequest>,
16) -> Result<Json<SearchResponse>, ScrapeError> {
17 info!("Search request received for query: {}", request.query);
18
19 validation::validate_search_request(&request)?;
21
22 let provider = SearchProvider::new().map_err(|e| {
24 error!("Failed to create search provider: {}", e);
25 e
26 })?;
27
28 let mut results = provider
30 .search_duckduckgo(&request.query, request.limit)
31 .await
32 .map_err(|e| {
33 error!("Search failed: {}", e);
34 e
35 })?;
36
37 info!("Found {} search results", results.len());
38
39 if request.scrape_results {
41 info!("Scraping {} search results in parallel", results.len());
42
43 let max_parallel = env::var("MAX_PARALLEL_SCRAPES")
45 .ok()
46 .and_then(|v| v.parse::<usize>().ok())
47 .unwrap_or(5);
48
49 info!("Using max_parallel_scrapes = {}", max_parallel);
50
51 let scrape_options = request.scrape_options.as_ref();
53
54 let provider = Arc::new(provider);
56
57 let start_time = std::time::Instant::now();
59
60 results = stream::iter(results)
61 .map(|result| {
62 let scrape_req = build_scrape_request(&result.url, scrape_options);
63 let provider = Arc::clone(&provider);
64 async move { provider.scrape_result(result, &scrape_req).await }
65 })
66 .buffer_unordered(max_parallel) .collect::<Vec<_>>()
68 .await;
69
70 let elapsed = start_time.elapsed();
71 let success_count = results.iter().filter(|r| r.content.is_some()).count();
72 let failure_count = results.len() - success_count;
73
74 info!(
75 "Scraping complete: {} successful, {} failed in {:.2}s ({:.2}s avg per result)",
76 success_count,
77 failure_count,
78 elapsed.as_secs_f64(),
79 elapsed.as_secs_f64() / results.len() as f64
80 );
81
82 if failure_count > 0 {
83 warn!(
84 "{} of {} scrapes failed (returning partial results)",
85 failure_count,
86 results.len()
87 );
88 }
89 }
90
91 Ok(Json(SearchResponse::success(results)))
92}
93
94fn build_scrape_request(url: &str, options: Option<&ScrapeOptions>) -> ScrapeRequest {
96 let opts = options.cloned().unwrap_or_else(|| ScrapeOptions {
97 formats: vec!["markdown".to_string()],
98 only_main_content: true,
99 timeout: 10000,
100 });
101
102 ScrapeRequest {
103 url: url.to_string(),
104 formats: opts.formats,
105 headers: Default::default(),
106 include_tags: vec![],
107 exclude_tags: vec![],
108 only_main_content: opts.only_main_content,
109 timeout: opts.timeout,
110 wait_for: 0,
111 remove_base64_images: true,
112 skip_tls_verification: false,
113 engine: "http".to_string(), wait_for_selector: None,
115 actions: vec![],
116 screenshot: false,
117 screenshot_format: "png".to_string(),
118 }
119}
120
121#[cfg(test)]
122mod tests {
123 use super::*;
124
125 #[test]
126 fn test_build_scrape_request_default() {
127 let req = build_scrape_request("https://example.com", None);
128 assert_eq!(req.url, "https://example.com");
129 assert_eq!(req.formats, vec!["markdown"]);
130 assert_eq!(req.timeout, 10000);
131 assert!(req.only_main_content);
132 }
133
134 #[test]
135 fn test_build_scrape_request_custom() {
136 let options = ScrapeOptions {
137 formats: vec!["html".to_string(), "markdown".to_string()],
138 only_main_content: false,
139 timeout: 5000,
140 };
141 let req = build_scrape_request("https://example.com", Some(&options));
142 assert_eq!(req.formats, vec!["html", "markdown"]);
143 assert_eq!(req.timeout, 5000);
144 assert!(!req.only_main_content);
145 }
146
147 #[tokio::test]
148 async fn test_search_handler_empty_query() {
149 let request = SearchRequest {
150 query: "".to_string(),
151 limit: 10,
152 scrape_results: false,
153 scrape_options: None,
154 };
155
156 let result = search_handler(Json(request)).await;
157 assert!(result.is_err());
158 }
159}