Skip to main content

essence/api/
search.rs

1use crate::{
2    error::ScrapeError,
3    search::SearchProvider,
4    types::{ScrapeOptions, ScrapeRequest, SearchRequest, SearchResponse},
5    validation,
6};
7use axum::Json;
8use futures::{stream, StreamExt};
9use std::env;
10use std::sync::Arc;
11use tracing::{error, info, warn};
12
13/// Handler for POST /api/v1/search
14pub async fn search_handler(
15    Json(request): Json<SearchRequest>,
16) -> Result<Json<SearchResponse>, ScrapeError> {
17    info!("Search request received for query: {}", request.query);
18
19    // Validate request
20    validation::validate_search_request(&request)?;
21
22    // Create search provider
23    let provider = SearchProvider::new().map_err(|e| {
24        error!("Failed to create search provider: {}", e);
25        e
26    })?;
27
28    // Perform search
29    let mut results = provider
30        .search_duckduckgo(&request.query, request.limit)
31        .await
32        .map_err(|e| {
33            error!("Search failed: {}", e);
34            e
35        })?;
36
37    info!("Found {} search results", results.len());
38
39    // Optionally scrape each result
40    if request.scrape_results {
41        info!("Scraping {} search results in parallel", results.len());
42
43        // Get max parallel scrapes from environment (default: 5)
44        let max_parallel = env::var("MAX_PARALLEL_SCRAPES")
45            .ok()
46            .and_then(|v| v.parse::<usize>().ok())
47            .unwrap_or(5);
48
49        info!("Using max_parallel_scrapes = {}", max_parallel);
50
51        // Build scrape request from options
52        let scrape_options = request.scrape_options.as_ref();
53
54        // Create a single shared provider (more efficient than creating one per result)
55        let provider = Arc::new(provider);
56
57        // Scrape results in parallel with chunked buffering
58        let start_time = std::time::Instant::now();
59
60        results = stream::iter(results)
61            .map(|result| {
62                let scrape_req = build_scrape_request(&result.url, scrape_options);
63                let provider = Arc::clone(&provider);
64                async move { provider.scrape_result(result, &scrape_req).await }
65            })
66            .buffer_unordered(max_parallel) // Process max_parallel requests concurrently
67            .collect::<Vec<_>>()
68            .await;
69
70        let elapsed = start_time.elapsed();
71        let success_count = results.iter().filter(|r| r.content.is_some()).count();
72        let failure_count = results.len() - success_count;
73
74        info!(
75            "Scraping complete: {} successful, {} failed in {:.2}s ({:.2}s avg per result)",
76            success_count,
77            failure_count,
78            elapsed.as_secs_f64(),
79            elapsed.as_secs_f64() / results.len() as f64
80        );
81
82        if failure_count > 0 {
83            warn!(
84                "{} of {} scrapes failed (returning partial results)",
85                failure_count,
86                results.len()
87            );
88        }
89    }
90
91    Ok(Json(SearchResponse::success(results)))
92}
93
94/// Build a ScrapeRequest from URL and options
95fn build_scrape_request(url: &str, options: Option<&ScrapeOptions>) -> ScrapeRequest {
96    let opts = options.cloned().unwrap_or_else(|| ScrapeOptions {
97        formats: vec!["markdown".to_string()],
98        only_main_content: true,
99        timeout: 10000,
100    });
101
102    ScrapeRequest {
103        url: url.to_string(),
104        formats: opts.formats,
105        headers: Default::default(),
106        include_tags: vec![],
107        exclude_tags: vec![],
108        only_main_content: opts.only_main_content,
109        timeout: opts.timeout,
110        wait_for: 0,
111        remove_base64_images: true,
112        skip_tls_verification: false,
113        engine: "http".to_string(), // Use HTTP for speed
114        wait_for_selector: None,
115        actions: vec![],
116        screenshot: false,
117        screenshot_format: "png".to_string(),
118    }
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124
125    #[test]
126    fn test_build_scrape_request_default() {
127        let req = build_scrape_request("https://example.com", None);
128        assert_eq!(req.url, "https://example.com");
129        assert_eq!(req.formats, vec!["markdown"]);
130        assert_eq!(req.timeout, 10000);
131        assert!(req.only_main_content);
132    }
133
134    #[test]
135    fn test_build_scrape_request_custom() {
136        let options = ScrapeOptions {
137            formats: vec!["html".to_string(), "markdown".to_string()],
138            only_main_content: false,
139            timeout: 5000,
140        };
141        let req = build_scrape_request("https://example.com", Some(&options));
142        assert_eq!(req.formats, vec!["html", "markdown"]);
143        assert_eq!(req.timeout, 5000);
144        assert!(!req.only_main_content);
145    }
146
147    #[tokio::test]
148    async fn test_search_handler_empty_query() {
149        let request = SearchRequest {
150            query: "".to_string(),
151            limit: 10,
152            scrape_results: false,
153            scrape_options: None,
154        };
155
156        let result = search_handler(Json(request)).await;
157        assert!(result.is_err());
158    }
159}