eggsearch 0.3.1

Lightweight MCP metasearch server for AI agents
Documentation
use std::time::Duration;

use reqwest::Client;
use scraper::Html;

use super::error::EngineError;
use super::models::SearchResult;

const ENGINE: &str = "yahoo";
const YAHOO_URL: &str = "https://search.yahoo.com/search";
const MAX_BODY_BYTES: usize = 2 * 1024 * 1024;

pub async fn search(
    client: &Client,
    query: &str,
    max_results: usize,
    timeout: Duration,
) -> Result<Vec<SearchResult>, EngineError> {
    let response = tokio::time::timeout(
        timeout,
        client
            .get(YAHOO_URL)
            .query(&[("p", query)])
            .header("Cookie", "sB=v=1&vm=p&fl=1&vl=lang_en&pn=10")
            .send(),
    )
    .await
    .map_err(|_| EngineError::Timeout { engine: ENGINE })?
    .map_err(|e| EngineError::Http {
        engine: ENGINE,
        source: e,
    })?;

    if !response.status().is_success() {
        return Err(EngineError::BadStatus {
            engine: ENGINE,
            status: response.status().as_u16(),
        });
    }

    let bytes = response.bytes().await.map_err(|e| EngineError::Http {
        engine: ENGINE,
        source: e,
    })?;
    if bytes.len() > MAX_BODY_BYTES {
        return Err(EngineError::ParseFailed {
            engine: ENGINE,
            reason: format!("response body too large: {} bytes", bytes.len()),
        });
    }
    let body = String::from_utf8_lossy(&bytes).into_owned();

    parse(&body, max_results)
}

fn parse(html: &str, max_results: usize) -> Result<Vec<SearchResult>, EngineError> {
    let document = Html::parse_document(html);

    let result_sel = sel(ENGINE, "div.algo-sr")?;
    let link_sel = sel(ENGINE, "div.compTitle a")?;
    let title_sel = sel(ENGINE, "div.compTitle a h3 span")?;
    let snippet_sel = sel(ENGINE, "div.compText")?;

    let mut results = Vec::new();

    for element in document.select(&result_sel) {
        if results.len() >= max_results {
            break;
        }

        let Some(link_el) = element.select(&link_sel).next() else {
            continue;
        };

        let raw_href = link_el.value().attr("href").unwrap_or("");
        let url = parse_yahoo_url(raw_href);
        if url.is_empty() || !url.starts_with("http") {
            continue;
        }

        let title = element
            .select(&title_sel)
            .next()
            .map(|el| el.text().collect::<String>().trim().to_string())
            .unwrap_or_default();
        if title.is_empty() {
            continue;
        }

        let snippet = element
            .select(&snippet_sel)
            .next()
            .map(|el| {
                el.text()
                    .collect::<String>()
                    .split_whitespace()
                    .collect::<Vec<_>>()
                    .join(" ")
            })
            .filter(|s| !s.is_empty());

        results.push(SearchResult {
            title,
            url,
            snippet,
            source_engine: ENGINE.to_string(),
        });
    }

    Ok(results)
}

fn parse_yahoo_url(href: &str) -> String {
    let start = match href.find("/RU=") {
        Some(pos) => {
            let after = pos + 4;
            match href[after..].find("http") {
                Some(off) => after + off,
                None => return href.to_string(),
            }
        }
        None => return href.to_string(),
    };

    let slice = &href[start..];
    let end = ["/RS=", "/RK="]
        .iter()
        .filter_map(|marker| slice.find(marker))
        .min()
        .unwrap_or(slice.len());

    urlencoding::decode(&slice[..end])
        .map(|s| s.into_owned())
        .unwrap_or_else(|_| slice[..end].to_string())
}

fn sel(engine: &'static str, s: &str) -> Result<scraper::Selector, EngineError> {
    scraper::Selector::parse(s).map_err(|e| EngineError::ParseFailed {
        engine,
        reason: format!("invalid selector '{s}': {e:?}"),
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_yahoo_url_redirect() {
        let href =
            "https://r.search.yahoo.com/_ylt=abc/RU=https%3A%2F%2Fwww.rust-lang.org%2F/RS=xyz";
        assert_eq!(parse_yahoo_url(href), "https://www.rust-lang.org/");
    }

    #[test]
    fn test_parse_yahoo_url_direct() {
        let href = "https://example.com";
        assert_eq!(parse_yahoo_url(href), "https://example.com");
    }

    #[test]
    fn test_parse_yahoo_url_rk_ending() {
        let href = "https://r.search.yahoo.com/_ylt=abc/RU=https%3A%2F%2Fexample.com/RK=2/RS=xyz";
        assert_eq!(parse_yahoo_url(href), "https://example.com");
    }

    #[test]
    fn test_parse_extracts_results() {
        let html = r#"
            <div class="algo-sr">
                <div class="compTitle">
                    <a href="https://r.search.yahoo.com/_ylt=abc/RU=https%3A%2F%2Fexample.com/RS=xyz">
                        <h3><span>Example Site</span></h3>
                    </a>
                </div>
                <div class="compText">An example website for testing.</div>
            </div>
            <div class="algo-sr">
                <div class="compTitle">
                    <a href="https://r.search.yahoo.com/_ylt=abc/RU=https%3A%2F%2Frust-lang.org/RS=xyz">
                        <h3><span>Rust Language</span></h3>
                    </a>
                </div>
                <div class="compText">Systems programming language.</div>
            </div>
        "#;

        let results = parse(html, 10).unwrap();
        assert_eq!(results.len(), 2);
        assert_eq!(results[0].title, "Example Site");
        assert_eq!(results[0].url, "https://example.com");
        assert_eq!(results[1].url, "https://rust-lang.org");
        assert!(results[0].snippet.is_some());
    }

    #[test]
    fn test_parse_respects_max_results() {
        let item = r#"
            <div class="algo-sr">
                <div class="compTitle">
                    <a href="https://r.search.yahoo.com/_ylt=x/RU=https%3A%2F%2Fexample.com/RS=y">
                        <h3><span>Title</span></h3>
                    </a>
                </div>
            </div>
        "#;
        let html = item.repeat(5);
        let results = parse(&html, 2).unwrap();
        assert_eq!(results.len(), 2);
    }

    #[test]
    fn test_parse_snippet_optional() {
        let html = r#"
            <div class="algo-sr">
                <div class="compTitle">
                    <a href="https://r.search.yahoo.com/_ylt=x/RU=https%3A%2F%2Fexample.com/RS=y">
                        <h3><span>No Snippet</span></h3>
                    </a>
                </div>
            </div>
        "#;
        let results = parse(html, 10).unwrap();
        assert_eq!(results.len(), 1);
        assert!(results[0].snippet.is_none());
    }

    #[test]
    fn test_parse_skips_non_http_urls() {
        let html = r#"
            <div class="algo-sr">
                <div class="compTitle">
                    <a href="/relative-link">
                        <h3><span>Relative</span></h3>
                    </a>
                </div>
            </div>
            <div class="algo-sr">
                <div class="compTitle">
                    <a href="https://r.search.yahoo.com/_ylt=x/RU=https%3A%2F%2Fvalid.com/RS=y">
                        <h3><span>Valid</span></h3>
                    </a>
                </div>
            </div>
        "#;
        let results = parse(html, 10).unwrap();
        assert_eq!(results.len(), 1);
        assert_eq!(results[0].url, "https://valid.com");
    }
}