daedra 0.3.0 - Docs.rs

//! Common types and data structures used throughout Daedra.
//!
//! This module contains all the shared types including:
//! - Search arguments and results
//! - Error types
//! - Configuration structures

use serde::{Deserialize, Serialize};
use thiserror::Error;

/// Result type alias for Daedra operations
pub type DaedraResult<T> = Result<T, DaedraError>;

/// Errors that can occur during Daedra operations
#[derive(Error, Debug)]
pub enum DaedraError {
    /// HTTP request failed
    #[error("HTTP request failed: {0}")]
    HttpError(#[from] reqwest::Error),

    /// URL parsing failed
    #[error("Invalid URL: {0}")]
    UrlParseError(#[from] url::ParseError),

    /// JSON serialization/deserialization failed
    #[error("JSON error: {0}")]
    JsonError(#[from] serde_json::Error),

    /// Search operation failed
    #[error("Search failed: {0}")]
    SearchError(String),

    /// Page fetch failed
    #[error("Failed to fetch page: {0}")]
    FetchError(String),

    /// Invalid arguments provided
    #[error("Invalid arguments: {0}")]
    InvalidArguments(String),

    /// Server error
    #[error("Server error: {0}")]
    ServerError(String),

    /// IO error
    #[error("IO error: {0}")]
    IoError(#[from] std::io::Error),

    /// Content extraction failed
    #[error("Content extraction failed: {0}")]
    ExtractionError(String),

    /// Unsupported content type for page extraction
    #[error("Unsupported content type: {0}")]
    UnsupportedContentType(String),

    /// Rate limit exceeded
    #[error("Rate limit exceeded, please try again later")]
    RateLimitExceeded,

    /// Bot protection detected
    #[error("Bot protection detected on target page")]
    BotProtectionDetected,

    /// Timeout occurred
    #[error("Operation timed out")]
    Timeout,
}

/// Safe search filtering levels
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
#[serde(rename_all = "UPPERCASE")]
pub enum SafeSearchLevel {
    /// No filtering
    Off,
    /// Moderate filtering (default)
    #[default]
    Moderate,
    /// Strict filtering
    Strict,
}

impl SafeSearchLevel {
    /// Convert to DuckDuckGo safe search parameter value
    pub fn to_ddg_value(&self) -> i32 {
        match self {
            SafeSearchLevel::Off => -2,
            SafeSearchLevel::Moderate => -1,
            SafeSearchLevel::Strict => 1,
        }
    }
}

impl std::fmt::Display for SafeSearchLevel {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            SafeSearchLevel::Off => write!(f, "OFF"),
            SafeSearchLevel::Moderate => write!(f, "MODERATE"),
            SafeSearchLevel::Strict => write!(f, "STRICT"),
        }
    }
}

impl std::str::FromStr for SafeSearchLevel {
    type Err = DaedraError;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s.to_uppercase().as_str() {
            "OFF" => Ok(SafeSearchLevel::Off),
            "MODERATE" => Ok(SafeSearchLevel::Moderate),
            "STRICT" => Ok(SafeSearchLevel::Strict),
            _ => Err(DaedraError::InvalidArguments(format!(
                "Invalid safe search level: {}",
                s
            ))),
        }
    }
}

/// Options for search operations
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchOptions {
    /// Region for search results (e.g., "us-en", "zh-cn")
    #[serde(default = "default_region")]
    pub region: String,

    /// Safe search filtering level
    #[serde(default)]
    pub safe_search: SafeSearchLevel,

    /// Maximum number of results to return
    #[serde(default = "default_num_results")]
    pub num_results: usize,

    /// Time range filter (e.g., "d" for day, "w" for week, "m" for month)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub time_range: Option<String>,
}

impl Default for SearchOptions {
    fn default() -> Self {
        Self {
            region: "wt-wt".to_string(),
            safe_search: SafeSearchLevel::Moderate,
            num_results: 10,
            time_range: None,
        }
    }
}

fn default_region() -> String {
    "wt-wt".to_string() // Worldwide
}

fn default_num_results() -> usize {
    10
}

/// Arguments for the search tool
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchArgs {
    /// The search query string
    pub query: String,

    /// Optional search configuration
    #[serde(skip_serializing_if = "Option::is_none")]
    pub options: Option<SearchOptions>,
}

/// Arguments for the visit_page tool
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VisitPageArgs {
    /// URL of the page to visit
    pub url: String,

    /// Optional CSS selector to target specific content
    #[serde(skip_serializing_if = "Option::is_none")]
    pub selector: Option<String>,

    /// Whether to include images in the response
    #[serde(default)]
    pub include_images: bool,
}

/// Content type classification for search results
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
#[derive(Default)]
pub enum ContentType {
    /// Documentation pages
    Documentation,
    /// Social media content
    Social,
    /// News articles
    Article,
    /// Forum discussions
    Forum,
    /// Video content
    Video,
    /// E-commerce/shopping
    Shopping,
    /// Other/unknown content
    #[default]
    Other,
}

/// Metadata for a search result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ResultMetadata {
    /// Content type classification
    #[serde(rename = "type")]
    pub content_type: ContentType,

    /// Source domain
    pub source: String,

    /// Favicon URL if available
    #[serde(skip_serializing_if = "Option::is_none")]
    pub favicon: Option<String>,

    /// Published date if available
    #[serde(skip_serializing_if = "Option::is_none")]
    pub published_date: Option<String>,
}

/// A single search result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResult {
    /// Title of the result
    pub title: String,

    /// URL of the result
    pub url: String,

    /// Description/snippet
    pub description: String,

    /// Additional metadata
    pub metadata: ResultMetadata,
}

/// Query analysis information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QueryAnalysis {
    /// Detected language of the query
    pub language: String,

    /// Detected topics in results
    pub topics: Vec<String>,
}

/// Search context information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchContext {
    /// Region used for search
    pub region: String,

    /// Safe search level applied
    pub safe_search: String,

    /// Number of results requested
    #[serde(skip_serializing_if = "Option::is_none")]
    pub num_results: Option<usize>,
}

/// Metadata about the search operation
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchMetadata {
    /// Original search query
    pub query: String,

    /// ISO timestamp of when search was conducted
    pub timestamp: String,

    /// Number of results returned
    pub result_count: usize,

    /// Search context information
    pub search_context: SearchContext,

    /// Query analysis results
    pub query_analysis: QueryAnalysis,
}

/// Complete search response
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResponse {
    /// Response type discriminator
    #[serde(rename = "type")]
    pub response_type: String,

    /// Array of search results
    pub data: Vec<SearchResult>,

    /// Search metadata
    pub metadata: SearchMetadata,
}

impl SearchResponse {
    /// Create a new search response
    pub fn new(query: String, results: Vec<SearchResult>, options: &SearchOptions) -> Self {
        let timestamp = chrono::Utc::now().to_rfc3339();
        let result_count = results.len();

        // Analyze query for language detection
        let language = detect_language(&query);
        let topics = detect_topics(&results);

        Self {
            response_type: "search_results".to_string(),
            data: results,
            metadata: SearchMetadata {
                query,
                timestamp,
                result_count,
                search_context: SearchContext {
                    region: options.region.clone(),
                    safe_search: options.safe_search.to_string(),
                    num_results: Some(options.num_results),
                },
                query_analysis: QueryAnalysis { language, topics },
            },
        }
    }
}

/// Result of visiting a page
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageContent {
    /// URL of the page
    pub url: String,

    /// Page title
    pub title: String,

    /// Extracted content in Markdown format
    pub content: String,

    /// ISO timestamp of when page was fetched
    pub timestamp: String,

    /// Word count of extracted content
    pub word_count: usize,

    /// Links found on the page
    #[serde(skip_serializing_if = "Option::is_none")]
    pub links: Option<Vec<PageLink>>,
}

/// A link found on a page
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageLink {
    /// Link text
    pub text: String,

    /// Link URL
    pub url: String,
}

/// Arguments for the `crawl_site` tool.
///
/// `max_pages` is clamped to `[1, 500]` and `concurrency` to `[1, 16]`
/// inside `crawl::crawl_site` — the declared maxima here are advisory.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlArgs {
    /// The root URL of the site to crawl
    pub root_url: String,

    /// Upper bound on the number of pages to fetch
    #[serde(default = "default_crawl_max_pages")]
    pub max_pages: usize,

    /// Maximum number of concurrent fetches
    #[serde(default = "default_crawl_concurrency")]
    pub concurrency: usize,
}

fn default_crawl_max_pages() -> usize { 25 }
fn default_crawl_concurrency() -> usize { 4 }

/// A single page fetched by `crawl_site`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawledPage {
    /// Absolute URL of the fetched page
    pub url: String,

    /// Extracted page title (may be empty)
    pub title: String,

    /// Content converted to Markdown
    pub markdown: String,

    /// Outbound link URLs discovered on the page
    pub links: Vec<String>,
}

/// Error record for a URL that could not be fetched during a crawl.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlError {
    /// URL that failed
    pub url: String,

    /// Human-readable error reason
    pub error: String,
}

/// Summary of crawl activity — counts only, no content.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlSummary {
    /// Page budget requested by the caller (post-clamp)
    pub requested: usize,

    /// Number of pages successfully fetched
    pub fetched: usize,

    /// Number of URLs that errored out
    pub failed: usize,
}

/// Return value of `crawl_site`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlResult {
    /// The normalized root URL that was crawled
    pub root_url: String,

    /// Whether a sitemap.xml (or alias) was found and used
    pub sitemap_found: bool,

    /// Counts-only activity summary
    pub summary: CrawlSummary,

    /// Successfully fetched pages
    pub pages: Vec<CrawledPage>,

    /// Per-URL errors that were silently dropped during the batch
    pub errors: Vec<CrawlError>,
}

struct LangRange {
    lang: &'static str,
    ranges: &'static [(char, char)],
}

const LANG_RANGES: &[LangRange] = &[
    LangRange {
        lang: "zh",
        ranges: &[('\u{4e00}', '\u{9fff}')],
    },
    LangRange {
        lang: "ja",
        ranges: &[('\u{3040}', '\u{30ff}')],
    },
    LangRange {
        lang: "ko",
        ranges: &[('\u{ac00}', '\u{d7af}')],
    },
    LangRange {
        lang: "ru",
        ranges: &[('\u{0400}', '\u{04ff}')],
    },
    LangRange {
        lang: "ar",
        ranges: &[('\u{0600}', '\u{06ff}')],
    },
];

/// Detect language of a query using simple heuristics
fn detect_language(query: &str) -> String {
    for range in LANG_RANGES {
        if query
            .chars()
            .any(|c| range.ranges.iter().any(|&(s, e)| c >= s && c <= e))
        {
            return range.lang.to_string();
        }
    }
    "en".to_string()
}

struct TopicRule {
    topic: &'static str,
    url_patterns: &'static [&'static str],
    title_patterns: &'static [&'static str],
    content_type: Option<ContentType>,
}

const TOPIC_RULES: &[TopicRule] = &[
    TopicRule {
        topic: "technology",
        url_patterns: &["github.com", "stackoverflow.com", "gitlab.com"],
        title_patterns: &["programming", "code"],
        content_type: None,
    },
    TopicRule {
        topic: "documentation",
        url_patterns: &["docs.", "/docs/", "/documentation/"],
        title_patterns: &["documentation", "api reference"],
        content_type: None,
    },
    TopicRule {
        topic: "news",
        url_patterns: &["news.", "/news/"],
        title_patterns: &[],
        content_type: Some(ContentType::Article),
    },
    TopicRule {
        topic: "academic",
        url_patterns: &[".edu", "arxiv.org", "scholar.google"],
        title_patterns: &["research", "study"],
        content_type: None,
    },
];

/// Detect topics from search results
fn detect_topics(results: &[SearchResult]) -> Vec<String> {
    let mut topics = std::collections::HashSet::new();
    for result in results {
        let lower_url = result.url.to_lowercase();
        let lower_title = result.title.to_lowercase();
        for rule in TOPIC_RULES {
            let url_match = rule.url_patterns.iter().any(|p| lower_url.contains(p));
            let title_match = rule
                .title_patterns
                .iter()
                .any(|p| lower_title.contains(p));
            let type_match = rule
                .content_type
                .map_or(true, |ct| result.metadata.content_type == ct);
            if url_match || title_match || type_match {
                topics.insert(rule.topic.to_string());
            }
        }
    }
    topics.into_iter().collect()
}

/// JSON Schema for search arguments (used for MCP tool definition)
pub fn search_args_schema() -> serde_json::Value {
    serde_json::json!({
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "The search query string"
            },
            "options": {
                "type": "object",
                "description": "Optional search configuration",
                "properties": {
                    "region": {
                        "type": "string",
                        "description": "Region for search results (e.g., 'us-en', 'wt-wt' for worldwide)",
                        "default": "wt-wt"
                    },
                    "safe_search": {
                        "type": "string",
                        "enum": ["OFF", "MODERATE", "STRICT"],
                        "description": "Safe search filtering level",
                        "default": "MODERATE"
                    },
                    "num_results": {
                        "type": "integer",
                        "description": "Maximum number of results to return",
                        "default": 10,
                        "minimum": 1,
                        "maximum": 50
                    },
                    "time_range": {
                        "type": "string",
                        "description": "Time range filter (d=day, w=week, m=month, y=year)"
                    }
                }
            }
        },
        "required": ["query"]
    })
}

/// JSON Schema for visit_page arguments
pub fn visit_page_args_schema() -> serde_json::Value {
    serde_json::json!({
        "type": "object",
        "properties": {
            "url": {
                "type": "string",
                "format": "uri",
                "description": "URL of the page to visit"
            },
            "selector": {
                "type": "string",
                "description": "Optional CSS selector to target specific content"
            },
            "include_images": {
                "type": "boolean",
                "description": "Whether to include image references in the response",
                "default": false
            }
        },
        "required": ["url"]
    })
}

pub fn crawl_args_schema() -> serde_json::Value {
    serde_json::json!({
        "type": "object",
        "properties": {
            "root_url": {
                "type": "string",
                "format": "uri",
                "description": "Root URL of the site to crawl (sitemap or homepage)"
            },
            "max_pages": {
                "type": "integer",
                "description": "Maximum number of pages to fetch (default: 25)",
                "default": 25
            },
            "concurrency": {
                "type": "integer",
                "description": "Maximum concurrent fetches (default: 4)",
                "default": 4
            }
        },
        "required": ["root_url"]
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_safe_search_level_parsing() {
        assert_eq!(
            "OFF".parse::<SafeSearchLevel>().unwrap(),
            SafeSearchLevel::Off
        );
        assert_eq!(
            "MODERATE".parse::<SafeSearchLevel>().unwrap(),
            SafeSearchLevel::Moderate
        );
        assert_eq!(
            "STRICT".parse::<SafeSearchLevel>().unwrap(),
            SafeSearchLevel::Strict
        );
        assert_eq!(
            "moderate".parse::<SafeSearchLevel>().unwrap(),
            SafeSearchLevel::Moderate
        );
    }

    #[test]
    fn test_safe_search_ddg_value() {
        assert_eq!(SafeSearchLevel::Off.to_ddg_value(), -2);
        assert_eq!(SafeSearchLevel::Moderate.to_ddg_value(), -1);
        assert_eq!(SafeSearchLevel::Strict.to_ddg_value(), 1);
    }

    #[test]
    fn test_language_detection() {
        assert_eq!(detect_language("hello world"), "en");
        assert_eq!(detect_language("你好世界"), "zh");
        assert_eq!(detect_language("こんにちは"), "ja");
        assert_eq!(detect_language("안녕하세요"), "ko");
        assert_eq!(detect_language("привет"), "ru");
    }

    #[test]
    fn test_search_args_schema() {
        let schema = search_args_schema();
        assert!(schema["properties"]["query"].is_object());
        assert!(schema["properties"]["options"].is_object());
    }

    #[test]
    fn test_search_response_creation() {
        let results = vec![SearchResult {
            title: "Test".to_string(),
            url: "https://example.com".to_string(),
            description: "Test description".to_string(),
            metadata: ResultMetadata {
                content_type: ContentType::Article,
                source: "example.com".to_string(),
                favicon: None,
                published_date: None,
            },
        }];

        let options = SearchOptions::default();
        let response = SearchResponse::new("test query".to_string(), results, &options);

        assert_eq!(response.response_type, "search_results");
        assert_eq!(response.data.len(), 1);
        assert_eq!(response.metadata.query, "test query");
    }
}