research-master 0.1.40

MCP server for searching and downloading academic papers from multiple research sources
Documentation
//! Google Scholar research source implementation.
//!
//! NOTE: This source is DISABLED by default and requires the GOOGLE_SCHOLAR_ENABLED
//! environment variable to be set to "true" to enable it.
//!
//! Google Scholar does not have an official public API. This implementation uses
//! web scraping which may violate Google's Terms of Service. Use at your own risk.
//!
//! This source is primarily intended for research/educational purposes.

use async_trait::async_trait;
use std::sync::Arc;

use crate::models::{Paper, PaperBuilder, SearchQuery, SearchResponse, SourceType};
use crate::sources::{Source, SourceCapabilities, SourceError};
use crate::utils::{api_retry_config, with_retry, HttpClient};

const GOOGLE_SCHOLAR_URL: &str = "https://scholar.google.com";

/// Environment variable to enable Google Scholar scraping
const GOOGLE_SCHOLAR_ENABLED_VAR: &str = "GOOGLE_SCHOLAR_ENABLED";

/// Google Scholar research source
///
/// WARNING: This source requires GOOGLE_SCHOLAR_ENABLED=true to be set.
/// It uses web scraping which may violate Google's Terms of Service.
/// Use at your own risk.
#[derive(Debug, Clone)]
pub struct GoogleScholarSource {
    client: Arc<HttpClient>,
    enabled: bool,
}

impl GoogleScholarSource {
    pub fn new() -> Result<Self, SourceError> {
        let enabled = std::env::var(GOOGLE_SCHOLAR_ENABLED_VAR).unwrap_or_default() == "true";

        if !enabled {
            tracing::info!(
                "Google Scholar source is disabled. Set {} to enable.",
                GOOGLE_SCHOLAR_ENABLED_VAR
            );
        }

        Ok(Self {
            client: Arc::new(HttpClient::new()?),
            enabled,
        })
    }
}

impl Default for GoogleScholarSource {
    fn default() -> Self {
        Self::new().expect("Failed to create GoogleScholarSource")
    }
}

#[async_trait]
impl Source for GoogleScholarSource {
    fn id(&self) -> &str {
        "google_scholar"
    }

    fn name(&self) -> &str {
        "Google Scholar"
    }

    fn capabilities(&self) -> SourceCapabilities {
        if !self.enabled {
            return SourceCapabilities::empty();
        }
        SourceCapabilities::SEARCH | SourceCapabilities::DOI_LOOKUP
    }

    async fn search(&self, query: &SearchQuery) -> Result<SearchResponse, SourceError> {
        if !self.enabled {
            return Err(SourceError::Other(format!(
                "Google Scholar is disabled. Set {} to enable.",
                GOOGLE_SCHOLAR_ENABLED_VAR
            )));
        }

        let max_results = query.max_results.min(10);

        // Note: Direct scraping of Google Scholar is complex due to dynamic content
        // and anti-bot measures. This is a simplified implementation.
        // In practice, you may want to use a service like SerpApi or similar.

        let url = format!(
            "{}?hl=en&q={}&start={}&num={}",
            GOOGLE_SCHOLAR_URL,
            urlencoding::encode(&query.query),
            0,
            max_results
        );

        let client = Arc::clone(&self.client);
        let url_for_retry = url.clone();

        let response = with_retry(api_retry_config(), || {
            let client = Arc::clone(&client);
            let url = url_for_retry.clone();
            async move {
                let request = client.get(&url).header(
                    "User-Agent",
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
                );

                let response = request.send().await.map_err(|e| {
                    SourceError::Network(format!("Failed to search Google Scholar: {}", e))
                })?;

                if !response.status().is_success() {
                    let status = response.status();
                    return Err(SourceError::Api(format!(
                        "Google Scholar returned status: {}",
                        status
                    )));
                }

                let text = response
                    .text()
                    .await
                    .map_err(|e| SourceError::Parse(format!("Failed to read response: {}", e)))?;

                Ok(text)
            }
        })
        .await?;

        let papers = self.parse_results(&response, &query.query)?;
        let papers_len = papers.len();
        let mut search_response = SearchResponse::new(papers, "Google Scholar", &query.query);
        search_response.total_results = Some(papers_len);
        Ok(search_response)
    }

    async fn get_by_doi(&self, doi: &str) -> Result<Paper, SourceError> {
        if !self.enabled {
            return Err(SourceError::Other(format!(
                "Google Scholar is disabled. Set {} to enable.",
                GOOGLE_SCHOLAR_ENABLED_VAR
            )));
        }

        // Search by DOI
        let url = format!(
            "{}?hl=en&q={}&btnI",
            GOOGLE_SCHOLAR_URL,
            urlencoding::encode(doi)
        );

        let client = Arc::clone(&self.client);

        let response = client
            .get(&url)
            .header(
                "User-Agent",
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            )
            .send()
            .await
            .map_err(|e| {
                SourceError::Network(format!("Failed to lookup DOI in Google Scholar: {}", e))
            })?;

        if !response.status().is_success() {
            return Err(SourceError::NotFound(format!(
                "Paper not found in Google Scholar: {}",
                doi
            )));
        }

        // For DOI lookups, we return a basic paper with the DOI info
        let clean_doi = doi
            .replace("https://doi.org/", "")
            .replace("doi:", "")
            .trim()
            .to_string();

        Ok(PaperBuilder::new(
            clean_doi.clone(),
            "Paper from Google Scholar".to_string(),
            format!("https://doi.org/{}", clean_doi),
            SourceType::GoogleScholar,
        )
        .doi(doi)
        .build())
    }
}

impl GoogleScholarSource {
    fn parse_results(&self, _html: &str, _query: &str) -> Result<Vec<Paper>, SourceError> {
        // Simple parsing - in practice, you'd want more robust HTML parsing
        // Google Scholar's HTML structure is complex and changes frequently

        // This is a placeholder that returns no results
        // In a real implementation, you would use scraper or similar to parse the HTML
        tracing::warn!("Google Scholar parsing not fully implemented - returning empty results");

        // For a real implementation, you would:
        // 1. Parse the HTML response
        // 2. Extract paper titles, authors, venues, citations
        // 3. Handle pagination
        // 4. Respect rate limits

        Ok(vec![])
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_source_creation() {
        // Should always succeed (even when disabled)
        let source = GoogleScholarSource::new();
        assert!(source.is_ok());
    }

    #[test]
    fn test_source_disabled_by_default() {
        let source = GoogleScholarSource::new();
        assert!(!source.unwrap().enabled);
    }
}