Skip to main content

essence/crawler/
mapper.rs

1use crate::{
2    crawler::sitemap,
3    error::{Result, ScrapeError},
4    types::MapRequest,
5};
6use reqwest::Client;
7use scraper::{Html, Selector};
8use std::collections::HashSet;
9use url::Url;
10
11/// Discover URLs from a given URL
12pub async fn discover_urls(url: &str, options: &MapRequest) -> Result<Vec<String>> {
13    let base_url =
14        Url::parse(url).map_err(|e| ScrapeError::InvalidUrl(format!("Invalid URL: {}", e)))?;
15
16    let client = Client::builder()
17        .user_agent("Mozilla/5.0 (compatible; Essence/0.1.0; +https://essence.foundation)")
18        .timeout(std::time::Duration::from_secs(30))
19        .build()
20        .map_err(|e| ScrapeError::Internal(format!("Failed to build HTTP client: {}", e)))?;
21
22    let mut all_urls = HashSet::new();
23
24    // 1. Try sitemap discovery first (unless explicitly ignored)
25    if !options.ignore_sitemap.unwrap_or(false) {
26        match sitemap::fetch_sitemap(url, &client).await {
27            Ok(sitemap_urls) => {
28                if !sitemap_urls.is_empty() {
29                    tracing::info!("Found {} URLs from sitemap for {}", sitemap_urls.len(), url);
30                    all_urls.extend(sitemap_urls);
31                } else {
32                    tracing::debug!("No sitemap URLs found for {}", url);
33                }
34            }
35            Err(e) => {
36                tracing::debug!("Sitemap fetch failed for {}: {}", url, e);
37            }
38        }
39    }
40
41    // 2. Fetch the page and extract links from HTML (always do this for comprehensive coverage)
42    let response = client.get(url).send().await.map_err(|e| {
43        if e.is_timeout() {
44            ScrapeError::Timeout
45        } else {
46            ScrapeError::RequestFailed(e)
47        }
48    })?;
49
50    let html_content = response
51        .text()
52        .await
53        .map_err(|e| ScrapeError::Internal(format!("Failed to read HTML content: {}", e)))?;
54
55    // Parse HTML and extract links
56    let document = Html::parse_document(&html_content);
57    let link_selector = Selector::parse("a[href]")
58        .map_err(|e| ScrapeError::Internal(format!("Invalid selector: {:?}", e)))?;
59
60    let mut in_page_links = 0;
61    for element in document.select(&link_selector) {
62        if let Some(href) = element.value().attr("href") {
63            // Resolve relative URLs
64            if let Ok(absolute_url) = base_url.join(href) {
65                let url_str = absolute_url.to_string();
66
67                // Filter by subdomain option
68                if let Some(include_subdomains) = options.include_subdomains {
69                    if !include_subdomains {
70                        // Only include URLs from the same domain (no subdomains)
71                        if let (Some(base_host), Some(url_host)) =
72                            (base_url.host_str(), absolute_url.host_str())
73                        {
74                            if base_host != url_host {
75                                continue;
76                            }
77                        }
78                    } else {
79                        // Include subdomains - check if it's the same base domain
80                        if let (Some(base_host), Some(url_host)) =
81                            (base_url.host_str(), absolute_url.host_str())
82                        {
83                            let base_domain = extract_base_domain(base_host);
84                            let url_domain = extract_base_domain(url_host);
85                            if base_domain != url_domain {
86                                continue;
87                            }
88                        }
89                    }
90                }
91
92                if all_urls.insert(url_str) {
93                    in_page_links += 1;
94                }
95            }
96        }
97    }
98
99    tracing::info!(
100        "Found {} in-page links for {} (total unique: {})",
101        in_page_links,
102        url,
103        all_urls.len()
104    );
105
106    // 3. Filter by search query if provided
107    let mut filtered_urls: Vec<String> = if let Some(search) = &options.search {
108        all_urls
109            .into_iter()
110            .filter(|url| url.to_lowercase().contains(&search.to_lowercase()))
111            .collect()
112    } else {
113        all_urls.into_iter().collect()
114    };
115
116    // 4. Sort for consistent output
117    filtered_urls.sort();
118
119    // 5. Apply limit
120    let limit = options.limit.unwrap_or(5000) as usize;
121    if filtered_urls.len() > limit {
122        filtered_urls.truncate(limit);
123    }
124
125    Ok(filtered_urls)
126}
127
128/// Extract base domain from host (e.g., "blog.example.com" -> "example.com")
129fn extract_base_domain(host: &str) -> &str {
130    let parts: Vec<&str> = host.split('.').collect();
131    if parts.len() >= 2 {
132        &host[host.len() - parts[parts.len() - 2].len() - parts[parts.len() - 1].len() - 1..]
133    } else {
134        host
135    }
136}
137
138#[cfg(test)]
139mod tests {
140    use super::*;
141
142    #[test]
143    fn test_extract_base_domain() {
144        assert_eq!(extract_base_domain("example.com"), "example.com");
145        assert_eq!(extract_base_domain("blog.example.com"), "example.com");
146        assert_eq!(extract_base_domain("api.blog.example.com"), "example.com");
147        assert_eq!(extract_base_domain("localhost"), "localhost");
148    }
149
150    #[test]
151    fn test_url_filtering() {
152        let base_url = Url::parse("https://example.com").unwrap();
153
154        // Test subdomain filtering logic
155        let url_same_domain = Url::parse("https://example.com/page").unwrap();
156        let url_subdomain = Url::parse("https://blog.example.com/page").unwrap();
157        let url_different = Url::parse("https://different.com/page").unwrap();
158
159        assert_eq!(
160            base_url.host_str().unwrap(),
161            url_same_domain.host_str().unwrap()
162        );
163        assert_ne!(
164            base_url.host_str().unwrap(),
165            url_subdomain.host_str().unwrap()
166        );
167        assert_ne!(
168            base_url.host_str().unwrap(),
169            url_different.host_str().unwrap()
170        );
171    }
172}