essence/crawler/
mapper.rs1use crate::{
2 crawler::sitemap,
3 error::{Result, ScrapeError},
4 types::MapRequest,
5};
6use reqwest::Client;
7use scraper::{Html, Selector};
8use std::collections::HashSet;
9use url::Url;
10
11pub async fn discover_urls(url: &str, options: &MapRequest) -> Result<Vec<String>> {
13 let base_url =
14 Url::parse(url).map_err(|e| ScrapeError::InvalidUrl(format!("Invalid URL: {}", e)))?;
15
16 let client = Client::builder()
17 .user_agent("Mozilla/5.0 (compatible; Essence/0.1.0; +https://essence.foundation)")
18 .timeout(std::time::Duration::from_secs(30))
19 .build()
20 .map_err(|e| ScrapeError::Internal(format!("Failed to build HTTP client: {}", e)))?;
21
22 let mut all_urls = HashSet::new();
23
24 if !options.ignore_sitemap.unwrap_or(false) {
26 match sitemap::fetch_sitemap(url, &client).await {
27 Ok(sitemap_urls) => {
28 if !sitemap_urls.is_empty() {
29 tracing::info!("Found {} URLs from sitemap for {}", sitemap_urls.len(), url);
30 all_urls.extend(sitemap_urls);
31 } else {
32 tracing::debug!("No sitemap URLs found for {}", url);
33 }
34 }
35 Err(e) => {
36 tracing::debug!("Sitemap fetch failed for {}: {}", url, e);
37 }
38 }
39 }
40
41 let response = client.get(url).send().await.map_err(|e| {
43 if e.is_timeout() {
44 ScrapeError::Timeout
45 } else {
46 ScrapeError::RequestFailed(e)
47 }
48 })?;
49
50 let html_content = response
51 .text()
52 .await
53 .map_err(|e| ScrapeError::Internal(format!("Failed to read HTML content: {}", e)))?;
54
55 let document = Html::parse_document(&html_content);
57 let link_selector = Selector::parse("a[href]")
58 .map_err(|e| ScrapeError::Internal(format!("Invalid selector: {:?}", e)))?;
59
60 let mut in_page_links = 0;
61 for element in document.select(&link_selector) {
62 if let Some(href) = element.value().attr("href") {
63 if let Ok(absolute_url) = base_url.join(href) {
65 let url_str = absolute_url.to_string();
66
67 if let Some(include_subdomains) = options.include_subdomains {
69 if !include_subdomains {
70 if let (Some(base_host), Some(url_host)) =
72 (base_url.host_str(), absolute_url.host_str())
73 {
74 if base_host != url_host {
75 continue;
76 }
77 }
78 } else {
79 if let (Some(base_host), Some(url_host)) =
81 (base_url.host_str(), absolute_url.host_str())
82 {
83 let base_domain = extract_base_domain(base_host);
84 let url_domain = extract_base_domain(url_host);
85 if base_domain != url_domain {
86 continue;
87 }
88 }
89 }
90 }
91
92 if all_urls.insert(url_str) {
93 in_page_links += 1;
94 }
95 }
96 }
97 }
98
99 tracing::info!(
100 "Found {} in-page links for {} (total unique: {})",
101 in_page_links,
102 url,
103 all_urls.len()
104 );
105
106 let mut filtered_urls: Vec<String> = if let Some(search) = &options.search {
108 all_urls
109 .into_iter()
110 .filter(|url| url.to_lowercase().contains(&search.to_lowercase()))
111 .collect()
112 } else {
113 all_urls.into_iter().collect()
114 };
115
116 filtered_urls.sort();
118
119 let limit = options.limit.unwrap_or(5000) as usize;
121 if filtered_urls.len() > limit {
122 filtered_urls.truncate(limit);
123 }
124
125 Ok(filtered_urls)
126}
127
128fn extract_base_domain(host: &str) -> &str {
130 let parts: Vec<&str> = host.split('.').collect();
131 if parts.len() >= 2 {
132 &host[host.len() - parts[parts.len() - 2].len() - parts[parts.len() - 1].len() - 1..]
133 } else {
134 host
135 }
136}
137
138#[cfg(test)]
139mod tests {
140 use super::*;
141
142 #[test]
143 fn test_extract_base_domain() {
144 assert_eq!(extract_base_domain("example.com"), "example.com");
145 assert_eq!(extract_base_domain("blog.example.com"), "example.com");
146 assert_eq!(extract_base_domain("api.blog.example.com"), "example.com");
147 assert_eq!(extract_base_domain("localhost"), "localhost");
148 }
149
150 #[test]
151 fn test_url_filtering() {
152 let base_url = Url::parse("https://example.com").unwrap();
153
154 let url_same_domain = Url::parse("https://example.com/page").unwrap();
156 let url_subdomain = Url::parse("https://blog.example.com/page").unwrap();
157 let url_different = Url::parse("https://different.com/page").unwrap();
158
159 assert_eq!(
160 base_url.host_str().unwrap(),
161 url_same_domain.host_str().unwrap()
162 );
163 assert_ne!(
164 base_url.host_str().unwrap(),
165 url_subdomain.host_str().unwrap()
166 );
167 assert_ne!(
168 base_url.host_str().unwrap(),
169 url_different.host_str().unwrap()
170 );
171 }
172}