web_analyzer/
geo_analysis.rs1use reqwest::Client;
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4use std::time::Duration;
5
6#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct GeoAnalysisResult {
10 pub domain: String,
11 pub llms_txt: LlmsTxtResult,
12 pub webmcp: WebMcpResult,
13 pub ai_crawler_directives: AiCrawlerResult,
14 pub geo_score: u32,
15 pub geo_grade: String,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct LlmsTxtResult {
20 pub found: bool,
21 pub files: Vec<String>,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct WebMcpResult {
26 pub found: bool,
27 pub endpoints: Vec<String>,
28 pub html_features: Vec<String>,
29}
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct AiCrawlerResult {
33 pub status: String,
34 pub bots: HashMap<String, String>,
35}
36
37const AI_BOTS: &[&str] = &[
40 "GPTBot",
41 "ChatGPT-User",
42 "ClaudeBot",
43 "Claude-Web",
44 "Applebot-Extended",
45 "OAI-SearchBot",
46 "PerplexityBot",
47];
48
49pub async fn analyze_geo(
52 domain: &str,
53) -> Result<GeoAnalysisResult, Box<dyn std::error::Error + Send + Sync>> {
54 let base_url = if domain.starts_with("http") {
55 domain.to_string()
56 } else {
57 format!("https://{}", domain)
58 };
59
60 let client = Client::builder()
61 .timeout(Duration::from_secs(10))
62 .danger_accept_invalid_certs(true)
63 .build()?;
64
65 let llms_paths = ["/llms.txt", "/llms-full.txt", "/.well-known/llms.txt"];
67 let mut llms_found = Vec::new();
68 for path in &llms_paths {
69 let url = format!("{}{}", base_url.trim_end_matches('/'), path);
70 if let Ok(resp) = client.get(&url).send().await {
71 if resp.status().is_success() {
72 let ct = resp
73 .headers()
74 .get("content-type")
75 .and_then(|v| v.to_str().ok())
76 .unwrap_or("")
77 .to_lowercase();
78 if ct.contains("text/plain") || ct.contains("text/html") {
79 llms_found.push(path.to_string());
80 }
81 }
82 }
83 }
84
85 let mcp_paths = ["/.well-known/mcp", "/mcp.json"];
87 let mut mcp_found = Vec::new();
88 for path in &mcp_paths {
89 let url = format!("{}{}", base_url.trim_end_matches('/'), path);
90 if let Ok(resp) = client.get(&url).send().await {
91 if resp.status().is_success() {
92 mcp_found.push(path.to_string());
93 }
94 }
95 }
96
97 let mut html_features = Vec::new();
99 if let Ok(resp) = client.get(&base_url).send().await {
100 if resp.status().is_success() {
101 if let Ok(html) = resp.text().await {
102 if html.contains("navigator.modelContext") {
103 html_features.push("navigator.modelContext API".to_string());
104 }
105 let lower = html.to_lowercase();
106 if lower.contains("webmcp") || lower.contains("model context protocol") {
107 html_features
108 .push("WebMCP/Model Context Protocol references in HTML".to_string());
109 }
110 }
111 }
112 }
113
114 let mcp_has_anything = !mcp_found.is_empty() || !html_features.is_empty();
115
116 let mut directives: HashMap<String, String> = AI_BOTS
118 .iter()
119 .map(|b| (b.to_string(), "Unknown".into()))
120 .collect();
121
122 let robots_url = format!("{}/robots.txt", base_url.trim_end_matches('/'));
123 if let Ok(resp) = client.get(&robots_url).send().await {
124 if resp.status().is_success() {
125 if let Ok(body) = resp.text().await {
126 let mut current_agent: Option<String> = None;
127 for line in body.lines() {
128 let line = line.trim();
129 if line.is_empty() || line.starts_with('#') {
130 continue;
131 }
132 let lower = line.to_lowercase();
133
134 if lower.starts_with("user-agent:") {
135 let agent = line.split(':').nth(1).unwrap_or("").trim().to_string();
136 if AI_BOTS.iter().any(|b| *b == agent) {
137 current_agent = Some(agent);
138 } else {
139 current_agent = None;
140 }
141 } else if let Some(ref agent) = current_agent {
142 if lower.starts_with("disallow:") {
143 let path = line.split(':').nth(1).unwrap_or("").trim();
144 if path == "/" {
145 directives.insert(agent.clone(), "Blocked".into());
146 } else if directives.get(agent).map(|s| s.as_str()) == Some("Unknown") {
147 directives.insert(agent.clone(), "Partially Blocked".into());
148 }
149 } else if lower.starts_with("allow:")
150 && directives.get(agent).map(|s| s.as_str()) == Some("Unknown") {
151 directives.insert(agent.clone(), "Allowed".into());
152 }
153 }
154 }
155 for (_, v) in directives.iter_mut() {
157 if *v == "Unknown" {
158 *v = "Allowed (Implicit)".into();
159 }
160 }
161 }
162 }
163 }
164
165 let blocked_count = directives
166 .values()
167 .filter(|v| v.contains("Blocked"))
168 .count();
169 let crawler_status = if blocked_count > AI_BOTS.len() / 2 {
170 "Restrictive"
171 } else {
172 "Permissive"
173 };
174
175 let mut score: u32 = 0;
177
178 if !llms_found.is_empty() {
180 score += 20 + (llms_found.len() as u32 * 10).min(20);
181 }
182
183 if mcp_has_anything {
185 score += 20;
186 if !mcp_found.is_empty() {
187 score += 10;
188 }
189 if !html_features.is_empty() {
190 score += 10;
191 }
192 }
193
194 if crawler_status == "Permissive" {
196 score += 20;
197 }
198
199 let grade = match score {
200 80..=100 => "A (Excellent)".into(),
201 60..=79 => "B (Good)".into(),
202 40..=59 => "C (Fair)".into(),
203 20..=39 => "D (Poor)".into(),
204 _ => "F (None)".into(),
205 };
206
207 Ok(GeoAnalysisResult {
208 domain: domain.to_string(),
209 llms_txt: LlmsTxtResult {
210 found: !llms_found.is_empty(),
211 files: llms_found,
212 },
213 webmcp: WebMcpResult {
214 found: mcp_has_anything,
215 endpoints: mcp_found,
216 html_features,
217 },
218 ai_crawler_directives: AiCrawlerResult {
219 status: crawler_status.to_string(),
220 bots: directives,
221 },
222 geo_score: score,
223 geo_grade: grade,
224 })
225}