Skip to main content

web_analyzer/
geo_analysis.rs

1use reqwest::Client;
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4use std::time::Duration;
5
6// ── Data Structures ─────────────────────────────────────────────────────────
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct GeoAnalysisResult {
10    pub domain: String,
11    pub llms_txt: LlmsTxtResult,
12    pub webmcp: WebMcpResult,
13    pub ai_crawler_directives: AiCrawlerResult,
14    pub geo_score: u32,
15    pub geo_grade: String,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct LlmsTxtResult {
20    pub found: bool,
21    pub files: Vec<String>,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct WebMcpResult {
26    pub found: bool,
27    pub endpoints: Vec<String>,
28    pub html_features: Vec<String>,
29}
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct AiCrawlerResult {
33    pub status: String,
34    pub bots: HashMap<String, String>,
35}
36
37// ── AI bot list ─────────────────────────────────────────────────────────────
38
39const AI_BOTS: &[&str] = &[
40    "GPTBot",
41    "ChatGPT-User",
42    "ClaudeBot",
43    "Claude-Web",
44    "Applebot-Extended",
45    "OAI-SearchBot",
46    "PerplexityBot",
47];
48
49// ── Main function ───────────────────────────────────────────────────────────
50
51pub async fn analyze_geo(
52    domain: &str,
53    progress_tx: Option<tokio::sync::mpsc::Sender<crate::ScanProgress>>,
54) -> Result<GeoAnalysisResult, Box<dyn std::error::Error + Send + Sync>> {
55    let base_url = if domain.starts_with("http") {
56        domain.to_string()
57    } else {
58        format!("https://{}", domain)
59    };
60
61    let client = Client::builder()
62        .timeout(Duration::from_secs(10))
63        .danger_accept_invalid_certs(true)
64        .build()?;
65
66    // ── 1. Check llms.txt ───────────────────────────────────────────────
67    if let Some(t) = &progress_tx { let _ = t.send(crate::ScanProgress { module: "Geo Analysis".into(), percentage: 10.0, message: "Checking for llms.txt presence...".into(), status: "Info".into() }).await; }
68    let llms_paths = ["/llms.txt", "/llms-full.txt", "/.well-known/llms.txt"];
69    let mut llms_found = Vec::new();
70    for path in &llms_paths {
71        let url = format!("{}{}", base_url.trim_end_matches('/'), path);
72        if let Ok(resp) = client.get(&url).send().await {
73            if resp.status().is_success() {
74                let ct = resp
75                    .headers()
76                    .get("content-type")
77                    .and_then(|v| v.to_str().ok())
78                    .unwrap_or("")
79                    .to_lowercase();
80                if ct.contains("text/plain") || ct.contains("text/html") {
81                    llms_found.push(path.to_string());
82                }
83            }
84        }
85    }
86
87    // ── 2. Check WebMCP endpoints + HTML features ───────────────────────
88    if let Some(t) = &progress_tx { let _ = t.send(crate::ScanProgress { module: "Geo Analysis".into(), percentage: 40.0, message: "Scanning for Model Context Protocol (MCP) endpoints...".into(), status: "Info".into() }).await; }
89    let mcp_paths = ["/.well-known/mcp", "/mcp.json"];
90    let mut mcp_found = Vec::new();
91    for path in &mcp_paths {
92        let url = format!("{}{}", base_url.trim_end_matches('/'), path);
93        if let Ok(resp) = client.get(&url).send().await {
94            if resp.status().is_success() {
95                mcp_found.push(path.to_string());
96            }
97        }
98    }
99
100    // Check HTML for navigator.modelContext or WebMCP references
101    let mut html_features = Vec::new();
102    if let Ok(resp) = client.get(&base_url).send().await {
103        if resp.status().is_success() {
104            if let Ok(html) = resp.text().await {
105                if html.contains("navigator.modelContext") {
106                    html_features.push("navigator.modelContext API".to_string());
107                }
108                let lower = html.to_lowercase();
109                if lower.contains("webmcp") || lower.contains("model context protocol") {
110                    html_features
111                        .push("WebMCP/Model Context Protocol references in HTML".to_string());
112                }
113            }
114        }
115    }
116
117    let mcp_has_anything = !mcp_found.is_empty() || !html_features.is_empty();
118
119    // ── 3. Check AI crawler directives in robots.txt ────────────────────
120    if let Some(t) = &progress_tx { let _ = t.send(crate::ScanProgress { module: "Geo Analysis".into(), percentage: 70.0, message: "Analyzing AI crawler directives in robots.txt...".into(), status: "Info".into() }).await; }
121    let mut directives: HashMap<String, String> = AI_BOTS
122        .iter()
123        .map(|b| (b.to_string(), "Unknown".into()))
124        .collect();
125
126    let robots_url = format!("{}/robots.txt", base_url.trim_end_matches('/'));
127    if let Ok(resp) = client.get(&robots_url).send().await {
128        if resp.status().is_success() {
129            if let Ok(body) = resp.text().await {
130                let mut current_agent: Option<String> = None;
131                for line in body.lines() {
132                    let line = line.trim();
133                    if line.is_empty() || line.starts_with('#') {
134                        continue;
135                    }
136                    let lower = line.to_lowercase();
137
138                    if lower.starts_with("user-agent:") {
139                        let agent = line.split(':').nth(1).unwrap_or("").trim().to_string();
140                        if AI_BOTS.iter().any(|b| *b == agent) {
141                            current_agent = Some(agent);
142                        } else {
143                            current_agent = None;
144                        }
145                    } else if let Some(ref agent) = current_agent {
146                        if lower.starts_with("disallow:") {
147                            let path = line.split(':').nth(1).unwrap_or("").trim();
148                            if path == "/" {
149                                directives.insert(agent.clone(), "Blocked".into());
150                            } else if directives.get(agent).map(|s| s.as_str()) == Some("Unknown") {
151                                directives.insert(agent.clone(), "Partially Blocked".into());
152                            }
153                        } else if lower.starts_with("allow:")
154                            && directives.get(agent).map(|s| s.as_str()) == Some("Unknown") {
155                                directives.insert(agent.clone(), "Allowed".into());
156                            }
157                    }
158                }
159                // Mark remaining unknowns as implicit allow
160                for (_, v) in directives.iter_mut() {
161                    if *v == "Unknown" {
162                        *v = "Allowed (Implicit)".into();
163                    }
164                }
165            }
166        }
167    }
168
169    let blocked_count = directives
170        .values()
171        .filter(|v| v.contains("Blocked"))
172        .count();
173    let crawler_status = if blocked_count > AI_BOTS.len() / 2 {
174        "Restrictive"
175    } else {
176        "Permissive"
177    };
178
179    // ── Score calculation ────────────────────────────────────────────────
180    if let Some(t) = &progress_tx { let _ = t.send(crate::ScanProgress { module: "Geo Analysis".into(), percentage: 90.0, message: "Calculating Geofencing AI readiness score...".into(), status: "Info".into() }).await; }
181    let mut score: u32 = 0;
182
183    // llms.txt (up to 40 pts)
184    if !llms_found.is_empty() {
185        score += 20 + (llms_found.len() as u32 * 10).min(20);
186    }
187
188    // WebMCP (up to 40 pts)
189    if mcp_has_anything {
190        score += 20;
191        if !mcp_found.is_empty() {
192            score += 10;
193        }
194        if !html_features.is_empty() {
195            score += 10;
196        }
197    }
198
199    // AI crawlers (20 pts)
200    if crawler_status == "Permissive" {
201        score += 20;
202    }
203
204    let grade = match score {
205        80..=100 => "A (Excellent)".into(),
206        60..=79 => "B (Good)".into(),
207        40..=59 => "C (Fair)".into(),
208        20..=39 => "D (Poor)".into(),
209        _ => "F (None)".into(),
210    };
211
212    Ok(GeoAnalysisResult {
213        domain: domain.to_string(),
214        llms_txt: LlmsTxtResult {
215            found: !llms_found.is_empty(),
216            files: llms_found,
217        },
218        webmcp: WebMcpResult {
219            found: mcp_has_anything,
220            endpoints: mcp_found,
221            html_features,
222        },
223        ai_crawler_directives: AiCrawlerResult {
224            status: crawler_status.to_string(),
225            bots: directives,
226        },
227        geo_score: score,
228        geo_grade: grade,
229    })
230}