Skip to main content

web_analyzer/
geo_analysis.rs

1use reqwest::Client;
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4use std::time::Duration;
5
6// ── Data Structures ─────────────────────────────────────────────────────────
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct GeoAnalysisResult {
10    pub domain: String,
11    pub llms_txt: LlmsTxtResult,
12    pub webmcp: WebMcpResult,
13    pub ai_crawler_directives: AiCrawlerResult,
14    pub geo_score: u32,
15    pub geo_grade: String,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct LlmsTxtResult {
20    pub found: bool,
21    pub files: Vec<String>,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct WebMcpResult {
26    pub found: bool,
27    pub endpoints: Vec<String>,
28    pub html_features: Vec<String>,
29}
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct AiCrawlerResult {
33    pub status: String,
34    pub bots: HashMap<String, String>,
35}
36
37// ── AI bot list ─────────────────────────────────────────────────────────────
38
39const AI_BOTS: &[&str] = &[
40    "GPTBot",
41    "ChatGPT-User",
42    "ClaudeBot",
43    "Claude-Web",
44    "Applebot-Extended",
45    "OAI-SearchBot",
46    "PerplexityBot",
47];
48
49// ── Main function ───────────────────────────────────────────────────────────
50
51pub async fn analyze_geo(
52    domain: &str,
53) -> Result<GeoAnalysisResult, Box<dyn std::error::Error + Send + Sync>> {
54    let base_url = if domain.starts_with("http") {
55        domain.to_string()
56    } else {
57        format!("https://{}", domain)
58    };
59
60    let client = Client::builder()
61        .timeout(Duration::from_secs(10))
62        .danger_accept_invalid_certs(true)
63        .build()?;
64
65    // ── 1. Check llms.txt ───────────────────────────────────────────────
66    let llms_paths = ["/llms.txt", "/llms-full.txt", "/.well-known/llms.txt"];
67    let mut llms_found = Vec::new();
68    for path in &llms_paths {
69        let url = format!("{}{}", base_url.trim_end_matches('/'), path);
70        if let Ok(resp) = client.get(&url).send().await {
71            if resp.status().is_success() {
72                let ct = resp
73                    .headers()
74                    .get("content-type")
75                    .and_then(|v| v.to_str().ok())
76                    .unwrap_or("")
77                    .to_lowercase();
78                if ct.contains("text/plain") || ct.contains("text/html") {
79                    llms_found.push(path.to_string());
80                }
81            }
82        }
83    }
84
85    // ── 2. Check WebMCP endpoints + HTML features ───────────────────────
86    let mcp_paths = ["/.well-known/mcp", "/mcp.json"];
87    let mut mcp_found = Vec::new();
88    for path in &mcp_paths {
89        let url = format!("{}{}", base_url.trim_end_matches('/'), path);
90        if let Ok(resp) = client.get(&url).send().await {
91            if resp.status().is_success() {
92                mcp_found.push(path.to_string());
93            }
94        }
95    }
96
97    // Check HTML for navigator.modelContext or WebMCP references
98    let mut html_features = Vec::new();
99    if let Ok(resp) = client.get(&base_url).send().await {
100        if resp.status().is_success() {
101            if let Ok(html) = resp.text().await {
102                if html.contains("navigator.modelContext") {
103                    html_features.push("navigator.modelContext API".to_string());
104                }
105                let lower = html.to_lowercase();
106                if lower.contains("webmcp") || lower.contains("model context protocol") {
107                    html_features
108                        .push("WebMCP/Model Context Protocol references in HTML".to_string());
109                }
110            }
111        }
112    }
113
114    let mcp_has_anything = !mcp_found.is_empty() || !html_features.is_empty();
115
116    // ── 3. Check AI crawler directives in robots.txt ────────────────────
117    let mut directives: HashMap<String, String> = AI_BOTS
118        .iter()
119        .map(|b| (b.to_string(), "Unknown".into()))
120        .collect();
121
122    let robots_url = format!("{}/robots.txt", base_url.trim_end_matches('/'));
123    if let Ok(resp) = client.get(&robots_url).send().await {
124        if resp.status().is_success() {
125            if let Ok(body) = resp.text().await {
126                let mut current_agent: Option<String> = None;
127                for line in body.lines() {
128                    let line = line.trim();
129                    if line.is_empty() || line.starts_with('#') {
130                        continue;
131                    }
132                    let lower = line.to_lowercase();
133
134                    if lower.starts_with("user-agent:") {
135                        let agent = line.split(':').nth(1).unwrap_or("").trim().to_string();
136                        if AI_BOTS.iter().any(|b| *b == agent) {
137                            current_agent = Some(agent);
138                        } else {
139                            current_agent = None;
140                        }
141                    } else if let Some(ref agent) = current_agent {
142                        if lower.starts_with("disallow:") {
143                            let path = line.split(':').nth(1).unwrap_or("").trim();
144                            if path == "/" {
145                                directives.insert(agent.clone(), "Blocked".into());
146                            } else if directives.get(agent).map(|s| s.as_str()) == Some("Unknown") {
147                                directives.insert(agent.clone(), "Partially Blocked".into());
148                            }
149                        } else if lower.starts_with("allow:")
150                            && directives.get(agent).map(|s| s.as_str()) == Some("Unknown") {
151                                directives.insert(agent.clone(), "Allowed".into());
152                            }
153                    }
154                }
155                // Mark remaining unknowns as implicit allow
156                for (_, v) in directives.iter_mut() {
157                    if *v == "Unknown" {
158                        *v = "Allowed (Implicit)".into();
159                    }
160                }
161            }
162        }
163    }
164
165    let blocked_count = directives
166        .values()
167        .filter(|v| v.contains("Blocked"))
168        .count();
169    let crawler_status = if blocked_count > AI_BOTS.len() / 2 {
170        "Restrictive"
171    } else {
172        "Permissive"
173    };
174
175    // ── Score calculation ────────────────────────────────────────────────
176    let mut score: u32 = 0;
177
178    // llms.txt (up to 40 pts)
179    if !llms_found.is_empty() {
180        score += 20 + (llms_found.len() as u32 * 10).min(20);
181    }
182
183    // WebMCP (up to 40 pts)
184    if mcp_has_anything {
185        score += 20;
186        if !mcp_found.is_empty() {
187            score += 10;
188        }
189        if !html_features.is_empty() {
190            score += 10;
191        }
192    }
193
194    // AI crawlers (20 pts)
195    if crawler_status == "Permissive" {
196        score += 20;
197    }
198
199    let grade = match score {
200        80..=100 => "A (Excellent)".into(),
201        60..=79 => "B (Good)".into(),
202        40..=59 => "C (Fair)".into(),
203        20..=39 => "D (Poor)".into(),
204        _ => "F (None)".into(),
205    };
206
207    Ok(GeoAnalysisResult {
208        domain: domain.to_string(),
209        llms_txt: LlmsTxtResult {
210            found: !llms_found.is_empty(),
211            files: llms_found,
212        },
213        webmcp: WebMcpResult {
214            found: mcp_has_anything,
215            endpoints: mcp_found,
216            html_features,
217        },
218        ai_crawler_directives: AiCrawlerResult {
219            status: crawler_status.to_string(),
220            bots: directives,
221        },
222        geo_score: score,
223        geo_grade: grade,
224    })
225}