Skip to main content

web_analyzer/
seo_analysis.rs

1use regex::Regex;
2use reqwest::Client;
3use scraper::{Html, Selector};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::time::{Duration, Instant};
7
8// ── Tracking tool detection patterns ────────────────────────────────────────
9
10const TRACKING_TOOLS: &[(&str, &[&str])] = &[
11    (
12        "Google Tag Manager",
13        &["googletagmanager.com/gtm.js", "dataLayer"],
14    ),
15    (
16        "Google Ads",
17        &["googleads.g.doubleclick.net", "googlesyndication.com"],
18    ),
19    ("Facebook Pixel", &["connect.facebook.net", "fbq("]),
20    (
21        "LinkedIn Insight",
22        &["snap.licdn.com", "_linkedin_partner_id"],
23    ),
24    ("TikTok Pixel", &["analytics.tiktok.com", "ttq."]),
25    ("Hotjar", &["static.hotjar.com", "hjid"]),
26    ("Mixpanel", &["cdn.mxpnl.com", "mixpanel.init"]),
27    ("Segment", &["cdn.segment.com", "analytics.load"]),
28    ("Intercom", &["widget.intercom.io"]),
29    ("Zendesk", &["static.zdassets.com"]),
30    ("Crisp", &["client.crisp.chat"]),
31];
32
33/// SEO resources to check
34const SEO_RESOURCES: &[&str] = &["robots.txt", "sitemap.xml", "humans.txt", "ads.txt"];
35
36// ── Data Structures ─────────────────────────────────────────────────────────
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct SeoAnalysisResult {
40    pub domain: String,
41    pub basic_seo: BasicSeoResult,
42    pub content_analysis: ContentAnalysisResult,
43    pub technical_seo: TechnicalSeoResult,
44    pub social_media: SocialMediaResult,
45    pub analytics: HashMap<String, String>,
46    pub performance: PerformanceResult,
47    pub mobile_accessibility: MobileAccessibilityResult,
48    pub seo_resources: HashMap<String, String>,
49    pub schema_markup: SchemaMarkupResult,
50    pub link_analysis: LinkAnalysisResult,
51    pub image_seo: ImageSeoResult,
52    pub page_speed_factors: PageSpeedResult,
53    pub seo_score: SeoScoreResult,
54}
55
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct TitleAnalysis {
58    pub text: String,
59    pub length: usize,
60    pub status: String,
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct MetaDescAnalysis {
65    pub text: String,
66    pub length: usize,
67    pub status: String,
68}
69
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct BasicSeoResult {
72    pub title: TitleAnalysis,
73    pub meta_description: MetaDescAnalysis,
74    pub meta_keywords: String,
75    pub canonical_url: String,
76    pub meta_robots: String,
77    pub viewport: String,
78    pub language: String,
79    pub charset: String,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct HeadingInfo {
84    pub count: usize,
85    pub texts: Vec<String>,
86}
87
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct KeywordInfo {
90    pub word: String,
91    pub count: usize,
92    pub density: String,
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct ContentAnalysisResult {
97    pub headings: HashMap<String, HeadingInfo>,
98    pub heading_issues: Vec<String>,
99    pub word_count: usize,
100    pub word_count_status: String,
101    pub paragraphs: usize,
102    pub text_to_html_ratio: String,
103    pub top_keywords: Vec<KeywordInfo>,
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct TechnicalSeoResult {
108    pub page_size_bytes: usize,
109    pub http_status: u16,
110    pub redirects: usize,
111    pub internal_links: usize,
112    pub external_links: usize,
113    pub structured_data_count: usize,
114    pub has_breadcrumbs: bool,
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct SocialMediaResult {
119    pub open_graph: HashMap<String, String>,
120    pub twitter_cards: HashMap<String, String>,
121}
122
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct PerformanceResult {
125    pub load_time_secs: f64,
126    pub load_time_status: String,
127    pub content_size_kb: f64,
128    pub compression: String,
129    pub server: String,
130    pub cache_control: String,
131    pub etag: bool,
132}
133
134#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct AltAttributeResult {
136    pub total_images: usize,
137    pub images_with_alt: usize,
138    pub missing_alt: usize,
139    pub alt_coverage: String,
140}
141
142#[derive(Debug, Clone, Serialize, Deserialize)]
143pub struct MobileAccessibilityResult {
144    pub viewport_present: bool,
145    pub mobile_friendly: bool,
146    pub alt_attributes: AltAttributeResult,
147    pub aria_labels: usize,
148}
149
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct SchemaMarkupResult {
152    pub json_ld_count: usize,
153    pub json_ld_types: Vec<String>,
154    pub microdata_items: usize,
155    pub total_structured_data: usize,
156}
157
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct LinkAnalysisResult {
160    pub total_links: usize,
161    pub internal_links: usize,
162    pub external_links: usize,
163    pub nofollow_links: usize,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct ImageSeoResult {
168    pub total_images: usize,
169    pub lazy_loaded: usize,
170    pub with_alt_text: usize,
171    pub with_title: usize,
172    pub optimization_score: String,
173}
174
175#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct PageSpeedResult {
177    pub css_files: usize,
178    pub js_files: usize,
179    pub inline_styles: usize,
180    pub inline_scripts: usize,
181    pub compression: String,
182}
183
184#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct SeoScoreResult {
186    pub score: u32,
187    pub max_score: u32,
188    pub percentage: String,
189    pub grade: String,
190}
191
192// ── Main function ───────────────────────────────────────────────────────────
193
194pub async fn analyze_advanced_seo(
195    domain: &str,
196    progress_tx: Option<tokio::sync::mpsc::Sender<crate::ScanProgress>>,
197) -> Result<SeoAnalysisResult, Box<dyn std::error::Error + Send + Sync>> {
198    let url = if domain.starts_with("http") {
199        domain.to_string()
200    } else {
201        format!("https://{}", domain)
202    };
203
204    if let Some(t) = &progress_tx { let _ = t.send(crate::ScanProgress { module: "SEO Analysis".into(), percentage: 5.0, message: "Fetching homepage HTML...".into(), status: "Info".into() }).await; }
205
206    let client = Client::builder()
207        .timeout(Duration::from_secs(20))
208        .danger_accept_invalid_certs(true)
209        .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
210        .build()?;
211
212    let start = Instant::now();
213    let resp = client.get(&url).send().await?;
214    let load_time = start.elapsed().as_secs_f64();
215
216    let status_code = resp.status().as_u16();
217    let redirects = resp.url().to_string() != url; // simplified
218    let headers = resp.headers().clone();
219    let content_bytes = resp.bytes().await?;
220    let content_size = content_bytes.len();
221    let html_text = String::from_utf8_lossy(&content_bytes).to_string();
222    let base_domain = domain
223        .replace("https://", "")
224        .replace("http://", "")
225        .replace("www.", "");
226
227    if let Some(t) = &progress_tx { let _ = t.send(crate::ScanProgress { module: "SEO Analysis".into(), percentage: 20.0, message: "HTML fetched. Searching for SEO resources (sitemap, robots)...".into(), status: "Success".into() }).await; }
228
229    // ── 8. SEO Resources (await before parsing HTML to avoid Send bounds) ──
230    let seo_resources = check_seo_resources(&client, domain).await;
231
232    if let Some(t) = &progress_tx { let _ = t.send(crate::ScanProgress { module: "SEO Analysis".into(), percentage: 40.0, message: "Parsing HTML document...".into(), status: "Info".into() }).await; }
233
234    let document = Html::parse_document(&html_text);
235
236    // ── 1. Basic SEO ────────────────────────────────────────────────────
237    let basic_seo = analyze_basic_seo(&document);
238
239    // ── 2. Content Analysis ─────────────────────────────────────────────
240    let content_analysis = analyze_content(&document);
241
242    // ── 3. Technical SEO ────────────────────────────────────────────────
243    let technical_seo = analyze_technical(
244        &document,
245        status_code,
246        content_size,
247        redirects as usize,
248        &base_domain,
249    );
250
251    if let Some(t) = &progress_tx { let _ = t.send(crate::ScanProgress { module: "SEO Analysis".into(), percentage: 60.0, message: "Analyzing Social Media & Analytics...".into(), status: "Info".into() }).await; }
252
253    // ── 4. Social Media Tags ────────────────────────────────────────────
254    let social_media = analyze_social_tags(&document);
255
256    // ── 5. Analytics & Tracking ─────────────────────────────────────────
257    let analytics = analyze_analytics(&html_text);
258
259    if let Some(t) = &progress_tx { let _ = t.send(crate::ScanProgress { module: "SEO Analysis".into(), percentage: 80.0, message: "Calculating SEO Core Web Factors...".into(), status: "Info".into() }).await; }
260
261    // ── 6. Performance ──────────────────────────────────────────────────
262    let performance = analyze_performance(&headers, load_time, content_size);
263
264    // ── 7. Mobile & Accessibility ───────────────────────────────────────
265    let mobile_accessibility = analyze_mobile(&document);
266
267    // ── 9. Schema Markup ────────────────────────────────────────────────
268    let schema_markup = analyze_schema(&document, &html_text);
269
270    // ── 10. Link Analysis ───────────────────────────────────────────────
271    let link_analysis = analyze_links(&document, &base_domain);
272
273    // ── 11. Image SEO ───────────────────────────────────────────────────
274    let image_seo = analyze_images(&document);
275
276    // ── 12. Page Speed Factors ──────────────────────────────────────────
277    let page_speed_factors = analyze_speed_factors(&document, &headers);
278
279    // ── 13. SEO Score ───────────────────────────────────────────────────
280    let seo_score = calculate_seo_score(
281        &basic_seo,
282        &content_analysis,
283        &seo_resources,
284        &schema_markup,
285        &performance,
286        &mobile_accessibility,
287    );
288
289    Ok(SeoAnalysisResult {
290        domain: domain.to_string(),
291        basic_seo,
292        content_analysis,
293        technical_seo,
294        social_media,
295        analytics,
296        performance,
297        mobile_accessibility,
298        seo_resources,
299        schema_markup,
300        link_analysis,
301        image_seo,
302        page_speed_factors,
303        seo_score,
304    })
305}
306
307// ── 1. Basic SEO ────────────────────────────────────────────────────────────
308
309fn analyze_basic_seo(doc: &Html) -> BasicSeoResult {
310    let title_sel = Selector::parse("title").unwrap();
311    let title_text = doc
312        .select(&title_sel)
313        .next()
314        .map(|el| el.text().collect::<String>().trim().to_string())
315        .unwrap_or_default();
316
317    let title_len = title_text.len();
318    let title_status = if title_text.is_empty() {
319        "Missing"
320    } else if title_len < 30 {
321        "Too short"
322    } else if title_len > 60 {
323        "Too long"
324    } else {
325        "Good"
326    };
327
328    let desc = get_meta_content(doc, "name", "description");
329    let desc_len = if desc == "Not Found" { 0 } else { desc.len() };
330    let desc_status = if desc == "Not Found" {
331        "Missing"
332    } else if desc_len < 120 {
333        "Too short"
334    } else if desc_len > 160 {
335        "Too long"
336    } else {
337        "Good"
338    };
339
340    BasicSeoResult {
341        title: TitleAnalysis {
342            text: if title_text.is_empty() {
343                "Missing".into()
344            } else {
345                title_text
346            },
347            length: title_len,
348            status: title_status.into(),
349        },
350        meta_description: MetaDescAnalysis {
351            text: desc.clone(),
352            length: desc_len,
353            status: desc_status.into(),
354        },
355        meta_keywords: get_meta_content(doc, "name", "keywords"),
356        canonical_url: get_link_href(doc, "canonical"),
357        meta_robots: get_meta_content(doc, "name", "robots"),
358        viewport: get_meta_content(doc, "name", "viewport"),
359        language: doc
360            .root_element()
361            .value()
362            .attr("lang")
363            .unwrap_or("Not specified")
364            .to_string(),
365        charset: get_charset(doc),
366    }
367}
368
369fn get_meta_content(doc: &Html, attr: &str, value: &str) -> String {
370    let selector_str = format!("meta[{}=\"{}\"]", attr, value);
371    if let Ok(sel) = Selector::parse(&selector_str) {
372        if let Some(el) = doc.select(&sel).next() {
373            if let Some(content) = el.value().attr("content") {
374                return content.trim().to_string();
375            }
376        }
377    }
378    "Not Found".into()
379}
380
381fn get_link_href(doc: &Html, rel: &str) -> String {
382    let selector_str = format!("link[rel=\"{}\"]", rel);
383    if let Ok(sel) = Selector::parse(&selector_str) {
384        if let Some(el) = doc.select(&sel).next() {
385            if let Some(href) = el.value().attr("href") {
386                return href.trim().to_string();
387            }
388        }
389    }
390    "Not Found".into()
391}
392
393fn get_charset(doc: &Html) -> String {
394    if let Ok(sel) = Selector::parse("meta[charset]") {
395        if let Some(el) = doc.select(&sel).next() {
396            if let Some(cs) = el.value().attr("charset") {
397                return cs.to_string();
398            }
399        }
400    }
401    if let Ok(sel) = Selector::parse("meta[http-equiv=\"Content-Type\"]") {
402        if let Some(el) = doc.select(&sel).next() {
403            if let Some(content) = el.value().attr("content") {
404                if let Some(cs) = Regex::new(r"charset=([^;]+)")
405                    .ok()
406                    .and_then(|r| r.captures(content))
407                {
408                    return cs.get(1).unwrap().as_str().to_string();
409                }
410            }
411        }
412    }
413    "Unknown".into()
414}
415
416// ── 2. Content Analysis ─────────────────────────────────────────────────────
417
418fn analyze_content(doc: &Html) -> ContentAnalysisResult {
419    let mut headings = HashMap::new();
420    let mut hierarchy: Vec<(u8, String)> = Vec::new();
421
422    let h_selectors = [
423        (1u8, Selector::parse("h1").unwrap()),
424        (2, Selector::parse("h2").unwrap()),
425        (3, Selector::parse("h3").unwrap()),
426        (4, Selector::parse("h4").unwrap()),
427        (5, Selector::parse("h5").unwrap()),
428        (6, Selector::parse("h6").unwrap()),
429    ];
430
431    for (i, sel) in &h_selectors {
432        let elements: Vec<_> = doc.select(sel).collect();
433        if !elements.is_empty() {
434            let texts: Vec<String> = elements
435                .iter()
436                .take(3)
437                .map(|e| {
438                    let t = e.text().collect::<String>();
439                    t.trim().chars().take(100).collect()
440                })
441                .collect();
442            headings.insert(
443                format!("H{}", i),
444                HeadingInfo {
445                    count: elements.len(),
446                    texts,
447                },
448            );
449            for e in &elements {
450                let t = e.text().collect::<String>().trim().to_string();
451                hierarchy.push((*i, t));
452            }
453        }
454    }
455
456    let heading_issues = check_heading_issues(&hierarchy);
457
458    let text = doc.root_element().text().collect::<String>();
459    let words: Vec<&str> = text.split_whitespace().collect();
460    let word_count = words.len();
461
462    let p_sel = Selector::parse("p").unwrap();
463    let paragraphs = doc.select(&p_sel).count();
464
465    let html_len = doc.html().len();
466    let text_len = text.len();
467    let ratio = if html_len > 0 {
468        (text_len as f64 / html_len as f64) * 100.0
469    } else {
470        0.0
471    };
472
473    let top_keywords = analyze_keyword_density(&words);
474
475    ContentAnalysisResult {
476        headings,
477        heading_issues,
478        word_count,
479        word_count_status: if word_count >= 300 {
480            "Good"
481        } else {
482            "Too short"
483        }
484        .into(),
485        paragraphs,
486        text_to_html_ratio: format!("{:.1}%", ratio),
487        top_keywords,
488    }
489}
490
491fn check_heading_issues(hierarchy: &[(u8, String)]) -> Vec<String> {
492    let mut issues = Vec::new();
493    if hierarchy.is_empty() {
494        issues.push("No headings found".into());
495        return issues;
496    }
497
498    let h1_count = hierarchy.iter().filter(|(l, _)| *l == 1).count();
499    if h1_count == 0 {
500        issues.push("Missing H1 tag".into());
501    } else if h1_count > 1 {
502        issues.push(format!("Multiple H1 tags ({})", h1_count));
503    }
504
505    let mut prev = 0u8;
506    for &(level, _) in hierarchy {
507        if prev > 0 && level > prev + 1 {
508            issues.push(format!(
509                "Skipped heading level (from H{} to H{})",
510                prev, level
511            ));
512        }
513        prev = level;
514    }
515    issues
516}
517
518fn analyze_keyword_density(words: &[&str]) -> Vec<KeywordInfo> {
519    let total = words.len();
520    if total == 0 {
521        return vec![];
522    }
523
524    let mut freq: HashMap<String, usize> = HashMap::new();
525    for &w in words {
526        let lower = w.to_lowercase();
527        if lower.len() > 3 {
528            *freq.entry(lower).or_insert(0) += 1;
529        }
530    }
531
532    let mut sorted: Vec<_> = freq.into_iter().collect();
533    sorted.sort_by(|a, b| b.1.cmp(&a.1));
534
535    sorted
536        .into_iter()
537        .take(5)
538        .map(|(word, count)| KeywordInfo {
539            word,
540            count,
541            density: format!("{:.2}%", (count as f64 / total as f64) * 100.0),
542        })
543        .collect()
544}
545
546// ── 3. Technical SEO ────────────────────────────────────────────────────────
547
548fn analyze_technical(
549    doc: &Html,
550    status: u16,
551    size: usize,
552    redirects: usize,
553    base_domain: &str,
554) -> TechnicalSeoResult {
555    let link_sel = Selector::parse("a[href]").unwrap();
556    let mut internal = 0;
557    let mut external = 0;
558
559    for el in doc.select(&link_sel) {
560        if let Some(href) = el.value().attr("href") {
561            if href.starts_with("http") && !href.contains(base_domain) {
562                external += 1;
563            } else if !href.starts_with("mailto:")
564                && !href.starts_with("tel:")
565                && !href.starts_with('#')
566            {
567                internal += 1;
568            }
569        }
570    }
571
572    let json_ld = Selector::parse("script[type=\"application/ld+json\"]")
573        .ok()
574        .map(|s| doc.select(&s).count())
575        .unwrap_or(0);
576    let microdata = Selector::parse("[itemtype]")
577        .ok()
578        .map(|s| doc.select(&s).count())
579        .unwrap_or(0);
580
581    let breadcrumb = Selector::parse("[typeof=\"BreadcrumbList\"]")
582        .ok()
583        .map(|s| doc.select(&s).next().is_some())
584        .unwrap_or(false)
585        || doc.html().to_lowercase().contains("breadcrumb");
586
587    TechnicalSeoResult {
588        page_size_bytes: size,
589        http_status: status,
590        redirects,
591        internal_links: internal,
592        external_links: external,
593        structured_data_count: json_ld + microdata,
594        has_breadcrumbs: breadcrumb,
595    }
596}
597
598// ── 4. Social Media Tags ────────────────────────────────────────────────────
599
600fn analyze_social_tags(doc: &Html) -> SocialMediaResult {
601    let og_keys = [
602        "og:title",
603        "og:description",
604        "og:image",
605        "og:url",
606        "og:type",
607        "og:site_name",
608    ];
609    let tw_keys = [
610        "twitter:card",
611        "twitter:title",
612        "twitter:description",
613        "twitter:image",
614        "twitter:site",
615    ];
616
617    let mut og = HashMap::new();
618    for key in &og_keys {
619        og.insert(key.to_string(), get_meta_content(doc, "property", key));
620    }
621
622    let mut tw = HashMap::new();
623    for key in &tw_keys {
624        tw.insert(key.to_string(), get_meta_content(doc, "name", key));
625    }
626
627    SocialMediaResult {
628        open_graph: og,
629        twitter_cards: tw,
630    }
631}
632
633// ── 5. Analytics & Tracking ─────────────────────────────────────────────────
634
635fn analyze_analytics(html: &str) -> HashMap<String, String> {
636    let mut results = HashMap::new();
637
638    // Google Analytics
639    let has_ga4 = Regex::new(r#"gtag\(['"]config['"],\s*['"]G-[A-Z0-9]+['"]\)"#)
640        .ok()
641        .map(|r| r.is_match(html))
642        .unwrap_or(false);
643    let has_ua = Regex::new(r#"gtag\(['"]config['"],\s*['"]UA-[0-9-]+['"]\)"#)
644        .ok()
645        .map(|r| r.is_match(html))
646        .unwrap_or(false);
647    results.insert(
648        "Google Analytics GA4".into(),
649        if has_ga4 { "Found" } else { "Not Found" }.into(),
650    );
651    results.insert(
652        "Google Analytics UA".into(),
653        if has_ua { "Found" } else { "Not Found" }.into(),
654    );
655
656    // Other tracking tools
657    let lower = html.to_lowercase();
658    for &(name, patterns) in TRACKING_TOOLS {
659        let found = patterns.iter().any(|p| lower.contains(&p.to_lowercase()));
660        results.insert(
661            name.to_string(),
662            if found { "Found" } else { "Not Found" }.into(),
663        );
664    }
665
666    results
667}
668
669// ── 6. Performance ──────────────────────────────────────────────────────────
670
671fn analyze_performance(
672    headers: &reqwest::header::HeaderMap,
673    load_time: f64,
674    size: usize,
675) -> PerformanceResult {
676    let status = if load_time < 1.0 {
677        "Excellent"
678    } else if load_time < 3.0 {
679        "Good"
680    } else {
681        "Poor"
682    };
683
684    PerformanceResult {
685        load_time_secs: (load_time * 100.0).round() / 100.0,
686        load_time_status: status.into(),
687        content_size_kb: (size as f64 / 1024.0 * 100.0).round() / 100.0,
688        compression: headers
689            .get("content-encoding")
690            .and_then(|v| v.to_str().ok())
691            .unwrap_or("None")
692            .into(),
693        server: headers
694            .get("server")
695            .and_then(|v| v.to_str().ok())
696            .unwrap_or("Unknown")
697            .into(),
698        cache_control: headers
699            .get("cache-control")
700            .and_then(|v| v.to_str().ok())
701            .unwrap_or("Not Set")
702            .into(),
703        etag: headers.contains_key("etag"),
704    }
705}
706
707// ── 7. Mobile & Accessibility ───────────────────────────────────────────────
708
709fn analyze_mobile(doc: &Html) -> MobileAccessibilityResult {
710    let viewport_content = get_meta_content(doc, "name", "viewport");
711    let has_viewport = viewport_content != "Not Found";
712    let mobile_friendly = viewport_content.contains("width=device-width");
713
714    let img_sel = Selector::parse("img").unwrap();
715    let images: Vec<_> = doc.select(&img_sel).collect();
716    let total = images.len();
717    let with_alt = images
718        .iter()
719        .filter(|i| i.value().attr("alt").is_some())
720        .count();
721
722    let aria_sel = Selector::parse("[aria-label]").unwrap();
723    let aria_count = doc.select(&aria_sel).count();
724
725    MobileAccessibilityResult {
726        viewport_present: has_viewport,
727        mobile_friendly,
728        alt_attributes: AltAttributeResult {
729            total_images: total,
730            images_with_alt: with_alt,
731            missing_alt: total - with_alt,
732            alt_coverage: if total > 0 {
733                format!("{:.1}%", (with_alt as f64 / total as f64) * 100.0)
734            } else {
735                "0%".into()
736            },
737        },
738        aria_labels: aria_count,
739    }
740}
741
742// ── 8. SEO Resources ────────────────────────────────────────────────────────
743
744async fn check_seo_resources(client: &Client, domain: &str) -> HashMap<String, String> {
745    let mut results = HashMap::new();
746    for &file in SEO_RESOURCES {
747        let url = format!("https://{}/{}", domain, file);
748        let found = match client.get(&url).send().await {
749            Ok(r) if r.status().is_success() => "Found",
750            _ => "Not Found",
751        };
752        results.insert(file.to_string(), found.into());
753    }
754    results
755}
756
757// ── 9. Schema Markup ────────────────────────────────────────────────────────
758
759fn analyze_schema(doc: &Html, html: &str) -> SchemaMarkupResult {
760    let json_ld_sel = Selector::parse("script[type=\"application/ld+json\"]").unwrap();
761    let json_lds: Vec<_> = doc.select(&json_ld_sel).collect();
762    let json_ld_count = json_lds.len();
763
764    let mut types = Vec::new();
765    for script in &json_lds {
766        let text = script.text().collect::<String>();
767        if let Ok(val) = serde_json::from_str::<serde_json::Value>(&text) {
768            extract_types(&val, &mut types);
769        }
770    }
771
772    let microdata = Selector::parse("[itemtype]")
773        .ok()
774        .map(|s| doc.select(&s).count())
775        .unwrap_or(0);
776
777    // Also check for inline JSON-LD in raw HTML (in case scraper misses it)
778    let additional = Regex::new(r#""@type"\s*:\s*"([^"]+)""#)
779        .ok()
780        .map(|r| {
781            r.captures_iter(html)
782                .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
783                .collect::<Vec<_>>()
784        })
785        .unwrap_or_default();
786
787    for t in additional {
788        if !types.contains(&t) {
789            types.push(t);
790        }
791    }
792
793    SchemaMarkupResult {
794        json_ld_count,
795        json_ld_types: types,
796        microdata_items: microdata,
797        total_structured_data: json_ld_count + microdata,
798    }
799}
800
801fn extract_types(val: &serde_json::Value, types: &mut Vec<String>) {
802    match val {
803        serde_json::Value::Object(map) => {
804            if let Some(t) = map.get("@type").and_then(|v| v.as_str()) {
805                types.push(t.to_string());
806            }
807            for (_, v) in map {
808                extract_types(v, types);
809            }
810        }
811        serde_json::Value::Array(arr) => {
812            for v in arr {
813                extract_types(v, types);
814            }
815        }
816        _ => {}
817    }
818}
819
820// ── 10. Link Analysis ───────────────────────────────────────────────────────
821
822fn analyze_links(doc: &Html, base_domain: &str) -> LinkAnalysisResult {
823    let link_sel = Selector::parse("a[href]").unwrap();
824    let mut internal = 0;
825    let mut external = 0;
826    let mut nofollow = 0;
827    let mut total = 0;
828
829    for el in doc.select(&link_sel) {
830        total += 1;
831        if let Some(href) = el.value().attr("href") {
832            if href.starts_with("http") && !href.contains(base_domain) {
833                external += 1;
834            } else if !href.starts_with("mailto:")
835                && !href.starts_with("tel:")
836                && !href.starts_with('#')
837            {
838                internal += 1;
839            }
840        }
841        if let Some(rel) = el.value().attr("rel") {
842            if rel.contains("nofollow") {
843                nofollow += 1;
844            }
845        }
846    }
847
848    LinkAnalysisResult {
849        total_links: total,
850        internal_links: internal,
851        external_links: external,
852        nofollow_links: nofollow,
853    }
854}
855
856// ── 11. Image SEO ───────────────────────────────────────────────────────────
857
858fn analyze_images(doc: &Html) -> ImageSeoResult {
859    let img_sel = Selector::parse("img").unwrap();
860    let images: Vec<_> = doc.select(&img_sel).collect();
861    let total = images.len();
862    let lazy = images
863        .iter()
864        .filter(|i| i.value().attr("loading") == Some("lazy"))
865        .count();
866    let alt = images
867        .iter()
868        .filter(|i| i.value().attr("alt").is_some())
869        .count();
870    let title = images
871        .iter()
872        .filter(|i| i.value().attr("title").is_some())
873        .count();
874
875    let opt_score = if total > 0 {
876        format!("{:.1}%", ((lazy + alt) as f64 / (total * 2) as f64) * 100.0)
877    } else {
878        "0%".into()
879    };
880
881    ImageSeoResult {
882        total_images: total,
883        lazy_loaded: lazy,
884        with_alt_text: alt,
885        with_title: title,
886        optimization_score: opt_score,
887    }
888}
889
890// ── 12. Page Speed Factors ──────────────────────────────────────────────────
891
892fn analyze_speed_factors(doc: &Html, headers: &reqwest::header::HeaderMap) -> PageSpeedResult {
893    let css_sel = Selector::parse("link[rel=\"stylesheet\"]").unwrap();
894    let js_sel = Selector::parse("script[src]").unwrap();
895    let style_sel = Selector::parse("style").unwrap();
896    let inline_js_sel = Selector::parse("script:not([src])").unwrap();
897
898    PageSpeedResult {
899        css_files: doc.select(&css_sel).count(),
900        js_files: doc.select(&js_sel).count(),
901        inline_styles: doc.select(&style_sel).count(),
902        inline_scripts: doc.select(&inline_js_sel).count(),
903        compression: headers
904            .get("content-encoding")
905            .and_then(|v| v.to_str().ok())
906            .unwrap_or("None")
907            .into(),
908    }
909}
910
911// ── 13. SEO Score ───────────────────────────────────────────────────────────
912
913fn calculate_seo_score(
914    basic: &BasicSeoResult,
915    content: &ContentAnalysisResult,
916    resources: &HashMap<String, String>,
917    schema: &SchemaMarkupResult,
918    perf: &PerformanceResult,
919    mobile: &MobileAccessibilityResult,
920) -> SeoScoreResult {
921    let mut score: u32 = 0;
922
923    // Basic SEO (30 pts)
924    if basic.title.status == "Good" {
925        score += 10;
926    }
927    if basic.meta_description.status == "Good" {
928        score += 10;
929    }
930    if basic.canonical_url != "Not Found" {
931        score += 5;
932    }
933    if basic.viewport != "Not Found" {
934        score += 5;
935    }
936
937    // Content (20 pts)
938    if content.word_count_status == "Good" {
939        score += 10;
940    }
941    if content.headings.contains_key("H1") {
942        score += 10;
943    }
944
945    // Technical (20 pts)
946    if resources.get("robots.txt").map(|s| s.as_str()) == Some("Found") {
947        score += 5;
948    }
949    if resources.get("sitemap.xml").map(|s| s.as_str()) == Some("Found") {
950        score += 5;
951    }
952    if schema.total_structured_data > 0 {
953        score += 10;
954    }
955
956    // Performance (15 pts)
957    match perf.load_time_status.as_str() {
958        "Excellent" | "Good" => score += 15,
959        _ => {}
960    }
961
962    // Security (10 pts) — counted from headers presence (simplified)
963    score += 5; // base
964
965    // Mobile (5 pts)
966    if mobile.mobile_friendly {
967        score += 5;
968    }
969
970    let max_score = 100u32;
971    let pct = (score as f64 / max_score as f64) * 100.0;
972    let grade = if pct >= 90.0 {
973        "A+"
974    } else if pct >= 80.0 {
975        "A"
976    } else if pct >= 70.0 {
977        "B"
978    } else if pct >= 60.0 {
979        "C"
980    } else if pct >= 50.0 {
981        "D"
982    } else {
983        "F"
984    };
985
986    SeoScoreResult {
987        score,
988        max_score,
989        percentage: format!("{:.1}%", pct),
990        grade: grade.into(),
991    }
992}