Skip to main content

web_analyzer/
seo_analysis.rs

1use regex::Regex;
2use reqwest::Client;
3use scraper::{Html, Selector};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::time::{Duration, Instant};
7
8// ── Tracking tool detection patterns ────────────────────────────────────────
9
10const TRACKING_TOOLS: &[(&str, &[&str])] = &[
11    (
12        "Google Tag Manager",
13        &["googletagmanager.com/gtm.js", "dataLayer"],
14    ),
15    (
16        "Google Ads",
17        &["googleads.g.doubleclick.net", "googlesyndication.com"],
18    ),
19    ("Facebook Pixel", &["connect.facebook.net", "fbq("]),
20    (
21        "LinkedIn Insight",
22        &["snap.licdn.com", "_linkedin_partner_id"],
23    ),
24    ("TikTok Pixel", &["analytics.tiktok.com", "ttq."]),
25    ("Hotjar", &["static.hotjar.com", "hjid"]),
26    ("Mixpanel", &["cdn.mxpnl.com", "mixpanel.init"]),
27    ("Segment", &["cdn.segment.com", "analytics.load"]),
28    ("Intercom", &["widget.intercom.io"]),
29    ("Zendesk", &["static.zdassets.com"]),
30    ("Crisp", &["client.crisp.chat"]),
31];
32
33/// SEO resources to check
34const SEO_RESOURCES: &[&str] = &["robots.txt", "sitemap.xml", "humans.txt", "ads.txt"];
35
36// ── Data Structures ─────────────────────────────────────────────────────────
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct SeoAnalysisResult {
40    pub domain: String,
41    pub basic_seo: BasicSeoResult,
42    pub content_analysis: ContentAnalysisResult,
43    pub technical_seo: TechnicalSeoResult,
44    pub social_media: SocialMediaResult,
45    pub analytics: HashMap<String, String>,
46    pub performance: PerformanceResult,
47    pub mobile_accessibility: MobileAccessibilityResult,
48    pub seo_resources: HashMap<String, String>,
49    pub schema_markup: SchemaMarkupResult,
50    pub link_analysis: LinkAnalysisResult,
51    pub image_seo: ImageSeoResult,
52    pub page_speed_factors: PageSpeedResult,
53    pub seo_score: SeoScoreResult,
54}
55
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct TitleAnalysis {
58    pub text: String,
59    pub length: usize,
60    pub status: String,
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct MetaDescAnalysis {
65    pub text: String,
66    pub length: usize,
67    pub status: String,
68}
69
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct BasicSeoResult {
72    pub title: TitleAnalysis,
73    pub meta_description: MetaDescAnalysis,
74    pub meta_keywords: String,
75    pub canonical_url: String,
76    pub meta_robots: String,
77    pub viewport: String,
78    pub language: String,
79    pub charset: String,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct HeadingInfo {
84    pub count: usize,
85    pub texts: Vec<String>,
86}
87
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct KeywordInfo {
90    pub word: String,
91    pub count: usize,
92    pub density: String,
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct ContentAnalysisResult {
97    pub headings: HashMap<String, HeadingInfo>,
98    pub heading_issues: Vec<String>,
99    pub word_count: usize,
100    pub word_count_status: String,
101    pub paragraphs: usize,
102    pub text_to_html_ratio: String,
103    pub top_keywords: Vec<KeywordInfo>,
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct TechnicalSeoResult {
108    pub page_size_bytes: usize,
109    pub http_status: u16,
110    pub redirects: usize,
111    pub internal_links: usize,
112    pub external_links: usize,
113    pub structured_data_count: usize,
114    pub has_breadcrumbs: bool,
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct SocialMediaResult {
119    pub open_graph: HashMap<String, String>,
120    pub twitter_cards: HashMap<String, String>,
121}
122
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct PerformanceResult {
125    pub load_time_secs: f64,
126    pub load_time_status: String,
127    pub content_size_kb: f64,
128    pub compression: String,
129    pub server: String,
130    pub cache_control: String,
131    pub etag: bool,
132}
133
134#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct AltAttributeResult {
136    pub total_images: usize,
137    pub images_with_alt: usize,
138    pub missing_alt: usize,
139    pub alt_coverage: String,
140}
141
142#[derive(Debug, Clone, Serialize, Deserialize)]
143pub struct MobileAccessibilityResult {
144    pub viewport_present: bool,
145    pub mobile_friendly: bool,
146    pub alt_attributes: AltAttributeResult,
147    pub aria_labels: usize,
148}
149
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct SchemaMarkupResult {
152    pub json_ld_count: usize,
153    pub json_ld_types: Vec<String>,
154    pub microdata_items: usize,
155    pub total_structured_data: usize,
156}
157
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct LinkAnalysisResult {
160    pub total_links: usize,
161    pub internal_links: usize,
162    pub external_links: usize,
163    pub nofollow_links: usize,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct ImageSeoResult {
168    pub total_images: usize,
169    pub lazy_loaded: usize,
170    pub with_alt_text: usize,
171    pub with_title: usize,
172    pub optimization_score: String,
173}
174
175#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct PageSpeedResult {
177    pub css_files: usize,
178    pub js_files: usize,
179    pub inline_styles: usize,
180    pub inline_scripts: usize,
181    pub compression: String,
182}
183
184#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct SeoScoreResult {
186    pub score: u32,
187    pub max_score: u32,
188    pub percentage: String,
189    pub grade: String,
190}
191
192// ── Main function ───────────────────────────────────────────────────────────
193
194pub async fn analyze_advanced_seo(
195    domain: &str,
196    progress_tx: Option<tokio::sync::mpsc::Sender<crate::ScanProgress>>,
197) -> Result<SeoAnalysisResult, Box<dyn std::error::Error + Send + Sync>> {
198    let url = if domain.starts_with("http") {
199        domain.to_string()
200    } else {
201        format!("https://{}", domain)
202    };
203
204    if let Some(t) = &progress_tx { let _ = t.try_send(crate::ScanProgress { module: "SEO Analysis".into(), percentage: 5.0, message: "Fetching homepage HTML...".into(), status: "Info".into() }); }
205
206    let client = Client::builder()
207        .timeout(Duration::from_secs(20))
208        .danger_accept_invalid_certs(true)
209        .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
210        .build()?;
211
212    let start = Instant::now();
213    let resp = client.get(&url).send().await?;
214    let load_time = start.elapsed().as_secs_f64();
215
216    let status_code = resp.status().as_u16();
217    let redirects = resp.url().to_string() != url; // simplified
218    let headers = resp.headers().clone();
219    let content_bytes = resp.bytes().await?;
220    let content_size = content_bytes.len();
221    let html_text = String::from_utf8_lossy(&content_bytes).to_string();
222    let base_domain = domain
223        .replace("https://", "")
224        .replace("http://", "")
225        .replace("www.", "");
226
227    if let Some(t) = &progress_tx { let _ = t.try_send(crate::ScanProgress { module: "SEO Analysis".into(), percentage: 20.0, message: "HTML fetched. Searching for SEO resources (sitemap, robots)...".into(), status: "Success".into() }); }
228
229    // ── 8. SEO Resources (await before parsing HTML to avoid Send bounds) ──
230    let seo_resources = check_seo_resources(&client, domain).await;
231
232    if let Some(t) = &progress_tx { let _ = t.try_send(crate::ScanProgress { module: "SEO Analysis".into(), percentage: 40.0, message: "Parsing HTML document...".into(), status: "Info".into() }); }
233
234    let document = Html::parse_document(&html_text);
235
236    // ── 1. Basic SEO ────────────────────────────────────────────────────
237    let basic_seo = analyze_basic_seo(&document);
238
239    // ── 2. Content Analysis ─────────────────────────────────────────────
240    let content_analysis = analyze_content(&document);
241
242    // ── 3. Technical SEO ────────────────────────────────────────────────
243    let technical_seo = analyze_technical(
244        &document,
245        status_code,
246        content_size,
247        redirects as usize,
248        &base_domain,
249    );
250
251    if let Some(t) = &progress_tx { let _ = t.try_send(crate::ScanProgress { module: "SEO Analysis".into(), percentage: 60.0, message: "Analyzing Social Media & Analytics...".into(), status: "Info".into() }); }
252
253    // ── 4. Social Media Tags ────────────────────────────────────────────
254    let social_media = analyze_social_tags(&document);
255
256    // ── 5. Analytics & Tracking ─────────────────────────────────────────
257    let analytics = analyze_analytics(&html_text);
258
259    if let Some(t) = &progress_tx { let _ = t.try_send(crate::ScanProgress { module: "SEO Analysis".into(), percentage: 80.0, message: "Calculating SEO Core Web Factors...".into(), status: "Info".into() }); }
260
261    // ── 6. Performance ──────────────────────────────────────────────────
262    let performance = analyze_performance(&headers, load_time, content_size);
263
264    // ── 7. Mobile & Accessibility ───────────────────────────────────────
265    let mobile_accessibility = analyze_mobile(&document);
266
267    // ── 9. Schema Markup ────────────────────────────────────────────────
268    let schema_markup = analyze_schema(&document, &html_text);
269
270    // ── 10. Link Analysis ───────────────────────────────────────────────
271    let link_analysis = analyze_links(&document, &base_domain);
272
273    // ── 11. Image SEO ───────────────────────────────────────────────────
274    let image_seo = analyze_images(&document);
275
276    // ── 12. Page Speed Factors ──────────────────────────────────────────
277    let page_speed_factors = analyze_speed_factors(&document, &headers);
278
279    // ── 13. SEO Score ───────────────────────────────────────────────────
280    let seo_score = calculate_seo_score(
281        &basic_seo,
282        &content_analysis,
283        &seo_resources,
284        &schema_markup,
285        &performance,
286        &mobile_accessibility,
287    );
288
289    if let Some(t) = &progress_tx { let _ = t.try_send(crate::ScanProgress { module: "SEO Analysis".into(), percentage: 100.0, message: "SEO Analysis successfully completed.".into(), status: "Success".into() }); }
290
291    Ok(SeoAnalysisResult {
292        domain: domain.to_string(),
293        basic_seo,
294        content_analysis,
295        technical_seo,
296        social_media,
297        analytics,
298        performance,
299        mobile_accessibility,
300        seo_resources,
301        schema_markup,
302        link_analysis,
303        image_seo,
304        page_speed_factors,
305        seo_score,
306    })
307}
308
309// ── 1. Basic SEO ────────────────────────────────────────────────────────────
310
311fn analyze_basic_seo(doc: &Html) -> BasicSeoResult {
312    let title_sel = Selector::parse("title").unwrap();
313    let title_text = doc
314        .select(&title_sel)
315        .next()
316        .map(|el| el.text().collect::<String>().trim().to_string())
317        .unwrap_or_default();
318
319    let title_len = title_text.len();
320    let title_status = if title_text.is_empty() {
321        "Missing"
322    } else if title_len < 30 {
323        "Too short"
324    } else if title_len > 60 {
325        "Too long"
326    } else {
327        "Good"
328    };
329
330    let desc = get_meta_content(doc, "name", "description");
331    let desc_len = if desc == "Not Found" { 0 } else { desc.len() };
332    let desc_status = if desc == "Not Found" {
333        "Missing"
334    } else if desc_len < 120 {
335        "Too short"
336    } else if desc_len > 160 {
337        "Too long"
338    } else {
339        "Good"
340    };
341
342    BasicSeoResult {
343        title: TitleAnalysis {
344            text: if title_text.is_empty() {
345                "Missing".into()
346            } else {
347                title_text
348            },
349            length: title_len,
350            status: title_status.into(),
351        },
352        meta_description: MetaDescAnalysis {
353            text: desc.clone(),
354            length: desc_len,
355            status: desc_status.into(),
356        },
357        meta_keywords: get_meta_content(doc, "name", "keywords"),
358        canonical_url: get_link_href(doc, "canonical"),
359        meta_robots: get_meta_content(doc, "name", "robots"),
360        viewport: get_meta_content(doc, "name", "viewport"),
361        language: doc
362            .root_element()
363            .value()
364            .attr("lang")
365            .unwrap_or("Not specified")
366            .to_string(),
367        charset: get_charset(doc),
368    }
369}
370
371fn get_meta_content(doc: &Html, attr: &str, value: &str) -> String {
372    let selector_str = format!("meta[{}=\"{}\"]", attr, value);
373    if let Ok(sel) = Selector::parse(&selector_str) {
374        if let Some(el) = doc.select(&sel).next() {
375            if let Some(content) = el.value().attr("content") {
376                return content.trim().to_string();
377            }
378        }
379    }
380    "Not Found".into()
381}
382
383fn get_link_href(doc: &Html, rel: &str) -> String {
384    let selector_str = format!("link[rel=\"{}\"]", rel);
385    if let Ok(sel) = Selector::parse(&selector_str) {
386        if let Some(el) = doc.select(&sel).next() {
387            if let Some(href) = el.value().attr("href") {
388                return href.trim().to_string();
389            }
390        }
391    }
392    "Not Found".into()
393}
394
395fn get_charset(doc: &Html) -> String {
396    if let Ok(sel) = Selector::parse("meta[charset]") {
397        if let Some(el) = doc.select(&sel).next() {
398            if let Some(cs) = el.value().attr("charset") {
399                return cs.to_string();
400            }
401        }
402    }
403    if let Ok(sel) = Selector::parse("meta[http-equiv=\"Content-Type\"]") {
404        if let Some(el) = doc.select(&sel).next() {
405            if let Some(content) = el.value().attr("content") {
406                if let Some(cs) = Regex::new(r"charset=([^;]+)")
407                    .ok()
408                    .and_then(|r| r.captures(content))
409                {
410                    return cs.get(1).unwrap().as_str().to_string();
411                }
412            }
413        }
414    }
415    "Unknown".into()
416}
417
418// ── 2. Content Analysis ─────────────────────────────────────────────────────
419
420fn analyze_content(doc: &Html) -> ContentAnalysisResult {
421    let mut headings = HashMap::new();
422    let mut hierarchy: Vec<(u8, String)> = Vec::new();
423
424    let h_selectors = [
425        (1u8, Selector::parse("h1").unwrap()),
426        (2, Selector::parse("h2").unwrap()),
427        (3, Selector::parse("h3").unwrap()),
428        (4, Selector::parse("h4").unwrap()),
429        (5, Selector::parse("h5").unwrap()),
430        (6, Selector::parse("h6").unwrap()),
431    ];
432
433    for (i, sel) in &h_selectors {
434        let elements: Vec<_> = doc.select(sel).collect();
435        if !elements.is_empty() {
436            let texts: Vec<String> = elements
437                .iter()
438                .take(3)
439                .map(|e| {
440                    let t = e.text().collect::<String>();
441                    t.trim().chars().take(100).collect()
442                })
443                .collect();
444            headings.insert(
445                format!("H{}", i),
446                HeadingInfo {
447                    count: elements.len(),
448                    texts,
449                },
450            );
451            for e in &elements {
452                let t = e.text().collect::<String>().trim().to_string();
453                hierarchy.push((*i, t));
454            }
455        }
456    }
457
458    let heading_issues = check_heading_issues(&hierarchy);
459
460    let text = doc.root_element().text().collect::<String>();
461    let words: Vec<&str> = text.split_whitespace().collect();
462    let word_count = words.len();
463
464    let p_sel = Selector::parse("p").unwrap();
465    let paragraphs = doc.select(&p_sel).count();
466
467    let html_len = doc.html().len();
468    let text_len = text.len();
469    let ratio = if html_len > 0 {
470        (text_len as f64 / html_len as f64) * 100.0
471    } else {
472        0.0
473    };
474
475    let top_keywords = analyze_keyword_density(&words);
476
477    ContentAnalysisResult {
478        headings,
479        heading_issues,
480        word_count,
481        word_count_status: if word_count >= 300 {
482            "Good"
483        } else {
484            "Too short"
485        }
486        .into(),
487        paragraphs,
488        text_to_html_ratio: format!("{:.1}%", ratio),
489        top_keywords,
490    }
491}
492
493fn check_heading_issues(hierarchy: &[(u8, String)]) -> Vec<String> {
494    let mut issues = Vec::new();
495    if hierarchy.is_empty() {
496        issues.push("No headings found".into());
497        return issues;
498    }
499
500    let h1_count = hierarchy.iter().filter(|(l, _)| *l == 1).count();
501    if h1_count == 0 {
502        issues.push("Missing H1 tag".into());
503    } else if h1_count > 1 {
504        issues.push(format!("Multiple H1 tags ({})", h1_count));
505    }
506
507    let mut prev = 0u8;
508    for &(level, _) in hierarchy {
509        if prev > 0 && level > prev + 1 {
510            issues.push(format!(
511                "Skipped heading level (from H{} to H{})",
512                prev, level
513            ));
514        }
515        prev = level;
516    }
517    issues
518}
519
520fn analyze_keyword_density(words: &[&str]) -> Vec<KeywordInfo> {
521    let total = words.len();
522    if total == 0 {
523        return vec![];
524    }
525
526    let mut freq: HashMap<String, usize> = HashMap::new();
527    for &w in words {
528        let lower = w.to_lowercase();
529        if lower.len() > 3 {
530            *freq.entry(lower).or_insert(0) += 1;
531        }
532    }
533
534    let mut sorted: Vec<_> = freq.into_iter().collect();
535    sorted.sort_by(|a, b| b.1.cmp(&a.1));
536
537    sorted
538        .into_iter()
539        .take(5)
540        .map(|(word, count)| KeywordInfo {
541            word,
542            count,
543            density: format!("{:.2}%", (count as f64 / total as f64) * 100.0),
544        })
545        .collect()
546}
547
548// ── 3. Technical SEO ────────────────────────────────────────────────────────
549
550fn analyze_technical(
551    doc: &Html,
552    status: u16,
553    size: usize,
554    redirects: usize,
555    base_domain: &str,
556) -> TechnicalSeoResult {
557    let link_sel = Selector::parse("a[href]").unwrap();
558    let mut internal = 0;
559    let mut external = 0;
560
561    for el in doc.select(&link_sel) {
562        if let Some(href) = el.value().attr("href") {
563            if href.starts_with("http") && !href.contains(base_domain) {
564                external += 1;
565            } else if !href.starts_with("mailto:")
566                && !href.starts_with("tel:")
567                && !href.starts_with('#')
568            {
569                internal += 1;
570            }
571        }
572    }
573
574    let json_ld = Selector::parse("script[type=\"application/ld+json\"]")
575        .ok()
576        .map(|s| doc.select(&s).count())
577        .unwrap_or(0);
578    let microdata = Selector::parse("[itemtype]")
579        .ok()
580        .map(|s| doc.select(&s).count())
581        .unwrap_or(0);
582
583    let breadcrumb = Selector::parse("[typeof=\"BreadcrumbList\"]")
584        .ok()
585        .map(|s| doc.select(&s).next().is_some())
586        .unwrap_or(false)
587        || doc.html().to_lowercase().contains("breadcrumb");
588
589    TechnicalSeoResult {
590        page_size_bytes: size,
591        http_status: status,
592        redirects,
593        internal_links: internal,
594        external_links: external,
595        structured_data_count: json_ld + microdata,
596        has_breadcrumbs: breadcrumb,
597    }
598}
599
600// ── 4. Social Media Tags ────────────────────────────────────────────────────
601
602fn analyze_social_tags(doc: &Html) -> SocialMediaResult {
603    let og_keys = [
604        "og:title",
605        "og:description",
606        "og:image",
607        "og:url",
608        "og:type",
609        "og:site_name",
610    ];
611    let tw_keys = [
612        "twitter:card",
613        "twitter:title",
614        "twitter:description",
615        "twitter:image",
616        "twitter:site",
617    ];
618
619    let mut og = HashMap::new();
620    for key in &og_keys {
621        og.insert(key.to_string(), get_meta_content(doc, "property", key));
622    }
623
624    let mut tw = HashMap::new();
625    for key in &tw_keys {
626        tw.insert(key.to_string(), get_meta_content(doc, "name", key));
627    }
628
629    SocialMediaResult {
630        open_graph: og,
631        twitter_cards: tw,
632    }
633}
634
635// ── 5. Analytics & Tracking ─────────────────────────────────────────────────
636
637fn analyze_analytics(html: &str) -> HashMap<String, String> {
638    let mut results = HashMap::new();
639
640    // Google Analytics
641    let has_ga4 = Regex::new(r#"gtag\(['"]config['"],\s*['"]G-[A-Z0-9]+['"]\)"#)
642        .ok()
643        .map(|r| r.is_match(html))
644        .unwrap_or(false);
645    let has_ua = Regex::new(r#"gtag\(['"]config['"],\s*['"]UA-[0-9-]+['"]\)"#)
646        .ok()
647        .map(|r| r.is_match(html))
648        .unwrap_or(false);
649    results.insert(
650        "Google Analytics GA4".into(),
651        if has_ga4 { "Found" } else { "Not Found" }.into(),
652    );
653    results.insert(
654        "Google Analytics UA".into(),
655        if has_ua { "Found" } else { "Not Found" }.into(),
656    );
657
658    // Other tracking tools
659    let lower = html.to_lowercase();
660    for &(name, patterns) in TRACKING_TOOLS {
661        let found = patterns.iter().any(|p| lower.contains(&p.to_lowercase()));
662        results.insert(
663            name.to_string(),
664            if found { "Found" } else { "Not Found" }.into(),
665        );
666    }
667
668    results
669}
670
671// ── 6. Performance ──────────────────────────────────────────────────────────
672
673fn analyze_performance(
674    headers: &reqwest::header::HeaderMap,
675    load_time: f64,
676    size: usize,
677) -> PerformanceResult {
678    let status = if load_time < 1.0 {
679        "Excellent"
680    } else if load_time < 3.0 {
681        "Good"
682    } else {
683        "Poor"
684    };
685
686    PerformanceResult {
687        load_time_secs: (load_time * 100.0).round() / 100.0,
688        load_time_status: status.into(),
689        content_size_kb: (size as f64 / 1024.0 * 100.0).round() / 100.0,
690        compression: headers
691            .get("content-encoding")
692            .and_then(|v| v.to_str().ok())
693            .unwrap_or("None")
694            .into(),
695        server: headers
696            .get("server")
697            .and_then(|v| v.to_str().ok())
698            .unwrap_or("Unknown")
699            .into(),
700        cache_control: headers
701            .get("cache-control")
702            .and_then(|v| v.to_str().ok())
703            .unwrap_or("Not Set")
704            .into(),
705        etag: headers.contains_key("etag"),
706    }
707}
708
709// ── 7. Mobile & Accessibility ───────────────────────────────────────────────
710
711fn analyze_mobile(doc: &Html) -> MobileAccessibilityResult {
712    let viewport_content = get_meta_content(doc, "name", "viewport");
713    let has_viewport = viewport_content != "Not Found";
714    let mobile_friendly = viewport_content.contains("width=device-width");
715
716    let img_sel = Selector::parse("img").unwrap();
717    let images: Vec<_> = doc.select(&img_sel).collect();
718    let total = images.len();
719    let with_alt = images
720        .iter()
721        .filter(|i| i.value().attr("alt").is_some())
722        .count();
723
724    let aria_sel = Selector::parse("[aria-label]").unwrap();
725    let aria_count = doc.select(&aria_sel).count();
726
727    MobileAccessibilityResult {
728        viewport_present: has_viewport,
729        mobile_friendly,
730        alt_attributes: AltAttributeResult {
731            total_images: total,
732            images_with_alt: with_alt,
733            missing_alt: total - with_alt,
734            alt_coverage: if total > 0 {
735                format!("{:.1}%", (with_alt as f64 / total as f64) * 100.0)
736            } else {
737                "0%".into()
738            },
739        },
740        aria_labels: aria_count,
741    }
742}
743
744// ── 8. SEO Resources ────────────────────────────────────────────────────────
745
746async fn check_seo_resources(client: &Client, domain: &str) -> HashMap<String, String> {
747    let mut results = HashMap::new();
748    for &file in SEO_RESOURCES {
749        let url = format!("https://{}/{}", domain, file);
750        let found = match client.get(&url).send().await {
751            Ok(r) if r.status().is_success() => "Found",
752            _ => "Not Found",
753        };
754        results.insert(file.to_string(), found.into());
755    }
756    results
757}
758
759// ── 9. Schema Markup ────────────────────────────────────────────────────────
760
761fn analyze_schema(doc: &Html, html: &str) -> SchemaMarkupResult {
762    let json_ld_sel = Selector::parse("script[type=\"application/ld+json\"]").unwrap();
763    let json_lds: Vec<_> = doc.select(&json_ld_sel).collect();
764    let json_ld_count = json_lds.len();
765
766    let mut types = Vec::new();
767    for script in &json_lds {
768        let text = script.text().collect::<String>();
769        if let Ok(val) = serde_json::from_str::<serde_json::Value>(&text) {
770            extract_types(&val, &mut types);
771        }
772    }
773
774    let microdata = Selector::parse("[itemtype]")
775        .ok()
776        .map(|s| doc.select(&s).count())
777        .unwrap_or(0);
778
779    // Also check for inline JSON-LD in raw HTML (in case scraper misses it)
780    let additional = Regex::new(r#""@type"\s*:\s*"([^"]+)""#)
781        .ok()
782        .map(|r| {
783            r.captures_iter(html)
784                .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
785                .collect::<Vec<_>>()
786        })
787        .unwrap_or_default();
788
789    for t in additional {
790        if !types.contains(&t) {
791            types.push(t);
792        }
793    }
794
795    SchemaMarkupResult {
796        json_ld_count,
797        json_ld_types: types,
798        microdata_items: microdata,
799        total_structured_data: json_ld_count + microdata,
800    }
801}
802
803fn extract_types(val: &serde_json::Value, types: &mut Vec<String>) {
804    match val {
805        serde_json::Value::Object(map) => {
806            if let Some(t) = map.get("@type").and_then(|v| v.as_str()) {
807                types.push(t.to_string());
808            }
809            for (_, v) in map {
810                extract_types(v, types);
811            }
812        }
813        serde_json::Value::Array(arr) => {
814            for v in arr {
815                extract_types(v, types);
816            }
817        }
818        _ => {}
819    }
820}
821
822// ── 10. Link Analysis ───────────────────────────────────────────────────────
823
824fn analyze_links(doc: &Html, base_domain: &str) -> LinkAnalysisResult {
825    let link_sel = Selector::parse("a[href]").unwrap();
826    let mut internal = 0;
827    let mut external = 0;
828    let mut nofollow = 0;
829    let mut total = 0;
830
831    for el in doc.select(&link_sel) {
832        total += 1;
833        if let Some(href) = el.value().attr("href") {
834            if href.starts_with("http") && !href.contains(base_domain) {
835                external += 1;
836            } else if !href.starts_with("mailto:")
837                && !href.starts_with("tel:")
838                && !href.starts_with('#')
839            {
840                internal += 1;
841            }
842        }
843        if let Some(rel) = el.value().attr("rel") {
844            if rel.contains("nofollow") {
845                nofollow += 1;
846            }
847        }
848    }
849
850    LinkAnalysisResult {
851        total_links: total,
852        internal_links: internal,
853        external_links: external,
854        nofollow_links: nofollow,
855    }
856}
857
858// ── 11. Image SEO ───────────────────────────────────────────────────────────
859
860fn analyze_images(doc: &Html) -> ImageSeoResult {
861    let img_sel = Selector::parse("img").unwrap();
862    let images: Vec<_> = doc.select(&img_sel).collect();
863    let total = images.len();
864    let lazy = images
865        .iter()
866        .filter(|i| i.value().attr("loading") == Some("lazy"))
867        .count();
868    let alt = images
869        .iter()
870        .filter(|i| i.value().attr("alt").is_some())
871        .count();
872    let title = images
873        .iter()
874        .filter(|i| i.value().attr("title").is_some())
875        .count();
876
877    let opt_score = if total > 0 {
878        format!("{:.1}%", ((lazy + alt) as f64 / (total * 2) as f64) * 100.0)
879    } else {
880        "0%".into()
881    };
882
883    ImageSeoResult {
884        total_images: total,
885        lazy_loaded: lazy,
886        with_alt_text: alt,
887        with_title: title,
888        optimization_score: opt_score,
889    }
890}
891
892// ── 12. Page Speed Factors ──────────────────────────────────────────────────
893
894fn analyze_speed_factors(doc: &Html, headers: &reqwest::header::HeaderMap) -> PageSpeedResult {
895    let css_sel = Selector::parse("link[rel=\"stylesheet\"]").unwrap();
896    let js_sel = Selector::parse("script[src]").unwrap();
897    let style_sel = Selector::parse("style").unwrap();
898    let inline_js_sel = Selector::parse("script:not([src])").unwrap();
899
900    PageSpeedResult {
901        css_files: doc.select(&css_sel).count(),
902        js_files: doc.select(&js_sel).count(),
903        inline_styles: doc.select(&style_sel).count(),
904        inline_scripts: doc.select(&inline_js_sel).count(),
905        compression: headers
906            .get("content-encoding")
907            .and_then(|v| v.to_str().ok())
908            .unwrap_or("None")
909            .into(),
910    }
911}
912
913// ── 13. SEO Score ───────────────────────────────────────────────────────────
914
915fn calculate_seo_score(
916    basic: &BasicSeoResult,
917    content: &ContentAnalysisResult,
918    resources: &HashMap<String, String>,
919    schema: &SchemaMarkupResult,
920    perf: &PerformanceResult,
921    mobile: &MobileAccessibilityResult,
922) -> SeoScoreResult {
923    let mut score: u32 = 0;
924
925    // Basic SEO (30 pts)
926    if basic.title.status == "Good" {
927        score += 10;
928    }
929    if basic.meta_description.status == "Good" {
930        score += 10;
931    }
932    if basic.canonical_url != "Not Found" {
933        score += 5;
934    }
935    if basic.viewport != "Not Found" {
936        score += 5;
937    }
938
939    // Content (20 pts)
940    if content.word_count_status == "Good" {
941        score += 10;
942    }
943    if content.headings.contains_key("H1") {
944        score += 10;
945    }
946
947    // Technical (20 pts)
948    if resources.get("robots.txt").map(|s| s.as_str()) == Some("Found") {
949        score += 5;
950    }
951    if resources.get("sitemap.xml").map(|s| s.as_str()) == Some("Found") {
952        score += 5;
953    }
954    if schema.total_structured_data > 0 {
955        score += 10;
956    }
957
958    // Performance (15 pts)
959    match perf.load_time_status.as_str() {
960        "Excellent" | "Good" => score += 15,
961        _ => {}
962    }
963
964    // Security (10 pts) — counted from headers presence (simplified)
965    score += 5; // base
966
967    // Mobile (5 pts)
968    if mobile.mobile_friendly {
969        score += 5;
970    }
971
972    let max_score = 100u32;
973    let pct = (score as f64 / max_score as f64) * 100.0;
974    let grade = if pct >= 90.0 {
975        "A+"
976    } else if pct >= 80.0 {
977        "A"
978    } else if pct >= 70.0 {
979        "B"
980    } else if pct >= 60.0 {
981        "C"
982    } else if pct >= 50.0 {
983        "D"
984    } else {
985        "F"
986    };
987
988    SeoScoreResult {
989        score,
990        max_score,
991        percentage: format!("{:.1}%", pct),
992        grade: grade.into(),
993    }
994}