Skip to main content

web_analyzer/
seo_analysis.rs

1use regex::Regex;
2use reqwest::Client;
3use scraper::{Html, Selector};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::time::{Duration, Instant};
7
8// ── Tracking tool detection patterns ────────────────────────────────────────
9
10const TRACKING_TOOLS: &[(&str, &[&str])] = &[
11    (
12        "Google Tag Manager",
13        &["googletagmanager.com/gtm.js", "dataLayer"],
14    ),
15    (
16        "Google Ads",
17        &["googleads.g.doubleclick.net", "googlesyndication.com"],
18    ),
19    ("Facebook Pixel", &["connect.facebook.net", "fbq("]),
20    (
21        "LinkedIn Insight",
22        &["snap.licdn.com", "_linkedin_partner_id"],
23    ),
24    ("TikTok Pixel", &["analytics.tiktok.com", "ttq."]),
25    ("Hotjar", &["static.hotjar.com", "hjid"]),
26    ("Mixpanel", &["cdn.mxpnl.com", "mixpanel.init"]),
27    ("Segment", &["cdn.segment.com", "analytics.load"]),
28    ("Intercom", &["widget.intercom.io"]),
29    ("Zendesk", &["static.zdassets.com"]),
30    ("Crisp", &["client.crisp.chat"]),
31];
32
33/// SEO resources to check
34const SEO_RESOURCES: &[&str] = &["robots.txt", "sitemap.xml", "humans.txt", "ads.txt"];
35
36// ── Data Structures ─────────────────────────────────────────────────────────
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct SeoAnalysisResult {
40    pub domain: String,
41    pub basic_seo: BasicSeoResult,
42    pub content_analysis: ContentAnalysisResult,
43    pub technical_seo: TechnicalSeoResult,
44    pub social_media: SocialMediaResult,
45    pub analytics: HashMap<String, String>,
46    pub performance: PerformanceResult,
47    pub mobile_accessibility: MobileAccessibilityResult,
48    pub seo_resources: HashMap<String, String>,
49    pub schema_markup: SchemaMarkupResult,
50    pub link_analysis: LinkAnalysisResult,
51    pub image_seo: ImageSeoResult,
52    pub page_speed_factors: PageSpeedResult,
53    pub seo_score: SeoScoreResult,
54}
55
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct TitleAnalysis {
58    pub text: String,
59    pub length: usize,
60    pub status: String,
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct MetaDescAnalysis {
65    pub text: String,
66    pub length: usize,
67    pub status: String,
68}
69
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct BasicSeoResult {
72    pub title: TitleAnalysis,
73    pub meta_description: MetaDescAnalysis,
74    pub meta_keywords: String,
75    pub canonical_url: String,
76    pub meta_robots: String,
77    pub viewport: String,
78    pub language: String,
79    pub charset: String,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct HeadingInfo {
84    pub count: usize,
85    pub texts: Vec<String>,
86}
87
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct KeywordInfo {
90    pub word: String,
91    pub count: usize,
92    pub density: String,
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct ContentAnalysisResult {
97    pub headings: HashMap<String, HeadingInfo>,
98    pub heading_issues: Vec<String>,
99    pub word_count: usize,
100    pub word_count_status: String,
101    pub paragraphs: usize,
102    pub text_to_html_ratio: String,
103    pub top_keywords: Vec<KeywordInfo>,
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct TechnicalSeoResult {
108    pub page_size_bytes: usize,
109    pub http_status: u16,
110    pub redirects: usize,
111    pub internal_links: usize,
112    pub external_links: usize,
113    pub structured_data_count: usize,
114    pub has_breadcrumbs: bool,
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct SocialMediaResult {
119    pub open_graph: HashMap<String, String>,
120    pub twitter_cards: HashMap<String, String>,
121}
122
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct PerformanceResult {
125    pub load_time_secs: f64,
126    pub load_time_status: String,
127    pub content_size_kb: f64,
128    pub compression: String,
129    pub server: String,
130    pub cache_control: String,
131    pub etag: bool,
132}
133
134#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct AltAttributeResult {
136    pub total_images: usize,
137    pub images_with_alt: usize,
138    pub missing_alt: usize,
139    pub alt_coverage: String,
140}
141
142#[derive(Debug, Clone, Serialize, Deserialize)]
143pub struct MobileAccessibilityResult {
144    pub viewport_present: bool,
145    pub mobile_friendly: bool,
146    pub alt_attributes: AltAttributeResult,
147    pub aria_labels: usize,
148}
149
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct SchemaMarkupResult {
152    pub json_ld_count: usize,
153    pub json_ld_types: Vec<String>,
154    pub microdata_items: usize,
155    pub total_structured_data: usize,
156}
157
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct LinkAnalysisResult {
160    pub total_links: usize,
161    pub internal_links: usize,
162    pub external_links: usize,
163    pub nofollow_links: usize,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct ImageSeoResult {
168    pub total_images: usize,
169    pub lazy_loaded: usize,
170    pub with_alt_text: usize,
171    pub with_title: usize,
172    pub optimization_score: String,
173}
174
175#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct PageSpeedResult {
177    pub css_files: usize,
178    pub js_files: usize,
179    pub inline_styles: usize,
180    pub inline_scripts: usize,
181    pub compression: String,
182}
183
184#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct SeoScoreResult {
186    pub score: u32,
187    pub max_score: u32,
188    pub percentage: String,
189    pub grade: String,
190}
191
192// ── Main function ───────────────────────────────────────────────────────────
193
194pub async fn analyze_advanced_seo(
195    domain: &str,
196) -> Result<SeoAnalysisResult, Box<dyn std::error::Error + Send + Sync>> {
197    let url = if domain.starts_with("http") {
198        domain.to_string()
199    } else {
200        format!("https://{}", domain)
201    };
202
203    let client = Client::builder()
204        .timeout(Duration::from_secs(20))
205        .danger_accept_invalid_certs(true)
206        .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
207        .build()?;
208
209    let start = Instant::now();
210    let resp = client.get(&url).send().await?;
211    let load_time = start.elapsed().as_secs_f64();
212
213    let status_code = resp.status().as_u16();
214    let redirects = resp.url().to_string() != url; // simplified
215    let headers = resp.headers().clone();
216    let content_bytes = resp.bytes().await?;
217    let content_size = content_bytes.len();
218    let html_text = String::from_utf8_lossy(&content_bytes).to_string();
219    let base_domain = domain
220        .replace("https://", "")
221        .replace("http://", "")
222        .replace("www.", "");
223
224    // ── 8. SEO Resources (await before parsing HTML to avoid Send bounds) ──
225    let seo_resources = check_seo_resources(&client, domain).await;
226
227    let document = Html::parse_document(&html_text);
228
229    // ── 1. Basic SEO ────────────────────────────────────────────────────
230    let basic_seo = analyze_basic_seo(&document);
231
232    // ── 2. Content Analysis ─────────────────────────────────────────────
233    let content_analysis = analyze_content(&document);
234
235    // ── 3. Technical SEO ────────────────────────────────────────────────
236    let technical_seo = analyze_technical(
237        &document,
238        status_code,
239        content_size,
240        redirects as usize,
241        &base_domain,
242    );
243
244    // ── 4. Social Media Tags ────────────────────────────────────────────
245    let social_media = analyze_social_tags(&document);
246
247    // ── 5. Analytics & Tracking ─────────────────────────────────────────
248    let analytics = analyze_analytics(&html_text);
249
250    // ── 6. Performance ──────────────────────────────────────────────────
251    let performance = analyze_performance(&headers, load_time, content_size);
252
253    // ── 7. Mobile & Accessibility ───────────────────────────────────────
254    let mobile_accessibility = analyze_mobile(&document);
255
256    // ── 9. Schema Markup ────────────────────────────────────────────────
257    let schema_markup = analyze_schema(&document, &html_text);
258
259    // ── 10. Link Analysis ───────────────────────────────────────────────
260    let link_analysis = analyze_links(&document, &base_domain);
261
262    // ── 11. Image SEO ───────────────────────────────────────────────────
263    let image_seo = analyze_images(&document);
264
265    // ── 12. Page Speed Factors ──────────────────────────────────────────
266    let page_speed_factors = analyze_speed_factors(&document, &headers);
267
268    // ── 13. SEO Score ───────────────────────────────────────────────────
269    let seo_score = calculate_seo_score(
270        &basic_seo,
271        &content_analysis,
272        &seo_resources,
273        &schema_markup,
274        &performance,
275        &mobile_accessibility,
276    );
277
278    Ok(SeoAnalysisResult {
279        domain: domain.to_string(),
280        basic_seo,
281        content_analysis,
282        technical_seo,
283        social_media,
284        analytics,
285        performance,
286        mobile_accessibility,
287        seo_resources,
288        schema_markup,
289        link_analysis,
290        image_seo,
291        page_speed_factors,
292        seo_score,
293    })
294}
295
296// ── 1. Basic SEO ────────────────────────────────────────────────────────────
297
298fn analyze_basic_seo(doc: &Html) -> BasicSeoResult {
299    let title_sel = Selector::parse("title").unwrap();
300    let title_text = doc
301        .select(&title_sel)
302        .next()
303        .map(|el| el.text().collect::<String>().trim().to_string())
304        .unwrap_or_default();
305
306    let title_len = title_text.len();
307    let title_status = if title_text.is_empty() {
308        "Missing"
309    } else if title_len < 30 {
310        "Too short"
311    } else if title_len > 60 {
312        "Too long"
313    } else {
314        "Good"
315    };
316
317    let desc = get_meta_content(doc, "name", "description");
318    let desc_len = if desc == "Not Found" { 0 } else { desc.len() };
319    let desc_status = if desc == "Not Found" {
320        "Missing"
321    } else if desc_len < 120 {
322        "Too short"
323    } else if desc_len > 160 {
324        "Too long"
325    } else {
326        "Good"
327    };
328
329    BasicSeoResult {
330        title: TitleAnalysis {
331            text: if title_text.is_empty() {
332                "Missing".into()
333            } else {
334                title_text
335            },
336            length: title_len,
337            status: title_status.into(),
338        },
339        meta_description: MetaDescAnalysis {
340            text: desc.clone(),
341            length: desc_len,
342            status: desc_status.into(),
343        },
344        meta_keywords: get_meta_content(doc, "name", "keywords"),
345        canonical_url: get_link_href(doc, "canonical"),
346        meta_robots: get_meta_content(doc, "name", "robots"),
347        viewport: get_meta_content(doc, "name", "viewport"),
348        language: doc
349            .root_element()
350            .value()
351            .attr("lang")
352            .unwrap_or("Not specified")
353            .to_string(),
354        charset: get_charset(doc),
355    }
356}
357
358fn get_meta_content(doc: &Html, attr: &str, value: &str) -> String {
359    let selector_str = format!("meta[{}=\"{}\"]", attr, value);
360    if let Ok(sel) = Selector::parse(&selector_str) {
361        if let Some(el) = doc.select(&sel).next() {
362            if let Some(content) = el.value().attr("content") {
363                return content.trim().to_string();
364            }
365        }
366    }
367    "Not Found".into()
368}
369
370fn get_link_href(doc: &Html, rel: &str) -> String {
371    let selector_str = format!("link[rel=\"{}\"]", rel);
372    if let Ok(sel) = Selector::parse(&selector_str) {
373        if let Some(el) = doc.select(&sel).next() {
374            if let Some(href) = el.value().attr("href") {
375                return href.trim().to_string();
376            }
377        }
378    }
379    "Not Found".into()
380}
381
382fn get_charset(doc: &Html) -> String {
383    if let Ok(sel) = Selector::parse("meta[charset]") {
384        if let Some(el) = doc.select(&sel).next() {
385            if let Some(cs) = el.value().attr("charset") {
386                return cs.to_string();
387            }
388        }
389    }
390    if let Ok(sel) = Selector::parse("meta[http-equiv=\"Content-Type\"]") {
391        if let Some(el) = doc.select(&sel).next() {
392            if let Some(content) = el.value().attr("content") {
393                if let Some(cs) = Regex::new(r"charset=([^;]+)")
394                    .ok()
395                    .and_then(|r| r.captures(content))
396                {
397                    return cs.get(1).unwrap().as_str().to_string();
398                }
399            }
400        }
401    }
402    "Unknown".into()
403}
404
405// ── 2. Content Analysis ─────────────────────────────────────────────────────
406
407fn analyze_content(doc: &Html) -> ContentAnalysisResult {
408    let mut headings = HashMap::new();
409    let mut hierarchy: Vec<(u8, String)> = Vec::new();
410
411    let h_selectors = [
412        (1u8, Selector::parse("h1").unwrap()),
413        (2, Selector::parse("h2").unwrap()),
414        (3, Selector::parse("h3").unwrap()),
415        (4, Selector::parse("h4").unwrap()),
416        (5, Selector::parse("h5").unwrap()),
417        (6, Selector::parse("h6").unwrap()),
418    ];
419
420    for (i, sel) in &h_selectors {
421        let elements: Vec<_> = doc.select(sel).collect();
422        if !elements.is_empty() {
423            let texts: Vec<String> = elements
424                .iter()
425                .take(3)
426                .map(|e| {
427                    let t = e.text().collect::<String>();
428                    t.trim().chars().take(100).collect()
429                })
430                .collect();
431            headings.insert(
432                format!("H{}", i),
433                HeadingInfo {
434                    count: elements.len(),
435                    texts,
436                },
437            );
438            for e in &elements {
439                let t = e.text().collect::<String>().trim().to_string();
440                hierarchy.push((*i, t));
441            }
442        }
443    }
444
445    let heading_issues = check_heading_issues(&hierarchy);
446
447    let text = doc.root_element().text().collect::<String>();
448    let words: Vec<&str> = text.split_whitespace().collect();
449    let word_count = words.len();
450
451    let p_sel = Selector::parse("p").unwrap();
452    let paragraphs = doc.select(&p_sel).count();
453
454    let html_len = doc.html().len();
455    let text_len = text.len();
456    let ratio = if html_len > 0 {
457        (text_len as f64 / html_len as f64) * 100.0
458    } else {
459        0.0
460    };
461
462    let top_keywords = analyze_keyword_density(&words);
463
464    ContentAnalysisResult {
465        headings,
466        heading_issues,
467        word_count,
468        word_count_status: if word_count >= 300 {
469            "Good"
470        } else {
471            "Too short"
472        }
473        .into(),
474        paragraphs,
475        text_to_html_ratio: format!("{:.1}%", ratio),
476        top_keywords,
477    }
478}
479
480fn check_heading_issues(hierarchy: &[(u8, String)]) -> Vec<String> {
481    let mut issues = Vec::new();
482    if hierarchy.is_empty() {
483        issues.push("No headings found".into());
484        return issues;
485    }
486
487    let h1_count = hierarchy.iter().filter(|(l, _)| *l == 1).count();
488    if h1_count == 0 {
489        issues.push("Missing H1 tag".into());
490    } else if h1_count > 1 {
491        issues.push(format!("Multiple H1 tags ({})", h1_count));
492    }
493
494    let mut prev = 0u8;
495    for &(level, _) in hierarchy {
496        if prev > 0 && level > prev + 1 {
497            issues.push(format!(
498                "Skipped heading level (from H{} to H{})",
499                prev, level
500            ));
501        }
502        prev = level;
503    }
504    issues
505}
506
507fn analyze_keyword_density(words: &[&str]) -> Vec<KeywordInfo> {
508    let total = words.len();
509    if total == 0 {
510        return vec![];
511    }
512
513    let mut freq: HashMap<String, usize> = HashMap::new();
514    for &w in words {
515        let lower = w.to_lowercase();
516        if lower.len() > 3 {
517            *freq.entry(lower).or_insert(0) += 1;
518        }
519    }
520
521    let mut sorted: Vec<_> = freq.into_iter().collect();
522    sorted.sort_by(|a, b| b.1.cmp(&a.1));
523
524    sorted
525        .into_iter()
526        .take(5)
527        .map(|(word, count)| KeywordInfo {
528            word,
529            count,
530            density: format!("{:.2}%", (count as f64 / total as f64) * 100.0),
531        })
532        .collect()
533}
534
535// ── 3. Technical SEO ────────────────────────────────────────────────────────
536
537fn analyze_technical(
538    doc: &Html,
539    status: u16,
540    size: usize,
541    redirects: usize,
542    base_domain: &str,
543) -> TechnicalSeoResult {
544    let link_sel = Selector::parse("a[href]").unwrap();
545    let mut internal = 0;
546    let mut external = 0;
547
548    for el in doc.select(&link_sel) {
549        if let Some(href) = el.value().attr("href") {
550            if href.starts_with("http") && !href.contains(base_domain) {
551                external += 1;
552            } else if !href.starts_with("mailto:")
553                && !href.starts_with("tel:")
554                && !href.starts_with('#')
555            {
556                internal += 1;
557            }
558        }
559    }
560
561    let json_ld = Selector::parse("script[type=\"application/ld+json\"]")
562        .ok()
563        .map(|s| doc.select(&s).count())
564        .unwrap_or(0);
565    let microdata = Selector::parse("[itemtype]")
566        .ok()
567        .map(|s| doc.select(&s).count())
568        .unwrap_or(0);
569
570    let breadcrumb = Selector::parse("[typeof=\"BreadcrumbList\"]")
571        .ok()
572        .map(|s| doc.select(&s).next().is_some())
573        .unwrap_or(false)
574        || doc.html().to_lowercase().contains("breadcrumb");
575
576    TechnicalSeoResult {
577        page_size_bytes: size,
578        http_status: status,
579        redirects,
580        internal_links: internal,
581        external_links: external,
582        structured_data_count: json_ld + microdata,
583        has_breadcrumbs: breadcrumb,
584    }
585}
586
587// ── 4. Social Media Tags ────────────────────────────────────────────────────
588
589fn analyze_social_tags(doc: &Html) -> SocialMediaResult {
590    let og_keys = [
591        "og:title",
592        "og:description",
593        "og:image",
594        "og:url",
595        "og:type",
596        "og:site_name",
597    ];
598    let tw_keys = [
599        "twitter:card",
600        "twitter:title",
601        "twitter:description",
602        "twitter:image",
603        "twitter:site",
604    ];
605
606    let mut og = HashMap::new();
607    for key in &og_keys {
608        og.insert(key.to_string(), get_meta_content(doc, "property", key));
609    }
610
611    let mut tw = HashMap::new();
612    for key in &tw_keys {
613        tw.insert(key.to_string(), get_meta_content(doc, "name", key));
614    }
615
616    SocialMediaResult {
617        open_graph: og,
618        twitter_cards: tw,
619    }
620}
621
622// ── 5. Analytics & Tracking ─────────────────────────────────────────────────
623
624fn analyze_analytics(html: &str) -> HashMap<String, String> {
625    let mut results = HashMap::new();
626
627    // Google Analytics
628    let has_ga4 = Regex::new(r#"gtag\(['"]config['"],\s*['"]G-[A-Z0-9]+['"]\)"#)
629        .ok()
630        .map(|r| r.is_match(html))
631        .unwrap_or(false);
632    let has_ua = Regex::new(r#"gtag\(['"]config['"],\s*['"]UA-[0-9-]+['"]\)"#)
633        .ok()
634        .map(|r| r.is_match(html))
635        .unwrap_or(false);
636    results.insert(
637        "Google Analytics GA4".into(),
638        if has_ga4 { "Found" } else { "Not Found" }.into(),
639    );
640    results.insert(
641        "Google Analytics UA".into(),
642        if has_ua { "Found" } else { "Not Found" }.into(),
643    );
644
645    // Other tracking tools
646    let lower = html.to_lowercase();
647    for &(name, patterns) in TRACKING_TOOLS {
648        let found = patterns.iter().any(|p| lower.contains(&p.to_lowercase()));
649        results.insert(
650            name.to_string(),
651            if found { "Found" } else { "Not Found" }.into(),
652        );
653    }
654
655    results
656}
657
658// ── 6. Performance ──────────────────────────────────────────────────────────
659
660fn analyze_performance(
661    headers: &reqwest::header::HeaderMap,
662    load_time: f64,
663    size: usize,
664) -> PerformanceResult {
665    let status = if load_time < 1.0 {
666        "Excellent"
667    } else if load_time < 3.0 {
668        "Good"
669    } else {
670        "Poor"
671    };
672
673    PerformanceResult {
674        load_time_secs: (load_time * 100.0).round() / 100.0,
675        load_time_status: status.into(),
676        content_size_kb: (size as f64 / 1024.0 * 100.0).round() / 100.0,
677        compression: headers
678            .get("content-encoding")
679            .and_then(|v| v.to_str().ok())
680            .unwrap_or("None")
681            .into(),
682        server: headers
683            .get("server")
684            .and_then(|v| v.to_str().ok())
685            .unwrap_or("Unknown")
686            .into(),
687        cache_control: headers
688            .get("cache-control")
689            .and_then(|v| v.to_str().ok())
690            .unwrap_or("Not Set")
691            .into(),
692        etag: headers.contains_key("etag"),
693    }
694}
695
696// ── 7. Mobile & Accessibility ───────────────────────────────────────────────
697
698fn analyze_mobile(doc: &Html) -> MobileAccessibilityResult {
699    let viewport_content = get_meta_content(doc, "name", "viewport");
700    let has_viewport = viewport_content != "Not Found";
701    let mobile_friendly = viewport_content.contains("width=device-width");
702
703    let img_sel = Selector::parse("img").unwrap();
704    let images: Vec<_> = doc.select(&img_sel).collect();
705    let total = images.len();
706    let with_alt = images
707        .iter()
708        .filter(|i| i.value().attr("alt").is_some())
709        .count();
710
711    let aria_sel = Selector::parse("[aria-label]").unwrap();
712    let aria_count = doc.select(&aria_sel).count();
713
714    MobileAccessibilityResult {
715        viewport_present: has_viewport,
716        mobile_friendly,
717        alt_attributes: AltAttributeResult {
718            total_images: total,
719            images_with_alt: with_alt,
720            missing_alt: total - with_alt,
721            alt_coverage: if total > 0 {
722                format!("{:.1}%", (with_alt as f64 / total as f64) * 100.0)
723            } else {
724                "0%".into()
725            },
726        },
727        aria_labels: aria_count,
728    }
729}
730
731// ── 8. SEO Resources ────────────────────────────────────────────────────────
732
733async fn check_seo_resources(client: &Client, domain: &str) -> HashMap<String, String> {
734    let mut results = HashMap::new();
735    for &file in SEO_RESOURCES {
736        let url = format!("https://{}/{}", domain, file);
737        let found = match client.get(&url).send().await {
738            Ok(r) if r.status().is_success() => "Found",
739            _ => "Not Found",
740        };
741        results.insert(file.to_string(), found.into());
742    }
743    results
744}
745
746// ── 9. Schema Markup ────────────────────────────────────────────────────────
747
748fn analyze_schema(doc: &Html, html: &str) -> SchemaMarkupResult {
749    let json_ld_sel = Selector::parse("script[type=\"application/ld+json\"]").unwrap();
750    let json_lds: Vec<_> = doc.select(&json_ld_sel).collect();
751    let json_ld_count = json_lds.len();
752
753    let mut types = Vec::new();
754    for script in &json_lds {
755        let text = script.text().collect::<String>();
756        if let Ok(val) = serde_json::from_str::<serde_json::Value>(&text) {
757            extract_types(&val, &mut types);
758        }
759    }
760
761    let microdata = Selector::parse("[itemtype]")
762        .ok()
763        .map(|s| doc.select(&s).count())
764        .unwrap_or(0);
765
766    // Also check for inline JSON-LD in raw HTML (in case scraper misses it)
767    let additional = Regex::new(r#""@type"\s*:\s*"([^"]+)""#)
768        .ok()
769        .map(|r| {
770            r.captures_iter(html)
771                .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
772                .collect::<Vec<_>>()
773        })
774        .unwrap_or_default();
775
776    for t in additional {
777        if !types.contains(&t) {
778            types.push(t);
779        }
780    }
781
782    SchemaMarkupResult {
783        json_ld_count,
784        json_ld_types: types,
785        microdata_items: microdata,
786        total_structured_data: json_ld_count + microdata,
787    }
788}
789
790fn extract_types(val: &serde_json::Value, types: &mut Vec<String>) {
791    match val {
792        serde_json::Value::Object(map) => {
793            if let Some(t) = map.get("@type").and_then(|v| v.as_str()) {
794                types.push(t.to_string());
795            }
796            for (_, v) in map {
797                extract_types(v, types);
798            }
799        }
800        serde_json::Value::Array(arr) => {
801            for v in arr {
802                extract_types(v, types);
803            }
804        }
805        _ => {}
806    }
807}
808
809// ── 10. Link Analysis ───────────────────────────────────────────────────────
810
811fn analyze_links(doc: &Html, base_domain: &str) -> LinkAnalysisResult {
812    let link_sel = Selector::parse("a[href]").unwrap();
813    let mut internal = 0;
814    let mut external = 0;
815    let mut nofollow = 0;
816    let mut total = 0;
817
818    for el in doc.select(&link_sel) {
819        total += 1;
820        if let Some(href) = el.value().attr("href") {
821            if href.starts_with("http") && !href.contains(base_domain) {
822                external += 1;
823            } else if !href.starts_with("mailto:")
824                && !href.starts_with("tel:")
825                && !href.starts_with('#')
826            {
827                internal += 1;
828            }
829        }
830        if let Some(rel) = el.value().attr("rel") {
831            if rel.contains("nofollow") {
832                nofollow += 1;
833            }
834        }
835    }
836
837    LinkAnalysisResult {
838        total_links: total,
839        internal_links: internal,
840        external_links: external,
841        nofollow_links: nofollow,
842    }
843}
844
845// ── 11. Image SEO ───────────────────────────────────────────────────────────
846
847fn analyze_images(doc: &Html) -> ImageSeoResult {
848    let img_sel = Selector::parse("img").unwrap();
849    let images: Vec<_> = doc.select(&img_sel).collect();
850    let total = images.len();
851    let lazy = images
852        .iter()
853        .filter(|i| i.value().attr("loading") == Some("lazy"))
854        .count();
855    let alt = images
856        .iter()
857        .filter(|i| i.value().attr("alt").is_some())
858        .count();
859    let title = images
860        .iter()
861        .filter(|i| i.value().attr("title").is_some())
862        .count();
863
864    let opt_score = if total > 0 {
865        format!("{:.1}%", ((lazy + alt) as f64 / (total * 2) as f64) * 100.0)
866    } else {
867        "0%".into()
868    };
869
870    ImageSeoResult {
871        total_images: total,
872        lazy_loaded: lazy,
873        with_alt_text: alt,
874        with_title: title,
875        optimization_score: opt_score,
876    }
877}
878
879// ── 12. Page Speed Factors ──────────────────────────────────────────────────
880
881fn analyze_speed_factors(doc: &Html, headers: &reqwest::header::HeaderMap) -> PageSpeedResult {
882    let css_sel = Selector::parse("link[rel=\"stylesheet\"]").unwrap();
883    let js_sel = Selector::parse("script[src]").unwrap();
884    let style_sel = Selector::parse("style").unwrap();
885    let inline_js_sel = Selector::parse("script:not([src])").unwrap();
886
887    PageSpeedResult {
888        css_files: doc.select(&css_sel).count(),
889        js_files: doc.select(&js_sel).count(),
890        inline_styles: doc.select(&style_sel).count(),
891        inline_scripts: doc.select(&inline_js_sel).count(),
892        compression: headers
893            .get("content-encoding")
894            .and_then(|v| v.to_str().ok())
895            .unwrap_or("None")
896            .into(),
897    }
898}
899
900// ── 13. SEO Score ───────────────────────────────────────────────────────────
901
902fn calculate_seo_score(
903    basic: &BasicSeoResult,
904    content: &ContentAnalysisResult,
905    resources: &HashMap<String, String>,
906    schema: &SchemaMarkupResult,
907    perf: &PerformanceResult,
908    mobile: &MobileAccessibilityResult,
909) -> SeoScoreResult {
910    let mut score: u32 = 0;
911
912    // Basic SEO (30 pts)
913    if basic.title.status == "Good" {
914        score += 10;
915    }
916    if basic.meta_description.status == "Good" {
917        score += 10;
918    }
919    if basic.canonical_url != "Not Found" {
920        score += 5;
921    }
922    if basic.viewport != "Not Found" {
923        score += 5;
924    }
925
926    // Content (20 pts)
927    if content.word_count_status == "Good" {
928        score += 10;
929    }
930    if content.headings.contains_key("H1") {
931        score += 10;
932    }
933
934    // Technical (20 pts)
935    if resources.get("robots.txt").map(|s| s.as_str()) == Some("Found") {
936        score += 5;
937    }
938    if resources.get("sitemap.xml").map(|s| s.as_str()) == Some("Found") {
939        score += 5;
940    }
941    if schema.total_structured_data > 0 {
942        score += 10;
943    }
944
945    // Performance (15 pts)
946    match perf.load_time_status.as_str() {
947        "Excellent" | "Good" => score += 15,
948        _ => {}
949    }
950
951    // Security (10 pts) — counted from headers presence (simplified)
952    score += 5; // base
953
954    // Mobile (5 pts)
955    if mobile.mobile_friendly {
956        score += 5;
957    }
958
959    let max_score = 100u32;
960    let pct = (score as f64 / max_score as f64) * 100.0;
961    let grade = if pct >= 90.0 {
962        "A+"
963    } else if pct >= 80.0 {
964        "A"
965    } else if pct >= 70.0 {
966        "B"
967    } else if pct >= 60.0 {
968        "C"
969    } else if pct >= 50.0 {
970        "D"
971    } else {
972        "F"
973    };
974
975    SeoScoreResult {
976        score,
977        max_score,
978        percentage: format!("{:.1}%", pct),
979        grade: grade.into(),
980    }
981}