halldyll_parser/
fingerprint.rs

1//! Content fingerprinting and change detection for halldyll-parser
2//!
3//! This module handles:
4//! - Content hashing for change detection
5//! - Structural fingerprinting
6//! - AMP page detection
7//! - Content comparison
8//! - Cache control hints
9
10use scraper::{Html, Selector};
11use serde::{Deserialize, Serialize};
12use std::collections::hash_map::DefaultHasher;
13use std::hash::{Hash, Hasher};
14
15use crate::selector::SELECTORS;
16use crate::types::ParserResult;
17
18// ============================================================================
19// TYPES
20// ============================================================================
21
22/// Content fingerprint for change detection
23#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
24pub struct ContentFingerprint {
25    /// Hash of the full HTML content
26    pub full_hash: u64,
27    /// Hash of just the main content (excluding nav, footer, etc.)
28    pub content_hash: u64,
29    /// Hash of the text content only (no tags)
30    pub text_hash: u64,
31    /// Hash of the document structure (tag hierarchy)
32    pub structure_hash: u64,
33    /// Number of elements in the document
34    pub element_count: usize,
35    /// Number of text nodes
36    pub text_node_count: usize,
37    /// Content length in bytes
38    pub content_length: usize,
39    /// Main content length
40    pub main_content_length: usize,
41}
42
43impl ContentFingerprint {
44    /// Check if content has changed compared to another fingerprint
45    pub fn has_changed(&self, other: &ContentFingerprint) -> bool {
46        self.content_hash != other.content_hash
47    }
48
49    /// Check if only minor changes occurred (same structure, different content)
50    pub fn has_minor_changes(&self, other: &ContentFingerprint) -> bool {
51        self.structure_hash == other.structure_hash && 
52        self.content_hash != other.content_hash
53    }
54
55    /// Check if structure changed (major change)
56    pub fn has_structural_changes(&self, other: &ContentFingerprint) -> bool {
57        self.structure_hash != other.structure_hash
58    }
59
60    /// Get similarity percentage (0.0 to 1.0)
61    pub fn similarity(&self, other: &ContentFingerprint) -> f64 {
62        let mut matches = 0.0;
63        let total = 4.0;
64
65        if self.content_hash == other.content_hash { matches += 1.0; }
66        if self.text_hash == other.text_hash { matches += 1.0; }
67        if self.structure_hash == other.structure_hash { matches += 1.0; }
68        
69        // Element count similarity
70        let count_diff = (self.element_count as i64 - other.element_count as i64).abs();
71        let max_count = self.element_count.max(other.element_count) as f64;
72        if max_count > 0.0 {
73            matches += 1.0 - (count_diff as f64 / max_count);
74        } else {
75            matches += 1.0;
76        }
77
78        matches / total
79    }
80}
81
82/// AMP (Accelerated Mobile Pages) information
83#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
84pub struct AmpInfo {
85    /// Whether this page is an AMP page
86    pub is_amp: bool,
87    /// Whether this is an AMP HTML page (⚡ or amp attribute)
88    pub is_amp_html: bool,
89    /// URL to the AMP version of this page (if not AMP)
90    pub amp_url: Option<String>,
91    /// URL to the canonical (non-AMP) version (if this is AMP)
92    pub canonical_url: Option<String>,
93    /// AMP version detected
94    pub amp_version: Option<String>,
95    /// Whether AMP runtime is included
96    pub has_amp_runtime: bool,
97    /// AMP components used
98    pub components: Vec<String>,
99}
100
101impl AmpInfo {
102    pub fn new() -> Self {
103        Self::default()
104    }
105
106    /// Check if page has AMP version available
107    pub fn has_amp_version(&self) -> bool {
108        self.amp_url.is_some()
109    }
110}
111
112/// Cache hints extracted from the page
113#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
114pub struct CacheHints {
115    /// ETag value if found
116    pub etag: Option<String>,
117    /// Last-Modified header value
118    pub last_modified: Option<String>,
119    /// Cache-Control directives
120    pub cache_control: Option<String>,
121    /// Whether page indicates it shouldn't be cached
122    pub no_cache: bool,
123    /// Max-age value in seconds
124    pub max_age: Option<u32>,
125}
126
127// ============================================================================
128// FINGERPRINTING FUNCTIONS
129// ============================================================================
130
131/// Generate content fingerprint from HTML
132pub fn generate_fingerprint(html: &str) -> ParserResult<ContentFingerprint> {
133    let document = Html::parse_document(html);
134    
135    // Main content hash (excluding boilerplate)
136    let main_content = extract_main_content(&document);
137    
138    // Text-only hash
139    let text_content = extract_text_only(&document);
140    
141    // Structure hash
142    let structure = extract_structure(&document);
143    
144    let fingerprint = ContentFingerprint {
145        full_hash: hash_string(html),
146        content_length: html.len(),
147        content_hash: hash_string(&main_content),
148        main_content_length: main_content.len(),
149        text_hash: hash_string(&text_content),
150        structure_hash: hash_string(&structure),
151        element_count: count_elements(&document),
152        text_node_count: count_text_nodes(&document),
153    };
154    
155    Ok(fingerprint)
156}
157
158/// Generate fingerprint from parsed document
159pub fn fingerprint_document(document: &Html) -> ContentFingerprint {
160    let html = document.html();
161    generate_fingerprint(&html).unwrap_or_default()
162}
163
164/// Hash a string using DefaultHasher
165fn hash_string(s: &str) -> u64 {
166    let mut hasher = DefaultHasher::new();
167    s.hash(&mut hasher);
168    hasher.finish()
169}
170
171/// Extract main content area (excluding nav, header, footer, sidebar)
172fn extract_main_content(document: &Html) -> String {
173    // Try to find main content area
174    let main_selectors = [
175        "main",
176        "article",
177        "[role='main']",
178        ".content",
179        "#content",
180        ".post-content",
181        ".article-content",
182        ".entry-content",
183    ];
184
185    for selector_str in main_selectors {
186        if let Ok(sel) = Selector::parse(selector_str) {
187            let content: String = document.select(&sel)
188                .map(|el| el.html())
189                .collect();
190            
191            if !content.is_empty() {
192                return content;
193            }
194        }
195    }
196
197    // Fallback to body without boilerplate
198    if let Some(body) = document.select(&SELECTORS.body).next() {
199        let mut content = body.html();
200        
201        // Remove common boilerplate elements
202        let boilerplate = ["<nav", "<header", "<footer", "<aside", "<script", "<style"];
203        for bp in boilerplate {
204            if let Some(start) = content.find(bp) {
205                if let Some(end) = content[start..].find('>') {
206                    // Find closing tag - simplified removal
207                    let tag_end = start + end + 1;
208                    content = format!("{}{}", &content[..start], &content[tag_end..]);
209                }
210            }
211        }
212        
213        return content;
214    }
215
216    document.html()
217}
218
219/// Extract text content only (no HTML tags)
220fn extract_text_only(document: &Html) -> String {
221    document.root_element()
222        .text()
223        .collect::<String>()
224        .split_whitespace()
225        .collect::<Vec<_>>()
226        .join(" ")
227}
228
229/// Extract document structure (tag hierarchy)
230fn extract_structure(document: &Html) -> String {
231    let mut structure = String::new();
232    extract_structure_recursive(document.root_element(), &mut structure, 0);
233    structure
234}
235
236/// Recursively extract structure
237fn extract_structure_recursive(
238    element: scraper::ElementRef,
239    structure: &mut String,
240    depth: usize,
241) {
242    // Add tag name with depth indicator
243    structure.push_str(&format!("{}:{}", depth, element.value().name()));
244    
245    // Add significant attributes
246    for attr in ["id", "class", "role"] {
247        if let Some(val) = element.value().attr(attr) {
248            // Only include first class and id for fingerprinting
249            let short_val: String = val.split_whitespace().take(1).collect();
250            if !short_val.is_empty() {
251                structure.push_str(&format!("[{}={}]", attr, short_val));
252            }
253        }
254    }
255    
256    structure.push(';');
257    
258    // Recurse into children (limit depth for performance)
259    if depth < 10 {
260        for child in element.children() {
261            if let Some(el) = scraper::ElementRef::wrap(child) {
262                // Skip script and style
263                let name = el.value().name();
264                if name != "script" && name != "style" && name != "noscript" {
265                    extract_structure_recursive(el, structure, depth + 1);
266                }
267            }
268        }
269    }
270}
271
272/// Count total elements in document
273fn count_elements(document: &Html) -> usize {
274    if let Ok(sel) = Selector::parse("*") {
275        document.select(&sel).count()
276    } else {
277        0
278    }
279}
280
281/// Count text nodes in document
282fn count_text_nodes(document: &Html) -> usize {
283    document.root_element()
284        .text()
285        .filter(|t| !t.trim().is_empty())
286        .count()
287}
288
289// ============================================================================
290// AMP DETECTION
291// ============================================================================
292
293/// Extract AMP information from document
294pub fn extract_amp_info(document: &Html, base_url: Option<&url::Url>) -> ParserResult<AmpInfo> {
295    let mut info = AmpInfo::new();
296
297    // Check if this is an AMP page
298    info.is_amp_html = detect_is_amp_page(document);
299    info.is_amp = info.is_amp_html;
300
301    // Get AMP URL (if this is not an AMP page)
302    if !info.is_amp {
303        info.amp_url = extract_amp_link(document, base_url);
304        if info.amp_url.is_some() {
305            info.is_amp = true; // Has AMP version
306        }
307    }
308
309    // Get canonical URL (if this is an AMP page)
310    if info.is_amp_html {
311        info.canonical_url = extract_canonical_link(document, base_url);
312    }
313
314    // Check for AMP runtime
315    info.has_amp_runtime = detect_amp_runtime(document);
316
317    // Extract AMP components
318    info.components = extract_amp_components(document);
319
320    // Try to detect AMP version
321    info.amp_version = detect_amp_version(document);
322
323    Ok(info)
324}
325
326/// Check if document is an AMP HTML page
327fn detect_is_amp_page(document: &Html) -> bool {
328    // Check for ⚡ or amp attribute on html element
329    if let Some(html) = document.select(&SELECTORS.html).next() {
330        // Check for amp or ⚡ attribute
331        if html.value().attr("amp").is_some() || html.value().attr("⚡").is_some() {
332            return true;
333        }
334        
335        // Check class
336        if html.value().classes().any(|c| c == "amp" || c == "⚡") {
337            return true;
338        }
339    }
340
341    // Check for AMP boilerplate in head
342    let html_str = document.html();
343    html_str.contains("amp-boilerplate") || 
344    html_str.contains("cdn.ampproject.org")
345}
346
347/// Extract link to AMP version
348fn extract_amp_link(document: &Html, base_url: Option<&url::Url>) -> Option<String> {
349    if let Ok(sel) = Selector::parse("link[rel='amphtml']") {
350        if let Some(el) = document.select(&sel).next() {
351            if let Some(href) = el.value().attr("href") {
352                return resolve_url(href, base_url);
353            }
354        }
355    }
356    None
357}
358
359/// Extract canonical link
360fn extract_canonical_link(document: &Html, base_url: Option<&url::Url>) -> Option<String> {
361    if let Ok(sel) = Selector::parse("link[rel='canonical']") {
362        if let Some(el) = document.select(&sel).next() {
363            if let Some(href) = el.value().attr("href") {
364                return resolve_url(href, base_url);
365            }
366        }
367    }
368    None
369}
370
371/// Check for AMP runtime script
372fn detect_amp_runtime(document: &Html) -> bool {
373    if let Ok(sel) = Selector::parse("script[src*='cdn.ampproject.org']") {
374        return document.select(&sel).next().is_some();
375    }
376    false
377}
378
379/// Extract AMP component names
380fn extract_amp_components(document: &Html) -> Vec<String> {
381    let mut components = Vec::new();
382
383    // Find custom element scripts
384    if let Ok(sel) = Selector::parse("script[custom-element]") {
385        for el in document.select(&sel) {
386            if let Some(name) = el.value().attr("custom-element") {
387                if !components.contains(&name.to_string()) {
388                    components.push(name.to_string());
389                }
390            }
391        }
392    }
393
394    // Also check for amp-* tags in the document
395    let html = document.html().to_lowercase();
396    let amp_tags = [
397        "amp-img", "amp-video", "amp-audio", "amp-carousel",
398        "amp-accordion", "amp-sidebar", "amp-lightbox",
399        "amp-analytics", "amp-ad", "amp-social-share",
400        "amp-form", "amp-list", "amp-bind", "amp-state",
401    ];
402
403    for tag in amp_tags {
404        if html.contains(&format!("<{}", tag)) {
405            let tag_str = tag.to_string();
406            if !components.contains(&tag_str) {
407                components.push(tag_str);
408            }
409        }
410    }
411
412    components
413}
414
415/// Detect AMP version from runtime URL
416fn detect_amp_version(document: &Html) -> Option<String> {
417    if let Ok(sel) = Selector::parse("script[src*='cdn.ampproject.org']") {
418        if let Some(el) = document.select(&sel).next() {
419            if let Some(src) = el.value().attr("src") {
420                // Try to extract version from URL like /v0.js or /v0/amp-component-0.1.js
421                if src.contains("/v0") {
422                    return Some("v0".to_string());
423                }
424                // Could parse more specific versions here
425            }
426        }
427    }
428    None
429}
430
431/// Resolve URL helper
432fn resolve_url(href: &str, base_url: Option<&url::Url>) -> Option<String> {
433    if href.starts_with("http://") || href.starts_with("https://") {
434        return Some(href.to_string());
435    }
436
437    if let Some(base) = base_url {
438        return base.join(href).ok().map(|u| u.to_string());
439    }
440
441    None
442}
443
444// ============================================================================
445// CACHE HINTS EXTRACTION
446// ============================================================================
447
448/// Extract cache hints from meta tags
449pub fn extract_cache_hints(document: &Html) -> CacheHints {
450    let mut hints = CacheHints::default();
451
452    // Check for no-cache meta tag
453    if let Ok(sel) = Selector::parse("meta[http-equiv='Cache-Control']") {
454        if let Some(el) = document.select(&sel).next() {
455            if let Some(content) = el.value().attr("content") {
456                hints.cache_control = Some(content.to_string());
457                hints.no_cache = content.to_lowercase().contains("no-cache") ||
458                                 content.to_lowercase().contains("no-store");
459                
460                // Try to extract max-age
461                if let Some(pos) = content.to_lowercase().find("max-age=") {
462                    let start = pos + 8;
463                    let num: String = content[start..]
464                        .chars()
465                        .take_while(|c| c.is_ascii_digit())
466                        .collect();
467                    hints.max_age = num.parse().ok();
468                }
469            }
470        }
471    }
472
473    // Check for Pragma: no-cache
474    if let Ok(sel) = Selector::parse("meta[http-equiv='Pragma']") {
475        if let Some(el) = document.select(&sel).next() {
476            if let Some(content) = el.value().attr("content") {
477                if content.to_lowercase().contains("no-cache") {
478                    hints.no_cache = true;
479                }
480            }
481        }
482    }
483
484    hints
485}
486
487// ============================================================================
488// CONVENIENCE FUNCTIONS
489// ============================================================================
490
491/// Check if content has changed between two HTML strings
492pub fn has_content_changed(old_html: &str, new_html: &str) -> bool {
493    let old_fp = generate_fingerprint(old_html).unwrap_or_default();
494    let new_fp = generate_fingerprint(new_html).unwrap_or_default();
495    old_fp.has_changed(&new_fp)
496}
497
498/// Get content similarity between two HTML strings (0.0 to 1.0)
499pub fn content_similarity(html1: &str, html2: &str) -> f64 {
500    let fp1 = generate_fingerprint(html1).unwrap_or_default();
501    let fp2 = generate_fingerprint(html2).unwrap_or_default();
502    fp1.similarity(&fp2)
503}
504
505/// Check if page is an AMP page
506pub fn is_amp_page(document: &Html) -> bool {
507    detect_is_amp_page(document)
508}
509
510/// Get AMP URL if available
511pub fn get_amp_url(document: &Html) -> Option<String> {
512    extract_amp_link(document, None)
513}
514
515/// Quick hash of HTML content
516pub fn quick_hash(html: &str) -> u64 {
517    hash_string(html)
518}
519
520// ============================================================================
521// TESTS
522// ============================================================================
523
524#[cfg(test)]
525mod tests {
526    use super::*;
527
528    fn parse_html(html: &str) -> Html {
529        Html::parse_document(html)
530    }
531
532    #[test]
533    fn test_generate_fingerprint() {
534        let html = "<html><body><p>Hello world</p></body></html>";
535        let fp = generate_fingerprint(html).unwrap();
536
537        assert!(fp.full_hash != 0);
538        assert!(fp.content_hash != 0);
539        assert!(fp.text_hash != 0);
540        assert!(fp.structure_hash != 0);
541        assert!(fp.element_count > 0);
542        assert!(fp.content_length > 0);
543    }
544
545    #[test]
546    fn test_fingerprint_same_content() {
547        let html1 = "<html><body><p>Hello world</p></body></html>";
548        let html2 = "<html><body><p>Hello world</p></body></html>";
549
550        let fp1 = generate_fingerprint(html1).unwrap();
551        let fp2 = generate_fingerprint(html2).unwrap();
552
553        assert!(!fp1.has_changed(&fp2));
554        assert_eq!(fp1.similarity(&fp2), 1.0);
555    }
556
557    #[test]
558    fn test_fingerprint_different_content() {
559        let html1 = "<html><body><p>Hello world</p></body></html>";
560        let html2 = "<html><body><p>Goodbye world</p></body></html>";
561
562        let fp1 = generate_fingerprint(html1).unwrap();
563        let fp2 = generate_fingerprint(html2).unwrap();
564
565        assert!(fp1.has_changed(&fp2));
566        // Structure should be the same
567        assert!(!fp1.has_structural_changes(&fp2));
568        assert!(fp1.has_minor_changes(&fp2));
569    }
570
571    #[test]
572    fn test_fingerprint_structural_change() {
573        let html1 = "<html><body><p>Hello</p></body></html>";
574        let html2 = "<html><body><div><p>Hello</p></div></body></html>";
575
576        let fp1 = generate_fingerprint(html1).unwrap();
577        let fp2 = generate_fingerprint(html2).unwrap();
578
579        assert!(fp1.has_structural_changes(&fp2));
580    }
581
582    #[test]
583    fn test_detect_amp_page() {
584        let amp_html = r#"
585            <!DOCTYPE html>
586            <html amp>
587            <head>
588                <script async src="https://cdn.ampproject.org/v0.js"></script>
589            </head>
590            <body></body>
591            </html>
592        "#;
593
594        let doc = parse_html(amp_html);
595        assert!(detect_is_amp_page(&doc));
596    }
597
598    #[test]
599    fn test_detect_amp_page_lightning() {
600        let amp_html = r#"
601            <!DOCTYPE html>
602            <html ⚡>
603            <head></head>
604            <body></body>
605            </html>
606        "#;
607
608        let doc = parse_html(amp_html);
609        assert!(detect_is_amp_page(&doc));
610    }
611
612    #[test]
613    fn test_not_amp_page() {
614        let html = "<html><body><p>Regular page</p></body></html>";
615        let doc = parse_html(html);
616        assert!(!detect_is_amp_page(&doc));
617    }
618
619    #[test]
620    fn test_extract_amp_link() {
621        let html = r#"
622            <html>
623            <head>
624                <link rel="amphtml" href="https://example.com/page.amp">
625            </head>
626            </html>
627        "#;
628
629        let doc = parse_html(html);
630        let amp_url = extract_amp_link(&doc, None);
631        assert_eq!(amp_url, Some("https://example.com/page.amp".to_string()));
632    }
633
634    #[test]
635    fn test_extract_amp_components() {
636        let html = r#"
637            <html amp>
638            <head>
639                <script custom-element="amp-carousel" src="..."></script>
640                <script custom-element="amp-analytics" src="..."></script>
641            </head>
642            <body>
643                <amp-img src="test.jpg"></amp-img>
644            </body>
645            </html>
646        "#;
647
648        let doc = parse_html(html);
649        let components = extract_amp_components(&doc);
650
651        assert!(components.contains(&"amp-carousel".to_string()));
652        assert!(components.contains(&"amp-analytics".to_string()));
653        assert!(components.contains(&"amp-img".to_string()));
654    }
655
656    #[test]
657    fn test_extract_amp_info() {
658        let html = r#"
659            <html>
660            <head>
661                <link rel="amphtml" href="/amp/page">
662                <link rel="canonical" href="/page">
663            </head>
664            </html>
665        "#;
666
667        let doc = parse_html(html);
668        let base = url::Url::parse("https://example.com/").unwrap();
669        let info = extract_amp_info(&doc, Some(&base)).unwrap();
670
671        assert!(info.has_amp_version());
672        assert_eq!(info.amp_url, Some("https://example.com/amp/page".to_string()));
673    }
674
675    #[test]
676    fn test_extract_cache_hints() {
677        let html = r#"
678            <html>
679            <head>
680                <meta http-equiv="Cache-Control" content="max-age=3600, public">
681            </head>
682            </html>
683        "#;
684
685        let doc = parse_html(html);
686        let hints = extract_cache_hints(&doc);
687
688        assert!(!hints.no_cache);
689        assert_eq!(hints.max_age, Some(3600));
690    }
691
692    #[test]
693    fn test_cache_no_cache() {
694        let html = r#"
695            <html>
696            <head>
697                <meta http-equiv="Cache-Control" content="no-cache, no-store">
698            </head>
699            </html>
700        "#;
701
702        let doc = parse_html(html);
703        let hints = extract_cache_hints(&doc);
704
705        assert!(hints.no_cache);
706    }
707
708    #[test]
709    fn test_has_content_changed() {
710        let html1 = "<html><body><p>Version 1</p></body></html>";
711        let html2 = "<html><body><p>Version 2</p></body></html>";
712
713        assert!(has_content_changed(html1, html2));
714        assert!(!has_content_changed(html1, html1));
715    }
716
717    #[test]
718    fn test_content_similarity() {
719        let html1 = "<html><body><p>Hello world</p></body></html>";
720        let html2 = "<html><body><p>Hello world</p></body></html>";
721
722        assert_eq!(content_similarity(html1, html2), 1.0);
723
724        let html3 = "<html><body><p>Different content entirely</p></body></html>";
725        let sim = content_similarity(html1, html3);
726        assert!(sim < 1.0);
727        assert!(sim > 0.0);
728    }
729
730    #[test]
731    fn test_quick_hash() {
732        let html1 = "<html><body>Test</body></html>";
733        let html2 = "<html><body>Test</body></html>";
734        let html3 = "<html><body>Different</body></html>";
735
736        assert_eq!(quick_hash(html1), quick_hash(html2));
737        assert_ne!(quick_hash(html1), quick_hash(html3));
738    }
739
740    #[test]
741    fn test_fingerprint_similarity_range() {
742        let html1 = "<html><body><div><p>Test</p></div></body></html>";
743        let html2 = "<html><body><span><p>Test</p></span></body></html>";
744
745        let fp1 = generate_fingerprint(html1).unwrap();
746        let fp2 = generate_fingerprint(html2).unwrap();
747
748        let sim = fp1.similarity(&fp2);
749        assert!(sim >= 0.0 && sim <= 1.0);
750    }
751}