1use scraper::{Html, Selector};
11use serde::{Deserialize, Serialize};
12use std::collections::hash_map::DefaultHasher;
13use std::hash::{Hash, Hasher};
14
15use crate::selector::SELECTORS;
16use crate::types::ParserResult;
17
18#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
24pub struct ContentFingerprint {
25 pub full_hash: u64,
27 pub content_hash: u64,
29 pub text_hash: u64,
31 pub structure_hash: u64,
33 pub element_count: usize,
35 pub text_node_count: usize,
37 pub content_length: usize,
39 pub main_content_length: usize,
41}
42
43impl ContentFingerprint {
44 pub fn has_changed(&self, other: &ContentFingerprint) -> bool {
46 self.content_hash != other.content_hash
47 }
48
49 pub fn has_minor_changes(&self, other: &ContentFingerprint) -> bool {
51 self.structure_hash == other.structure_hash &&
52 self.content_hash != other.content_hash
53 }
54
55 pub fn has_structural_changes(&self, other: &ContentFingerprint) -> bool {
57 self.structure_hash != other.structure_hash
58 }
59
60 pub fn similarity(&self, other: &ContentFingerprint) -> f64 {
62 let mut matches = 0.0;
63 let total = 4.0;
64
65 if self.content_hash == other.content_hash { matches += 1.0; }
66 if self.text_hash == other.text_hash { matches += 1.0; }
67 if self.structure_hash == other.structure_hash { matches += 1.0; }
68
69 let count_diff = (self.element_count as i64 - other.element_count as i64).abs();
71 let max_count = self.element_count.max(other.element_count) as f64;
72 if max_count > 0.0 {
73 matches += 1.0 - (count_diff as f64 / max_count);
74 } else {
75 matches += 1.0;
76 }
77
78 matches / total
79 }
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
84pub struct AmpInfo {
85 pub is_amp: bool,
87 pub is_amp_html: bool,
89 pub amp_url: Option<String>,
91 pub canonical_url: Option<String>,
93 pub amp_version: Option<String>,
95 pub has_amp_runtime: bool,
97 pub components: Vec<String>,
99}
100
101impl AmpInfo {
102 pub fn new() -> Self {
103 Self::default()
104 }
105
106 pub fn has_amp_version(&self) -> bool {
108 self.amp_url.is_some()
109 }
110}
111
112#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
114pub struct CacheHints {
115 pub etag: Option<String>,
117 pub last_modified: Option<String>,
119 pub cache_control: Option<String>,
121 pub no_cache: bool,
123 pub max_age: Option<u32>,
125}
126
127pub fn generate_fingerprint(html: &str) -> ParserResult<ContentFingerprint> {
133 let document = Html::parse_document(html);
134
135 let main_content = extract_main_content(&document);
137
138 let text_content = extract_text_only(&document);
140
141 let structure = extract_structure(&document);
143
144 let fingerprint = ContentFingerprint {
145 full_hash: hash_string(html),
146 content_length: html.len(),
147 content_hash: hash_string(&main_content),
148 main_content_length: main_content.len(),
149 text_hash: hash_string(&text_content),
150 structure_hash: hash_string(&structure),
151 element_count: count_elements(&document),
152 text_node_count: count_text_nodes(&document),
153 };
154
155 Ok(fingerprint)
156}
157
158pub fn fingerprint_document(document: &Html) -> ContentFingerprint {
160 let html = document.html();
161 generate_fingerprint(&html).unwrap_or_default()
162}
163
164fn hash_string(s: &str) -> u64 {
166 let mut hasher = DefaultHasher::new();
167 s.hash(&mut hasher);
168 hasher.finish()
169}
170
171fn extract_main_content(document: &Html) -> String {
173 let main_selectors = [
175 "main",
176 "article",
177 "[role='main']",
178 ".content",
179 "#content",
180 ".post-content",
181 ".article-content",
182 ".entry-content",
183 ];
184
185 for selector_str in main_selectors {
186 if let Ok(sel) = Selector::parse(selector_str) {
187 let content: String = document.select(&sel)
188 .map(|el| el.html())
189 .collect();
190
191 if !content.is_empty() {
192 return content;
193 }
194 }
195 }
196
197 if let Some(body) = document.select(&SELECTORS.body).next() {
199 let mut content = body.html();
200
201 let boilerplate = ["<nav", "<header", "<footer", "<aside", "<script", "<style"];
203 for bp in boilerplate {
204 if let Some(start) = content.find(bp) {
205 if let Some(end) = content[start..].find('>') {
206 let tag_end = start + end + 1;
208 content = format!("{}{}", &content[..start], &content[tag_end..]);
209 }
210 }
211 }
212
213 return content;
214 }
215
216 document.html()
217}
218
219fn extract_text_only(document: &Html) -> String {
221 document.root_element()
222 .text()
223 .collect::<String>()
224 .split_whitespace()
225 .collect::<Vec<_>>()
226 .join(" ")
227}
228
229fn extract_structure(document: &Html) -> String {
231 let mut structure = String::new();
232 extract_structure_recursive(document.root_element(), &mut structure, 0);
233 structure
234}
235
236fn extract_structure_recursive(
238 element: scraper::ElementRef,
239 structure: &mut String,
240 depth: usize,
241) {
242 structure.push_str(&format!("{}:{}", depth, element.value().name()));
244
245 for attr in ["id", "class", "role"] {
247 if let Some(val) = element.value().attr(attr) {
248 let short_val: String = val.split_whitespace().take(1).collect();
250 if !short_val.is_empty() {
251 structure.push_str(&format!("[{}={}]", attr, short_val));
252 }
253 }
254 }
255
256 structure.push(';');
257
258 if depth < 10 {
260 for child in element.children() {
261 if let Some(el) = scraper::ElementRef::wrap(child) {
262 let name = el.value().name();
264 if name != "script" && name != "style" && name != "noscript" {
265 extract_structure_recursive(el, structure, depth + 1);
266 }
267 }
268 }
269 }
270}
271
272fn count_elements(document: &Html) -> usize {
274 if let Ok(sel) = Selector::parse("*") {
275 document.select(&sel).count()
276 } else {
277 0
278 }
279}
280
281fn count_text_nodes(document: &Html) -> usize {
283 document.root_element()
284 .text()
285 .filter(|t| !t.trim().is_empty())
286 .count()
287}
288
289pub fn extract_amp_info(document: &Html, base_url: Option<&url::Url>) -> ParserResult<AmpInfo> {
295 let mut info = AmpInfo::new();
296
297 info.is_amp_html = detect_is_amp_page(document);
299 info.is_amp = info.is_amp_html;
300
301 if !info.is_amp {
303 info.amp_url = extract_amp_link(document, base_url);
304 if info.amp_url.is_some() {
305 info.is_amp = true; }
307 }
308
309 if info.is_amp_html {
311 info.canonical_url = extract_canonical_link(document, base_url);
312 }
313
314 info.has_amp_runtime = detect_amp_runtime(document);
316
317 info.components = extract_amp_components(document);
319
320 info.amp_version = detect_amp_version(document);
322
323 Ok(info)
324}
325
326fn detect_is_amp_page(document: &Html) -> bool {
328 if let Some(html) = document.select(&SELECTORS.html).next() {
330 if html.value().attr("amp").is_some() || html.value().attr("⚡").is_some() {
332 return true;
333 }
334
335 if html.value().classes().any(|c| c == "amp" || c == "⚡") {
337 return true;
338 }
339 }
340
341 let html_str = document.html();
343 html_str.contains("amp-boilerplate") ||
344 html_str.contains("cdn.ampproject.org")
345}
346
347fn extract_amp_link(document: &Html, base_url: Option<&url::Url>) -> Option<String> {
349 if let Ok(sel) = Selector::parse("link[rel='amphtml']") {
350 if let Some(el) = document.select(&sel).next() {
351 if let Some(href) = el.value().attr("href") {
352 return resolve_url(href, base_url);
353 }
354 }
355 }
356 None
357}
358
359fn extract_canonical_link(document: &Html, base_url: Option<&url::Url>) -> Option<String> {
361 if let Ok(sel) = Selector::parse("link[rel='canonical']") {
362 if let Some(el) = document.select(&sel).next() {
363 if let Some(href) = el.value().attr("href") {
364 return resolve_url(href, base_url);
365 }
366 }
367 }
368 None
369}
370
371fn detect_amp_runtime(document: &Html) -> bool {
373 if let Ok(sel) = Selector::parse("script[src*='cdn.ampproject.org']") {
374 return document.select(&sel).next().is_some();
375 }
376 false
377}
378
379fn extract_amp_components(document: &Html) -> Vec<String> {
381 let mut components = Vec::new();
382
383 if let Ok(sel) = Selector::parse("script[custom-element]") {
385 for el in document.select(&sel) {
386 if let Some(name) = el.value().attr("custom-element") {
387 if !components.contains(&name.to_string()) {
388 components.push(name.to_string());
389 }
390 }
391 }
392 }
393
394 let html = document.html().to_lowercase();
396 let amp_tags = [
397 "amp-img", "amp-video", "amp-audio", "amp-carousel",
398 "amp-accordion", "amp-sidebar", "amp-lightbox",
399 "amp-analytics", "amp-ad", "amp-social-share",
400 "amp-form", "amp-list", "amp-bind", "amp-state",
401 ];
402
403 for tag in amp_tags {
404 if html.contains(&format!("<{}", tag)) {
405 let tag_str = tag.to_string();
406 if !components.contains(&tag_str) {
407 components.push(tag_str);
408 }
409 }
410 }
411
412 components
413}
414
415fn detect_amp_version(document: &Html) -> Option<String> {
417 if let Ok(sel) = Selector::parse("script[src*='cdn.ampproject.org']") {
418 if let Some(el) = document.select(&sel).next() {
419 if let Some(src) = el.value().attr("src") {
420 if src.contains("/v0") {
422 return Some("v0".to_string());
423 }
424 }
426 }
427 }
428 None
429}
430
431fn resolve_url(href: &str, base_url: Option<&url::Url>) -> Option<String> {
433 if href.starts_with("http://") || href.starts_with("https://") {
434 return Some(href.to_string());
435 }
436
437 if let Some(base) = base_url {
438 return base.join(href).ok().map(|u| u.to_string());
439 }
440
441 None
442}
443
444pub fn extract_cache_hints(document: &Html) -> CacheHints {
450 let mut hints = CacheHints::default();
451
452 if let Ok(sel) = Selector::parse("meta[http-equiv='Cache-Control']") {
454 if let Some(el) = document.select(&sel).next() {
455 if let Some(content) = el.value().attr("content") {
456 hints.cache_control = Some(content.to_string());
457 hints.no_cache = content.to_lowercase().contains("no-cache") ||
458 content.to_lowercase().contains("no-store");
459
460 if let Some(pos) = content.to_lowercase().find("max-age=") {
462 let start = pos + 8;
463 let num: String = content[start..]
464 .chars()
465 .take_while(|c| c.is_ascii_digit())
466 .collect();
467 hints.max_age = num.parse().ok();
468 }
469 }
470 }
471 }
472
473 if let Ok(sel) = Selector::parse("meta[http-equiv='Pragma']") {
475 if let Some(el) = document.select(&sel).next() {
476 if let Some(content) = el.value().attr("content") {
477 if content.to_lowercase().contains("no-cache") {
478 hints.no_cache = true;
479 }
480 }
481 }
482 }
483
484 hints
485}
486
487pub fn has_content_changed(old_html: &str, new_html: &str) -> bool {
493 let old_fp = generate_fingerprint(old_html).unwrap_or_default();
494 let new_fp = generate_fingerprint(new_html).unwrap_or_default();
495 old_fp.has_changed(&new_fp)
496}
497
498pub fn content_similarity(html1: &str, html2: &str) -> f64 {
500 let fp1 = generate_fingerprint(html1).unwrap_or_default();
501 let fp2 = generate_fingerprint(html2).unwrap_or_default();
502 fp1.similarity(&fp2)
503}
504
505pub fn is_amp_page(document: &Html) -> bool {
507 detect_is_amp_page(document)
508}
509
510pub fn get_amp_url(document: &Html) -> Option<String> {
512 extract_amp_link(document, None)
513}
514
515pub fn quick_hash(html: &str) -> u64 {
517 hash_string(html)
518}
519
520#[cfg(test)]
525mod tests {
526 use super::*;
527
528 fn parse_html(html: &str) -> Html {
529 Html::parse_document(html)
530 }
531
532 #[test]
533 fn test_generate_fingerprint() {
534 let html = "<html><body><p>Hello world</p></body></html>";
535 let fp = generate_fingerprint(html).unwrap();
536
537 assert!(fp.full_hash != 0);
538 assert!(fp.content_hash != 0);
539 assert!(fp.text_hash != 0);
540 assert!(fp.structure_hash != 0);
541 assert!(fp.element_count > 0);
542 assert!(fp.content_length > 0);
543 }
544
545 #[test]
546 fn test_fingerprint_same_content() {
547 let html1 = "<html><body><p>Hello world</p></body></html>";
548 let html2 = "<html><body><p>Hello world</p></body></html>";
549
550 let fp1 = generate_fingerprint(html1).unwrap();
551 let fp2 = generate_fingerprint(html2).unwrap();
552
553 assert!(!fp1.has_changed(&fp2));
554 assert_eq!(fp1.similarity(&fp2), 1.0);
555 }
556
557 #[test]
558 fn test_fingerprint_different_content() {
559 let html1 = "<html><body><p>Hello world</p></body></html>";
560 let html2 = "<html><body><p>Goodbye world</p></body></html>";
561
562 let fp1 = generate_fingerprint(html1).unwrap();
563 let fp2 = generate_fingerprint(html2).unwrap();
564
565 assert!(fp1.has_changed(&fp2));
566 assert!(!fp1.has_structural_changes(&fp2));
568 assert!(fp1.has_minor_changes(&fp2));
569 }
570
571 #[test]
572 fn test_fingerprint_structural_change() {
573 let html1 = "<html><body><p>Hello</p></body></html>";
574 let html2 = "<html><body><div><p>Hello</p></div></body></html>";
575
576 let fp1 = generate_fingerprint(html1).unwrap();
577 let fp2 = generate_fingerprint(html2).unwrap();
578
579 assert!(fp1.has_structural_changes(&fp2));
580 }
581
582 #[test]
583 fn test_detect_amp_page() {
584 let amp_html = r#"
585 <!DOCTYPE html>
586 <html amp>
587 <head>
588 <script async src="https://cdn.ampproject.org/v0.js"></script>
589 </head>
590 <body></body>
591 </html>
592 "#;
593
594 let doc = parse_html(amp_html);
595 assert!(detect_is_amp_page(&doc));
596 }
597
598 #[test]
599 fn test_detect_amp_page_lightning() {
600 let amp_html = r#"
601 <!DOCTYPE html>
602 <html ⚡>
603 <head></head>
604 <body></body>
605 </html>
606 "#;
607
608 let doc = parse_html(amp_html);
609 assert!(detect_is_amp_page(&doc));
610 }
611
612 #[test]
613 fn test_not_amp_page() {
614 let html = "<html><body><p>Regular page</p></body></html>";
615 let doc = parse_html(html);
616 assert!(!detect_is_amp_page(&doc));
617 }
618
619 #[test]
620 fn test_extract_amp_link() {
621 let html = r#"
622 <html>
623 <head>
624 <link rel="amphtml" href="https://example.com/page.amp">
625 </head>
626 </html>
627 "#;
628
629 let doc = parse_html(html);
630 let amp_url = extract_amp_link(&doc, None);
631 assert_eq!(amp_url, Some("https://example.com/page.amp".to_string()));
632 }
633
634 #[test]
635 fn test_extract_amp_components() {
636 let html = r#"
637 <html amp>
638 <head>
639 <script custom-element="amp-carousel" src="..."></script>
640 <script custom-element="amp-analytics" src="..."></script>
641 </head>
642 <body>
643 <amp-img src="test.jpg"></amp-img>
644 </body>
645 </html>
646 "#;
647
648 let doc = parse_html(html);
649 let components = extract_amp_components(&doc);
650
651 assert!(components.contains(&"amp-carousel".to_string()));
652 assert!(components.contains(&"amp-analytics".to_string()));
653 assert!(components.contains(&"amp-img".to_string()));
654 }
655
656 #[test]
657 fn test_extract_amp_info() {
658 let html = r#"
659 <html>
660 <head>
661 <link rel="amphtml" href="/amp/page">
662 <link rel="canonical" href="/page">
663 </head>
664 </html>
665 "#;
666
667 let doc = parse_html(html);
668 let base = url::Url::parse("https://example.com/").unwrap();
669 let info = extract_amp_info(&doc, Some(&base)).unwrap();
670
671 assert!(info.has_amp_version());
672 assert_eq!(info.amp_url, Some("https://example.com/amp/page".to_string()));
673 }
674
675 #[test]
676 fn test_extract_cache_hints() {
677 let html = r#"
678 <html>
679 <head>
680 <meta http-equiv="Cache-Control" content="max-age=3600, public">
681 </head>
682 </html>
683 "#;
684
685 let doc = parse_html(html);
686 let hints = extract_cache_hints(&doc);
687
688 assert!(!hints.no_cache);
689 assert_eq!(hints.max_age, Some(3600));
690 }
691
692 #[test]
693 fn test_cache_no_cache() {
694 let html = r#"
695 <html>
696 <head>
697 <meta http-equiv="Cache-Control" content="no-cache, no-store">
698 </head>
699 </html>
700 "#;
701
702 let doc = parse_html(html);
703 let hints = extract_cache_hints(&doc);
704
705 assert!(hints.no_cache);
706 }
707
708 #[test]
709 fn test_has_content_changed() {
710 let html1 = "<html><body><p>Version 1</p></body></html>";
711 let html2 = "<html><body><p>Version 2</p></body></html>";
712
713 assert!(has_content_changed(html1, html2));
714 assert!(!has_content_changed(html1, html1));
715 }
716
717 #[test]
718 fn test_content_similarity() {
719 let html1 = "<html><body><p>Hello world</p></body></html>";
720 let html2 = "<html><body><p>Hello world</p></body></html>";
721
722 assert_eq!(content_similarity(html1, html2), 1.0);
723
724 let html3 = "<html><body><p>Different content entirely</p></body></html>";
725 let sim = content_similarity(html1, html3);
726 assert!(sim < 1.0);
727 assert!(sim > 0.0);
728 }
729
730 #[test]
731 fn test_quick_hash() {
732 let html1 = "<html><body>Test</body></html>";
733 let html2 = "<html><body>Test</body></html>";
734 let html3 = "<html><body>Different</body></html>";
735
736 assert_eq!(quick_hash(html1), quick_hash(html2));
737 assert_ne!(quick_hash(html1), quick_hash(html3));
738 }
739
740 #[test]
741 fn test_fingerprint_similarity_range() {
742 let html1 = "<html><body><div><p>Test</p></div></body></html>";
743 let html2 = "<html><body><span><p>Test</p></span></body></html>";
744
745 let fp1 = generate_fingerprint(html1).unwrap();
746 let fp2 = generate_fingerprint(html2).unwrap();
747
748 let sim = fp1.similarity(&fp2);
749 assert!(sim >= 0.0 && sim <= 1.0);
750 }
751}