1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
use std::collections::HashSet;
/// Classification label for a paragraph.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ClassType {
Good,
Bad,
Short,
NearGood,
}
/// A classified text paragraph extracted from HTML.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct Paragraph {
/// Dot-separated DOM path without ordinals (e.g., "body.div.p").
pub dom_path: String,
/// XPath with ordinals (e.g., "/html[1]/body[1]/div[2]/p[1]").
pub xpath: String,
/// Normalized text content.
pub text: String,
/// Word count (whitespace-split).
pub words_count: usize,
/// Character count inside `<a>` tags.
pub chars_count_in_links: usize,
/// Count of inline (non-block-level) tags within this paragraph.
pub tags_count: usize,
/// Final classification (set by revision stage).
pub class_type: ClassType,
/// Context-free classification before neighbor-based revision.
pub initial_class: ClassType,
/// Whether this paragraph is a heading.
pub heading: bool,
}
impl Paragraph {
/// Create a new paragraph with the given path and text.
pub(crate) fn new(
dom_path: String,
xpath: String,
text: String,
chars_count_in_links: usize,
tags_count: usize,
) -> Self {
let words_count = text.split_whitespace().count();
Self {
dom_path,
xpath,
text,
words_count,
chars_count_in_links,
tags_count,
class_type: ClassType::Short,
initial_class: ClassType::Short,
heading: false,
}
}
/// Returns `true` if this paragraph is classified as boilerplate.
pub fn is_boilerplate(&self) -> bool {
self.class_type != ClassType::Good
}
/// Returns `true` if the dom_path contains a heading tag (h0-h9).
///
/// dom_path is dot-separated ASCII tags (e.g. "html.body.div.h1").
/// A segment matches if it is exactly two bytes: `h` followed by an ASCII digit.
/// This mirrors Python's `\bh\d\b` regex where dots act as word boundaries.
pub fn is_heading(&self) -> bool {
self.dom_path.split('.').any(|seg| {
let b = seg.as_bytes();
b.len() == 2 && b[0] == b'h' && b[1].is_ascii_digit()
})
}
/// Link density: chars_count_in_links / text char count. Returns 0.0 if text is empty.
///
/// Uses Unicode codepoint count (not byte length) to match Python's `len()` semantics.
pub fn links_density(&self) -> f64 {
let char_count = self.text.chars().count();
if char_count == 0 {
0.0
} else {
self.chars_count_in_links as f64 / char_count as f64
}
}
/// Count of words present in the stoplist (case-insensitive).
pub fn stopwords_count(&self, stoplist: &HashSet<String>) -> usize {
self.text
.split_whitespace()
.filter(|word| stoplist.contains(&word.to_lowercase()))
.count()
}
/// Stopword density: stopwords_count / words_count. Returns 0.0 if no words.
pub fn stopwords_density(&self, stoplist: &HashSet<String>) -> f64 {
if self.words_count == 0 {
0.0
} else {
self.stopwords_count(stoplist) as f64 / self.words_count as f64
}
}
}