1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
use serde::{Deserialize, Serialize};
/// Options for configuring the decruft extraction pipeline.
#[derive(Debug, Clone)]
#[non_exhaustive]
#[expect(clippy::struct_excessive_bools)]
pub struct DecruftOptions {
/// URL of the page being parsed (for resolving relative URLs).
pub url: Option<String>,
/// Enable debug logging and include removal details in output.
pub debug: bool,
/// Remove elements matching exact CSS selectors.
pub remove_exact_selectors: bool,
/// Remove elements matching partial class/id patterns.
pub remove_partial_selectors: bool,
/// Remove all images from output.
pub remove_images: bool,
/// Remove elements hidden via CSS.
pub remove_hidden_elements: bool,
/// Remove low-scoring non-content blocks.
pub remove_low_scoring: bool,
/// Remove small images (< 33px).
pub remove_small_images: bool,
/// Standardize heading levels, code blocks, etc.
pub standardize: bool,
/// Remove content patterns (bylines, read time, etc.).
pub remove_content_patterns: bool,
/// CSS selector override for content root.
pub content_selector: Option<String>,
/// Convert output to Markdown.
pub markdown: bool,
/// Include Markdown alongside HTML content.
pub separate_markdown: bool,
/// Include replies/comments in extracted content.
pub include_replies: bool,
}
impl Default for DecruftOptions {
fn default() -> Self {
Self {
url: None,
debug: false,
remove_exact_selectors: true,
remove_partial_selectors: true,
remove_images: false,
remove_hidden_elements: true,
remove_low_scoring: true,
remove_small_images: true,
standardize: true,
remove_content_patterns: true,
content_selector: None,
markdown: false,
separate_markdown: false,
include_replies: true,
}
}
}
/// Result of the decruft extraction pipeline.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct DecruftResult {
/// Cleaned HTML content.
pub content: String,
/// Page title.
pub title: String,
/// Page description.
pub description: String,
/// Domain name.
pub domain: String,
/// Favicon URL.
pub favicon: String,
/// Primary image URL.
pub image: String,
/// Content language.
pub language: String,
/// Parse time in milliseconds.
pub parse_time_ms: u64,
/// Publication date.
pub published: String,
/// Author name.
pub author: String,
/// Site name.
pub site: String,
/// Markdown version of content (when `markdown` or `separate_markdown` is enabled).
#[serde(skip_serializing_if = "Option::is_none")]
pub content_markdown: Option<String>,
/// Word count of extracted content.
pub word_count: usize,
/// Schema.org data if found.
pub schema_org_data: Option<serde_json::Value>,
/// All meta tags found on the page. Only populated when debug mode is enabled.
#[serde(skip_serializing_if = "Option::is_none")]
pub meta_tags: Option<Vec<MetaTag>>,
/// Which site-specific extractor produced this result (if any).
#[serde(skip_serializing_if = "Option::is_none")]
pub extractor_type: Option<String>,
/// Debug information (only present when debug mode is enabled).
#[serde(skip_serializing_if = "Option::is_none")]
pub debug: Option<DebugInfo>,
}
/// A meta tag from the page.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetaTag {
pub name: Option<String>,
pub property: Option<String>,
pub content: Option<String>,
}
/// Debug information about the extraction process.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DebugInfo {
/// CSS selector path of the chosen content element.
pub content_selector: String,
/// List of elements that were removed.
pub removals: Vec<Removal>,
}
/// A record of a removed element.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Removal {
pub step: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub selector: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub reason: Option<String>,
pub text: String,
}
/// Metadata extracted from the page (internal representation).
#[derive(Debug, Clone, Default, Serialize)]
pub(crate) struct Metadata {
pub title: String,
pub description: String,
pub domain: String,
pub favicon: String,
pub image: String,
pub language: String,
pub published: String,
pub author: String,
pub site_name: String,
}