1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
use serde::{Deserialize, Serialize};
use crate::{errors::ValidationError, sanitization::SanitizeOption};
/// `CastType` is an enum that represents the type of the (final) result value.
#[derive(Serialize, Deserialize, Default, Debug, Clone, Copy)]
#[serde(rename_all = "lowercase")]
pub enum CastType {
#[default]
String,
Bool,
Int,
Float,
}
/// `Config` is a struct that represents the configuration of the `Finder`.
#[derive(Serialize, Deserialize, Default, Debug)]
pub struct Config {
/// represents a key for result the and every inline element if it is presented.
pub name: String,
/// is a selector's path to the element to handle. May be omitted if the `inherit` option is set to `true`.
#[serde(default)]
pub base_path: String,
/// represents what finder needs to extract. Accepted values are `text`, `inner_text`, `html`, `inner_html`, `immediate_text` or an html-attribute name.
#[serde(default)]
pub extract: String,
/// is a type of the result value. Accepted values are `bool`, `int`, `float` or `string` (default, and should be omitted).
#[serde(default)]
pub cast: CastType,
/// is a separator for joining the result values. Works only when `many` is set to `true` and there is no descendant config.
#[serde(default)]
pub join_sep: String,
/// is a flag that indicates whether the result is expecting to be an array or not.
#[serde(default)]
pub many: bool,
/// adds a index field to the result if it is an array of objects.
#[serde(default)]
pub enumerate: bool,
/// parent's `base_path` (and parent's selector) will be used if it is set to `true`.
#[serde(default)]
pub inherit: bool,
/// if it is `true` then `.parent()` method of the matcher will be applied. It means it will use direct parent of the selection. It is distinct from `inherit` option.
#[serde(default)]
pub parent: bool,
///if it is `true` then finder will stop parsing descendant selections when it will encounter the first non-empty result.
#[serde(default)]
pub first_occurrence: bool,
/// when it is `true` finder will remove a matching selection from the document (html).
#[serde(default)]
pub remove_selection: bool,
/// if it is `true` then finder will unpack descendant map into parent map.
#[serde(default)]
pub flatten: bool,
/// if it is `true` then finder will split `base_path` by `,` for more flexibility. Not implemented yet.
#[serde(default)]
pub split_path: bool,
/// is a one of [`SanitizeOption`] variants. If set, it will sanitize the matching node before extracting html.
#[serde(default)]
pub sanitize_policy: SanitizeOption,
/// is a list of predefined procedures to apply to the result.
/// Each procedure (pipeline element) is represented by a list of strings.
/// Currently supported procedures are:
/// `regex`, `replace`, `extract_json`, `trim_space`,
/// `trim`, `html_unescape`, `normalize_spaces`.
#[serde(default)]
pub pipeline: Vec<Vec<String>>,
/// is a list of descendant `Config`.
#[serde(default)]
pub children: Vec<Config>,
}
impl Config {
/// Creates a new `Config` instance from the given YAML string.
///
/// # Arguments
/// * `data` - a YAML string that represents the configuration.
/// # Examples
/// ```
/// use dom_finder::Config;
/// let cfg_yml: &str = r#"
/// name: all_links
/// base_path: html body a[href]
/// many: true
/// extract: href
/// "#;
/// let cfg = Config::from_yaml(cfg_yml);
/// assert!(cfg.is_ok());
/// ```
pub fn from_yaml(data: &str) -> Result<Self, serde_yaml::Error> {
serde_yaml::from_str(data)
}
/// Validates the `Config` instance.
pub fn validate(&self) -> Result<(), ValidationError> {
if self.name.is_empty() {
return Err(ValidationError::FieldIsMissing("name".to_string()));
}
if self.base_path.is_empty() && !self.inherit {
// The case when base_path is empty and inherit is true, resolved in Finder::new
return Err(ValidationError::FieldIsMissing("base_path".to_string()));
}
let must_extract = !self.extract.is_empty();
let must_dive = !self.children.is_empty();
if must_extract == must_dive {
return Err(ValidationError::ExtractOrDive);
}
Ok(())
}
}
#[cfg(feature = "json_cfg")]
impl Config {
/// Creates a new `Config` instance from the given JSON string.
/// This method is available only if the `json_cfg` feature is enabled.
/// # Arguments
/// * `data` - a JSON string that represents the configuration.
/// # Examples
/// ```
/// use dom_finder::Config;
/// let cfg_json: &str = r#"
/// {
/// "name": "all_links",
/// "base_path": "html body a[href]",
/// "many": true,
/// "extract": "href"
/// }
/// "#;
/// let cfg = Config::from_json(cfg_json);
/// assert!(cfg.is_ok());
/// ```
pub fn from_json(data: &str) -> Result<Self, serde_json::Error> {
serde_json::from_str(data)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn config_is_valid() {
let cfg_yml: &str = r"
name: footer_links
base_path: footer a[href]
many: true
extract: href
";
let cfg = Config::from_yaml(cfg_yml).unwrap();
assert!(cfg.validate().is_ok());
}
#[test]
#[cfg(feature = "json_cfg")]
fn config_json_is_valid() {
let cfg_json: &str = r#"
{
"name": "footer_links",
"base_path": "footer a[href]",
"many": true,
"extract": "href"
}
"#;
let cfg = Config::from_json(cfg_json).unwrap();
assert!(cfg.validate().is_ok());
}
#[test]
fn config_with_children_is_valid() {
let cfg_yml: &str = r"
name: root
base_path: body
children:
- name: footer_links
base_path: footer a[href]
many: true
extract: href
";
let cfg = Config::from_yaml(cfg_yml).unwrap();
assert!(cfg.validate().is_ok());
}
#[test]
#[should_panic]
fn config_missing_name_panic() {
let cfg_yml: &str = r"
base_path: a[href]
many: true
extract: href
";
let cfg = Config::from_yaml(cfg_yml).unwrap();
assert!(cfg.validate().is_err());
}
#[test]
fn config_missing_name() {
let cfg_yml: &str = r"
name:
base_path: a[href]
many: true
extract: href
";
let cfg = Config::from_yaml(cfg_yml).unwrap();
assert!(cfg.validate().is_err());
}
#[test]
fn config_missing_base_path() {
let cfg_yml: &str = r"
name: footer_links
base_path:
many: true
extract: href
";
let cfg = Config::from_yaml(cfg_yml).unwrap();
assert!(cfg.validate().is_err());
}
#[test]
fn config_no_extract_no_children() {
let cfg_yml: &str = r"
name: footer_links
base_path: a[href]
many: true
";
let cfg = Config::from_yaml(cfg_yml).unwrap();
assert!(cfg.validate().is_err());
}
#[test]
fn config_with_extract_with_children() {
let cfg_yml: &str = r"
name: footer_links
base_path: footer p:has(a[href])
many: true
extract: text
children:
- name: link
base_path: a[href]
extract: href
";
let cfg = Config::from_yaml(cfg_yml).unwrap();
assert!(cfg.validate().is_err());
}
}