mlscraper_rust/
util.rs

1use crate::selectors::Selector;
2use rand::Rng;
3use std::borrow::Cow;
4use tl::VDom;
5use tl::{HTMLTag, Node, NodeHandle, Parser};
6
7#[cfg(feature = "serde")]
8use serde::{Deserialize, Serialize};
9
10/// Find parent of node using a brute force search in the parser's node table
11pub(crate) fn find_parent(handle: NodeHandle, parser: &Parser) -> Option<NodeHandle> {
12    let inner = handle.get_inner();
13    let mut next_id = inner - 1;
14    let mut optional_node = parser.resolve_node_id(next_id);
15    while let Some(node) = optional_node {
16        let children = node.children();
17        if children.is_some()
18            && children
19                .unwrap()
20                .top()
21                .iter()
22                .any(|child_handle| child_handle.get_inner() == inner)
23        {
24            return Some(NodeHandle::new(next_id));
25        }
26        next_id = next_id.checked_sub(1)?;
27        optional_node = parser.resolve_node_id(next_id);
28    }
29    None
30}
31
32/// Find root node handle
33pub fn find_root<'a>(dom: &'a VDom<'a>) -> Option<&'a NodeHandle> {
34    dom.children()
35        .iter()
36        .find(|node| node_is_tag(node, dom.parser()))
37}
38
39/// Returns true if the parser node is a HTML tag
40pub(crate) fn node_is_tag(node: &NodeHandle, parser: &Parser) -> bool {
41    node.get(parser)
42        .map(|node| matches!(node, Node::Tag(..)))
43        .unwrap_or(false)
44}
45
46/// Returns the inner text of the node, but no text of any child nodes!
47pub(crate) fn get_direct_inner_text(tag: &HTMLTag, parser: &Parser) -> String {
48    tag.children()
49        .top()
50        .iter()
51        .filter_map(|child| {
52            child
53                .get(parser)
54                .and_then(|node| node.as_raw())
55                .map(|raw| raw.as_utf8_str())
56        })
57        .collect()
58}
59
60/// Returns the trimmed value of attribute `attr`, if it exists and is not empty
61/// (NOTE <node attr=""/> will thus return `None`.)
62pub(crate) fn get_trimmed_attr_value(tag: &HTMLTag, attr: &str) -> Option<String> {
63    let attrv = tag.attributes().get(attr).flatten();
64    if let Some(attrv) = attrv {
65        let attrv = attrv.as_utf8_str();
66        let trimmed_attrv = attrv.trim();
67        if !trimmed_attrv.is_empty() {
68            return Some(trimmed_attrv.to_string());
69        }
70    }
71    None
72}
73
74/// Returns the trimmed value of the first attribute whose name starts with `attr_prefix`,
75pub(crate) fn get_trimmed_attr_prefix_value(tag: &HTMLTag, attr_prefix: &str) -> Option<String> {
76    let attrv = tag
77        .attributes()
78        .iter()
79        .find(|(attr, _)| attr.starts_with(attr_prefix))
80        .map(|(_, val)| val)
81        .flatten();
82    if let Some(attrv) = attrv {
83        let trimmed_attrv = attrv.trim();
84        if !trimmed_attrv.is_empty() {
85            return Some(trimmed_attrv.to_string());
86        }
87    }
88    None
89}
90
91/// Searches for a node whose inner text matches the given text.
92/// NOTE that this also includes the inner text of any child nodes!
93/// Both strings are trimmed before comparison.
94pub fn find_node_with_text(dom: &VDom, text: &str) -> Option<NodeHandle> {
95    dom.nodes()
96        .iter()
97        .enumerate()
98        .find(|(_, node)| {
99            node.as_tag().is_some()
100                && node
101                    .as_tag()
102                    .unwrap()
103                    .inner_text(dom.parser())
104                    .as_ref()
105                    .trim()
106                    == text.trim()
107        })
108        .map(|(i, _)| NodeHandle::new(i as u32))
109}
110
111/// Get the id of a node
112pub(crate) fn get_id<'p>(handle: NodeHandle, parser: &'p Parser<'p>) -> Option<Cow<'p, str>> {
113    Some(
114        handle
115            .get(parser)?
116            .as_tag()?
117            .attributes()
118            .id()?
119            .as_utf8_str(),
120    )
121}
122
123/// Get all classes of a node as a single string
124#[allow(dead_code)] // used in unit tests
125pub(crate) fn get_classes<'p>(handle: NodeHandle, parser: &'p Parser<'p>) -> Option<Cow<'p, str>> {
126    Some(
127        handle
128            .get(parser)?
129            .as_tag()?
130            .attributes()
131            .class()?
132            .as_utf8_str(),
133    )
134}
135
136/// Highlight the given selector's selection by adding a red border to it
137/// Returns true if successful.
138pub(crate) fn style_selected_element(selector: &Selector, dom: &mut VDom) -> bool {
139    if let Some(node) = selector.try_select(*find_root(dom).unwrap(), dom.parser()) {
140        let attributes = node
141            .get_mut(dom.parser_mut())
142            .unwrap()
143            .as_tag_mut()
144            .unwrap()
145            .attributes_mut();
146        if let Some(Some(style)) = attributes.get_mut("style") {
147            // Add to pre-existing style
148            let new_style = format!("{}; border: 1px solid red;", style.as_utf8_str()).into_bytes();
149            style.set(new_style).is_ok()
150        } else {
151            attributes.insert("style", Some("border: 1px solid red;"));
152            true
153        }
154    } else {
155        false
156    }
157}
158
159pub(crate) fn random_index_weighted<R: Rng>(rng: &mut R, weights: &[f32]) -> usize {
160    let random: f32 = rng.gen();
161    let mut sum = 0f32;
162    for (i, weight) in weights.iter().enumerate() {
163        sum += weight;
164        if sum >= random {
165            return i;
166        }
167    }
168    panic!("this should not happen: {:?} {} {}", weights, random, sum);
169}
170
171/// Different options for retrieving text from a node.
172/// We generate selectors for every node that yields text that matches the expected attribute value.
173#[derive(Debug)]
174#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
175pub enum TextRetrievalOption {
176    /// Consider the node's inner text as text.
177    InnerText,
178    /// Consider the value of the given attribute as text.
179    /// For example, Attribute("title"") will consider the value of the "title" attribute as searchable text.
180    Attribute(String),
181    /// Consider the value of the first attribute whose name starts with the given prefix as text.
182    /// For example, AttributeStartsWith("data-") will consider the value of the first attribute whose name
183    /// starts with "data-" as searchable text.
184    AttributeStartsWith(String),
185}
186
187pub type TextRetrievalOptions = Vec<TextRetrievalOption>;
188
189/// Returns the node's text value, as specified by the given [`TextRetrievalOptions`].
190pub fn get_node_text(
191    vdom: &VDom,
192    node: NodeHandle,
193    text_retrieval_options: &TextRetrievalOptions,
194) -> Option<String> {
195    node.get(vdom.parser())
196        .and_then(|node| node.as_tag())
197        .and_then(|tag| {
198            for option in text_retrieval_options {
199                match option {
200                    TextRetrievalOption::InnerText => {
201                        let inner_text = get_direct_inner_text(tag, vdom.parser());
202                        let trimmed_inner_text = inner_text.trim();
203                        if !trimmed_inner_text.is_empty() {
204                            return Some(trimmed_inner_text.to_string());
205                        }
206                    }
207                    TextRetrievalOption::Attribute(name) => {
208                        let value = get_trimmed_attr_value(tag, &name);
209                        if value.is_some() {
210                            return value;
211                        }
212                    }
213                    TextRetrievalOption::AttributeStartsWith(prefix) => {
214                        let value = get_trimmed_attr_prefix_value(tag, &prefix);
215                        if value.is_some() {
216                            return value;
217                        }
218                    }
219                }
220            }
221
222            None
223        })
224}