html_languageservice/parser/
html_document.rs

1use std::collections::HashMap;
2
3use super::html_scanner::TokenType;
4
5#[derive(Debug, Clone, PartialEq)]
6pub struct Node {
7    /// It's None only when new
8    pub tag: Option<String>,
9    pub start: usize,
10    pub end: usize,
11    pub children: Vec<Node>,
12    /// Whether part of end tag exists
13    pub closed: bool,
14    /// It's None only when new or it miss close part of start tag, it larger than end of start tag
15    pub start_tag_end: Option<usize>,
16    /// It's None only when it's self-closing tag or it miss part of end tag, it equals start of end tag
17    pub end_tag_start: Option<usize>,
18    pub attributes: HashMap<String, NodeAttribute>,
19}
20
21/// # Node attribute
22///
23/// The `value` is the value of the attribute at after '=' sign, including quotes.
24///
25/// The '=' sign must be immediately followed by the attribute name, otherwise it will be ignored.
26///
27/// ## Boundary cases
28///
29/// * The value is `None` if there is not '=' sign.
30///
31/// * The value is `Some("".to_string())` if the '=' sign is followed by a space.
32#[derive(Debug, Clone, PartialEq)]
33pub struct NodeAttribute {
34    /// include quote
35    pub value: Option<String>,
36    /// start offset of attribute name
37    pub offset: usize,
38}
39
40impl NodeAttribute {
41    pub fn new(value: Option<String>, offset: usize) -> NodeAttribute {
42        NodeAttribute { value, offset }
43    }
44}
45
46impl Node {
47    pub fn new(start: usize, end: usize, children: Vec<Node>) -> Node {
48        Node {
49            tag: None,
50            start,
51            end,
52            children,
53            closed: false,
54            start_tag_end: None,
55            end_tag_start: None,
56            attributes: HashMap::new(),
57        }
58    }
59
60    pub fn attribute_names(&self) -> Vec<&String> {
61        self.attributes.keys().collect()
62    }
63
64    pub fn attribute_names_by_order(&self) -> Vec<&String> {
65        let mut attributes = self.attribute_names();
66        attributes.sort_by(|a, b| {
67            let a = self.attributes.get(*a).unwrap().offset;
68            let b = self.attributes.get(*b).unwrap().offset;
69            a.cmp(&b)
70        });
71        attributes
72    }
73
74    pub fn is_self_closing(&self) -> bool {
75        self.end_tag_start.is_none()
76    }
77
78    /// if case_sensitive is false, then other_tag should be lowercase
79    pub fn is_same_tag(&self, target_tag: Option<&str>, case_sensitive: bool) -> bool {
80        if self.tag.is_none() {
81            target_tag.is_none()
82        } else {
83            let tag: &str = &self.tag.as_ref().unwrap();
84            let tag = if case_sensitive {
85                tag
86            } else {
87                &tag.to_lowercase()
88            };
89            target_tag.is_some_and(|target_tag| tag == target_tag)
90        }
91    }
92
93    pub fn first_child(&self) -> Option<&Node> {
94        Some(self.children.first()?)
95    }
96
97    pub fn last_child(&self) -> Option<&Node> {
98        Some(self.children.last()?)
99    }
100
101    pub fn find_node_before<'a>(
102        node: &'a Node,
103        offset: usize,
104        parent_list: &mut Vec<&'a Node>,
105    ) -> &'a Node {
106        let mut idx = node.children.len();
107        for (i, child) in node.children.iter().enumerate() {
108            if offset <= child.start {
109                idx = i;
110                break;
111            }
112        }
113        if idx > 0 {
114            let child = &node.children[idx - 1];
115            if offset > child.start {
116                if offset < child.end {
117                    parent_list.push(&node);
118                    return Node::find_node_before(child, offset, parent_list);
119                }
120                if let Some(last_child) = child.last_child() {
121                    if last_child.end == child.end {
122                        parent_list.push(&node);
123                        return Node::find_node_before(child, offset, parent_list);
124                    }
125                }
126                parent_list.push(&node);
127                return child;
128            }
129        }
130        node
131    }
132
133    pub fn find_node_at<'a>(
134        node: &'a Node,
135        offset: usize,
136        parent_list: &mut Vec<&'a Node>,
137    ) -> &'a Node {
138        let mut idx = node.children.len();
139        for (i, child) in node.children.iter().enumerate() {
140            if offset < child.start {
141                idx = i;
142                break;
143            }
144        }
145
146        if idx > 0 {
147            let child = &node.children[idx - 1];
148            if offset >= child.start && offset < child.end {
149                parent_list.push(&node);
150                return Node::find_node_at(child, offset, parent_list);
151            }
152        }
153        node
154    }
155
156    /// Find TokenType in node at offset
157    ///
158    /// it return StartTagOpen, StartTag, StartTagClose, StartTagSelfClose, Content, EndTagOpen, EndTag, EndTagClose, Unknown
159    ///
160    /// if offset in children, then it's Content
161    /// if offset outside of node then it's Unknown
162    pub fn find_token_type_in_node(node: &Node, offset: usize) -> TokenType {
163        if node.start > offset || node.end <= offset {
164            return TokenType::Unknown;
165        }
166        let tag = node.tag.as_ref().unwrap();
167        if node.start == offset {
168            return TokenType::StartTagOpen;
169        }
170        if offset < node.start + 1 + tag.len() {
171            return TokenType::StartTag;
172        }
173        let start_tag_end = *node.start_tag_end.as_ref().unwrap();
174        if offset >= start_tag_end {
175            if let Some(end_tag_start) = node.end_tag_start {
176                if offset < end_tag_start {
177                    return TokenType::Content;
178                } else if offset == end_tag_start || offset == end_tag_start + 1 {
179                    return TokenType::EndTagOpen;
180                } else if offset < node.end - 1 {
181                    return TokenType::EndTag;
182                } else {
183                    return TokenType::EndTagClose;
184                }
185            } else if start_tag_end == node.end {
186                if offset >= node.end - 2 {
187                    return TokenType::StartTagSelfClose;
188                }
189            }
190        } else {
191            if start_tag_end == node.end {
192                if offset >= start_tag_end - 2 {
193                    return TokenType::StartTagSelfClose;
194                }
195            } else {
196                if offset >= start_tag_end - 1 {
197                    return TokenType::StartTagClose;
198                }
199            }
200        }
201        TokenType::Unknown
202    }
203}
204
205/// A tree of nodes for an HTML document
206///
207/// There is no reference to the parent node in the Node.
208/// The associated functions `find_node_before` and `find_node_at` keep a record of all parents of the target node.
209/// To get the parent node of the target node, you can like this:
210///
211/// ```rust
212/// use html_languageservice::{parser::html_parse, HTMLDataManager};
213///
214/// let html_document = html_parse::parse_html_document("<div><h1>title</h1></div>", "html", &HTMLDataManager::default(), false);
215///
216/// let mut parent_list = vec![];
217/// let node = html_document.find_node_at(9, &mut parent_list);
218/// assert_eq!(node.unwrap().tag, Some("h1".to_string()));
219///
220/// let parent = parent_list.pop();
221/// assert_eq!(parent.unwrap().tag, Some("div".to_string()));
222///
223/// let parent = parent_list.pop();
224/// assert!(parent.is_none());
225/// ```
226///
227/// If 'parent' is 'None', then its parent node is HTMLDocument.
228#[derive(Debug, Clone, PartialEq)]
229pub struct HTMLDocument {
230    pub roots: Vec<Node>,
231}
232
233impl HTMLDocument {
234    /// Find the node before the node where the given 'offset' is located
235    ///
236    /// `parent_list` is a list of parent nodes and the previous node is the parent node of the latter node.
237    /// If you don't care about the parent node, you can use `&mut vec![]`.
238    pub fn find_node_before<'a>(
239        &'a self,
240        offset: usize,
241        parent_list: &mut Vec<&'a Node>,
242    ) -> Option<&'a Node> {
243        let mut idx = self.roots.len();
244        for (i, child) in self.roots.iter().enumerate() {
245            if offset <= child.start {
246                idx = i;
247                break;
248            }
249        }
250        if idx > 0 {
251            let child = &self.roots[idx - 1];
252            if offset > child.start {
253                if offset < child.end {
254                    return Some(Node::find_node_before(child, offset, parent_list));
255                }
256                if let Some(last_child) = child.last_child() {
257                    if last_child.end == child.end {
258                        return Some(Node::find_node_before(child, offset, parent_list));
259                    }
260                }
261                return Some(child);
262            }
263        }
264        None
265    }
266
267    /// Find the node at the given 'offset' location
268    ///
269    /// `parent_list` is a list where the previous node is the parent node of the latter node.
270    /// If you don't care about the parent node, you can use `&mut vec![]`.
271    pub fn find_node_at<'a>(
272        &'a self,
273        offset: usize,
274        parent_list: &mut Vec<&'a Node>,
275    ) -> Option<&'a Node> {
276        let mut idx = self.roots.len();
277        for (i, child) in self.roots.iter().enumerate() {
278            if offset < child.start {
279                idx = i;
280                break;
281            }
282        }
283
284        if idx > 0 {
285            let child = &self.roots[idx - 1];
286            if offset >= child.start && offset < child.end {
287                return Some(Node::find_node_at(child, offset, parent_list));
288            }
289        }
290        None
291    }
292
293    pub fn find_root_at(&self, offset: usize) -> Option<&Node> {
294        for root in &self.roots {
295            if offset <= root.end {
296                return Some(root);
297            }
298        }
299        None
300    }
301}