html_languageservice/parser/
html_document.rs

1use std::collections::HashMap;
2
3use super::html_scanner::TokenType;
4
5#[derive(Debug, Clone)]
6pub struct Node {
7    /// It's None only when new
8    pub tag: Option<String>,
9    pub start: usize,
10    pub end: usize,
11    pub children: Vec<Node>,
12    /// Whether part of end tag exists
13    pub closed: bool,
14    /// It's None only when new, it larger than end of start tag
15    pub start_tag_end: Option<usize>,
16    /// It's None only when it's self-closing tag or it miss part of end tag, it equals start of end tag
17    pub end_tag_start: Option<usize>,
18    pub attributes: HashMap<String, NodeAttribute>,
19}
20
21#[derive(Debug, Clone, PartialEq)]
22pub struct NodeAttribute {
23    /// include quote
24    pub value: Option<String>,
25    /// start offset of attribute name
26    pub offset: usize,
27}
28
29impl NodeAttribute {
30    pub fn new(value: Option<String>, offset: usize) -> NodeAttribute {
31        NodeAttribute { value, offset }
32    }
33}
34
35impl Node {
36    pub fn new(start: usize, end: usize, children: Vec<Node>) -> Node {
37        Node {
38            tag: None,
39            start,
40            end,
41            children,
42            closed: false,
43            start_tag_end: None,
44            end_tag_start: None,
45            attributes: HashMap::new(),
46        }
47    }
48
49    pub fn attribute_names(&self) -> Vec<&String> {
50        self.attributes.keys().collect()
51    }
52
53    pub fn attribute_names_by_order(&self) -> Vec<&String> {
54        let mut attributes = self.attribute_names();
55        attributes.sort_by(|a, b| {
56            let a = self.attributes.get(*a).unwrap().offset;
57            let b = self.attributes.get(*b).unwrap().offset;
58            a.cmp(&b)
59        });
60        attributes
61    }
62
63    pub fn is_self_closing(&self) -> bool {
64        self.end_tag_start.is_none()
65    }
66
67    pub fn is_same_tag(&self, tag_in_lowercase: Option<&str>) -> bool {
68        if self.tag.is_none() {
69            tag_in_lowercase.is_none()
70        } else {
71            let tag: &str = &self.tag.as_ref().unwrap();
72            tag_in_lowercase.is_some_and(|tag_in_lowercase| {
73                tag.len() == tag_in_lowercase.len() && tag.to_lowercase() == tag_in_lowercase
74            })
75        }
76    }
77
78    pub fn first_child(&self) -> Option<&Node> {
79        Some(self.children.first()?)
80    }
81
82    pub fn last_child(&self) -> Option<&Node> {
83        Some(self.children.last()?)
84    }
85
86    pub fn find_node_before<'a>(
87        node: &'a Node,
88        offset: usize,
89        parent_list: &mut Vec<&'a Node>,
90    ) -> &'a Node {
91        let mut idx = node.children.len();
92        for (i, child) in node.children.iter().enumerate() {
93            if offset <= child.start {
94                idx = i;
95                break;
96            }
97        }
98        if idx > 0 {
99            let child = &node.children[idx - 1];
100            if offset > child.start {
101                if offset < child.end {
102                    parent_list.push(&node);
103                    return Node::find_node_before(child, offset, parent_list);
104                }
105                if let Some(last_child) = child.last_child() {
106                    if last_child.end == child.end {
107                        parent_list.push(&node);
108                        return Node::find_node_before(child, offset, parent_list);
109                    }
110                }
111                parent_list.push(&node);
112                return child;
113            }
114        }
115        node
116    }
117
118    pub fn find_node_at<'a>(
119        node: &'a Node,
120        offset: usize,
121        parent_list: &mut Vec<&'a Node>,
122    ) -> &'a Node {
123        let mut idx = node.children.len();
124        for (i, child) in node.children.iter().enumerate() {
125            if offset < child.start {
126                idx = i;
127                break;
128            }
129        }
130
131        if idx > 0 {
132            let child = &node.children[idx - 1];
133            if offset >= child.start && offset < child.end {
134                parent_list.push(&node);
135                return Node::find_node_at(child, offset, parent_list);
136            }
137        }
138        node
139    }
140
141    /// Find TokenType in node at offset
142    ///
143    /// it return StartTagOpen, StartTag, StartTagClose, StartTagSelfClose, Content, EndTagOpen, EndTag, EndTagClose, Unknown
144    ///
145    /// if offset in children, then it's Content
146    /// if offset outside of node then it's Unknown
147    pub fn find_token_type_in_node(node: &Node, offset: usize) -> TokenType {
148        if node.start > offset || node.end <= offset {
149            return TokenType::Unknown;
150        }
151        let tag = node.tag.as_ref().unwrap();
152        if node.start == offset {
153            return TokenType::StartTagOpen;
154        }
155        if offset < node.start + 1 + tag.len() {
156            return TokenType::StartTag;
157        }
158        let start_tag_end = *node.start_tag_end.as_ref().unwrap();
159        if offset >= start_tag_end {
160            if let Some(end_tag_start) = node.end_tag_start {
161                if offset < end_tag_start {
162                    return TokenType::Content;
163                } else if offset == end_tag_start || offset == end_tag_start + 1 {
164                    return TokenType::EndTagOpen;
165                } else if offset < node.end - 1 {
166                    return TokenType::EndTag;
167                } else {
168                    return TokenType::EndTagClose;
169                }
170            } else if start_tag_end == node.end {
171                if offset >= node.end - 2 {
172                    return TokenType::StartTagSelfClose;
173                }
174            }
175        } else {
176            if start_tag_end == node.end {
177                if offset >= start_tag_end - 2 {
178                    return TokenType::StartTagSelfClose;
179                }
180            } else {
181                if offset >= start_tag_end - 1 {
182                    return TokenType::StartTagClose;
183                }
184            }
185        }
186        TokenType::Unknown
187    }
188}
189
190/// A tree of nodes for an HTML document
191///
192/// There is no reference to the parent node in the Node.
193/// The associated functions `find_node_before` and `find_node_at` keep a record of all parents of the target node.
194/// To get the parent node of the target node, you can like this:
195///
196/// ```rust
197/// use html_languageservice::{parse_html_document, HTMLDataManager};
198///
199/// let html_document = parse_html_document("<div><h1>title</h1></div>", "html", &HTMLDataManager::default());
200///
201/// let mut parent_list = vec![];
202/// let node = html_document.find_node_at(9, &mut parent_list);
203/// assert_eq!(node.unwrap().tag, Some("h1".to_string()));
204///
205/// let parent = parent_list.pop();
206/// assert_eq!(parent.unwrap().tag, Some("div".to_string()));
207///
208/// let parent = parent_list.pop();
209/// assert!(parent.is_none());
210/// ```
211///
212/// If 'parent' is 'None', then its parent node is HTMLDocument.
213#[derive(Clone)]
214pub struct HTMLDocument {
215    pub roots: Vec<Node>,
216}
217
218impl HTMLDocument {
219    /// Find the node before the node where the given 'offset' is located
220    ///
221    /// `parent_list` is a list of parent nodes and the previous node is the parent node of the latter node.
222    /// If you don't care about the parent node, you can use `&mut vec![]`.
223    pub fn find_node_before<'a>(
224        &'a self,
225        offset: usize,
226        parent_list: &mut Vec<&'a Node>,
227    ) -> Option<&'a Node> {
228        let mut idx = self.roots.len();
229        for (i, child) in self.roots.iter().enumerate() {
230            if offset <= child.start {
231                idx = i;
232                break;
233            }
234        }
235        if idx > 0 {
236            let child = &self.roots[idx - 1];
237            if offset > child.start {
238                if offset < child.end {
239                    return Some(Node::find_node_before(child, offset, parent_list));
240                }
241                if let Some(last_child) = child.last_child() {
242                    if last_child.end == child.end {
243                        return Some(Node::find_node_before(child, offset, parent_list));
244                    }
245                }
246                return Some(child);
247            }
248        }
249        None
250    }
251
252    /// Find the node at the given 'offset' location
253    ///
254    /// `parent_list` is a list where the previous node is the parent node of the latter node.
255    /// If you don't care about the parent node, you can use `&mut vec![]`.
256    pub fn find_node_at<'a>(
257        &'a self,
258        offset: usize,
259        parent_list: &mut Vec<&'a Node>,
260    ) -> Option<&'a Node> {
261        let mut idx = self.roots.len();
262        for (i, child) in self.roots.iter().enumerate() {
263            if offset < child.start {
264                idx = i;
265                break;
266            }
267        }
268
269        if idx > 0 {
270            let child = &self.roots[idx - 1];
271            if offset >= child.start && offset < child.end {
272                return Some(Node::find_node_at(child, offset, parent_list));
273            }
274        }
275        None
276    }
277
278    pub fn find_root_at(&self, offset: usize) -> Option<&Node> {
279        for root in &self.roots {
280            if offset <= root.end {
281                return Some(root);
282            }
283        }
284        None
285    }
286}