Skip to main content

tree_sitter_htmlx_html/
lib.rs

1//! Tree-sitter grammar for HTML following the WHATWG HTML Living Standard
2//!
3//! This grammar provides spec-compliant HTML parsing including:
4//!
5//! - **Void elements** (§13.1.2): area, base, br, col, embed, hr, img, input, link, meta, source, track, wbr
6//! - **Raw text elements** (§13.1.2.1): script, style
7//! - **Escapable raw text elements** (§13.1.2.2): textarea, title
8//! - **Optional end tags** (§13.1.2.4): Proper implicit closing
9//! - **Character references** (§13.5): Named, decimal, and hex entities
10//!
11//! ## Example
12//!
13//! ```rust
14//! use tree_sitter_htmlx_html::LANGUAGE;
15//!
16//! let mut parser = tree_sitter::Parser::new();
17//! parser.set_language(&LANGUAGE.into()).expect("Failed to load HTML grammar");
18//!
19//! let source = r#"<!DOCTYPE html>
20//! <html>
21//! <head>
22//!   <title>Hello World</title>
23//! </head>
24//! <body>
25//!   <p>Welcome to <strong>HTML</strong>!</p>
26//!   <img src="logo.png" alt="Logo">
27//! </body>
28//! </html>"#;
29//!
30//! let tree = parser.parse(source, None).unwrap();
31//! assert!(!tree.root_node().has_error());
32//! ```
33
34use tree_sitter_language::LanguageFn;
35
36extern "C" {
37    fn tree_sitter_html() -> *const ();
38}
39
40/// The tree-sitter [`LanguageFn`] for HTML.
41pub const LANGUAGE: LanguageFn = unsafe { LanguageFn::from_raw(tree_sitter_html) };
42
43/// The tree-sitter language for HTML.
44pub fn language() -> tree_sitter::Language {
45    LANGUAGE.into()
46}
47
48/// The syntax highlighting query for HTML.
49pub const HIGHLIGHTS_QUERY: &str = include_str!("../queries/highlights.scm");
50
51/// The injection query for HTML (for script/style content).
52pub const INJECTIONS_QUERY: &str = include_str!("../queries/injections.scm");
53
54/// The content of the node-types.json file for HTML.
55pub const NODE_TYPES: &str = include_str!("node-types.json");
56
57#[cfg(test)]
58mod tests {
59    use super::*;
60
61    #[test]
62    fn test_can_load_grammar() {
63        let mut parser = tree_sitter::Parser::new();
64        parser
65            .set_language(&LANGUAGE.into())
66            .expect("Failed to load HTML grammar");
67    }
68
69    #[test]
70    fn test_parse_simple_html() {
71        let mut parser = tree_sitter::Parser::new();
72        parser.set_language(&LANGUAGE.into()).unwrap();
73
74        let source = "<div>Hello</div>";
75        let tree = parser.parse(source, None).unwrap();
76
77        assert!(!tree.root_node().has_error());
78        assert_eq!(tree.root_node().kind(), "document");
79    }
80
81    #[test]
82    fn test_parse_void_elements() {
83        let mut parser = tree_sitter::Parser::new();
84        parser.set_language(&LANGUAGE.into()).unwrap();
85
86        let source = r#"<img src="test.png"><br><input type="text"><hr>"#;
87        let tree = parser.parse(source, None).unwrap();
88
89        assert!(!tree.root_node().has_error());
90    }
91
92    #[test]
93    fn test_parse_self_closing_void() {
94        let mut parser = tree_sitter::Parser::new();
95        parser.set_language(&LANGUAGE.into()).unwrap();
96
97        let source = r#"<img src="test.png" /><br /><input type="text" />"#;
98        let tree = parser.parse(source, None).unwrap();
99
100        assert!(!tree.root_node().has_error());
101    }
102
103    #[test]
104    fn test_parse_script_element() {
105        let mut parser = tree_sitter::Parser::new();
106        parser.set_language(&LANGUAGE.into()).unwrap();
107
108        let source = r#"<script>const x = "<div></div>";</script>"#;
109        let tree = parser.parse(source, None).unwrap();
110
111        assert!(!tree.root_node().has_error());
112    }
113
114    #[test]
115    fn test_parse_style_element() {
116        let mut parser = tree_sitter::Parser::new();
117        parser.set_language(&LANGUAGE.into()).unwrap();
118
119        let source = r#"<style>div { color: red; }</style>"#;
120        let tree = parser.parse(source, None).unwrap();
121
122        assert!(!tree.root_node().has_error());
123    }
124
125    #[test]
126    fn test_parse_textarea_element() {
127        let mut parser = tree_sitter::Parser::new();
128        parser.set_language(&LANGUAGE.into()).unwrap();
129
130        let source = r#"<textarea>Some <b>text</b> here</textarea>"#;
131        let tree = parser.parse(source, None).unwrap();
132
133        assert!(!tree.root_node().has_error());
134        // textarea is an element with raw_text content
135        let element = tree.root_node().child(0).unwrap();
136        assert_eq!(element.kind(), "element");
137    }
138
139    #[test]
140    fn test_parse_title_element() {
141        let mut parser = tree_sitter::Parser::new();
142        parser.set_language(&LANGUAGE.into()).unwrap();
143
144        let source = r#"<title>Page <Title> Test</title>"#;
145        let tree = parser.parse(source, None).unwrap();
146
147        assert!(!tree.root_node().has_error());
148        // title is an element with raw_text content
149        let element = tree.root_node().child(0).unwrap();
150        assert_eq!(element.kind(), "element");
151    }
152
153    #[test]
154    fn test_parse_implicit_end_tags() {
155        let mut parser = tree_sitter::Parser::new();
156        parser.set_language(&LANGUAGE.into()).unwrap();
157
158        let source = r#"<ul><li>One<li>Two<li>Three</ul>"#;
159        let tree = parser.parse(source, None).unwrap();
160
161        assert!(!tree.root_node().has_error());
162    }
163
164    #[test]
165    fn test_parse_void_elements_do_not_capture_following_text() {
166        let mut parser = tree_sitter::Parser::new();
167        parser.set_language(&LANGUAGE.into()).unwrap();
168
169        let source = "<div><input>x</div>";
170        let tree = parser.parse(source, None).unwrap();
171
172        assert!(!tree.root_node().has_error());
173        assert_eq!(
174            tree.root_node().to_sexp(),
175            "(document (element (start_tag name: (tag_name)) (element (start_tag name: (tag_name))) (text) (end_tag name: (tag_name))))"
176        );
177    }
178
179    #[test]
180    fn test_parse_p_implicit_close() {
181        let mut parser = tree_sitter::Parser::new();
182        parser.set_language(&LANGUAGE.into()).unwrap();
183
184        let source = r#"<p>Paragraph<div>Block</div>"#;
185        let tree = parser.parse(source, None).unwrap();
186
187        assert!(!tree.root_node().has_error());
188    }
189
190    #[test]
191    fn test_parse_entities() {
192        let mut parser = tree_sitter::Parser::new();
193        parser.set_language(&LANGUAGE.into()).unwrap();
194
195        let source = r#"<p>&amp; &lt; &gt; &#60; &#x3C;</p>"#;
196        let tree = parser.parse(source, None).unwrap();
197
198        assert!(!tree.root_node().has_error());
199    }
200
201    #[test]
202    fn test_parse_doctype() {
203        let mut parser = tree_sitter::Parser::new();
204        parser.set_language(&LANGUAGE.into()).unwrap();
205
206        let source = r#"<!DOCTYPE html><html></html>"#;
207        let tree = parser.parse(source, None).unwrap();
208
209        assert!(!tree.root_node().has_error());
210    }
211
212    #[test]
213    fn test_parse_comments() {
214        let mut parser = tree_sitter::Parser::new();
215        parser.set_language(&LANGUAGE.into()).unwrap();
216
217        let source = r#"<!-- Comment --><div><!-- Another --></div>"#;
218        let tree = parser.parse(source, None).unwrap();
219
220        assert!(!tree.root_node().has_error());
221    }
222
223    #[test]
224    fn test_parse_attributes() {
225        let mut parser = tree_sitter::Parser::new();
226        parser.set_language(&LANGUAGE.into()).unwrap();
227
228        let source = r#"<div id="test" class='cls' data-value=123 disabled></div>"#;
229        let tree = parser.parse(source, None).unwrap();
230
231        assert!(!tree.root_node().has_error());
232    }
233}