Skip to main content

normalize_languages/
html.rs

1//! HTML language support with symbol extraction.
2//!
3//! HTML elements are extracted as symbols: elements with child elements become
4//! Modules (containers), leaf elements become Variables. Tag name is the symbol name.
5
6use crate::{Language, LanguageEmbedded, LanguageSymbols};
7use tree_sitter::Node;
8
9/// HTML language support.
10pub struct Html;
11
12impl Language for Html {
13    fn name(&self) -> &'static str {
14        "HTML"
15    }
16    fn extensions(&self) -> &'static [&'static str] {
17        &["html", "htm"]
18    }
19    fn grammar_name(&self) -> &'static str {
20        "html"
21    }
22
23    fn as_symbols(&self) -> Option<&dyn LanguageSymbols> {
24        Some(self)
25    }
26
27    fn as_embedded(&self) -> Option<&dyn LanguageEmbedded> {
28        Some(self)
29    }
30
31    fn refine_kind(
32        &self,
33        node: &Node,
34        _content: &str,
35        tag_kind: crate::SymbolKind,
36    ) -> crate::SymbolKind {
37        if node.kind() == "element" && has_child_elements(node) {
38            return crate::SymbolKind::Module;
39        }
40        tag_kind
41    }
42
43    fn node_name<'a>(&self, node: &Node, content: &'a str) -> Option<&'a str> {
44        if node.kind() == "element"
45            || node.kind() == "script_element"
46            || node.kind() == "style_element"
47        {
48            return extract_html_tag_name(node, content);
49        }
50        None
51    }
52
53    fn container_body<'a>(&self, node: &'a Node<'a>) -> Option<Node<'a>> {
54        // For elements with children, the element itself is the container body
55        // (child elements are direct children of the element node)
56        if node.kind() == "element" && has_child_elements(node) {
57            return Some(*node);
58        }
59        None
60    }
61
62    fn build_signature(&self, node: &Node, content: &str) -> String {
63        if let Some(tag) = self.node_name(node, content) {
64            // Include key attributes (id, class) in signature
65            if let Some(attrs) = extract_key_attributes(node, content) {
66                return format!("<{} {}>", tag, attrs);
67            }
68            return format!("<{}>", tag);
69        }
70        content[node.byte_range()]
71            .lines()
72            .next()
73            .unwrap_or("")
74            .trim()
75            .to_string()
76    }
77}
78
79impl LanguageSymbols for Html {}
80
81impl LanguageEmbedded for Html {
82    fn embedded_content(&self, node: &Node, content: &str) -> Option<crate::EmbeddedBlock> {
83        match node.kind() {
84            "script_element" => {
85                let raw = find_raw_text_child(node)?;
86                let grammar = detect_script_type(node, content);
87                Some(crate::EmbeddedBlock {
88                    grammar,
89                    content: content[raw.byte_range()].to_string(),
90                    start_line: raw.start_position().row + 1,
91                })
92            }
93            "style_element" => {
94                let raw = find_raw_text_child(node)?;
95                Some(crate::EmbeddedBlock {
96                    grammar: "css",
97                    content: content[raw.byte_range()].to_string(),
98                    start_line: raw.start_position().row + 1,
99                })
100            }
101            _ => None,
102        }
103    }
104}
105
106/// Check if an element has child elements (not just text).
107fn has_child_elements(node: &Node) -> bool {
108    let mut cursor = node.walk();
109    node.children(&mut cursor).any(|child| {
110        child.kind() == "element"
111            || child.kind() == "script_element"
112            || child.kind() == "style_element"
113    })
114}
115
116/// Extract tag name from start_tag or self_closing_tag.
117fn extract_html_tag_name<'a>(node: &Node, content: &'a str) -> Option<&'a str> {
118    let mut cursor = node.walk();
119    for child in node.children(&mut cursor) {
120        if child.kind() == "start_tag" || child.kind() == "self_closing_tag" {
121            let mut inner = child.walk();
122            for part in child.children(&mut inner) {
123                if part.kind() == "tag_name" {
124                    return Some(&content[part.byte_range()]);
125                }
126            }
127        }
128    }
129    None
130}
131
132/// Extract id and class attributes for the signature.
133fn extract_key_attributes(node: &Node, content: &str) -> Option<String> {
134    let mut cursor = node.walk();
135    for child in node.children(&mut cursor) {
136        if child.kind() == "start_tag" || child.kind() == "self_closing_tag" {
137            let mut parts = Vec::new();
138            let mut inner = child.walk();
139            for attr in child.children(&mut inner) {
140                if attr.kind() == "attribute" {
141                    let mut attr_cursor = attr.walk();
142                    let mut attr_name = None;
143                    let mut attr_val = None;
144                    for part in attr.children(&mut attr_cursor) {
145                        if part.kind() == "attribute_name" {
146                            attr_name = Some(&content[part.byte_range()]);
147                        } else if part.kind() == "quoted_attribute_value" {
148                            attr_val = Some(&content[part.byte_range()]);
149                        }
150                    }
151                    if let (Some(name), Some(val)) = (attr_name, attr_val)
152                        && (name == "id" || name == "class")
153                    {
154                        parts.push(format!("{}={}", name, val));
155                    }
156                }
157            }
158            if !parts.is_empty() {
159                return Some(parts.join(" "));
160            }
161        }
162    }
163    None
164}
165
166/// Find the raw_text child of a script/style element.
167fn find_raw_text_child<'a>(node: &'a Node<'a>) -> Option<Node<'a>> {
168    let mut cursor = node.walk();
169    node.children(&mut cursor)
170        .find(|&child| child.kind() == "raw_text")
171}
172
173/// Detect script type from the type attribute (e.g., <script type="module">).
174/// HTML scripts default to JavaScript; type="module" is still JavaScript.
175fn detect_script_type(node: &Node, content: &str) -> &'static str {
176    if let Some(script_type) = get_type_attribute(node, content) {
177        match script_type {
178            "text/typescript" => return "typescript",
179            "module" | "text/javascript" | "application/javascript" => return "javascript",
180            _ => {}
181        }
182    }
183    "javascript"
184}
185
186/// Get the type attribute value from a script element.
187fn get_type_attribute<'a>(node: &Node, content: &'a str) -> Option<&'a str> {
188    let mut cursor = node.walk();
189    for child in node.children(&mut cursor) {
190        // Look for start_tag which contains the attributes
191        if child.kind() == "start_tag" {
192            let mut inner_cursor = child.walk();
193            for attr in child.children(&mut inner_cursor) {
194                if attr.kind() == "attribute" {
195                    // Check if this is a type attribute
196                    let mut attr_cursor = attr.walk();
197                    let mut is_type = false;
198                    for part in attr.children(&mut attr_cursor) {
199                        if part.kind() == "attribute_name" {
200                            let name = &content[part.byte_range()];
201                            is_type = name == "type";
202                        } else if is_type && part.kind() == "quoted_attribute_value" {
203                            // Get the value inside quotes
204                            let value = &content[part.byte_range()];
205                            return Some(value.trim_matches('"').trim_matches('\''));
206                        }
207                    }
208                }
209            }
210        }
211    }
212    None
213}
214
215#[cfg(test)]
216mod tests {
217    use super::*;
218    use crate::validate_unused_kinds_audit;
219
220    #[test]
221    fn unused_node_kinds_audit() {
222        #[rustfmt::skip]
223        let documented_unused: &[&str] = &[
224            "doctype",
225        ];
226
227        validate_unused_kinds_audit(&Html, documented_unused)
228            .expect("HTML unused node kinds audit failed");
229    }
230}