html_linter/
lib.rs

1use html5ever::driver::ParseOpts;
2use html5ever::parse_document;
3use html5ever::tendril::TendrilSink;
4use markup5ever_rcdom::RcDom;
5use regex::Regex;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use thiserror::Error;
9
10mod checks;
11mod dom;
12
13use dom::{DOMIndex, IndexedNode};
14
15#[derive(Error, Debug)]
16pub enum LinterError {
17    #[error("Parse error: {0}")]
18    ParseError(String),
19    #[error("Rule error: {0}")]
20    RuleError(String),
21    #[error("Invalid selector: {0}")]
22    SelectorError(String),
23    #[error("IO error: {0}")]
24    IoError(#[from] std::io::Error),
25}
26
27#[derive(Debug, Serialize, Deserialize, Clone)]
28pub enum RuleType {
29    ElementPresence,
30    AttributePresence,
31    AttributeValue,
32    ElementOrder,
33    TextContent,
34    ElementContent,
35    WhiteSpace,
36    Nesting,
37    Semantics,
38    Compound,
39    Custom(String),
40    DocumentStructure,
41    ElementCount,
42    ElementCase,
43    AttributeQuotes,
44}
45
46#[derive(Debug, Serialize, Deserialize, Clone)]
47pub struct Rule {
48    pub name: String,
49    pub rule_type: RuleType,
50    pub severity: Severity,
51    pub selector: String,  // CSS-like selector
52    pub condition: String, // Rule-specific condition
53    pub message: String,   // Error message
54    #[serde(default)]
55    pub options: HashMap<String, String>, // Additional rule options
56}
57
58#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
59pub enum Severity {
60    Error,
61    Warning,
62    Info,
63}
64
65#[derive(Debug, Clone)]
66pub struct LintResult {
67    pub rule: String,
68    pub severity: Severity,
69    pub message: String,
70    pub location: Location,
71    pub source: String,
72}
73
74#[derive(Debug, Clone)]
75pub struct Location {
76    pub line: usize,
77    pub column: usize,
78    pub element: String,
79}
80
81#[derive(Debug, Serialize, Deserialize, Clone, Default)]
82pub struct LinterOptions {
83    pub ignore_files: Vec<String>,
84    pub custom_selectors: HashMap<String, String>,
85    pub max_line_length: Option<usize>,
86    pub allow_inline_styles: bool,
87}
88
89#[derive(Debug, Serialize, Deserialize)]
90struct MetaTagRule {
91    name: Option<String>,     // name attribute
92    property: Option<String>, // property attribute (for Open Graph etc.)
93    pattern: MetaTagPattern,  // pattern to match against
94    required: bool,           // whether this meta tag is required
95}
96
97#[derive(Debug, Serialize, Deserialize)]
98#[serde(tag = "type", content = "value")]
99enum MetaTagPattern {
100    Regex(String),      // Regular expression pattern
101    MinLength(usize),   // Minimum content length
102    MaxLength(usize),   // Maximum content length
103    NonEmpty,           // Must not be empty
104    Exact(String),      // Exact match
105    OneOf(Vec<String>), // Must match one of these values
106    Contains(String),   // Must contain this string
107    StartsWith(String), // Must start with this string
108    EndsWith(String),   // Must end with this string
109}
110
111#[derive(Debug, Serialize, Deserialize)]
112#[serde(tag = "type")]
113pub enum CompoundCondition {
114    TextContent {
115        pattern: String,
116    },
117    AttributeValue {
118        attribute: String,
119        pattern: String,
120    },
121    AttributeReference {
122        attribute: String,
123        reference_must_exist: bool,
124    },
125    ElementPresence {
126        selector: String,
127    },
128}
129
130pub struct HtmlLinter {
131    pub(crate) rules: Vec<Rule>,
132    options: LinterOptions,
133}
134
135impl HtmlLinter {
136    pub fn new(rules: Vec<Rule>, options: Option<LinterOptions>) -> Self {
137        Self {
138            rules,
139            options: options.unwrap_or_default(),
140        }
141    }
142
143    pub fn lint(&self, html: &str) -> Result<Vec<LintResult>, LinterError> {
144        let dom = parse_document(RcDom::default(), ParseOpts::default())
145            .from_utf8()
146            .read_from(&mut html.as_bytes())
147            .map_err(|e| LinterError::ParseError(e.to_string()))?;
148
149        let index = DOMIndex::new(&dom, html);
150        let mut results = Vec::new();
151
152        // Process rules in parallel using rayon
153        for rule in &self.rules {
154            if !self.should_ignore_rule(&rule.name) {
155                results.extend(self.process_rule(rule, &index)?);
156            }
157        }
158
159        Ok(results)
160    }
161
162    pub fn from_json(json: &str, options: Option<LinterOptions>) -> Result<Self, LinterError> {
163        let rules: Vec<Rule> = serde_json::from_str(json)
164            .map_err(|e| LinterError::ParseError(format!("Failed to parse rules JSON: {}", e)))?;
165        Ok(Self::new(rules, options))
166    }
167
168    pub fn from_json_file(path: &str, options: Option<LinterOptions>) -> Result<Self, LinterError> {
169        let content = std::fs::read_to_string(path)?;
170        Self::from_json(&content, options)
171    }
172
173    fn should_ignore_rule(&self, rule_name: &str) -> bool {
174        self.options.ignore_files.iter().any(|pattern| {
175            if let Ok(regex) = Regex::new(pattern) {
176                regex.is_match(rule_name)
177            } else {
178                pattern == rule_name
179            }
180        })
181    }
182
183    fn process_rule(&self, rule: &Rule, index: &DOMIndex) -> Result<Vec<LintResult>, LinterError> {
184        match rule.rule_type {
185            RuleType::ElementPresence => self.check_element_presence(rule, index),
186            RuleType::AttributePresence => self.check_attribute_presence(rule, index),
187            RuleType::AttributeValue => self.check_attribute_value(rule, index),
188            RuleType::ElementOrder => self.check_element_order(rule, index),
189            RuleType::TextContent => self.check_text_content(rule, index),
190            RuleType::ElementContent => self.check_element_content(rule, index),
191            RuleType::WhiteSpace => self.check_whitespace(rule, index),
192            RuleType::Nesting => self.check_nesting(rule, index),
193            RuleType::Semantics => self.check_semantics(rule, index),
194            RuleType::Compound => self.check_compound(rule, index),
195            RuleType::Custom(ref validator) => self.check_custom(rule, validator, index),
196            RuleType::DocumentStructure => self.check_document_structure(rule, index),
197            RuleType::ElementCount => self.check_element_count(rule, index),
198            RuleType::ElementCase => self.check_element_case(rule, index),
199            RuleType::AttributeQuotes => self.check_attribute_quotes(rule, index),
200        }
201    }
202
203    fn create_lint_result(&self, rule: &Rule, node: &IndexedNode, index: &DOMIndex) -> LintResult {
204        LintResult {
205            rule: rule.name.clone(),
206            severity: rule.severity.clone(),
207            message: rule.message.clone(),
208            location: Location {
209                line: node.source_info.line,
210                column: node.source_info.column,
211                element: index
212                    .resolve_symbol(node.tag_name)
213                    .unwrap_or_default()
214                    .to_string(),
215            },
216            source: node.source_info.source.clone(),
217        }
218    }
219
220    pub fn get_rules(&self) -> Vec<Rule> {
221        self.rules.clone()
222    }
223}
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228
229    #[test]
230    fn test_basic_linting() {
231        let rules = vec![Rule {
232            name: "img-alt".to_string(),
233            rule_type: RuleType::AttributePresence,
234            severity: Severity::Error,
235            selector: "img".to_string(),
236            condition: "alt-missing".to_string(),
237            message: "Image must have alt attribute".to_string(),
238            options: HashMap::new(),
239        }];
240
241        let linter = HtmlLinter::new(rules, None);
242        let html = r#"<img src="test.jpg">"#;
243        let results = linter.lint(html).unwrap();
244        assert_eq!(results.len(), 1);
245        assert_eq!(results[0].severity, Severity::Error);
246    }
247
248    #[test]
249    fn test_compound_rule() {
250        // Add more comprehensive tests
251    }
252}