lucid_lint/
engine.rs

1//! The linting engine: orchestrates parsing, rule execution, and output.
2
3use std::fs;
4use std::num::NonZeroU32;
5use std::path::Path;
6
7use thiserror::Error;
8
9use crate::condition::ConditionTag;
10use crate::config::Profile;
11use crate::language::{default_language, detect_language};
12use crate::parser::{parse_markdown, parse_plain, word_count};
13use crate::rules::lexicon::unexplained_abbreviation::{self, UnexplainedAbbreviation};
14use crate::rules::readability::score::{self, FormulaChoice, ReadabilityScore};
15use crate::rules::structure::excessive_commas::{self, ExcessiveCommas};
16use crate::rules::{
17    default_rules, filter_by_conditions, filter_by_experimental, ExperimentalOptIn, Rule,
18};
19use crate::scoring::{self, Scorecard, ScoringConfig};
20use crate::types::{Diagnostic, Language, SourceFile};
21
22/// Aggregated result of a lint run over a single document.
23///
24/// Pairs the list of diagnostics with the [`Scorecard`] that aggregates them
25/// and the word count used as the density denominator.
26#[derive(Debug, Clone)]
27pub struct Report {
28    /// Diagnostics emitted, in rule-discovery order.
29    pub diagnostics: Vec<Diagnostic>,
30    /// Global and per-category scores for this document.
31    pub scorecard: Scorecard,
32    /// Word count used to normalize scoring density.
33    pub word_count: u32,
34}
35
36/// The linting engine.
37///
38/// Bundles a profile and a set of rules, and exposes methods to lint strings,
39/// files, and stdin-sourced text.
40pub struct Engine {
41    profile: Profile,
42    rules: Vec<Box<dyn Rule>>,
43    scoring_config: ScoringConfig,
44}
45
46impl Engine {
47    /// Build an engine with the default rule set for the given profile.
48    #[must_use]
49    pub fn with_profile(profile: Profile) -> Self {
50        Self {
51            profile,
52            rules: default_rules(profile),
53            scoring_config: ScoringConfig::default(),
54        }
55    }
56
57    /// Build an engine for the given profile, restricting the rule set to
58    /// rules tagged `general` plus those whose condition tags intersect with
59    /// `conditions` (F71 + F72).
60    #[must_use]
61    pub fn with_profile_and_conditions(profile: Profile, conditions: &[ConditionTag]) -> Self {
62        Self::with_profile_conditions_and_experimental(
63            profile,
64            conditions,
65            &ExperimentalOptIn::None,
66        )
67    }
68
69    /// Build an engine with the F71/F72 condition filter and the F139
70    /// experimental opt-in applied together.
71    ///
72    /// Filtering order: experimental first (drops [`crate::rules::Status::Experimental`]
73    /// rules the user did not opt in to), then conditions. Order is
74    /// observably equivalent — both filters are pure subset operations
75    /// — but experimental-first keeps the cardinality cheaper.
76    #[must_use]
77    pub fn with_profile_conditions_and_experimental(
78        profile: Profile,
79        conditions: &[ConditionTag],
80        experimental: &ExperimentalOptIn,
81    ) -> Self {
82        let rules = filter_by_experimental(default_rules(profile), experimental);
83        let rules = filter_by_conditions(rules, conditions);
84        Self {
85            profile,
86            rules,
87            scoring_config: ScoringConfig::default(),
88        }
89    }
90
91    /// Build an engine with a custom rule set.
92    #[must_use]
93    pub fn with_rules(profile: Profile, rules: Vec<Box<dyn Rule>>) -> Self {
94        Self {
95            profile,
96            rules,
97            scoring_config: ScoringConfig::default(),
98        }
99    }
100
101    /// Attach a custom [`ScoringConfig`], overriding the defaults.
102    #[must_use]
103    pub fn with_scoring_config(mut self, scoring_config: ScoringConfig) -> Self {
104        self.scoring_config = scoring_config;
105        self
106    }
107
108    /// Override the [`ReadabilityScore`] rule's formula choice (F11).
109    ///
110    /// When `FormulaChoice::Auto` is passed the engine keeps the default
111    /// per-language selection; other variants pin a concrete formula
112    /// regardless of the document's detected language.
113    ///
114    /// If the rule set does not currently include a `readability-score`
115    /// rule (e.g., it was filtered out), this is a no-op — the rule will
116    /// not be re-added.
117    #[must_use]
118    pub fn with_readability_formula(mut self, formula: FormulaChoice) -> Self {
119        let config = score::Config::for_profile(self.profile).with_formula(formula);
120        self.replace_rule(
121            ReadabilityScore::ID,
122            Box::new(ReadabilityScore::new(config)),
123        );
124        self
125    }
126
127    /// Extend the [`UnexplainedAbbreviation`] rule's user whitelist
128    /// with project-specific entries (F31). The extras are additive
129    /// over the profile baseline — callers typically use this to
130    /// restore the narrower acronyms that F31 moved out of the shipped
131    /// `dev-doc` baseline (`WCAG`, `ARIA`, `ADHD`, `LLM`, …).
132    ///
133    /// If the rule set does not currently include an
134    /// `unexplained-abbreviation` rule, this is a no-op.
135    #[must_use]
136    pub fn with_unexplained_whitelist(mut self, extra: Vec<String>) -> Self {
137        if extra.is_empty() {
138            return self;
139        }
140        let config =
141            unexplained_abbreviation::Config::for_profile(self.profile).with_extra_whitelist(extra);
142        self.replace_rule(
143            UnexplainedAbbreviation::ID,
144            Box::new(UnexplainedAbbreviation::new(config)),
145        );
146        self
147    }
148
149    /// Override the [`ExcessiveCommas`] rule's `max_commas` threshold.
150    ///
151    /// If the rule set does not currently include an `excessive-commas`
152    /// rule (e.g., it was filtered out), this is a no-op.
153    #[must_use]
154    pub fn with_excessive_commas_max_commas(mut self, max_commas: NonZeroU32) -> Self {
155        let config =
156            excessive_commas::Config::for_profile(self.profile).with_max_commas(max_commas);
157        self.replace_rule(ExcessiveCommas::ID, Box::new(ExcessiveCommas::new(config)));
158        self
159    }
160
161    /// Replace the rule matching `id` in-place with `replacement`, preserving
162    /// its position so rule-discovery order (and therefore diagnostic order)
163    /// stays stable. No-op when no rule with that id is present — this is the
164    /// contract the public `with_*` helpers rely on when the target rule has
165    /// been filtered out by condition tags.
166    fn replace_rule(&mut self, id: &str, replacement: Box<dyn Rule>) {
167        if let Some(slot) = self.rules.iter_mut().find(|r| r.id() == id) {
168            *slot = replacement;
169        }
170    }
171
172    /// The profile this engine was configured with.
173    #[must_use]
174    pub const fn profile(&self) -> Profile {
175        self.profile
176    }
177
178    /// Lint a string input. Markdown syntax is assumed.
179    #[must_use]
180    pub fn lint_str(&self, input: &str) -> Report {
181        self.lint_with_source(input, SourceFile::Anonymous, true)
182    }
183
184    /// Lint stdin-like input.
185    #[must_use]
186    pub fn lint_stdin(&self, input: &str) -> Report {
187        self.lint_with_source(input, SourceFile::Stdin, true)
188    }
189
190    /// Lint a file from disk.
191    ///
192    /// Markdown is assumed for `.md` and `.markdown` extensions; other files
193    /// are treated as plain text.
194    ///
195    /// # Errors
196    ///
197    /// Returns [`EngineError::Io`] if the file cannot be read.
198    pub fn lint_file(&self, path: &Path) -> Result<Report, EngineError> {
199        let contents = fs::read_to_string(path).map_err(EngineError::Io)?;
200        let is_markdown = path
201            .extension()
202            .and_then(|e| e.to_str())
203            .is_some_and(|ext| matches!(ext, "md" | "markdown"));
204        let source = SourceFile::Path(path.to_path_buf());
205        Ok(self.lint_with_source(&contents, source, is_markdown))
206    }
207
208    fn lint_with_source(&self, input: &str, source: SourceFile, is_markdown: bool) -> Report {
209        let normalized = normalize_input(input);
210        let input = normalized.as_ref();
211        let language = match detect_language(input) {
212            Language::Unknown => default_language(),
213            detected => detected,
214        };
215
216        let document = if is_markdown {
217            parse_markdown(input, source)
218        } else {
219            parse_plain(input, source)
220        };
221
222        let mut diagnostics = Vec::new();
223        for rule in &self.rules {
224            diagnostics.extend(rule.check(&document, language));
225        }
226        diagnostics.retain(|d| {
227            !document
228                .directives
229                .iter()
230                .any(|dir| dir.rule_id == d.rule_id && dir.covers(d.location.line))
231        });
232
233        let words = word_count(input);
234        let scorecard = scoring::compute(&diagnostics, words, &self.scoring_config);
235
236        Report {
237            diagnostics,
238            scorecard,
239            word_count: words,
240        }
241    }
242}
243
244/// Normalize input at the engine boundary so every rule consumes the same
245/// shape of text: leading UTF-8 BOM stripped (F110), and NFC-normalized so
246/// `café` (precomposed) and `café` (decomposed) hash to the same key (F111).
247fn normalize_input(input: &str) -> std::borrow::Cow<'_, str> {
248    use unicode_normalization::{is_nfc_quick, IsNormalized, UnicodeNormalization};
249
250    let stripped = input.strip_prefix('\u{FEFF}');
251    let body = stripped.unwrap_or(input);
252    match is_nfc_quick(body.chars()) {
253        IsNormalized::Yes if stripped.is_none() => std::borrow::Cow::Borrowed(input),
254        IsNormalized::Yes => std::borrow::Cow::Owned(body.to_string()),
255        _ => std::borrow::Cow::Owned(body.nfc().collect()),
256    }
257}
258
259/// Errors returned by the engine.
260#[derive(Debug, Error)]
261pub enum EngineError {
262    /// I/O error reading a file.
263    #[error("failed to read input file")]
264    Io(#[source] std::io::Error),
265}
266
267#[cfg(test)]
268mod tests {
269    use super::*;
270    use crate::types::Severity;
271
272    #[test]
273    fn engine_applies_default_rules() {
274        let engine = Engine::with_profile(Profile::Public);
275        let text = "This is a rather long sentence that keeps adding more and more words \
276                    until it exceeds the public profile threshold by a comfortable margin.";
277        let report = engine.lint_str(text);
278        assert!(!report.diagnostics.is_empty());
279        assert!(report
280            .diagnostics
281            .iter()
282            .any(|d| d.severity == Severity::Warning));
283    }
284
285    #[test]
286    fn engine_returns_no_warnings_for_clean_text() {
287        let engine = Engine::with_profile(Profile::Public);
288        let report = engine.lint_str("Short clean sentence. Another fine one.");
289        assert!(report
290            .diagnostics
291            .iter()
292            .all(|d| d.severity == Severity::Info));
293    }
294
295    fn diags_for_rule(diags: &[Diagnostic], rule_id: &str) -> usize {
296        diags.iter().filter(|d| d.rule_id == rule_id).count()
297    }
298
299    #[test]
300    fn engine_respects_profile() {
301        let public = Engine::with_profile(Profile::Public);
302        let dev = Engine::with_profile(Profile::DevDoc);
303        // 25 words: triggers Public (22) but not DevDoc (30) for sentence-too-long.
304        let text = "This is a long sentence that keeps adding more and more words until it \
305                    exceeds the public profile threshold by a comfortable margin of safety.";
306        assert!(
307            diags_for_rule(
308                &public.lint_str(text).diagnostics,
309                "structure.sentence-too-long"
310            ) > 0
311        );
312        assert_eq!(
313            diags_for_rule(
314                &dev.lint_str(text).diagnostics,
315                "structure.sentence-too-long"
316            ),
317            0
318        );
319    }
320
321    #[test]
322    fn inline_disable_suppresses_matching_diagnostic() {
323        let engine = Engine::with_profile(Profile::Public);
324        let text = "Intro paragraph.\n\n\
325                    <!-- lucid-lint disable-next-line structure.sentence-too-long -->\n\
326                    This is a long sentence that keeps adding more and more words until it \
327                    exceeds the public profile threshold by a comfortable margin of safety.\n";
328        let report = engine.lint_str(text);
329        assert_eq!(
330            diags_for_rule(&report.diagnostics, "structure.sentence-too-long"),
331            0,
332            "expected directive to suppress sentence-too-long, got: {:?}",
333            report.diagnostics
334        );
335    }
336
337    #[test]
338    fn inline_disable_does_not_affect_other_rules_or_lines() {
339        let engine = Engine::with_profile(Profile::Public);
340        let text = "Intro.\n\n\
341                    <!-- lucid-lint disable-next-line weasel-words -->\n\
342                    This is a long sentence that keeps adding more and more words until it \
343                    exceeds the public profile threshold by a comfortable margin of safety.\n";
344        let report = engine.lint_str(text);
345        assert_eq!(
346            diags_for_rule(&report.diagnostics, "structure.sentence-too-long"),
347            1
348        );
349    }
350
351    #[test]
352    fn block_disable_suppresses_diagnostics_within_scope() {
353        let engine = Engine::with_profile(Profile::Public);
354        let long_sentence = "This is a long sentence that keeps adding more and more words \
355                             until it exceeds the public profile threshold by a comfortable \
356                             margin of safety.";
357        let text = format!(
358            "Intro.\n\n\
359             <!-- lucid-lint-disable structure.sentence-too-long -->\n\n\
360             {long_sentence}\n\n\
361             {long_sentence}\n\n\
362             <!-- lucid-lint-enable -->\n\n\
363             {long_sentence}\n",
364        );
365        let report = engine.lint_str(&text);
366        // The two long sentences inside the block are suppressed; the one
367        // after the enable comment still triggers.
368        assert_eq!(
369            diags_for_rule(&report.diagnostics, "structure.sentence-too-long"),
370            1,
371            "expected block directive to suppress 2 of 3 diagnostics, got: {:?}",
372            report.diagnostics,
373        );
374    }
375
376    #[test]
377    fn engine_profile_accessor() {
378        let engine = Engine::with_profile(Profile::Falc);
379        assert_eq!(engine.profile(), Profile::Falc);
380    }
381
382    #[test]
383    fn with_excessive_commas_max_commas_overrides_threshold() {
384        let base = Engine::with_profile(Profile::Public);
385        let tightened = Engine::with_profile(Profile::Public)
386            .with_excessive_commas_max_commas(NonZeroU32::new(1).unwrap());
387        let text = "Alpha, beta, gamma are three items in a short list.";
388        let base_hits = diags_for_rule(
389            &base.lint_str(text).diagnostics,
390            "structure.excessive-commas",
391        );
392        let tight_hits = diags_for_rule(
393            &tightened.lint_str(text).diagnostics,
394            "structure.excessive-commas",
395        );
396        assert!(
397            tight_hits > base_hits,
398            "tightened max_commas=1 should flag more than the Public baseline (base={base_hits}, tight={tight_hits})"
399        );
400    }
401
402    #[test]
403    fn with_unexplained_whitelist_suppresses_extra_acronym() {
404        let text = "WCAG is the relevant reference for accessibility compliance.";
405        let rule_id = "lexicon.unexplained-abbreviation";
406        let base = Engine::with_profile(Profile::Public);
407        let base_hits = diags_for_rule(&base.lint_str(text).diagnostics, rule_id);
408        if base_hits == 0 {
409            // The Public baseline already whitelists `WCAG`; pick any other
410            // acronym the rule would flag and re-run.
411            let text2 = "XYZZY governs that procedure as a policy baseline.";
412            let extended = Engine::with_profile(Profile::Public)
413                .with_unexplained_whitelist(vec!["XYZZY".into()]);
414            let baseline = Engine::with_profile(Profile::Public);
415            assert!(
416                diags_for_rule(&baseline.lint_str(text2).diagnostics, rule_id)
417                    > diags_for_rule(&extended.lint_str(text2).diagnostics, rule_id),
418                "extra whitelist entry should suppress at least one diagnostic"
419            );
420        } else {
421            let extended = Engine::with_profile(Profile::Public)
422                .with_unexplained_whitelist(vec!["WCAG".into()]);
423            let extended_hits = diags_for_rule(&extended.lint_str(text).diagnostics, rule_id);
424            assert!(extended_hits < base_hits);
425        }
426    }
427
428    #[test]
429    fn override_helpers_are_no_ops_when_rule_filtered_out() {
430        // An engine built with an empty rule set accepts the override helpers
431        // silently — contract documented on `replace_rule`.
432        let engine = Engine::with_rules(Profile::Public, Vec::new())
433            .with_readability_formula(FormulaChoice::Auto)
434            .with_unexplained_whitelist(vec!["NASA".into()])
435            .with_excessive_commas_max_commas(NonZeroU32::new(1).unwrap());
436        assert!(engine.lint_str("Anything.").diagnostics.is_empty());
437    }
438
439    #[test]
440    fn normalize_input_passes_through_clean_ascii_borrowed() {
441        // Fast path: already-NFC + no BOM → Cow::Borrowed, no allocation.
442        let input = "Plain ASCII sentence.";
443        let out = normalize_input(input);
444        assert!(matches!(out, std::borrow::Cow::Borrowed(_)));
445        assert_eq!(out.as_ref(), input);
446    }
447
448    #[test]
449    fn normalize_input_passes_through_nfc_unicode_borrowed() {
450        // Already-NFC accented text without a BOM also stays borrowed.
451        let input = "Le café est prêt.";
452        let out = normalize_input(input);
453        assert!(matches!(out, std::borrow::Cow::Borrowed(_)));
454        assert_eq!(out.as_ref(), input);
455    }
456
457    #[test]
458    fn normalize_input_strips_leading_bom_only() {
459        let out = normalize_input("\u{FEFF}hello");
460        assert_eq!(out.as_ref(), "hello");
461    }
462
463    #[test]
464    fn normalize_input_does_not_strip_inner_bom() {
465        // Only the *leading* BOM is stripped; inner U+FEFF (zero-width
466        // no-break space) is preserved so it doesn't silently mutate prose.
467        let input = "hello\u{FEFF}world";
468        let out = normalize_input(input);
469        assert_eq!(out.as_ref(), input);
470    }
471
472    #[test]
473    fn normalize_input_nfc_normalizes_decomposed_text() {
474        // NFD `cafe + U+0301` → NFC `café`.
475        let out = normalize_input("cafe\u{0301}");
476        assert_eq!(out.as_ref(), "café");
477    }
478
479    #[test]
480    fn normalize_input_strips_bom_and_nfc_normalizes() {
481        // Combined path: leading BOM + NFD body.
482        let out = normalize_input("\u{FEFF}cafe\u{0301}");
483        assert_eq!(out.as_ref(), "café");
484    }
485
486    #[test]
487    fn normalize_input_handles_empty_string() {
488        let out = normalize_input("");
489        assert_eq!(out.as_ref(), "");
490        assert!(matches!(out, std::borrow::Cow::Borrowed(_)));
491    }
492
493    #[test]
494    fn bom_prefix_does_not_shift_diagnostics() {
495        let engine = Engine::with_profile(Profile::Public);
496        let body = "This is a long sentence that keeps adding more and more words until it \
497                    exceeds the public profile threshold by a comfortable margin of safety.";
498        let with_bom = format!("\u{FEFF}{body}");
499        let plain = engine.lint_str(body);
500        let bommed = engine.lint_str(&with_bom);
501        assert_eq!(plain.diagnostics.len(), bommed.diagnostics.len());
502        for (a, b) in plain.diagnostics.iter().zip(bommed.diagnostics.iter()) {
503            assert_eq!(a.rule_id, b.rule_id);
504            assert_eq!(a.location.line, b.location.line);
505            assert_eq!(a.location.column, b.location.column);
506            assert_eq!(a.message, b.message);
507        }
508    }
509
510    #[test]
511    fn nfd_input_yields_same_diagnostics_as_nfc() {
512        // "café" precomposed (NFC) vs decomposed (NFD: e + combining acute).
513        // Rules using HashMap keys (e.g. low-lexical-diversity) would treat
514        // the two as different words without normalization at the boundary.
515        let engine = Engine::with_profile(Profile::Public);
516        let nfc = "Le café est bon. Le café est chaud. Le café est noir. Le café est fort.";
517        let nfd = "Le cafe\u{0301} est bon. Le cafe\u{0301} est chaud. Le cafe\u{0301} est noir. \
518                   Le cafe\u{0301} est fort.";
519        let a = engine.lint_str(nfc);
520        let b = engine.lint_str(nfd);
521        assert_eq!(a.diagnostics.len(), b.diagnostics.len());
522        for (x, y) in a.diagnostics.iter().zip(b.diagnostics.iter()) {
523            assert_eq!(x.rule_id, y.rule_id);
524            assert_eq!(x.location.line, y.location.line);
525        }
526    }
527
528    #[test]
529    fn lone_cr_line_endings_are_normalized() {
530        // Classic Mac line endings: bare \r between paragraphs.
531        // Parser already maps \r → \n at src/parser/mod.rs; this pins the
532        // behaviour so a future refactor can't silently drop it.
533        let engine = Engine::with_profile(Profile::Public);
534        let lf = "First paragraph.\n\nSecond paragraph.\n\nThird.";
535        let cr = "First paragraph.\r\rSecond paragraph.\r\rThird.";
536        let a = engine.lint_str(lf);
537        let b = engine.lint_str(cr);
538        assert_eq!(a.word_count, b.word_count);
539        assert_eq!(a.diagnostics.len(), b.diagnostics.len());
540    }
541
542    #[test]
543    fn zero_width_chars_inside_words_pin_behaviour() {
544        // Zero-width chars (U+200B/200C/200D) sometimes survive copy-paste
545        // from social-media or PDF sources. Pin observed behaviour: input
546        // round-trips through the engine without panicking and produces a
547        // valid Report. The exact word count is not asserted — `nfc()` does
548        // not strip them, and `unicode-segmentation`'s word boundary rules
549        // decide whether they split tokens.
550        let engine = Engine::with_profile(Profile::Public);
551        let text = "Hello\u{200B}world. Bonjour\u{200C}le\u{200D}monde.";
552        let report = engine.lint_str(text);
553        let _ = report.word_count;
554    }
555
556    #[test]
557    fn engine_produces_scorecard_with_fixed_max() {
558        let engine = Engine::with_profile(Profile::Public);
559        let report = engine.lint_str("Short clean sentence. Another fine one.");
560        assert_eq!(
561            report.scorecard.global.max,
562            crate::scoring::DEFAULT_CATEGORY_MAX * 5
563        );
564        assert_eq!(report.scorecard.per_category.len(), 5);
565    }
566}
lucid_lint/engine.rs

lucid_lint/
engine.rs