lucid-lint 0.2.4

A cognitive accessibility linter for prose. Bilingual EN/FR. CI-native.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
//! The linting engine: orchestrates parsing, rule execution, and output.

use std::fs;
use std::num::NonZeroU32;
use std::path::Path;

use thiserror::Error;

use crate::condition::ConditionTag;
use crate::config::Profile;
use crate::language::{default_language, detect_language};
use crate::parser::{parse_markdown, parse_plain, word_count};
use crate::rules::lexicon::unexplained_abbreviation::{self, UnexplainedAbbreviation};
use crate::rules::readability::score::{self, FormulaChoice, ReadabilityScore};
use crate::rules::structure::excessive_commas::{self, ExcessiveCommas};
use crate::rules::{
    default_rules, filter_by_conditions, filter_by_experimental, ExperimentalOptIn, Rule,
};
use crate::scoring::{self, Scorecard, ScoringConfig};
use crate::types::{Diagnostic, Language, SourceFile};

/// Aggregated result of a lint run over a single document.
///
/// Pairs the list of diagnostics with the [`Scorecard`] that aggregates them
/// and the word count used as the density denominator.
#[derive(Debug, Clone)]
pub struct Report {
    /// Diagnostics emitted, in rule-discovery order.
    pub diagnostics: Vec<Diagnostic>,
    /// Global and per-category scores for this document.
    pub scorecard: Scorecard,
    /// Word count used to normalize scoring density.
    pub word_count: u32,
}

/// The linting engine.
///
/// Bundles a profile and a set of rules, and exposes methods to lint strings,
/// files, and stdin-sourced text.
pub struct Engine {
    profile: Profile,
    rules: Vec<Box<dyn Rule>>,
    scoring_config: ScoringConfig,
}

impl Engine {
    /// Build an engine with the default rule set for the given profile.
    #[must_use]
    pub fn with_profile(profile: Profile) -> Self {
        Self {
            profile,
            rules: default_rules(profile),
            scoring_config: ScoringConfig::default(),
        }
    }

    /// Build an engine for the given profile, restricting the rule set to
    /// rules tagged `general` plus those whose condition tags intersect with
    /// `conditions` (F71 + F72).
    #[must_use]
    pub fn with_profile_and_conditions(profile: Profile, conditions: &[ConditionTag]) -> Self {
        Self::with_profile_conditions_and_experimental(
            profile,
            conditions,
            &ExperimentalOptIn::None,
        )
    }

    /// Build an engine with the F71/F72 condition filter and the F139
    /// experimental opt-in applied together.
    ///
    /// Filtering order: experimental first (drops [`crate::rules::Status::Experimental`]
    /// rules the user did not opt in to), then conditions. Order is
    /// observably equivalent — both filters are pure subset operations
    /// — but experimental-first keeps the cardinality cheaper.
    #[must_use]
    pub fn with_profile_conditions_and_experimental(
        profile: Profile,
        conditions: &[ConditionTag],
        experimental: &ExperimentalOptIn,
    ) -> Self {
        let rules = filter_by_experimental(default_rules(profile), experimental);
        let rules = filter_by_conditions(rules, conditions);
        Self {
            profile,
            rules,
            scoring_config: ScoringConfig::default(),
        }
    }

    /// Build an engine with a custom rule set.
    #[must_use]
    pub fn with_rules(profile: Profile, rules: Vec<Box<dyn Rule>>) -> Self {
        Self {
            profile,
            rules,
            scoring_config: ScoringConfig::default(),
        }
    }

    /// Attach a custom [`ScoringConfig`], overriding the defaults.
    #[must_use]
    pub fn with_scoring_config(mut self, scoring_config: ScoringConfig) -> Self {
        self.scoring_config = scoring_config;
        self
    }

    /// Override the [`ReadabilityScore`] rule's formula choice (F11).
    ///
    /// When `FormulaChoice::Auto` is passed the engine keeps the default
    /// per-language selection; other variants pin a concrete formula
    /// regardless of the document's detected language.
    ///
    /// If the rule set does not currently include a `readability-score`
    /// rule (e.g., it was filtered out), this is a no-op — the rule will
    /// not be re-added.
    #[must_use]
    pub fn with_readability_formula(mut self, formula: FormulaChoice) -> Self {
        let config = score::Config::for_profile(self.profile).with_formula(formula);
        self.replace_rule(
            ReadabilityScore::ID,
            Box::new(ReadabilityScore::new(config)),
        );
        self
    }

    /// Extend the [`UnexplainedAbbreviation`] rule's user whitelist
    /// with project-specific entries (F31). The extras are additive
    /// over the profile baseline — callers typically use this to
    /// restore the narrower acronyms that F31 moved out of the shipped
    /// `dev-doc` baseline (`WCAG`, `ARIA`, `ADHD`, `LLM`, …).
    ///
    /// If the rule set does not currently include an
    /// `unexplained-abbreviation` rule, this is a no-op.
    #[must_use]
    pub fn with_unexplained_whitelist(mut self, extra: Vec<String>) -> Self {
        if extra.is_empty() {
            return self;
        }
        let config =
            unexplained_abbreviation::Config::for_profile(self.profile).with_extra_whitelist(extra);
        self.replace_rule(
            UnexplainedAbbreviation::ID,
            Box::new(UnexplainedAbbreviation::new(config)),
        );
        self
    }

    /// Override the [`ExcessiveCommas`] rule's `max_commas` threshold.
    ///
    /// If the rule set does not currently include an `excessive-commas`
    /// rule (e.g., it was filtered out), this is a no-op.
    #[must_use]
    pub fn with_excessive_commas_max_commas(mut self, max_commas: NonZeroU32) -> Self {
        let config =
            excessive_commas::Config::for_profile(self.profile).with_max_commas(max_commas);
        self.replace_rule(ExcessiveCommas::ID, Box::new(ExcessiveCommas::new(config)));
        self
    }

    /// Replace the rule matching `id` in-place with `replacement`, preserving
    /// its position so rule-discovery order (and therefore diagnostic order)
    /// stays stable. No-op when no rule with that id is present — this is the
    /// contract the public `with_*` helpers rely on when the target rule has
    /// been filtered out by condition tags.
    fn replace_rule(&mut self, id: &str, replacement: Box<dyn Rule>) {
        if let Some(slot) = self.rules.iter_mut().find(|r| r.id() == id) {
            *slot = replacement;
        }
    }

    /// The profile this engine was configured with.
    #[must_use]
    pub const fn profile(&self) -> Profile {
        self.profile
    }

    /// Lint a string input. Markdown syntax is assumed.
    #[must_use]
    pub fn lint_str(&self, input: &str) -> Report {
        self.lint_with_source(input, SourceFile::Anonymous, true)
    }

    /// Lint stdin-like input.
    #[must_use]
    pub fn lint_stdin(&self, input: &str) -> Report {
        self.lint_with_source(input, SourceFile::Stdin, true)
    }

    /// Lint a file from disk.
    ///
    /// Markdown is assumed for `.md` and `.markdown` extensions; other files
    /// are treated as plain text.
    ///
    /// # Errors
    ///
    /// Returns [`EngineError::Io`] if the file cannot be read.
    pub fn lint_file(&self, path: &Path) -> Result<Report, EngineError> {
        let contents = fs::read_to_string(path).map_err(EngineError::Io)?;
        let is_markdown = path
            .extension()
            .and_then(|e| e.to_str())
            .is_some_and(|ext| matches!(ext, "md" | "markdown"));
        let source = SourceFile::Path(path.to_path_buf());
        Ok(self.lint_with_source(&contents, source, is_markdown))
    }

    fn lint_with_source(&self, input: &str, source: SourceFile, is_markdown: bool) -> Report {
        let normalized = normalize_input(input);
        let input = normalized.as_ref();
        let language = match detect_language(input) {
            Language::Unknown => default_language(),
            detected => detected,
        };

        let document = if is_markdown {
            parse_markdown(input, source)
        } else {
            parse_plain(input, source)
        };

        let mut diagnostics = Vec::new();
        for rule in &self.rules {
            diagnostics.extend(rule.check(&document, language));
        }
        diagnostics.retain(|d| {
            !document
                .directives
                .iter()
                .any(|dir| dir.rule_id == d.rule_id && dir.covers(d.location.line))
        });

        let words = word_count(input);
        let scorecard = scoring::compute(&diagnostics, words, &self.scoring_config);

        Report {
            diagnostics,
            scorecard,
            word_count: words,
        }
    }
}

/// Normalize input at the engine boundary so every rule consumes the same
/// shape of text: leading UTF-8 BOM stripped (F110), and NFC-normalized so
/// `café` (precomposed) and `café` (decomposed) hash to the same key (F111).
fn normalize_input(input: &str) -> std::borrow::Cow<'_, str> {
    use unicode_normalization::{is_nfc_quick, IsNormalized, UnicodeNormalization};

    let stripped = input.strip_prefix('\u{FEFF}');
    let body = stripped.unwrap_or(input);
    match is_nfc_quick(body.chars()) {
        IsNormalized::Yes if stripped.is_none() => std::borrow::Cow::Borrowed(input),
        IsNormalized::Yes => std::borrow::Cow::Owned(body.to_string()),
        _ => std::borrow::Cow::Owned(body.nfc().collect()),
    }
}

/// Errors returned by the engine.
#[derive(Debug, Error)]
pub enum EngineError {
    /// I/O error reading a file.
    #[error("failed to read input file")]
    Io(#[source] std::io::Error),
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::types::Severity;

    #[test]
    fn engine_applies_default_rules() {
        let engine = Engine::with_profile(Profile::Public);
        let text = "This is a rather long sentence that keeps adding more and more words \
                    until it exceeds the public profile threshold by a comfortable margin.";
        let report = engine.lint_str(text);
        assert!(!report.diagnostics.is_empty());
        assert!(report
            .diagnostics
            .iter()
            .any(|d| d.severity == Severity::Warning));
    }

    #[test]
    fn engine_returns_no_warnings_for_clean_text() {
        let engine = Engine::with_profile(Profile::Public);
        let report = engine.lint_str("Short clean sentence. Another fine one.");
        assert!(report
            .diagnostics
            .iter()
            .all(|d| d.severity == Severity::Info));
    }

    fn diags_for_rule(diags: &[Diagnostic], rule_id: &str) -> usize {
        diags.iter().filter(|d| d.rule_id == rule_id).count()
    }

    #[test]
    fn engine_respects_profile() {
        let public = Engine::with_profile(Profile::Public);
        let dev = Engine::with_profile(Profile::DevDoc);
        // 25 words: triggers Public (22) but not DevDoc (30) for sentence-too-long.
        let text = "This is a long sentence that keeps adding more and more words until it \
                    exceeds the public profile threshold by a comfortable margin of safety.";
        assert!(
            diags_for_rule(
                &public.lint_str(text).diagnostics,
                "structure.sentence-too-long"
            ) > 0
        );
        assert_eq!(
            diags_for_rule(
                &dev.lint_str(text).diagnostics,
                "structure.sentence-too-long"
            ),
            0
        );
    }

    #[test]
    fn inline_disable_suppresses_matching_diagnostic() {
        let engine = Engine::with_profile(Profile::Public);
        let text = "Intro paragraph.\n\n\
                    <!-- lucid-lint disable-next-line structure.sentence-too-long -->\n\
                    This is a long sentence that keeps adding more and more words until it \
                    exceeds the public profile threshold by a comfortable margin of safety.\n";
        let report = engine.lint_str(text);
        assert_eq!(
            diags_for_rule(&report.diagnostics, "structure.sentence-too-long"),
            0,
            "expected directive to suppress sentence-too-long, got: {:?}",
            report.diagnostics
        );
    }

    #[test]
    fn inline_disable_does_not_affect_other_rules_or_lines() {
        let engine = Engine::with_profile(Profile::Public);
        let text = "Intro.\n\n\
                    <!-- lucid-lint disable-next-line weasel-words -->\n\
                    This is a long sentence that keeps adding more and more words until it \
                    exceeds the public profile threshold by a comfortable margin of safety.\n";
        let report = engine.lint_str(text);
        assert_eq!(
            diags_for_rule(&report.diagnostics, "structure.sentence-too-long"),
            1
        );
    }

    #[test]
    fn block_disable_suppresses_diagnostics_within_scope() {
        let engine = Engine::with_profile(Profile::Public);
        let long_sentence = "This is a long sentence that keeps adding more and more words \
                             until it exceeds the public profile threshold by a comfortable \
                             margin of safety.";
        let text = format!(
            "Intro.\n\n\
             <!-- lucid-lint-disable structure.sentence-too-long -->\n\n\
             {long_sentence}\n\n\
             {long_sentence}\n\n\
             <!-- lucid-lint-enable -->\n\n\
             {long_sentence}\n",
        );
        let report = engine.lint_str(&text);
        // The two long sentences inside the block are suppressed; the one
        // after the enable comment still triggers.
        assert_eq!(
            diags_for_rule(&report.diagnostics, "structure.sentence-too-long"),
            1,
            "expected block directive to suppress 2 of 3 diagnostics, got: {:?}",
            report.diagnostics,
        );
    }

    #[test]
    fn engine_profile_accessor() {
        let engine = Engine::with_profile(Profile::Falc);
        assert_eq!(engine.profile(), Profile::Falc);
    }

    #[test]
    fn with_excessive_commas_max_commas_overrides_threshold() {
        let base = Engine::with_profile(Profile::Public);
        let tightened = Engine::with_profile(Profile::Public)
            .with_excessive_commas_max_commas(NonZeroU32::new(1).unwrap());
        let text = "Alpha, beta, gamma are three items in a short list.";
        let base_hits = diags_for_rule(
            &base.lint_str(text).diagnostics,
            "structure.excessive-commas",
        );
        let tight_hits = diags_for_rule(
            &tightened.lint_str(text).diagnostics,
            "structure.excessive-commas",
        );
        assert!(
            tight_hits > base_hits,
            "tightened max_commas=1 should flag more than the Public baseline (base={base_hits}, tight={tight_hits})"
        );
    }

    #[test]
    fn with_unexplained_whitelist_suppresses_extra_acronym() {
        let text = "WCAG is the relevant reference for accessibility compliance.";
        let rule_id = "lexicon.unexplained-abbreviation";
        let base = Engine::with_profile(Profile::Public);
        let base_hits = diags_for_rule(&base.lint_str(text).diagnostics, rule_id);
        if base_hits == 0 {
            // The Public baseline already whitelists `WCAG`; pick any other
            // acronym the rule would flag and re-run.
            let text2 = "XYZZY governs that procedure as a policy baseline.";
            let extended = Engine::with_profile(Profile::Public)
                .with_unexplained_whitelist(vec!["XYZZY".into()]);
            let baseline = Engine::with_profile(Profile::Public);
            assert!(
                diags_for_rule(&baseline.lint_str(text2).diagnostics, rule_id)
                    > diags_for_rule(&extended.lint_str(text2).diagnostics, rule_id),
                "extra whitelist entry should suppress at least one diagnostic"
            );
        } else {
            let extended = Engine::with_profile(Profile::Public)
                .with_unexplained_whitelist(vec!["WCAG".into()]);
            let extended_hits = diags_for_rule(&extended.lint_str(text).diagnostics, rule_id);
            assert!(extended_hits < base_hits);
        }
    }

    #[test]
    fn override_helpers_are_no_ops_when_rule_filtered_out() {
        // An engine built with an empty rule set accepts the override helpers
        // silently — contract documented on `replace_rule`.
        let engine = Engine::with_rules(Profile::Public, Vec::new())
            .with_readability_formula(FormulaChoice::Auto)
            .with_unexplained_whitelist(vec!["NASA".into()])
            .with_excessive_commas_max_commas(NonZeroU32::new(1).unwrap());
        assert!(engine.lint_str("Anything.").diagnostics.is_empty());
    }

    #[test]
    fn normalize_input_passes_through_clean_ascii_borrowed() {
        // Fast path: already-NFC + no BOM → Cow::Borrowed, no allocation.
        let input = "Plain ASCII sentence.";
        let out = normalize_input(input);
        assert!(matches!(out, std::borrow::Cow::Borrowed(_)));
        assert_eq!(out.as_ref(), input);
    }

    #[test]
    fn normalize_input_passes_through_nfc_unicode_borrowed() {
        // Already-NFC accented text without a BOM also stays borrowed.
        let input = "Le café est prêt.";
        let out = normalize_input(input);
        assert!(matches!(out, std::borrow::Cow::Borrowed(_)));
        assert_eq!(out.as_ref(), input);
    }

    #[test]
    fn normalize_input_strips_leading_bom_only() {
        let out = normalize_input("\u{FEFF}hello");
        assert_eq!(out.as_ref(), "hello");
    }

    #[test]
    fn normalize_input_does_not_strip_inner_bom() {
        // Only the *leading* BOM is stripped; inner U+FEFF (zero-width
        // no-break space) is preserved so it doesn't silently mutate prose.
        let input = "hello\u{FEFF}world";
        let out = normalize_input(input);
        assert_eq!(out.as_ref(), input);
    }

    #[test]
    fn normalize_input_nfc_normalizes_decomposed_text() {
        // NFD `cafe + U+0301` → NFC `café`.
        let out = normalize_input("cafe\u{0301}");
        assert_eq!(out.as_ref(), "café");
    }

    #[test]
    fn normalize_input_strips_bom_and_nfc_normalizes() {
        // Combined path: leading BOM + NFD body.
        let out = normalize_input("\u{FEFF}cafe\u{0301}");
        assert_eq!(out.as_ref(), "café");
    }

    #[test]
    fn normalize_input_handles_empty_string() {
        let out = normalize_input("");
        assert_eq!(out.as_ref(), "");
        assert!(matches!(out, std::borrow::Cow::Borrowed(_)));
    }

    #[test]
    fn bom_prefix_does_not_shift_diagnostics() {
        let engine = Engine::with_profile(Profile::Public);
        let body = "This is a long sentence that keeps adding more and more words until it \
                    exceeds the public profile threshold by a comfortable margin of safety.";
        let with_bom = format!("\u{FEFF}{body}");
        let plain = engine.lint_str(body);
        let bommed = engine.lint_str(&with_bom);
        assert_eq!(plain.diagnostics.len(), bommed.diagnostics.len());
        for (a, b) in plain.diagnostics.iter().zip(bommed.diagnostics.iter()) {
            assert_eq!(a.rule_id, b.rule_id);
            assert_eq!(a.location.line, b.location.line);
            assert_eq!(a.location.column, b.location.column);
            assert_eq!(a.message, b.message);
        }
    }

    #[test]
    fn nfd_input_yields_same_diagnostics_as_nfc() {
        // "café" precomposed (NFC) vs decomposed (NFD: e + combining acute).
        // Rules using HashMap keys (e.g. low-lexical-diversity) would treat
        // the two as different words without normalization at the boundary.
        let engine = Engine::with_profile(Profile::Public);
        let nfc = "Le café est bon. Le café est chaud. Le café est noir. Le café est fort.";
        let nfd = "Le cafe\u{0301} est bon. Le cafe\u{0301} est chaud. Le cafe\u{0301} est noir. \
                   Le cafe\u{0301} est fort.";
        let a = engine.lint_str(nfc);
        let b = engine.lint_str(nfd);
        assert_eq!(a.diagnostics.len(), b.diagnostics.len());
        for (x, y) in a.diagnostics.iter().zip(b.diagnostics.iter()) {
            assert_eq!(x.rule_id, y.rule_id);
            assert_eq!(x.location.line, y.location.line);
        }
    }

    #[test]
    fn lone_cr_line_endings_are_normalized() {
        // Classic Mac line endings: bare \r between paragraphs.
        // Parser already maps \r → \n at src/parser/mod.rs; this pins the
        // behaviour so a future refactor can't silently drop it.
        let engine = Engine::with_profile(Profile::Public);
        let lf = "First paragraph.\n\nSecond paragraph.\n\nThird.";
        let cr = "First paragraph.\r\rSecond paragraph.\r\rThird.";
        let a = engine.lint_str(lf);
        let b = engine.lint_str(cr);
        assert_eq!(a.word_count, b.word_count);
        assert_eq!(a.diagnostics.len(), b.diagnostics.len());
    }

    #[test]
    fn zero_width_chars_inside_words_pin_behaviour() {
        // Zero-width chars (U+200B/200C/200D) sometimes survive copy-paste
        // from social-media or PDF sources. Pin observed behaviour: input
        // round-trips through the engine without panicking and produces a
        // valid Report. The exact word count is not asserted — `nfc()` does
        // not strip them, and `unicode-segmentation`'s word boundary rules
        // decide whether they split tokens.
        let engine = Engine::with_profile(Profile::Public);
        let text = "Hello\u{200B}world. Bonjour\u{200C}le\u{200D}monde.";
        let report = engine.lint_str(text);
        let _ = report.word_count;
    }

    #[test]
    fn engine_produces_scorecard_with_fixed_max() {
        let engine = Engine::with_profile(Profile::Public);
        let report = engine.lint_str("Short clean sentence. Another fine one.");
        assert_eq!(
            report.scorecard.global.max,
            crate::scoring::DEFAULT_CATEGORY_MAX * 5
        );
        assert_eq!(report.scorecard.per_category.len(), 5);
    }
}