Skip to main content

harn_rules/
engine.rs

1//! Compile a [`Rule`] into a runnable matcher and run it against source.
2//!
3//! The atomic tier supports three matcher forms, all reduced to a single
4//! [`RuleMatch`] stream:
5//!
6//! - `pattern` → compiled to a tree-sitter query via [`crate::pattern`].
7//! - `kind` → the trivial query `(<kind>) @__match`.
8//! - `regex` → a text regex over the source, yielding spans with no AST
9//!   metavar bindings.
10
11use std::collections::BTreeMap;
12
13use harn_hostlib::ast::Language;
14
15use crate::constraint::CompiledConstraint;
16use crate::error::RulesError;
17use crate::evaluator::CompiledRuleTree;
18use crate::fix::{interpolate, splice, AppliedEdit};
19use crate::model::{Applicability, Rule, Safety, Severity};
20use crate::transform::CompiledTransform;
21
22/// A byte + row/col span. Rows/cols are 0-based, matching the rest of the
23/// Harn AST wire format.
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub struct Span {
26    /// Start byte offset.
27    pub start_byte: usize,
28    /// End byte offset (exclusive).
29    pub end_byte: usize,
30    /// 0-based start row.
31    pub start_row: usize,
32    /// 0-based start column.
33    pub start_col: usize,
34    /// 0-based end row.
35    pub end_row: usize,
36    /// 0-based end column.
37    pub end_col: usize,
38}
39
40impl Span {
41    pub(crate) fn of(node: tree_sitter::Node<'_>) -> Self {
42        let start = node.start_position();
43        let end = node.end_position();
44        Span {
45            start_byte: node.start_byte(),
46            end_byte: node.end_byte(),
47            start_row: start.row,
48            start_col: start.column,
49            end_row: end.row,
50            end_col: end.column,
51        }
52    }
53}
54
55/// A metavariable binding: the captured text plus where it lives.
56#[derive(Debug, Clone)]
57pub struct Binding {
58    /// The captured source text.
59    pub text: String,
60    /// The captured node's span.
61    pub span: Span,
62}
63
64/// One match of a rule against a file.
65#[derive(Debug, Clone)]
66pub struct RuleMatch {
67    /// The rule that produced this match.
68    pub rule_id: String,
69    /// The whole matched range (the pattern root, or the regex span).
70    pub span: Span,
71    /// The matched source text.
72    pub text: String,
73    /// Metavar bindings, keyed by name (without the leading `$`). Empty for
74    /// `kind` and `regex` matchers.
75    pub bindings: BTreeMap<String, Binding>,
76}
77
78/// The result of applying a codemod rule's `fix` to a source string.
79#[derive(Debug, Clone)]
80pub struct CodemodResult {
81    /// The rewritten source (equals the input when nothing matched).
82    pub rewritten: String,
83    /// The per-match edits that were spliced in, in document order.
84    pub edits: Vec<AppliedEdit>,
85    /// Whether the rewrite changed the source.
86    pub changed: bool,
87    /// The rule's declared safety tier.
88    pub safety: Safety,
89    /// Whether the fix may be auto-applied or is opt-in only.
90    pub applicability: Applicability,
91    /// Whether re-running the fix on `rewritten` yields no further change
92    /// (a fix should reach a fixed point).
93    pub idempotent: bool,
94}
95
96/// A rule whose matcher has been compiled and is ready to run.
97pub struct CompiledRule {
98    rule_id: String,
99    language: Language,
100    execution: Execution,
101    /// `where` predicates; a match survives only when all hold.
102    constraints: Vec<CompiledConstraint>,
103    /// `transform` definitions: (new metavar name, compiled transform).
104    transforms: Vec<(String, CompiledTransform)>,
105    /// The `fix` replacement template, if this is a codemod.
106    fix: Option<String>,
107    /// The fix's safety tier (gates auto-apply).
108    safety: Safety,
109    /// The diagnostic message (empty for search-only rules).
110    message: String,
111    /// The diagnostic severity.
112    severity: Severity,
113}
114
115/// A diagnostic produced by running a rule — the mapping surface onto the
116/// linter's `LintDiagnostic` / `FixEdit` (Epic C / the LSP reuse this).
117#[derive(Debug, Clone)]
118pub struct Diagnostic {
119    /// The rule id (also the diagnostic code).
120    pub rule_id: String,
121    /// The diagnostic message.
122    pub message: String,
123    /// The severity.
124    pub severity: Severity,
125    /// The flagged span.
126    pub span: Span,
127    /// Whether a fix, if present, is auto-applicable or a suggestion.
128    pub applicability: Applicability,
129    /// The interpolated fix replacement for this match, if the rule has a
130    /// `fix`.
131    pub fix: Option<String>,
132}
133
134enum Execution {
135    /// A top-level pure-`regex` rule: scan the raw source text (grep-style),
136    /// independent of the tree. Its match span is the regex match range.
137    SourceRegex(regex::Regex),
138    /// The full matching algebra (atomic + relational + composite).
139    Tree(Box<CompiledRuleTree>),
140}
141
142impl CompiledRule {
143    /// Resolve the rule's language and grammar, then compile its matcher.
144    pub fn compile(rule: &Rule) -> Result<Self, RulesError> {
145        let language =
146            Language::from_name(&rule.language).ok_or_else(|| RulesError::UnknownLanguage {
147                rule: rule.id.clone(),
148                language: rule.language.clone(),
149            })?;
150
151        // A top-level pure-`regex` rule greps the source text directly; any
152        // other shape (pattern / kind / relational / composite) compiles to
153        // the tree-walking algebra.
154        let execution = if rule.rule.is_pure_regex() {
155            let pattern = rule.rule.regex.as_ref().expect("pure regex");
156            Execution::SourceRegex(regex::Regex::new(pattern).map_err(|err| {
157                RulesError::PatternCompile {
158                    rule: rule.id.clone(),
159                    message: format!("invalid regex `{pattern}`: {err}"),
160                }
161            })?)
162        } else {
163            Execution::Tree(Box::new(CompiledRuleTree::compile(
164                &rule.id,
165                language,
166                &rule.rule,
167                &rule.utils,
168            )?))
169        };
170
171        let constraints = rule
172            .where_constraints
173            .iter()
174            .map(|c| CompiledConstraint::compile(&rule.id, language, c))
175            .collect::<Result<Vec<_>, _>>()?;
176
177        let transforms = rule
178            .transform
179            .iter()
180            .map(|(name, t)| {
181                CompiledTransform::compile(&rule.id, name, t).map(|c| (name.clone(), c))
182            })
183            .collect::<Result<Vec<_>, _>>()?;
184
185        Ok(CompiledRule {
186            rule_id: rule.id.clone(),
187            language,
188            execution,
189            constraints,
190            transforms,
191            fix: rule.fix.clone(),
192            safety: rule.safety,
193            message: rule.message.clone(),
194            severity: rule.severity,
195        })
196    }
197
198    /// The language this rule targets.
199    pub fn language(&self) -> Language {
200        self.language
201    }
202
203    /// The fix's declared safety tier.
204    pub fn safety(&self) -> Safety {
205        self.safety
206    }
207
208    /// Whether this rule's fix may be auto-applied (machine-applicable) or
209    /// is opt-in only (suggestion).
210    pub fn applicability(&self) -> Applicability {
211        self.safety.applicability()
212    }
213
214    /// The rule's id.
215    pub fn id(&self) -> &str {
216        &self.rule_id
217    }
218
219    /// The rule's diagnostic severity (the default for any diagnostic the
220    /// rule produces).
221    pub fn severity(&self) -> Severity {
222        self.severity
223    }
224
225    /// The rule's static diagnostic message (empty for a search-only rule).
226    pub fn message(&self) -> &str {
227        &self.message
228    }
229
230    /// Run the compiled rule against `source`, returning matches in
231    /// document order. Matches that fail any `where` constraint are dropped.
232    pub fn run(&self, source: &str) -> Result<Vec<RuleMatch>, RulesError> {
233        let mut matches = match &self.execution {
234            Execution::SourceRegex(regex) => self.run_regex(regex, source),
235            Execution::Tree(tree) => tree
236                .find(&self.rule_id, self.language, source)?
237                .into_iter()
238                .map(|m| RuleMatch {
239                    rule_id: self.rule_id.clone(),
240                    span: m.span,
241                    text: m.text,
242                    bindings: m.bindings,
243                })
244                .collect(),
245        };
246        if !self.constraints.is_empty() {
247            matches.retain(|m| self.satisfies_constraints(m));
248        }
249        Ok(matches)
250    }
251
252    /// True when every `where` constraint holds for this match. A
253    /// constraint whose metavar is unbound (not captured) fails closed.
254    fn satisfies_constraints(&self, m: &RuleMatch) -> bool {
255        self.constraints.iter().all(|c| {
256            m.bindings
257                .get(&c.metavar)
258                .is_some_and(|b| c.evaluate(&b.text))
259        })
260    }
261
262    /// Apply this codemod rule's `fix` to `source`, returning the rewritten
263    /// text plus the per-match edits. Each match's `fix` template is
264    /// interpolated from its captured metavars plus any `transform`-derived
265    /// ones. Errors if the rule has no `fix`.
266    ///
267    /// This computes the preview only — it does not enforce the safety gate.
268    /// Use [`CompiledRule::auto_apply`] to refuse non-machine-applicable
269    /// fixes, or [`CompiledRule::apply_checked`] to also assert idempotency.
270    pub fn apply(&self, source: &str) -> Result<CodemodResult, RulesError> {
271        let (rewritten, edits) = self.rewrite(source)?;
272        let changed = rewritten != source;
273        // Idempotency: re-running the fix on its own output must not change
274        // it further (the fix should reach a fixed point).
275        let (twice, _) = self.rewrite(&rewritten)?;
276        let idempotent = twice == rewritten;
277        Ok(CodemodResult {
278            rewritten,
279            edits,
280            changed,
281            safety: self.safety,
282            applicability: self.applicability(),
283            idempotent,
284        })
285    }
286
287    /// Like [`CompiledRule::apply`], but refuses to apply a fix whose
288    /// `safety` is above the machine-applicable threshold (`scope-local` and
289    /// riskier). This is the gate `harn codemod --apply` uses by default.
290    pub fn auto_apply(&self, source: &str) -> Result<CodemodResult, RulesError> {
291        if !self.safety.is_auto_applicable() {
292            return Err(RulesError::NotAutoApplicable {
293                rule: self.rule_id.clone(),
294                safety: format!("{:?}", self.safety),
295            });
296        }
297        self.apply(source)
298    }
299
300    /// Like [`CompiledRule::apply`], but fails if the fix is not idempotent.
301    /// Used by the codemod runner and the rule-test harness to assert a fix
302    /// reaches a fixed point.
303    pub fn apply_checked(&self, source: &str) -> Result<CodemodResult, RulesError> {
304        let result = self.apply(source)?;
305        if !result.idempotent {
306            return Err(RulesError::NotIdempotent {
307                rule: self.rule_id.clone(),
308            });
309        }
310        Ok(result)
311    }
312
313    /// Run the rule and produce one [`Diagnostic`] per match — the surface
314    /// the linter (Epic C) and the LSP convert into `LintDiagnostic` /
315    /// `FixEdit`. Each diagnostic carries the interpolated fix (if any) and
316    /// its applicability tier.
317    pub fn diagnostics(&self, source: &str) -> Result<Vec<Diagnostic>, RulesError> {
318        let applicability = self.applicability();
319        let matches = self.run(source)?;
320        Ok(matches
321            .iter()
322            .map(|m| Diagnostic {
323                rule_id: self.rule_id.clone(),
324                message: self.message.clone(),
325                severity: self.severity,
326                span: m.span,
327                applicability,
328                fix: self.fix.as_ref().map(|template| {
329                    let vars = self.metavars_for(m);
330                    interpolate(template, &vars)
331                }),
332            })
333            .collect())
334    }
335
336    /// The core rewrite: run the rule and splice each match's interpolated
337    /// fix. Returns the rewritten text and the edits.
338    fn rewrite(&self, source: &str) -> Result<(String, Vec<AppliedEdit>), RulesError> {
339        let template = self
340            .fix
341            .as_ref()
342            .ok_or_else(|| RulesError::PatternCompile {
343                rule: self.rule_id.clone(),
344                message: "apply requires a `fix` template; this rule has none".into(),
345            })?;
346
347        let matches = dedupe_overlapping(self.run(source)?);
348        let edits: Vec<AppliedEdit> = matches
349            .iter()
350            .map(|m| {
351                let vars = self.metavars_for(m);
352                AppliedEdit {
353                    span: m.span,
354                    before: m.text.clone(),
355                    replacement: interpolate(template, &vars),
356                }
357            })
358            .collect();
359        Ok((splice(source, &edits), edits))
360    }
361
362    /// Build the full metavar map for a match: captured bindings plus the
363    /// `transform`-synthesized metavars (which may shadow captures).
364    fn metavars_for(&self, m: &RuleMatch) -> BTreeMap<String, String> {
365        let mut vars: BTreeMap<String, String> = m
366            .bindings
367            .iter()
368            .map(|(name, binding)| (name.clone(), binding.text.clone()))
369            .collect();
370        for (name, transform) in &self.transforms {
371            let input = m
372                .bindings
373                .get(&transform.source)
374                .map(|b| b.text.as_str())
375                .unwrap_or("");
376            vars.insert(name.clone(), transform.apply(input));
377        }
378        vars
379    }
380
381    fn run_regex(&self, regex: &regex::Regex, source: &str) -> Vec<RuleMatch> {
382        let mut matches = Vec::new();
383        for m in regex.find_iter(source) {
384            let span = byte_span(source, m.start(), m.end());
385            matches.push(RuleMatch {
386                rule_id: self.rule_id.clone(),
387                span,
388                text: m.as_str().to_string(),
389                bindings: BTreeMap::new(),
390            });
391        }
392        matches
393    }
394}
395
396/// Drop matches that overlap an already-kept match, keeping the **outermost**
397/// (and, among equal extents, the first). A tree query naturally yields nested
398/// matches — e.g. `$X + $Y` matches both `(a+b)+c` and its inner `a+b` — and
399/// splicing both would apply overlapping byte edits, corrupting the output or
400/// panicking `replace_range` on a stale offset. A codemod must rewrite each
401/// region once, so we keep the enclosing match and discard anything nested in
402/// or straddling it. Matches are returned in document (start-byte) order.
403fn dedupe_overlapping(mut matches: Vec<RuleMatch>) -> Vec<RuleMatch> {
404    // Outermost-first: smallest start, then largest end (widest span wins a tie
405    // on start). After filtering we restore document order.
406    matches.sort_by(|a, b| {
407        a.span
408            .start_byte
409            .cmp(&b.span.start_byte)
410            .then(b.span.end_byte.cmp(&a.span.end_byte))
411    });
412    let mut kept: Vec<RuleMatch> = Vec::with_capacity(matches.len());
413    let mut covered_to = 0usize; // exclusive end of the last kept span
414    for m in matches {
415        // Keep only when this match starts at or after the end of the last kept
416        // one (adjacency is fine; any earlier start means it is nested in, or
417        // straddles, an already-kept span).
418        if m.span.start_byte >= covered_to {
419            covered_to = m.span.end_byte.max(covered_to);
420            kept.push(m);
421        }
422    }
423    kept
424}
425
426/// Compute a [`Span`] for a byte range by counting rows/cols. Used by the
427/// regex matcher, which has no tree-sitter node to read positions from.
428fn byte_span(source: &str, start: usize, end: usize) -> Span {
429    let (start_row, start_col) = row_col(source, start);
430    let (end_row, end_col) = row_col(source, end);
431    Span {
432        start_byte: start,
433        end_byte: end,
434        start_row,
435        start_col,
436        end_row,
437        end_col,
438    }
439}
440
441fn row_col(source: &str, byte: usize) -> (usize, usize) {
442    let mut row = 0;
443    let mut col = 0;
444    for (i, ch) in source.char_indices() {
445        if i >= byte {
446            break;
447        }
448        if ch == '\n' {
449            row += 1;
450            col = 0;
451        } else {
452            col += 1;
453        }
454    }
455    (row, col)
456}
457
458#[cfg(test)]
459mod tests {
460    use super::*;
461    use crate::model::Rule;
462
463    fn rule(toml: &str) -> CompiledRule {
464        let parsed = Rule::from_toml_str(toml).expect("rule parses");
465        CompiledRule::compile(&parsed).expect("rule compiles")
466    }
467
468    #[test]
469    fn pattern_rule_binds_metavars() {
470        let compiled = rule(
471            r#"
472            id = "destructure-default"
473            language = "typescript"
474            fix = "{ $KEY: $SRC }"
475            [rule]
476            pattern = "$SRC?.$KEY ?? $DEFAULT"
477            "#,
478        );
479        let matches = compiled
480            .run("const a = cfg?.timeout ?? 30;\nconst b = opts?.retries ?? 3;\n")
481            .unwrap();
482        assert_eq!(matches.len(), 2);
483        assert_eq!(matches[0].bindings["SRC"].text, "cfg");
484        assert_eq!(matches[0].bindings["KEY"].text, "timeout");
485        assert_eq!(matches[0].bindings["DEFAULT"].text, "30");
486        assert_eq!(matches[1].bindings["SRC"].text, "opts");
487        // The match span covers the whole expression.
488        assert_eq!(matches[0].text, "cfg?.timeout ?? 30");
489        assert_eq!(matches[0].span.start_row, 0);
490        assert_eq!(matches[1].span.start_row, 1);
491    }
492
493    #[test]
494    fn nested_matches_do_not_corrupt_or_panic_on_apply() {
495        // `$X + $Y` matches the outer `(a+b)+c` AND the inner `a+b` (distinct
496        // spans). Without overlap resolution, splicing both byte-edits panics
497        // `replace_range` on a stale offset or corrupts the output. The engine
498        // must keep the outermost match and rewrite the region exactly once.
499        let compiled = rule(
500            r#"
501            id = "sum-binop"
502            language = "typescript"
503            fix = "sum($X, $Y)"
504            [rule]
505            pattern = "$X + $Y"
506            "#,
507        );
508        // More than one match exists (outer + inner) before dedup.
509        assert!(compiled.run("const z = a + b + c;\n").unwrap().len() >= 2);
510        let result = compiled.apply("const z = a + b + c;\n").unwrap();
511        // Exactly one outer rewrite; inner `a + b` survives verbatim inside $X.
512        assert_eq!(result.rewritten, "const z = sum(a + b, c);\n");
513        assert_eq!(result.edits.len(), 1);
514        assert!(result.changed);
515    }
516
517    #[test]
518    fn dedupe_overlapping_keeps_outermost_in_document_order() {
519        let span = |s: usize, e: usize| Span {
520            start_byte: s,
521            end_byte: e,
522            start_row: 0,
523            start_col: s,
524            end_row: 0,
525            end_col: e,
526        };
527        let m = |s: usize, e: usize| RuleMatch {
528            rule_id: "r".into(),
529            span: span(s, e),
530            text: String::new(),
531            bindings: BTreeMap::new(),
532        };
533        // outer [0,9) contains inner [0,5); [10,14) is disjoint.
534        let kept = dedupe_overlapping(vec![m(0, 5), m(0, 9), m(10, 14)]);
535        let spans: Vec<_> = kept
536            .iter()
537            .map(|m| (m.span.start_byte, m.span.end_byte))
538            .collect();
539        assert_eq!(spans, vec![(0, 9), (10, 14)]);
540    }
541
542    #[test]
543    fn kind_rule_matches_node_kind() {
544        let compiled = rule(
545            r#"
546            id = "find-calls"
547            language = "python"
548            [rule]
549            kind = "call"
550            "#,
551        );
552        let matches = compiled.run("print(x)\nlog(y)\n").unwrap();
553        assert_eq!(matches.len(), 2);
554        assert_eq!(matches[0].text, "print(x)");
555        assert!(matches[0].bindings.is_empty());
556    }
557
558    #[test]
559    fn regex_rule_matches_text() {
560        let compiled = rule(
561            r#"
562            id = "todo"
563            language = "rust"
564            message = "Found a TODO"
565            [rule]
566            regex = "TODO\\(\\w+\\)"
567            "#,
568        );
569        let matches = compiled
570            .run("fn f() {\n    // TODO(ken) fix\n    // todo lower\n}\n")
571            .unwrap();
572        assert_eq!(matches.len(), 1);
573        assert_eq!(matches[0].text, "TODO(ken)");
574        assert_eq!(matches[0].span.start_row, 1);
575    }
576
577    #[test]
578    fn unknown_language_is_an_error() {
579        let parsed = Rule::from_toml_str(
580            r#"
581            id = "x"
582            language = "cobol"
583            [rule]
584            kind = "foo"
585            "#,
586        )
587        .unwrap();
588        assert!(matches!(
589            CompiledRule::compile(&parsed),
590            Err(RulesError::UnknownLanguage { .. })
591        ));
592    }
593
594    #[test]
595    fn invalid_pattern_surfaces_compile_error() {
596        let parsed = Rule::from_toml_str(
597            r#"
598            id = "x"
599            language = "typescript"
600            [rule]
601            pattern = "foo($$$ARGS)"
602            "#,
603        )
604        .unwrap();
605        assert!(matches!(
606            CompiledRule::compile(&parsed),
607            Err(RulesError::PatternCompile { .. })
608        ));
609    }
610}