Skip to main content

syara_x/
compiled_rules.rs

1/// Execution engine for compiled rules.
2///
3/// Runs matchers in cost order: strings → similarity → phash → classifier → LLM.
4/// LLM calls are short-circuited via `is_identifier_needed`.
5use std::collections::HashMap;
6use std::path::Path;
7
8use crate::cache::TextCache;
9use crate::condition;
10use crate::config::Registry;
11use crate::engine::string_matcher::StringMatcher;
12#[cfg(any(feature = "sbert", feature = "phash", feature = "classifier", feature = "llm", feature = "burn-llm"))]
13use crate::error::SyaraError;
14use crate::models::{Match, MatchDetail, Rule};
15
16pub struct CompiledRules {
17    pub(crate) rules: Vec<Rule>,
18    pub(crate) registry: Registry,
19}
20
21impl CompiledRules {
22    /// Number of compiled rules.
23    pub fn rule_count(&self) -> usize {
24        self.rules.len()
25    }
26
27    /// Iterate over the names of every compiled rule in source order.
28    pub fn rule_names(&self) -> impl Iterator<Item = &str> {
29        self.rules.iter().map(|r| r.name.as_str())
30    }
31}
32
33impl CompiledRules {
34    pub(crate) fn new(rules: Vec<Rule>, registry: Registry) -> Self {
35        Self { rules, registry }
36    }
37
38    /// Match text against all text-based rules.
39    /// PHash rules are skipped (require a file path).
40    pub fn scan(&self, text: &str) -> Vec<Match> {
41        let mut cache = TextCache::new();
42        let mut string_matcher = StringMatcher::new();
43        let mut results = Vec::with_capacity(self.rules.len());
44
45        for rule in &self.rules {
46            let m = self.execute_rule(rule, text, None, &mut cache, &mut string_matcher);
47            results.push(m);
48        }
49
50        cache.clear();
51        #[cfg(any(feature = "llm", feature = "burn-llm"))]
52        self.registry.clear_llm_caches();
53        results
54    }
55
56    /// Match a file against rules that contain phash patterns.
57    ///
58    /// BUG-008: reads file content as text so string patterns also match
59    /// in rules that combine phash + string conditions.
60    pub fn scan_file(&self, path: &Path) -> Vec<Match> {
61        let mut cache = TextCache::new();
62        let mut string_matcher = StringMatcher::new();
63        let mut results = Vec::new();
64
65        // Read file content once for string matching
66        let text = std::fs::read(path)
67            .map(|bytes| String::from_utf8_lossy(&bytes).into_owned())
68            .unwrap_or_default();
69
70        for rule in &self.rules {
71            if !rule.phash.is_empty() {
72                let m =
73                    self.execute_rule(rule, &text, Some(path), &mut cache, &mut string_matcher);
74                results.push(m);
75            }
76        }
77
78        cache.clear();
79        #[cfg(any(feature = "llm", feature = "burn-llm"))]
80        self.registry.clear_llm_caches();
81        results
82    }
83
84    #[allow(unused_variables)]
85    fn execute_rule(
86        &self,
87        rule: &Rule,
88        text: &str,
89        file_path: Option<&Path>,
90        cache: &mut TextCache,
91        string_matcher: &mut StringMatcher,
92    ) -> Match {
93        // Initialise every declared identifier with an empty list so that
94        // `all of them` and `all of ($prefix*)` correctly see failed identifiers.
95        let mut pattern_matches: HashMap<String, Vec<MatchDetail>> = HashMap::new();
96        for r in &rule.strings {
97            pattern_matches.insert(r.identifier.clone(), vec![]);
98        }
99        for r in &rule.similarity {
100            pattern_matches.insert(r.identifier.clone(), vec![]);
101        }
102        for r in &rule.phash {
103            pattern_matches.insert(r.identifier.clone(), vec![]);
104        }
105        for r in &rule.classifier {
106            pattern_matches.insert(r.identifier.clone(), vec![]);
107        }
108        for r in &rule.llm {
109            pattern_matches.insert(r.identifier.clone(), vec![]);
110        }
111
112        // 1. String patterns (cheapest)
113        for string_rule in &rule.strings {
114            match string_matcher.match_rule(string_rule, text) {
115                Ok(hits) if !hits.is_empty() => {
116                    pattern_matches.insert(string_rule.identifier.clone(), hits);
117                }
118                _ => {}
119            }
120        }
121
122        // 2. Similarity patterns (moderate cost) — requires sbert feature
123        #[cfg(feature = "sbert")]
124        for sim_rule in &rule.similarity {
125            if let Ok(hits) = self.execute_similarity(sim_rule, text, cache) {
126                if !hits.is_empty() {
127                    pattern_matches.insert(sim_rule.identifier.clone(), hits);
128                }
129            }
130        }
131
132        // 3. PHash patterns (moderate-to-high cost)
133        #[cfg(feature = "phash")]
134        if let Some(fp) = file_path {
135            for phash_rule in &rule.phash {
136                if let Ok(hits) = self.execute_phash(phash_rule, fp) {
137                    if !hits.is_empty() {
138                        pattern_matches.insert(phash_rule.identifier.clone(), hits);
139                    }
140                }
141            }
142        }
143
144        // 4. Classifier patterns (higher cost)
145        #[cfg(feature = "classifier")]
146        for cls_rule in &rule.classifier {
147            if let Ok(hits) = self.execute_classifier(cls_rule, text, cache) {
148                if !hits.is_empty() {
149                    pattern_matches.insert(cls_rule.identifier.clone(), hits);
150                }
151            }
152        }
153
154        // 5. LLM patterns (highest cost) — short-circuit if not needed
155        #[cfg(any(feature = "llm", feature = "burn-llm"))]
156        if let Some(ref expr) = rule.compiled_condition {
157            for llm_rule in &rule.llm {
158                if condition::is_identifier_needed(
159                    &llm_rule.identifier,
160                    expr,
161                    &pattern_matches,
162                ) {
163                    if let Ok(hits) = self.execute_llm(llm_rule, text, cache) {
164                        if !hits.is_empty() {
165                            pattern_matches.insert(llm_rule.identifier.clone(), hits);
166                        }
167                    }
168                }
169            }
170        }
171
172        // Evaluate condition using pre-compiled AST
173        let matched = self.evaluate_condition(rule, &pattern_matches);
174
175        Match {
176            rule_name: rule.name.clone(),
177            tags: rule.tags.clone(),
178            meta: rule.meta.clone(),
179            matched,
180            matched_patterns: if matched { pattern_matches } else { HashMap::new() },
181        }
182    }
183
184    fn evaluate_condition(
185        &self,
186        rule: &Rule,
187        pattern_matches: &HashMap<String, Vec<MatchDetail>>,
188    ) -> bool {
189        match rule.compiled_condition {
190            Some(ref expr) => condition::evaluate(expr, pattern_matches),
191            None => false,
192        }
193    }
194
195    // ── Feature-gated execution helpers ──────────────────────────────────────
196
197    #[cfg(feature = "sbert")]
198    fn execute_similarity(
199        &self,
200        rule: &crate::models::SimilarityRule,
201        text: &str,
202        cache: &mut TextCache,
203    ) -> Result<Vec<MatchDetail>, SyaraError> {
204        let cleaner = self.registry.get_cleaner(&rule.cleaner_name)?;
205        let chunker = self.registry.get_chunker(&rule.chunker_name)?;
206        let cleaned = get_cleaned(cache, text, cleaner, &rule.cleaner_name);
207        let chunks = chunker.chunk(&cleaned);
208        let matcher = self.registry.get_semantic_matcher(&rule.matcher_name)?;
209        matcher.match_chunks(rule, &chunks)
210    }
211
212    #[cfg(feature = "phash")]
213    fn execute_phash(
214        &self,
215        rule: &crate::models::PHashRule,
216        file_path: &Path,
217    ) -> Result<Vec<MatchDetail>, SyaraError> {
218        let matcher = self.registry.get_phash_matcher(&rule.phash_name)?;
219        matcher.match_rule(rule, file_path)
220    }
221
222    #[cfg(feature = "classifier")]
223    fn execute_classifier(
224        &self,
225        rule: &crate::models::ClassifierRule,
226        text: &str,
227        cache: &mut TextCache,
228    ) -> Result<Vec<MatchDetail>, SyaraError> {
229        let cleaner = self.registry.get_cleaner(&rule.cleaner_name)?;
230        let chunker = self.registry.get_chunker(&rule.chunker_name)?;
231        let cleaned = get_cleaned(cache, text, cleaner, &rule.cleaner_name);
232        let chunks = chunker.chunk(&cleaned);
233        let classifier = self.registry.get_classifier(&rule.classifier_name)?;
234        classifier.classify_chunks(rule, &chunks)
235    }
236
237    #[cfg(any(feature = "llm", feature = "burn-llm"))]
238    fn execute_llm(
239        &self,
240        rule: &crate::models::LLMRule,
241        text: &str,
242        cache: &mut TextCache,
243    ) -> Result<Vec<MatchDetail>, SyaraError> {
244        let cleaner = self.registry.get_cleaner(&rule.cleaner_name)?;
245        let chunker = self.registry.get_chunker(&rule.chunker_name)?;
246        let cleaned = get_cleaned(cache, text, cleaner, &rule.cleaner_name);
247        let chunks = chunker.chunk(&cleaned);
248        let evaluator = self.registry.get_llm_evaluator(&rule.llm_name)?;
249        evaluator.evaluate_chunks(rule, &chunks)
250    }
251
252    // ── Public registration API ───────────────────────────────────────────────
253
254    pub fn register_cleaner(
255        &mut self,
256        name: impl Into<String>,
257        cleaner: Box<dyn crate::engine::cleaner::TextCleaner>,
258    ) {
259        self.registry.register_cleaner(name, cleaner);
260    }
261
262    pub fn register_chunker(
263        &mut self,
264        name: impl Into<String>,
265        chunker: Box<dyn crate::engine::chunker::Chunker>,
266    ) {
267        self.registry.register_chunker(name, chunker);
268    }
269
270    #[cfg(feature = "sbert")]
271    pub fn register_semantic_matcher(
272        &mut self,
273        name: impl Into<String>,
274        matcher: Box<dyn crate::engine::semantic_matcher::SemanticMatcher>,
275    ) {
276        self.registry.register_semantic_matcher(name, matcher);
277    }
278
279    #[cfg(feature = "classifier")]
280    pub fn register_classifier(
281        &mut self,
282        name: impl Into<String>,
283        classifier: Box<dyn crate::engine::classifier::TextClassifier>,
284    ) {
285        self.registry.register_classifier(name, classifier);
286    }
287
288    #[cfg(any(feature = "llm", feature = "burn-llm"))]
289    pub fn register_llm_evaluator(
290        &mut self,
291        name: impl Into<String>,
292        evaluator: Box<dyn crate::engine::llm_evaluator::LLMEvaluator>,
293    ) {
294        self.registry.register_llm_evaluator(name, evaluator);
295    }
296
297    #[cfg(feature = "phash")]
298    pub fn register_phash_matcher(
299        &mut self,
300        name: impl Into<String>,
301        matcher: Box<dyn crate::engine::phash_matcher::PHashMatcher>,
302    ) {
303        self.registry.register_phash_matcher(name, matcher);
304    }
305}
306
307#[allow(dead_code)]
308/// Get cleaned text from cache, or clean and store.
309fn get_cleaned(
310    cache: &mut TextCache,
311    text: &str,
312    cleaner: &dyn crate::engine::cleaner::TextCleaner,
313    cleaner_name: &str,
314) -> String {
315    if let Some(cached) = cache.get(text, cleaner_name) {
316        return cached.to_owned();
317    }
318    let cleaned = cleaner.clean(text);
319    cache.insert(text, cleaner_name, cleaned.clone());
320    cleaned
321}