libmagic_rs/evaluator/
mod.rs

1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Rule evaluation engine
5//!
6//! This module provides the public interface for magic rule evaluation,
7//! including data types for evaluation state and match results, and
8//! re-exports the core evaluation functions from submodules.
9
10use crate::{EvaluationConfig, LibmagicError};
11use serde::Serialize;
12
13mod engine;
14pub mod offset;
15pub mod operators;
16pub mod strength;
17pub mod types;
18
19pub use engine::{evaluate_rules, evaluate_rules_with_config, evaluate_single_rule};
20
21/// Shared environment attached to an [`EvaluationContext`] so the engine can
22/// resolve whole-database operations (currently: `Use` subroutine lookups;
23/// eventually `indirect` whole-tree re-entry).
24///
25/// Stored as an `Arc` so cloning a context across recursive calls is cheap
26/// and the rule data can be shared safely across threads.
27#[derive(Debug, Clone)]
28pub(crate) struct RuleEnvironment {
29    /// Named subroutine table, keyed by identifier.
30    pub(crate) name_table: std::sync::Arc<crate::parser::name_table::NameTable>,
31    /// Top-level rule list retained for future whole-database operations.
32    #[allow(dead_code)]
33    pub(crate) root_rules: std::sync::Arc<[crate::parser::ast::MagicRule]>,
34}
35
36/// Context for maintaining evaluation state during rule processing
37///
38/// The `EvaluationContext` tracks the current state of rule evaluation,
39/// including the current offset position, recursion depth for nested rules,
40/// and configuration settings that control evaluation behavior.
41///
42/// # Examples
43///
44/// ```rust
45/// use libmagic_rs::evaluator::EvaluationContext;
46/// use libmagic_rs::EvaluationConfig;
47///
48/// let config = EvaluationConfig::default();
49/// let context = EvaluationContext::new(config);
50///
51/// assert_eq!(context.current_offset(), 0);
52/// assert_eq!(context.recursion_depth(), 0);
53/// ```
54#[derive(Debug, Clone)]
55#[non_exhaustive]
56pub struct EvaluationContext {
57    /// Current offset position in the file buffer
58    current_offset: usize,
59    /// End offset of the most recent successful match.
60    ///
61    /// This is the GNU `file`/libmagic anchor used to resolve relative
62    /// (`&+N` / `&-N`) offsets. It is updated to the end of the most
63    /// recently matched rule -- the value may *increase or decrease* as
64    /// successive rules match at different positions; it is not a
65    /// high-watermark. A fresh context starts with this set to 0, which
66    /// matches libmagic's behavior of resolving top-level relative offsets
67    /// from the file start.
68    last_match_end: usize,
69    /// Current recursion depth for nested rule evaluation
70    recursion_depth: u32,
71    /// Configuration settings for evaluation behavior
72    config: EvaluationConfig,
73    /// Optional rule environment (name table + root rules) threaded from
74    /// [`MagicDatabase`](crate::MagicDatabase). Evaluations that come in
75    /// through the low-level [`evaluate_rules`] / [`evaluate_rules_with_config`]
76    /// surface (tests, programmatic consumers) run with `rule_env = None`,
77    /// in which case `MetaType::Use` rules are silent no-ops.
78    rule_env: Option<std::sync::Arc<RuleEnvironment>>,
79    /// Base offset applied to absolute offset resolution.
80    ///
81    /// Normally 0. When evaluating a subroutine body via `MetaType::Use`,
82    /// this is set to the use-site offset so that the subroutine's
83    /// `OffsetSpec::Absolute(n)` rules resolve to `base + n` (matching
84    /// magic(5) / libmagic semantics: subroutines see offsets relative
85    /// to the caller's invocation point, not absolute file positions).
86    /// Restored to the caller's value on subroutine exit via the
87    /// `SubroutineScope` RAII guard in `engine/mod.rs`, which saves
88    /// and restores both `last_match_end` and `base_offset` together.
89    base_offset: usize,
90    /// One-shot flag set by `MetaType::Indirect` dispatch before
91    /// re-entering the root rule list. When true, the next entry to
92    /// `evaluate_rules` treats the iteration as a top-level sibling
93    /// chain (anchor chains across siblings per GOTCHAS S3.8) rather
94    /// than as a continuation list (anchor resets between siblings).
95    /// Consumed at entry — children of a matched rule inside the
96    /// re-entry see the flag cleared, so their own continuation-reset
97    /// semantics kick in via the `recursion_depth > 0` gate.
98    ///
99    /// Without this flag, `indirect` wrapping re-entry under
100    /// `RecursionGuard` forces `recursion_depth > 0`, which forces
101    /// continuation-reset semantics on the root rule list — wrong,
102    /// because top-level rules in the re-entered database should
103    /// chain sibling anchors like any other top-level evaluation.
104    indirect_reentry: bool,
105}
106
107impl EvaluationContext {
108    /// Create a new evaluation context with the given configuration
109    ///
110    /// # Arguments
111    ///
112    /// * `config` - Configuration settings for evaluation behavior
113    ///
114    /// # Examples
115    ///
116    /// ```rust
117    /// use libmagic_rs::evaluator::EvaluationContext;
118    /// use libmagic_rs::EvaluationConfig;
119    ///
120    /// let config = EvaluationConfig::default();
121    /// let context = EvaluationContext::new(config);
122    /// ```
123    #[must_use]
124    pub fn new(mut config: EvaluationConfig) -> Self {
125        // Defensive clamp on `max_string_length`: `EvaluationConfig::validate()`
126        // rejects 0, but callers can bypass validation by setting the field
127        // via struct-literal syntax (or via the `with_max_string_length`
128        // builder, which doesn't validate). Without this clamp, a `cap = 0`
129        // would silently produce zero-byte reads on every scan-mode `string x`
130        // rule and disable the CWE-770 control documented at this field.
131        //
132        // The clamp rewrites an invalid 0 to
133        // `crate::evaluator::types::DEFAULT_MAX_STRING_LENGTH` (8192,
134        // matching `EvaluationConfig::default()`). A `warn!` records the
135        // correction so embedders see it in logs. Closes PR #304 review
136        // finding SF-1.
137        if config.max_string_length == 0 {
138            log::warn!(
139                "EvaluationContext::new received max_string_length=0 \
140                 (likely a struct-literal or builder bypass of \
141                 EvaluationConfig::validate); clamping to {} (the documented \
142                 default). Construct the config via EvaluationConfig::new() \
143                 / EvaluationConfig::default() and use the with_* builders \
144                 to avoid this warning.",
145                crate::evaluator::types::DEFAULT_MAX_STRING_LENGTH,
146            );
147            config.max_string_length = crate::evaluator::types::DEFAULT_MAX_STRING_LENGTH;
148        }
149        Self {
150            current_offset: 0,
151            last_match_end: 0,
152            recursion_depth: 0,
153            config,
154            rule_env: None,
155            base_offset: 0,
156            indirect_reentry: false,
157        }
158    }
159
160    /// Read-only access to the subroutine base offset. Non-zero only
161    /// during a `MetaType::Use` body evaluation.
162    #[must_use]
163    pub(crate) const fn base_offset(&self) -> usize {
164        self.base_offset
165    }
166
167    /// Set the subroutine base offset.
168    ///
169    /// `pub(crate)` and owned by the engine's `SubroutineScope` RAII
170    /// guard -- no external caller should set this directly.
171    pub(crate) fn set_base_offset(&mut self, offset: usize) {
172        self.base_offset = offset;
173    }
174
175    /// Read-and-clear the indirect-reentry flag. Used by `evaluate_rules`
176    /// at entry to decide whether the iteration is a top-level re-entry
177    /// (no anchor reset between siblings) or a continuation list (reset
178    /// between siblings). Cleared on read so children of a matched rule
179    /// inside the re-entry see the flag as false and fall back to the
180    /// `recursion_depth > 0` gate for their own continuation semantics.
181    pub(crate) fn take_indirect_reentry(&mut self) -> bool {
182        std::mem::take(&mut self.indirect_reentry)
183    }
184
185    /// Set the indirect-reentry flag.
186    ///
187    /// `pub(crate)` and owned by the `MetaType::Indirect` dispatch in
188    /// `engine/mod.rs`. Callers should set this true exactly once
189    /// before invoking `evaluate_rules` on the root rule list.
190    pub(crate) fn set_indirect_reentry(&mut self, flag: bool) {
191        self.indirect_reentry = flag;
192    }
193
194    /// Attach a rule environment to this context.
195    ///
196    /// The environment carries the name-subroutine table and root rule list
197    /// so the engine can resolve `MetaType::Use` rules and (eventually)
198    /// `MetaType::Indirect` re-entries. Intended to be called once by
199    /// [`MagicDatabase`](crate::MagicDatabase) before handing the context
200    /// to [`evaluate_rules`].
201    #[must_use]
202    pub(crate) fn with_rule_env(mut self, env: std::sync::Arc<RuleEnvironment>) -> Self {
203        self.rule_env = Some(env);
204        self
205    }
206
207    /// Read-only access to the attached rule environment, if any.
208    #[must_use]
209    pub(crate) fn rule_env(&self) -> Option<&RuleEnvironment> {
210        self.rule_env.as_deref()
211    }
212
213    /// Get the current offset position
214    ///
215    /// # Returns
216    ///
217    /// The current offset position in the file buffer
218    #[must_use]
219    pub const fn current_offset(&self) -> usize {
220        self.current_offset
221    }
222
223    /// Set the current offset position
224    ///
225    /// # Arguments
226    ///
227    /// * `offset` - The new offset position
228    pub fn set_current_offset(&mut self, offset: usize) {
229        self.current_offset = offset;
230    }
231
232    /// Get the end offset of the most recent successful match.
233    ///
234    /// This is the GNU `file`/libmagic anchor used to resolve relative
235    /// (`&+N` / `&-N`) offset specifications. A fresh context returns 0,
236    /// which makes top-level relative offsets resolve from the file start.
237    ///
238    /// `pub(crate)` because the anchor is an internal engine detail; external
239    /// consumers should not couple to it.
240    #[must_use]
241    pub(crate) const fn last_match_end(&self) -> usize {
242        self.last_match_end
243    }
244
245    /// Set the end offset of the most recent successful match.
246    ///
247    /// Called by the evaluation engine after a rule matches, to advance the
248    /// anchor used by subsequent relative offset resolution. The new value
249    /// is typically `match_offset + bytes_consumed_by_type`.
250    ///
251    /// `pub(crate)` because external callers should not be able to inject
252    /// arbitrary anchor state. External callers that need to clear the
253    /// anchor between buffer evaluations should call
254    /// `EvaluationContext::reset()`, which resets the anchor, current
255    /// offset, and recursion depth together.
256    pub(crate) fn set_last_match_end(&mut self, offset: usize) {
257        self.last_match_end = offset;
258    }
259
260    /// Get the current recursion depth
261    ///
262    /// # Returns
263    ///
264    /// The current recursion depth for nested rule evaluation
265    #[must_use]
266    pub const fn recursion_depth(&self) -> u32 {
267        self.recursion_depth
268    }
269
270    /// Increment the recursion depth
271    ///
272    /// # Returns
273    ///
274    /// `Ok(())` if the recursion depth is within limits, or `Err(LibmagicError)`
275    /// if the maximum recursion depth would be exceeded
276    ///
277    /// # Errors
278    ///
279    /// Returns `LibmagicError::EvaluationError` if incrementing would exceed
280    /// the maximum recursion depth configured in the evaluation config.
281    pub(crate) fn increment_recursion_depth(&mut self) -> Result<(), LibmagicError> {
282        if self.recursion_depth >= self.config.max_recursion_depth {
283            return Err(LibmagicError::EvaluationError(
284                crate::error::EvaluationError::recursion_limit_exceeded(self.recursion_depth),
285            ));
286        }
287        self.recursion_depth += 1;
288        Ok(())
289    }
290
291    /// Decrement the recursion depth
292    ///
293    /// # Errors
294    ///
295    /// Returns an error if the recursion depth is already 0, as this indicates
296    /// a programming error in the evaluation logic (mismatched increment/decrement calls).
297    pub(crate) fn decrement_recursion_depth(&mut self) -> Result<(), LibmagicError> {
298        if self.recursion_depth == 0 {
299            return Err(LibmagicError::EvaluationError(
300                crate::error::EvaluationError::internal_error(
301                    "Attempted to decrement recursion depth below 0",
302                ),
303            ));
304        }
305        self.recursion_depth -= 1;
306        Ok(())
307    }
308
309    /// Get a reference to the evaluation configuration
310    ///
311    /// # Returns
312    ///
313    /// A reference to the `EvaluationConfig` used by this context
314    #[must_use]
315    pub const fn config(&self) -> &EvaluationConfig {
316        &self.config
317    }
318
319    /// Check if evaluation should stop at the first match
320    ///
321    /// # Returns
322    ///
323    /// `true` if evaluation should stop at the first match, `false` otherwise
324    #[must_use]
325    pub const fn should_stop_at_first_match(&self) -> bool {
326        self.config.stop_at_first_match
327    }
328
329    /// Get the maximum string length allowed for scan-mode string reads.
330    ///
331    /// Threaded into both string-read dispatchers
332    /// (`read_typed_value_with_pattern` for the unflagged `(None, _)` arm
333    /// and `read_pattern_match` for the flagged `/c`/`/C`/`/w`/`/W`/`/T`/`/f`
334    /// arm) so they cap the buffer-length allocation against this value.
335    /// Does NOT apply to `TypeKind::PString` (which errors on oversized
336    /// length prefixes per GOTCHAS S6.1) or `TypeKind::String16` (capped
337    /// at a hardcoded `STRING16_MAX_UNITS = 8192` ceiling).
338    ///
339    /// # Returns
340    ///
341    /// The configured `max_string_length` (default 8192 bytes per
342    /// `EvaluationConfig::default()`).
343    #[must_use]
344    pub const fn max_string_length(&self) -> usize {
345        self.config.max_string_length
346    }
347
348    /// Check if MIME type mapping is enabled
349    ///
350    /// # Returns
351    ///
352    /// `true` if MIME type mapping should be performed, `false` otherwise
353    #[must_use]
354    pub const fn enable_mime_types(&self) -> bool {
355        self.config.enable_mime_types
356    }
357
358    /// Get the evaluation timeout in milliseconds
359    ///
360    /// # Returns
361    ///
362    /// The timeout duration in milliseconds, or `None` if no timeout is set
363    #[must_use]
364    pub const fn timeout_ms(&self) -> Option<u64> {
365        self.config.timeout_ms
366    }
367
368    /// Reset the context to initial state while preserving configuration
369    ///
370    /// This resets the current offset and recursion depth to 0, but keeps
371    /// the same configuration settings.
372    pub fn reset(&mut self) {
373        self.current_offset = 0;
374        self.last_match_end = 0;
375        self.recursion_depth = 0;
376        self.base_offset = 0;
377        self.indirect_reentry = false;
378    }
379}
380
381/// RAII guard that increments recursion depth on entry and decrements on drop.
382///
383/// Replaces the manual `increment_recursion_depth` / `decrement_recursion_depth`
384/// pair with a scope-based guard, eliminating the risk of mismatched calls and
385/// the need to swallow cleanup errors on error-return paths.
386///
387/// Obtain a guard via [`RecursionGuard::enter`], which borrows the context
388/// mutably for the guard's lifetime. Use [`RecursionGuard::context`] to access
389/// the borrowed context for the duration of the recursive call. The guard
390/// automatically decrements the recursion depth when it goes out of scope.
391///
392/// The guard is `pub(crate)` because recursion-depth management is an internal
393/// detail of the evaluation engine.
394pub(crate) struct RecursionGuard<'a> {
395    context: &'a mut EvaluationContext,
396}
397
398impl<'a> RecursionGuard<'a> {
399    /// Enter a new recursion level, incrementing the context's recursion depth.
400    ///
401    /// # Errors
402    ///
403    /// Returns `LibmagicError::EvaluationError` if incrementing would exceed
404    /// the maximum recursion depth configured in the evaluation config.
405    pub(crate) fn enter(context: &'a mut EvaluationContext) -> Result<Self, LibmagicError> {
406        context.increment_recursion_depth()?;
407        Ok(Self { context })
408    }
409
410    /// Access the underlying context for the duration of the guard.
411    pub(crate) fn context(&mut self) -> &mut EvaluationContext {
412        self.context
413    }
414}
415
416impl Drop for RecursionGuard<'_> {
417    fn drop(&mut self) {
418        // Safe to ignore: `decrement_recursion_depth` only fails when the
419        // depth is already 0, which is impossible here because `enter` just
420        // incremented it and the depth is only mutated through guard pairs.
421        let result = self.context.decrement_recursion_depth();
422        debug_assert!(
423            result.is_ok(),
424            "RecursionGuard invariant violated: decrement failed after successful enter()"
425        );
426    }
427}
428
429/// Result of evaluating a magic rule
430///
431/// Contains information extracted from a successful rule match, including
432/// the matched value, position, and confidence score.
433///
434/// This type derives `Serialize` so callers can convert evaluation results
435/// to JSON, but intentionally does NOT derive `Deserialize`: a
436/// reconstructed `RuleMatch` would lack the buffer context it was
437/// produced against, so deserialization is not a meaningful operation.
438/// The output-side conversion layer (`output::MatchResult` /
439/// `output::json::JsonMatchResult`) is the documented JSON contract.
440#[derive(Debug, Clone, PartialEq, Serialize)]
441#[non_exhaustive]
442pub struct RuleMatch {
443    /// The message associated with the matching rule
444    pub message: String,
445    /// The offset where the match occurred
446    pub offset: usize,
447    /// The rule level (depth in hierarchy)
448    pub level: u32,
449    /// The matched value
450    pub value: crate::parser::ast::Value,
451    /// The type used to read the matched value
452    ///
453    /// Carries the source `TypeKind` so downstream consumers (e.g., output
454    /// formatting) can determine the on-disk width of the matched value.
455    ///
456    /// `#[serde(skip)]` keeps the parser AST out of JSON output produced
457    /// by serializing `EvaluationResult` directly via
458    /// `serde_json::to_string(&result)`. The documented JSON contract is
459    /// `JsonMatchResult` in `src/output/json.rs`, which omits this field.
460    /// Origin findings 1B-H2 / 2A-M1 (CWE-200 information exposure).
461    /// Rust-side consumers continue to access `type_kind` via field access
462    /// for runtime needs (`format_magic_message` width-masking,
463    /// `bit_width()` derivation).
464    #[serde(skip)]
465    pub type_kind: crate::parser::ast::TypeKind,
466    /// Confidence score (0.0 to 1.0)
467    ///
468    /// Calculated based on match depth in the rule hierarchy.
469    /// Deeper matches indicate more specific file type identification
470    /// and thus higher confidence.
471    pub confidence: f64,
472}
473
474impl RuleMatch {
475    /// Construct a new `RuleMatch`.
476    ///
477    /// `confidence` is typically derived from `level` via
478    /// [`RuleMatch::calculate_confidence`]; pass it explicitly here so
479    /// callers can supply an alternative score when needed (e.g. when
480    /// post-processing a series of matches).
481    #[must_use]
482    pub fn new(
483        message: String,
484        offset: usize,
485        level: u32,
486        value: crate::parser::ast::Value,
487        type_kind: crate::parser::ast::TypeKind,
488        confidence: f64,
489    ) -> Self {
490        Self {
491            message,
492            offset,
493            level,
494            value,
495            type_kind,
496            confidence,
497        }
498    }
499
500    /// Calculate confidence score based on rule depth
501    ///
502    /// Formula: min(1.0, 0.3 + (level * 0.2))
503    /// - Level 0 (root): 0.3
504    /// - Level 1: 0.5
505    /// - Level 2: 0.7
506    /// - Level 3: 0.9
507    /// - Level 4+: 1.0 (capped)
508    ///
509    /// # Examples
510    ///
511    /// ```
512    /// use libmagic_rs::evaluator::RuleMatch;
513    ///
514    /// assert!((RuleMatch::calculate_confidence(0) - 0.3).abs() < 0.001);
515    /// assert!((RuleMatch::calculate_confidence(3) - 0.9).abs() < 0.001);
516    /// assert!((RuleMatch::calculate_confidence(10) - 1.0).abs() < 0.001);
517    /// ```
518    #[must_use]
519    pub fn calculate_confidence(level: u32) -> f64 {
520        (0.3 + (f64::from(level) * 0.2)).min(1.0)
521    }
522}
523
524#[cfg(test)]
525mod tests;
libmagic_rs/evaluator/mod.rs

libmagic_rs/evaluator/
mod.rs