libmagic_rs/evaluator/mod.rs
1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Rule evaluation engine
5//!
6//! This module provides the public interface for magic rule evaluation,
7//! including data types for evaluation state and match results, and
8//! re-exports the core evaluation functions from submodules.
9
10use crate::{EvaluationConfig, LibmagicError};
11use serde::Serialize;
12
13mod engine;
14pub mod offset;
15pub mod operators;
16pub mod strength;
17pub mod types;
18
19pub use engine::{evaluate_rules, evaluate_rules_with_config, evaluate_single_rule};
20
21/// Shared environment attached to an [`EvaluationContext`] so the engine can
22/// resolve whole-database operations (currently: `Use` subroutine lookups;
23/// eventually `indirect` whole-tree re-entry).
24///
25/// Stored as an `Arc` so cloning a context across recursive calls is cheap
26/// and the rule data can be shared safely across threads.
27#[derive(Debug, Clone)]
28pub(crate) struct RuleEnvironment {
29 /// Named subroutine table, keyed by identifier.
30 pub(crate) name_table: std::sync::Arc<crate::parser::name_table::NameTable>,
31 /// Top-level rule list retained for future whole-database operations.
32 #[allow(dead_code)]
33 pub(crate) root_rules: std::sync::Arc<[crate::parser::ast::MagicRule]>,
34}
35
36/// Context for maintaining evaluation state during rule processing
37///
38/// The `EvaluationContext` tracks the current state of rule evaluation,
39/// including the current offset position, recursion depth for nested rules,
40/// and configuration settings that control evaluation behavior.
41///
42/// # Examples
43///
44/// ```rust
45/// use libmagic_rs::evaluator::EvaluationContext;
46/// use libmagic_rs::EvaluationConfig;
47///
48/// let config = EvaluationConfig::default();
49/// let context = EvaluationContext::new(config);
50///
51/// assert_eq!(context.current_offset(), 0);
52/// assert_eq!(context.recursion_depth(), 0);
53/// ```
54#[derive(Debug, Clone)]
55#[non_exhaustive]
56pub struct EvaluationContext {
57 /// Current offset position in the file buffer
58 current_offset: usize,
59 /// End offset of the most recent successful match.
60 ///
61 /// This is the GNU `file`/libmagic anchor used to resolve relative
62 /// (`&+N` / `&-N`) offsets. It is updated to the end of the most
63 /// recently matched rule -- the value may *increase or decrease* as
64 /// successive rules match at different positions; it is not a
65 /// high-watermark. A fresh context starts with this set to 0, which
66 /// matches libmagic's behavior of resolving top-level relative offsets
67 /// from the file start.
68 last_match_end: usize,
69 /// Current recursion depth for nested rule evaluation
70 recursion_depth: u32,
71 /// Configuration settings for evaluation behavior
72 config: EvaluationConfig,
73 /// Optional rule environment (name table + root rules) threaded from
74 /// [`MagicDatabase`](crate::MagicDatabase). Evaluations that come in
75 /// through the low-level [`evaluate_rules`] / [`evaluate_rules_with_config`]
76 /// surface (tests, programmatic consumers) run with `rule_env = None`,
77 /// in which case `MetaType::Use` rules are silent no-ops.
78 rule_env: Option<std::sync::Arc<RuleEnvironment>>,
79 /// Base offset applied to absolute offset resolution.
80 ///
81 /// Normally 0. When evaluating a subroutine body via `MetaType::Use`,
82 /// this is set to the use-site offset so that the subroutine's
83 /// `OffsetSpec::Absolute(n)` rules resolve to `base + n` (matching
84 /// magic(5) / libmagic semantics: subroutines see offsets relative
85 /// to the caller's invocation point, not absolute file positions).
86 /// Restored to the caller's value on subroutine exit via the
87 /// `SubroutineScope` RAII guard in `engine/mod.rs`, which saves
88 /// and restores both `last_match_end` and `base_offset` together.
89 base_offset: usize,
90 /// One-shot flag set by `MetaType::Indirect` dispatch before
91 /// re-entering the root rule list. When true, the next entry to
92 /// `evaluate_rules` treats the iteration as a top-level sibling
93 /// chain (anchor chains across siblings per GOTCHAS S3.8) rather
94 /// than as a continuation list (anchor resets between siblings).
95 /// Consumed at entry — children of a matched rule inside the
96 /// re-entry see the flag cleared, so their own continuation-reset
97 /// semantics kick in via the `recursion_depth > 0` gate.
98 ///
99 /// Without this flag, `indirect` wrapping re-entry under
100 /// `RecursionGuard` forces `recursion_depth > 0`, which forces
101 /// continuation-reset semantics on the root rule list — wrong,
102 /// because top-level rules in the re-entered database should
103 /// chain sibling anchors like any other top-level evaluation.
104 indirect_reentry: bool,
105}
106
107impl EvaluationContext {
108 /// Create a new evaluation context with the given configuration
109 ///
110 /// # Arguments
111 ///
112 /// * `config` - Configuration settings for evaluation behavior
113 ///
114 /// # Examples
115 ///
116 /// ```rust
117 /// use libmagic_rs::evaluator::EvaluationContext;
118 /// use libmagic_rs::EvaluationConfig;
119 ///
120 /// let config = EvaluationConfig::default();
121 /// let context = EvaluationContext::new(config);
122 /// ```
123 #[must_use]
124 pub fn new(mut config: EvaluationConfig) -> Self {
125 // Defensive clamp on `max_string_length`: `EvaluationConfig::validate()`
126 // rejects 0, but callers can bypass validation by setting the field
127 // via struct-literal syntax (or via the `with_max_string_length`
128 // builder, which doesn't validate). Without this clamp, a `cap = 0`
129 // would silently produce zero-byte reads on every scan-mode `string x`
130 // rule and disable the CWE-770 control documented at this field.
131 //
132 // The clamp rewrites an invalid 0 to
133 // `crate::evaluator::types::DEFAULT_MAX_STRING_LENGTH` (8192,
134 // matching `EvaluationConfig::default()`). A `warn!` records the
135 // correction so embedders see it in logs. Closes PR #304 review
136 // finding SF-1.
137 if config.max_string_length == 0 {
138 log::warn!(
139 "EvaluationContext::new received max_string_length=0 \
140 (likely a struct-literal or builder bypass of \
141 EvaluationConfig::validate); clamping to {} (the documented \
142 default). Construct the config via EvaluationConfig::new() \
143 / EvaluationConfig::default() and use the with_* builders \
144 to avoid this warning.",
145 crate::evaluator::types::DEFAULT_MAX_STRING_LENGTH,
146 );
147 config.max_string_length = crate::evaluator::types::DEFAULT_MAX_STRING_LENGTH;
148 }
149 Self {
150 current_offset: 0,
151 last_match_end: 0,
152 recursion_depth: 0,
153 config,
154 rule_env: None,
155 base_offset: 0,
156 indirect_reentry: false,
157 }
158 }
159
160 /// Read-only access to the subroutine base offset. Non-zero only
161 /// during a `MetaType::Use` body evaluation.
162 #[must_use]
163 pub(crate) const fn base_offset(&self) -> usize {
164 self.base_offset
165 }
166
167 /// Set the subroutine base offset.
168 ///
169 /// `pub(crate)` and owned by the engine's `SubroutineScope` RAII
170 /// guard -- no external caller should set this directly.
171 pub(crate) fn set_base_offset(&mut self, offset: usize) {
172 self.base_offset = offset;
173 }
174
175 /// Read-and-clear the indirect-reentry flag. Used by `evaluate_rules`
176 /// at entry to decide whether the iteration is a top-level re-entry
177 /// (no anchor reset between siblings) or a continuation list (reset
178 /// between siblings). Cleared on read so children of a matched rule
179 /// inside the re-entry see the flag as false and fall back to the
180 /// `recursion_depth > 0` gate for their own continuation semantics.
181 pub(crate) fn take_indirect_reentry(&mut self) -> bool {
182 std::mem::take(&mut self.indirect_reentry)
183 }
184
185 /// Set the indirect-reentry flag.
186 ///
187 /// `pub(crate)` and owned by the `MetaType::Indirect` dispatch in
188 /// `engine/mod.rs`. Callers should set this true exactly once
189 /// before invoking `evaluate_rules` on the root rule list.
190 pub(crate) fn set_indirect_reentry(&mut self, flag: bool) {
191 self.indirect_reentry = flag;
192 }
193
194 /// Attach a rule environment to this context.
195 ///
196 /// The environment carries the name-subroutine table and root rule list
197 /// so the engine can resolve `MetaType::Use` rules and (eventually)
198 /// `MetaType::Indirect` re-entries. Intended to be called once by
199 /// [`MagicDatabase`](crate::MagicDatabase) before handing the context
200 /// to [`evaluate_rules`].
201 #[must_use]
202 pub(crate) fn with_rule_env(mut self, env: std::sync::Arc<RuleEnvironment>) -> Self {
203 self.rule_env = Some(env);
204 self
205 }
206
207 /// Read-only access to the attached rule environment, if any.
208 #[must_use]
209 pub(crate) fn rule_env(&self) -> Option<&RuleEnvironment> {
210 self.rule_env.as_deref()
211 }
212
213 /// Get the current offset position
214 ///
215 /// # Returns
216 ///
217 /// The current offset position in the file buffer
218 #[must_use]
219 pub const fn current_offset(&self) -> usize {
220 self.current_offset
221 }
222
223 /// Set the current offset position
224 ///
225 /// # Arguments
226 ///
227 /// * `offset` - The new offset position
228 pub fn set_current_offset(&mut self, offset: usize) {
229 self.current_offset = offset;
230 }
231
232 /// Get the end offset of the most recent successful match.
233 ///
234 /// This is the GNU `file`/libmagic anchor used to resolve relative
235 /// (`&+N` / `&-N`) offset specifications. A fresh context returns 0,
236 /// which makes top-level relative offsets resolve from the file start.
237 ///
238 /// `pub(crate)` because the anchor is an internal engine detail; external
239 /// consumers should not couple to it.
240 #[must_use]
241 pub(crate) const fn last_match_end(&self) -> usize {
242 self.last_match_end
243 }
244
245 /// Set the end offset of the most recent successful match.
246 ///
247 /// Called by the evaluation engine after a rule matches, to advance the
248 /// anchor used by subsequent relative offset resolution. The new value
249 /// is typically `match_offset + bytes_consumed_by_type`.
250 ///
251 /// `pub(crate)` because external callers should not be able to inject
252 /// arbitrary anchor state. External callers that need to clear the
253 /// anchor between buffer evaluations should call
254 /// `EvaluationContext::reset()`, which resets the anchor, current
255 /// offset, and recursion depth together.
256 pub(crate) fn set_last_match_end(&mut self, offset: usize) {
257 self.last_match_end = offset;
258 }
259
260 /// Get the current recursion depth
261 ///
262 /// # Returns
263 ///
264 /// The current recursion depth for nested rule evaluation
265 #[must_use]
266 pub const fn recursion_depth(&self) -> u32 {
267 self.recursion_depth
268 }
269
270 /// Increment the recursion depth
271 ///
272 /// # Returns
273 ///
274 /// `Ok(())` if the recursion depth is within limits, or `Err(LibmagicError)`
275 /// if the maximum recursion depth would be exceeded
276 ///
277 /// # Errors
278 ///
279 /// Returns `LibmagicError::EvaluationError` if incrementing would exceed
280 /// the maximum recursion depth configured in the evaluation config.
281 pub(crate) fn increment_recursion_depth(&mut self) -> Result<(), LibmagicError> {
282 if self.recursion_depth >= self.config.max_recursion_depth {
283 return Err(LibmagicError::EvaluationError(
284 crate::error::EvaluationError::recursion_limit_exceeded(self.recursion_depth),
285 ));
286 }
287 self.recursion_depth += 1;
288 Ok(())
289 }
290
291 /// Decrement the recursion depth
292 ///
293 /// # Errors
294 ///
295 /// Returns an error if the recursion depth is already 0, as this indicates
296 /// a programming error in the evaluation logic (mismatched increment/decrement calls).
297 pub(crate) fn decrement_recursion_depth(&mut self) -> Result<(), LibmagicError> {
298 if self.recursion_depth == 0 {
299 return Err(LibmagicError::EvaluationError(
300 crate::error::EvaluationError::internal_error(
301 "Attempted to decrement recursion depth below 0",
302 ),
303 ));
304 }
305 self.recursion_depth -= 1;
306 Ok(())
307 }
308
309 /// Get a reference to the evaluation configuration
310 ///
311 /// # Returns
312 ///
313 /// A reference to the `EvaluationConfig` used by this context
314 #[must_use]
315 pub const fn config(&self) -> &EvaluationConfig {
316 &self.config
317 }
318
319 /// Check if evaluation should stop at the first match
320 ///
321 /// # Returns
322 ///
323 /// `true` if evaluation should stop at the first match, `false` otherwise
324 #[must_use]
325 pub const fn should_stop_at_first_match(&self) -> bool {
326 self.config.stop_at_first_match
327 }
328
329 /// Get the maximum string length allowed for scan-mode string reads.
330 ///
331 /// Threaded into both string-read dispatchers
332 /// (`read_typed_value_with_pattern` for the unflagged `(None, _)` arm
333 /// and `read_pattern_match` for the flagged `/c`/`/C`/`/w`/`/W`/`/T`/`/f`
334 /// arm) so they cap the buffer-length allocation against this value.
335 /// Does NOT apply to `TypeKind::PString` (which errors on oversized
336 /// length prefixes per GOTCHAS S6.1) or `TypeKind::String16` (capped
337 /// at a hardcoded `STRING16_MAX_UNITS = 8192` ceiling).
338 ///
339 /// # Returns
340 ///
341 /// The configured `max_string_length` (default 8192 bytes per
342 /// `EvaluationConfig::default()`).
343 #[must_use]
344 pub const fn max_string_length(&self) -> usize {
345 self.config.max_string_length
346 }
347
348 /// Check if MIME type mapping is enabled
349 ///
350 /// # Returns
351 ///
352 /// `true` if MIME type mapping should be performed, `false` otherwise
353 #[must_use]
354 pub const fn enable_mime_types(&self) -> bool {
355 self.config.enable_mime_types
356 }
357
358 /// Get the evaluation timeout in milliseconds
359 ///
360 /// # Returns
361 ///
362 /// The timeout duration in milliseconds, or `None` if no timeout is set
363 #[must_use]
364 pub const fn timeout_ms(&self) -> Option<u64> {
365 self.config.timeout_ms
366 }
367
368 /// Reset the context to initial state while preserving configuration
369 ///
370 /// This resets the current offset and recursion depth to 0, but keeps
371 /// the same configuration settings.
372 pub fn reset(&mut self) {
373 self.current_offset = 0;
374 self.last_match_end = 0;
375 self.recursion_depth = 0;
376 self.base_offset = 0;
377 self.indirect_reentry = false;
378 }
379}
380
381/// RAII guard that increments recursion depth on entry and decrements on drop.
382///
383/// Replaces the manual `increment_recursion_depth` / `decrement_recursion_depth`
384/// pair with a scope-based guard, eliminating the risk of mismatched calls and
385/// the need to swallow cleanup errors on error-return paths.
386///
387/// Obtain a guard via [`RecursionGuard::enter`], which borrows the context
388/// mutably for the guard's lifetime. Use [`RecursionGuard::context`] to access
389/// the borrowed context for the duration of the recursive call. The guard
390/// automatically decrements the recursion depth when it goes out of scope.
391///
392/// The guard is `pub(crate)` because recursion-depth management is an internal
393/// detail of the evaluation engine.
394pub(crate) struct RecursionGuard<'a> {
395 context: &'a mut EvaluationContext,
396}
397
398impl<'a> RecursionGuard<'a> {
399 /// Enter a new recursion level, incrementing the context's recursion depth.
400 ///
401 /// # Errors
402 ///
403 /// Returns `LibmagicError::EvaluationError` if incrementing would exceed
404 /// the maximum recursion depth configured in the evaluation config.
405 pub(crate) fn enter(context: &'a mut EvaluationContext) -> Result<Self, LibmagicError> {
406 context.increment_recursion_depth()?;
407 Ok(Self { context })
408 }
409
410 /// Access the underlying context for the duration of the guard.
411 pub(crate) fn context(&mut self) -> &mut EvaluationContext {
412 self.context
413 }
414}
415
416impl Drop for RecursionGuard<'_> {
417 fn drop(&mut self) {
418 // Safe to ignore: `decrement_recursion_depth` only fails when the
419 // depth is already 0, which is impossible here because `enter` just
420 // incremented it and the depth is only mutated through guard pairs.
421 let result = self.context.decrement_recursion_depth();
422 debug_assert!(
423 result.is_ok(),
424 "RecursionGuard invariant violated: decrement failed after successful enter()"
425 );
426 }
427}
428
429/// Result of evaluating a magic rule
430///
431/// Contains information extracted from a successful rule match, including
432/// the matched value, position, and confidence score.
433///
434/// This type derives `Serialize` so callers can convert evaluation results
435/// to JSON, but intentionally does NOT derive `Deserialize`: a
436/// reconstructed `RuleMatch` would lack the buffer context it was
437/// produced against, so deserialization is not a meaningful operation.
438/// The output-side conversion layer (`output::MatchResult` /
439/// `output::json::JsonMatchResult`) is the documented JSON contract.
440#[derive(Debug, Clone, PartialEq, Serialize)]
441#[non_exhaustive]
442pub struct RuleMatch {
443 /// The message associated with the matching rule
444 pub message: String,
445 /// The offset where the match occurred
446 pub offset: usize,
447 /// The rule level (depth in hierarchy)
448 pub level: u32,
449 /// The matched value
450 pub value: crate::parser::ast::Value,
451 /// The type used to read the matched value
452 ///
453 /// Carries the source `TypeKind` so downstream consumers (e.g., output
454 /// formatting) can determine the on-disk width of the matched value.
455 ///
456 /// `#[serde(skip)]` keeps the parser AST out of JSON output produced
457 /// by serializing `EvaluationResult` directly via
458 /// `serde_json::to_string(&result)`. The documented JSON contract is
459 /// `JsonMatchResult` in `src/output/json.rs`, which omits this field.
460 /// Origin findings 1B-H2 / 2A-M1 (CWE-200 information exposure).
461 /// Rust-side consumers continue to access `type_kind` via field access
462 /// for runtime needs (`format_magic_message` width-masking,
463 /// `bit_width()` derivation).
464 #[serde(skip)]
465 pub type_kind: crate::parser::ast::TypeKind,
466 /// Confidence score (0.0 to 1.0)
467 ///
468 /// Calculated based on match depth in the rule hierarchy.
469 /// Deeper matches indicate more specific file type identification
470 /// and thus higher confidence.
471 pub confidence: f64,
472}
473
474impl RuleMatch {
475 /// Construct a new `RuleMatch`.
476 ///
477 /// `confidence` is typically derived from `level` via
478 /// [`RuleMatch::calculate_confidence`]; pass it explicitly here so
479 /// callers can supply an alternative score when needed (e.g. when
480 /// post-processing a series of matches).
481 #[must_use]
482 pub fn new(
483 message: String,
484 offset: usize,
485 level: u32,
486 value: crate::parser::ast::Value,
487 type_kind: crate::parser::ast::TypeKind,
488 confidence: f64,
489 ) -> Self {
490 Self {
491 message,
492 offset,
493 level,
494 value,
495 type_kind,
496 confidence,
497 }
498 }
499
500 /// Calculate confidence score based on rule depth
501 ///
502 /// Formula: min(1.0, 0.3 + (level * 0.2))
503 /// - Level 0 (root): 0.3
504 /// - Level 1: 0.5
505 /// - Level 2: 0.7
506 /// - Level 3: 0.9
507 /// - Level 4+: 1.0 (capped)
508 ///
509 /// # Examples
510 ///
511 /// ```
512 /// use libmagic_rs::evaluator::RuleMatch;
513 ///
514 /// assert!((RuleMatch::calculate_confidence(0) - 0.3).abs() < 0.001);
515 /// assert!((RuleMatch::calculate_confidence(3) - 0.9).abs() < 0.001);
516 /// assert!((RuleMatch::calculate_confidence(10) - 1.0).abs() < 0.001);
517 /// ```
518 #[must_use]
519 pub fn calculate_confidence(level: u32) -> f64 {
520 (0.3 + (f64::from(level) * 0.2)).min(1.0)
521 }
522}
523
524#[cfg(test)]
525mod tests;