libmagic-rs 0.6.0

A pure-Rust implementation of libmagic for file type identification
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
// Copyright (c) 2025-2026 the libmagic-rs contributors
// SPDX-License-Identifier: Apache-2.0

//! Rule evaluation engine
//!
//! This module provides the public interface for magic rule evaluation,
//! including data types for evaluation state and match results, and
//! re-exports the core evaluation functions from submodules.

use crate::{EvaluationConfig, LibmagicError};
use serde::{Deserialize, Serialize};

mod engine;
pub mod offset;
pub mod operators;
pub mod strength;
pub mod types;

pub use engine::{evaluate_rules, evaluate_rules_with_config, evaluate_single_rule};

/// Shared environment attached to an [`EvaluationContext`] so the engine can
/// resolve whole-database operations (currently: `Use` subroutine lookups;
/// eventually `indirect` whole-tree re-entry).
///
/// Stored as an `Arc` so cloning a context across recursive calls is cheap
/// and the rule data can be shared safely across threads.
#[derive(Debug, Clone)]
pub(crate) struct RuleEnvironment {
    /// Named subroutine table, keyed by identifier.
    pub(crate) name_table: std::sync::Arc<crate::parser::name_table::NameTable>,
    /// Top-level rule list retained for future whole-database operations.
    #[allow(dead_code)]
    pub(crate) root_rules: std::sync::Arc<[crate::parser::ast::MagicRule]>,
}

/// Context for maintaining evaluation state during rule processing
///
/// The `EvaluationContext` tracks the current state of rule evaluation,
/// including the current offset position, recursion depth for nested rules,
/// and configuration settings that control evaluation behavior.
///
/// # Examples
///
/// ```rust
/// use libmagic_rs::evaluator::EvaluationContext;
/// use libmagic_rs::EvaluationConfig;
///
/// let config = EvaluationConfig::default();
/// let context = EvaluationContext::new(config);
///
/// assert_eq!(context.current_offset(), 0);
/// assert_eq!(context.recursion_depth(), 0);
/// ```
#[derive(Debug, Clone)]
pub struct EvaluationContext {
    /// Current offset position in the file buffer
    current_offset: usize,
    /// End offset of the most recent successful match.
    ///
    /// This is the GNU `file`/libmagic anchor used to resolve relative
    /// (`&+N` / `&-N`) offsets. It is updated to the end of the most
    /// recently matched rule -- the value may *increase or decrease* as
    /// successive rules match at different positions; it is not a
    /// high-watermark. A fresh context starts with this set to 0, which
    /// matches libmagic's behavior of resolving top-level relative offsets
    /// from the file start.
    last_match_end: usize,
    /// Current recursion depth for nested rule evaluation
    recursion_depth: u32,
    /// Configuration settings for evaluation behavior
    config: EvaluationConfig,
    /// Optional rule environment (name table + root rules) threaded from
    /// [`MagicDatabase`](crate::MagicDatabase). Evaluations that come in
    /// through the low-level [`evaluate_rules`] / [`evaluate_rules_with_config`]
    /// surface (tests, programmatic consumers) run with `rule_env = None`,
    /// in which case `MetaType::Use` rules are silent no-ops.
    rule_env: Option<std::sync::Arc<RuleEnvironment>>,
    /// Base offset applied to absolute offset resolution.
    ///
    /// Normally 0. When evaluating a subroutine body via `MetaType::Use`,
    /// this is set to the use-site offset so that the subroutine's
    /// `OffsetSpec::Absolute(n)` rules resolve to `base + n` (matching
    /// magic(5) / libmagic semantics: subroutines see offsets relative
    /// to the caller's invocation point, not absolute file positions).
    /// Restored to the caller's value on subroutine exit via the
    /// `SubroutineScope` RAII guard in `engine/mod.rs`, which saves
    /// and restores both `last_match_end` and `base_offset` together.
    base_offset: usize,
    /// One-shot flag set by `MetaType::Indirect` dispatch before
    /// re-entering the root rule list. When true, the next entry to
    /// `evaluate_rules` treats the iteration as a top-level sibling
    /// chain (anchor chains across siblings per GOTCHAS S3.8) rather
    /// than as a continuation list (anchor resets between siblings).
    /// Consumed at entry — children of a matched rule inside the
    /// re-entry see the flag cleared, so their own continuation-reset
    /// semantics kick in via the `recursion_depth > 0` gate.
    ///
    /// Without this flag, `indirect` wrapping re-entry under
    /// `RecursionGuard` forces `recursion_depth > 0`, which forces
    /// continuation-reset semantics on the root rule list — wrong,
    /// because top-level rules in the re-entered database should
    /// chain sibling anchors like any other top-level evaluation.
    indirect_reentry: bool,
}

impl EvaluationContext {
    /// Create a new evaluation context with the given configuration
    ///
    /// # Arguments
    ///
    /// * `config` - Configuration settings for evaluation behavior
    ///
    /// # Examples
    ///
    /// ```rust
    /// use libmagic_rs::evaluator::EvaluationContext;
    /// use libmagic_rs::EvaluationConfig;
    ///
    /// let config = EvaluationConfig::default();
    /// let context = EvaluationContext::new(config);
    /// ```
    #[must_use]
    pub const fn new(config: EvaluationConfig) -> Self {
        Self {
            current_offset: 0,
            last_match_end: 0,
            recursion_depth: 0,
            config,
            rule_env: None,
            base_offset: 0,
            indirect_reentry: false,
        }
    }

    /// Read-only access to the subroutine base offset. Non-zero only
    /// during a `MetaType::Use` body evaluation.
    #[must_use]
    pub(crate) const fn base_offset(&self) -> usize {
        self.base_offset
    }

    /// Set the subroutine base offset.
    ///
    /// `pub(crate)` and owned by the engine's `SubroutineScope` RAII
    /// guard -- no external caller should set this directly.
    pub(crate) fn set_base_offset(&mut self, offset: usize) {
        self.base_offset = offset;
    }

    /// Read-and-clear the indirect-reentry flag. Used by `evaluate_rules`
    /// at entry to decide whether the iteration is a top-level re-entry
    /// (no anchor reset between siblings) or a continuation list (reset
    /// between siblings). Cleared on read so children of a matched rule
    /// inside the re-entry see the flag as false and fall back to the
    /// `recursion_depth > 0` gate for their own continuation semantics.
    pub(crate) fn take_indirect_reentry(&mut self) -> bool {
        std::mem::take(&mut self.indirect_reentry)
    }

    /// Set the indirect-reentry flag.
    ///
    /// `pub(crate)` and owned by the `MetaType::Indirect` dispatch in
    /// `engine/mod.rs`. Callers should set this true exactly once
    /// before invoking `evaluate_rules` on the root rule list.
    pub(crate) fn set_indirect_reentry(&mut self, flag: bool) {
        self.indirect_reentry = flag;
    }

    /// Attach a rule environment to this context.
    ///
    /// The environment carries the name-subroutine table and root rule list
    /// so the engine can resolve `MetaType::Use` rules and (eventually)
    /// `MetaType::Indirect` re-entries. Intended to be called once by
    /// [`MagicDatabase`](crate::MagicDatabase) before handing the context
    /// to [`evaluate_rules`].
    #[must_use]
    pub(crate) fn with_rule_env(mut self, env: std::sync::Arc<RuleEnvironment>) -> Self {
        self.rule_env = Some(env);
        self
    }

    /// Read-only access to the attached rule environment, if any.
    #[must_use]
    pub(crate) fn rule_env(&self) -> Option<&RuleEnvironment> {
        self.rule_env.as_deref()
    }

    /// Get the current offset position
    ///
    /// # Returns
    ///
    /// The current offset position in the file buffer
    #[must_use]
    pub const fn current_offset(&self) -> usize {
        self.current_offset
    }

    /// Set the current offset position
    ///
    /// # Arguments
    ///
    /// * `offset` - The new offset position
    pub fn set_current_offset(&mut self, offset: usize) {
        self.current_offset = offset;
    }

    /// Get the end offset of the most recent successful match.
    ///
    /// This is the GNU `file`/libmagic anchor used to resolve relative
    /// (`&+N` / `&-N`) offset specifications. A fresh context returns 0,
    /// which makes top-level relative offsets resolve from the file start.
    ///
    /// `pub(crate)` because the anchor is an internal engine detail; external
    /// consumers should not couple to it.
    #[must_use]
    pub(crate) const fn last_match_end(&self) -> usize {
        self.last_match_end
    }

    /// Set the end offset of the most recent successful match.
    ///
    /// Called by the evaluation engine after a rule matches, to advance the
    /// anchor used by subsequent relative offset resolution. The new value
    /// is typically `match_offset + bytes_consumed_by_type`.
    ///
    /// `pub(crate)` because external callers should not be able to inject
    /// arbitrary anchor state. External callers that need to clear the
    /// anchor between buffer evaluations should call
    /// `EvaluationContext::reset()`, which resets the anchor, current
    /// offset, and recursion depth together.
    pub(crate) fn set_last_match_end(&mut self, offset: usize) {
        self.last_match_end = offset;
    }

    /// Get the current recursion depth
    ///
    /// # Returns
    ///
    /// The current recursion depth for nested rule evaluation
    #[must_use]
    pub const fn recursion_depth(&self) -> u32 {
        self.recursion_depth
    }

    /// Increment the recursion depth
    ///
    /// # Returns
    ///
    /// `Ok(())` if the recursion depth is within limits, or `Err(LibmagicError)`
    /// if the maximum recursion depth would be exceeded
    ///
    /// # Errors
    ///
    /// Returns `LibmagicError::EvaluationError` if incrementing would exceed
    /// the maximum recursion depth configured in the evaluation config.
    pub(crate) fn increment_recursion_depth(&mut self) -> Result<(), LibmagicError> {
        if self.recursion_depth >= self.config.max_recursion_depth {
            return Err(LibmagicError::EvaluationError(
                crate::error::EvaluationError::recursion_limit_exceeded(self.recursion_depth),
            ));
        }
        self.recursion_depth += 1;
        Ok(())
    }

    /// Decrement the recursion depth
    ///
    /// # Errors
    ///
    /// Returns an error if the recursion depth is already 0, as this indicates
    /// a programming error in the evaluation logic (mismatched increment/decrement calls).
    pub(crate) fn decrement_recursion_depth(&mut self) -> Result<(), LibmagicError> {
        if self.recursion_depth == 0 {
            return Err(LibmagicError::EvaluationError(
                crate::error::EvaluationError::internal_error(
                    "Attempted to decrement recursion depth below 0",
                ),
            ));
        }
        self.recursion_depth -= 1;
        Ok(())
    }

    /// Get a reference to the evaluation configuration
    ///
    /// # Returns
    ///
    /// A reference to the `EvaluationConfig` used by this context
    #[must_use]
    pub const fn config(&self) -> &EvaluationConfig {
        &self.config
    }

    /// Check if evaluation should stop at the first match
    ///
    /// # Returns
    ///
    /// `true` if evaluation should stop at the first match, `false` otherwise
    #[must_use]
    pub const fn should_stop_at_first_match(&self) -> bool {
        self.config.stop_at_first_match
    }

    /// Get the maximum string length allowed
    ///
    /// # Returns
    ///
    /// The maximum string length that should be read during evaluation
    #[must_use]
    pub const fn max_string_length(&self) -> usize {
        self.config.max_string_length
    }

    /// Check if MIME type mapping is enabled
    ///
    /// # Returns
    ///
    /// `true` if MIME type mapping should be performed, `false` otherwise
    #[must_use]
    pub const fn enable_mime_types(&self) -> bool {
        self.config.enable_mime_types
    }

    /// Get the evaluation timeout in milliseconds
    ///
    /// # Returns
    ///
    /// The timeout duration in milliseconds, or `None` if no timeout is set
    #[must_use]
    pub const fn timeout_ms(&self) -> Option<u64> {
        self.config.timeout_ms
    }

    /// Reset the context to initial state while preserving configuration
    ///
    /// This resets the current offset and recursion depth to 0, but keeps
    /// the same configuration settings.
    pub fn reset(&mut self) {
        self.current_offset = 0;
        self.last_match_end = 0;
        self.recursion_depth = 0;
        self.base_offset = 0;
        self.indirect_reentry = false;
    }
}

/// RAII guard that increments recursion depth on entry and decrements on drop.
///
/// Replaces the manual `increment_recursion_depth` / `decrement_recursion_depth`
/// pair with a scope-based guard, eliminating the risk of mismatched calls and
/// the need to swallow cleanup errors on error-return paths.
///
/// Obtain a guard via [`RecursionGuard::enter`], which borrows the context
/// mutably for the guard's lifetime. Use [`RecursionGuard::context`] to access
/// the borrowed context for the duration of the recursive call. The guard
/// automatically decrements the recursion depth when it goes out of scope.
///
/// The guard is `pub(crate)` because recursion-depth management is an internal
/// detail of the evaluation engine.
pub(crate) struct RecursionGuard<'a> {
    context: &'a mut EvaluationContext,
}

impl<'a> RecursionGuard<'a> {
    /// Enter a new recursion level, incrementing the context's recursion depth.
    ///
    /// # Errors
    ///
    /// Returns `LibmagicError::EvaluationError` if incrementing would exceed
    /// the maximum recursion depth configured in the evaluation config.
    pub(crate) fn enter(context: &'a mut EvaluationContext) -> Result<Self, LibmagicError> {
        context.increment_recursion_depth()?;
        Ok(Self { context })
    }

    /// Access the underlying context for the duration of the guard.
    pub(crate) fn context(&mut self) -> &mut EvaluationContext {
        self.context
    }
}

impl Drop for RecursionGuard<'_> {
    fn drop(&mut self) {
        // Safe to ignore: `decrement_recursion_depth` only fails when the
        // depth is already 0, which is impossible here because `enter` just
        // incremented it and the depth is only mutated through guard pairs.
        let result = self.context.decrement_recursion_depth();
        debug_assert!(
            result.is_ok(),
            "RecursionGuard invariant violated: decrement failed after successful enter()"
        );
    }
}

/// Result of evaluating a magic rule
///
/// Contains information extracted from a successful rule match, including
/// the matched value, position, and confidence score.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct RuleMatch {
    /// The message associated with the matching rule
    pub message: String,
    /// The offset where the match occurred
    pub offset: usize,
    /// The rule level (depth in hierarchy)
    pub level: u32,
    /// The matched value
    pub value: crate::parser::ast::Value,
    /// The type used to read the matched value
    ///
    /// Carries the source `TypeKind` so downstream consumers (e.g., output
    /// formatting) can determine the on-disk width of the matched value.
    pub type_kind: crate::parser::ast::TypeKind,
    /// Confidence score (0.0 to 1.0)
    ///
    /// Calculated based on match depth in the rule hierarchy.
    /// Deeper matches indicate more specific file type identification
    /// and thus higher confidence.
    pub confidence: f64,
}

impl RuleMatch {
    /// Calculate confidence score based on rule depth
    ///
    /// Formula: min(1.0, 0.3 + (level * 0.2))
    /// - Level 0 (root): 0.3
    /// - Level 1: 0.5
    /// - Level 2: 0.7
    /// - Level 3: 0.9
    /// - Level 4+: 1.0 (capped)
    ///
    /// # Examples
    ///
    /// ```
    /// use libmagic_rs::evaluator::RuleMatch;
    ///
    /// assert!((RuleMatch::calculate_confidence(0) - 0.3).abs() < 0.001);
    /// assert!((RuleMatch::calculate_confidence(3) - 0.9).abs() < 0.001);
    /// assert!((RuleMatch::calculate_confidence(10) - 1.0).abs() < 0.001);
    /// ```
    #[must_use]
    pub fn calculate_confidence(level: u32) -> f64 {
        (0.3 + (f64::from(level) * 0.2)).min(1.0)
    }
}

#[cfg(test)]
mod tests;