libmagic-rs 0.6.0

A pure-Rust implementation of libmagic for file type identification
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
// Copyright (c) 2025-2026 the libmagic-rs contributors
// SPDX-License-Identifier: Apache-2.0

//! Printf-style format specifier substitution for magic rule messages.
//!
//! Magic file messages frequently contain C-style format specifiers such as
//! `%lld`, `%02x`, or `%s` that reference the rule's read value. GNU `file`
//! renders the message with the value substituted at the specifier's
//! position; without this pass libmagic-rs would emit the literal
//! specifier tokens (e.g., `at_offset %lld`) and diverge visibly from
//! `file(1)` output.
//!
//! The substitution is intentionally narrow: it supports the subset of
//! C's `printf` syntax that appears in shipping magic corpora (notably
//! `third_party/tests/searchbug.magic` and the GNU `file` `Magdir`
//! collection). Unrecognized specifiers pass through literally with a
//! `debug!` log rather than erroring -- matching the evaluator's
//! graceful-skip discipline.
//!
//! Width masking for hex specifiers uses [`crate::parser::ast::TypeKind::bit_width`]
//! so that e.g. a signed byte rendered with `%02x` produces the unsigned
//! 8-bit interpretation (`0xff`, not `0xffffffffffffffff`).
//!
//! See the project plan at
//! `docs/plans/2026-04-22-001-feat-meta-type-offset-and-format-substitution-plan.md`
//! for scope, and GOTCHAS.md S14.2 for historical context.

use log::debug;

use crate::parser::ast::{TypeKind, Value};

/// Substitute printf-style format specifiers in a magic rule message.
///
/// Walks `template` left to right. Plain text is copied verbatim; on
/// each `%`, the full specifier (`%[flags][width][.precision][length]<conv>`)
/// is parsed and substituted from `value`. `%%` emits a single `%`.
/// Unrecognized or malformed specifiers are passed through literally
/// with a `debug!` log.
///
/// `type_kind` is consulted only for hex specifiers, which need the
/// natural bit width of the underlying read to mask sign-extended
/// values correctly. For non-hex specifiers `type_kind` is ignored.
///
/// # Examples
///
/// ```
/// use libmagic_rs::output::format::format_magic_message;
/// use libmagic_rs::parser::ast::{TypeKind, Value};
///
/// let out = format_magic_message(
///     "at_offset %lld",
///     &Value::Uint(11),
///     &TypeKind::Byte { signed: false },
/// );
/// assert_eq!(out, "at_offset 11");
///
/// let out = format_magic_message(
///     "followed_by 0x%02x",
///     &Value::Uint(0x31),
///     &TypeKind::Byte { signed: false },
/// );
/// assert_eq!(out, "followed_by 0x31");
///
/// // Unknown specifier falls through literally.
/// let out = format_magic_message("%q", &Value::Uint(0), &TypeKind::Byte { signed: false });
/// assert_eq!(out, "%q");
///
/// // `%%` is an escaped literal percent.
/// let out = format_magic_message("100%% sure", &Value::Uint(0), &TypeKind::Byte { signed: false });
/// assert_eq!(out, "100% sure");
/// ```
#[must_use]
pub fn format_magic_message(template: &str, value: &Value, type_kind: &TypeKind) -> String {
    let mut out = String::with_capacity(template.len());
    let bytes = template.as_bytes();
    let mut i = 0;
    // Start of the most recent run of non-`%` bytes. We copy the run
    // as a string slice rather than byte-by-byte so non-ASCII UTF-8
    // code points survive intact. Scanning still happens at the byte
    // level (safe because `%` is ASCII 0x25 and cannot appear as a
    // UTF-8 continuation byte, which is always >= 0x80).
    let mut plain_start = 0;

    while i < bytes.len() {
        if bytes[i] != b'%' {
            i += 1;
            continue;
        }

        // Flush any pending plain-text run as a single UTF-8 slice.
        if plain_start < i {
            out.push_str(&template[plain_start..i]);
        }

        // Start of a format specifier at position i.
        let spec_start = i;
        let Some(parsed_spec) = parse_spec(bytes, i + 1) else {
            // Malformed specifier (e.g., trailing `%` with nothing after,
            // or a sequence that doesn't end in a valid conversion char).
            // Pass through the remaining literal and stop scanning.
            debug!(
                "format_magic_message: malformed specifier at byte {i} in template {template:?}; passing through remainder literally",
            );
            out.push_str(&template[i..]);
            // Skip the trailing flush -- we have already emitted the
            // remainder above.
            plain_start = bytes.len();
            break;
        };
        let next_i = parsed_spec.end;
        if let Some(rendered) = render(&parsed_spec, value, type_kind) {
            out.push_str(&rendered);
        } else {
            // Type mismatch or unsupported conversion; pass through the
            // literal specifier and log.
            let literal = &template[spec_start..next_i];
            debug!(
                "format_magic_message: unsupported specifier {literal:?} for value {value:?}; passing through literally",
            );
            out.push_str(literal);
        }
        i = next_i;
        plain_start = i;
    }

    // Flush any trailing plain-text run.
    if plain_start < bytes.len() {
        out.push_str(&template[plain_start..]);
    }

    out
}

/// Kinds of conversion characters we recognize.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Conv {
    /// `%d`, `%i`, `%ld`, `%lld` -- signed decimal.
    Signed,
    /// `%u`, `%lu`, `%llu` -- unsigned decimal.
    Unsigned,
    /// `%x` -- lowercase hex.
    HexLower,
    /// `%X` -- uppercase hex.
    HexUpper,
    /// `%o` -- octal.
    Octal,
    /// `%s` -- string.
    Str,
    /// `%c` -- single character (full 0x00-0xff byte range via Latin-1 code points).
    Char,
    /// `%%` -- literal percent.
    Percent,
}

/// Parsed format specifier.
#[derive(Debug, Clone)]
struct Spec {
    zero_pad: bool,
    left_align: bool,
    alt_form: bool,
    width: usize,
    conv: Conv,
    /// Byte index of the character *after* this specifier in the template.
    end: usize,
}

/// Maximum width value accepted from a format specifier.
///
/// Caps the field width to prevent crafted magic rules with enormous widths
/// (e.g., `%999999999d`) from driving unbounded `repeat_n` allocations in the
/// padding helpers. 4096 is generous for any real magic-corpus usage.
const MAX_FORMAT_WIDTH: usize = 4096;

/// Parse a format specifier starting at `start` (the first byte after the
/// leading `%`). Returns `None` if the sequence does not end in a
/// recognized conversion character.
fn parse_spec(bytes: &[u8], start: usize) -> Option<Spec> {
    let mut i = start;
    let mut zero_pad = false;
    let mut left_align = false;
    let mut alt_form = false;

    // Flags (subset: 0, -, #). Other flags (+, space) are parsed but ignored.
    while i < bytes.len() {
        match bytes[i] {
            b'0' => {
                zero_pad = true;
                i += 1;
            }
            b'-' => {
                left_align = true;
                i += 1;
            }
            b'#' => {
                alt_form = true;
                i += 1;
            }
            b'+' | b' ' => {
                // Accepted for syntactic completeness, no rendering effect
                // in the current subset.
                i += 1;
            }
            _ => break,
        }
    }

    // Width (decimal digits). Capped at MAX_FORMAT_WIDTH to prevent
    // unbounded allocations from crafted format strings.
    let mut width: usize = 0;
    while i < bytes.len() && bytes[i].is_ascii_digit() {
        let digit = (bytes[i] - b'0') as usize;
        width = width
            .saturating_mul(10)
            .saturating_add(digit)
            .min(MAX_FORMAT_WIDTH);
        i += 1;
    }

    // Precision (`.<digits>`): parsed and skipped -- no current consumer
    // requires precision handling, and numeric rendering is whole-value.
    if i < bytes.len() && bytes[i] == b'.' {
        i += 1;
        while i < bytes.len() && bytes[i].is_ascii_digit() {
            i += 1;
        }
    }

    // Length modifier (`h`, `hh`, `l`, `ll`, `j`, `z`, `t`). We consume
    // these for syntactic completeness but never rely on them -- all
    // numeric rendering uses full u64/i64 width.
    while i < bytes.len() {
        match bytes[i] {
            b'l' | b'h' | b'j' | b'z' | b't' => i += 1,
            _ => break,
        }
    }

    if i >= bytes.len() {
        return None;
    }

    let conv = match bytes[i] {
        b'd' | b'i' => Conv::Signed,
        b'u' => Conv::Unsigned,
        b'x' => Conv::HexLower,
        b'X' => Conv::HexUpper,
        b'o' => Conv::Octal,
        b's' => Conv::Str,
        b'c' => Conv::Char,
        b'%' => Conv::Percent,
        _ => return None,
    };
    i += 1;

    Some(Spec {
        zero_pad,
        left_align,
        alt_form,
        width,
        conv,
        end: i,
    })
}

/// Render the specifier against `value`, or return `None` if the value
/// is type-incompatible with the conversion.
fn render(spec: &Spec, value: &Value, type_kind: &TypeKind) -> Option<String> {
    match spec.conv {
        Conv::Percent => Some("%".to_string()),
        Conv::Str => Some(render_string(value)),
        Conv::Signed => {
            let n = coerce_to_i64(value)?;
            Some(pad_numeric(&n.to_string(), spec))
        }
        Conv::Unsigned => {
            let n = coerce_to_u64(value)?;
            Some(pad_numeric(&n.to_string(), spec))
        }
        Conv::HexLower => {
            let n = coerce_to_u64_masked(value, type_kind)?;
            // C printf suppresses the `0x`/`0X` alt-form prefix when the
            // value is zero: `printf("%#x", 0)` emits `"0"`, not `"0x0"`.
            let prefix = if spec.alt_form && n != 0 { "0x" } else { "" };
            Some(render_prefixed_int(&format!("{n:x}"), prefix, spec))
        }
        Conv::HexUpper => {
            let n = coerce_to_u64_masked(value, type_kind)?;
            let prefix = if spec.alt_form && n != 0 { "0X" } else { "" };
            Some(render_prefixed_int(&format!("{n:X}"), prefix, spec))
        }
        Conv::Octal => {
            let n = coerce_to_u64_masked(value, type_kind)?;
            // C printf uses a single "0" prefix for %#o (not Rust's "0o"),
            // and suppresses the prefix when the value itself is zero --
            // the resulting digit `0` already satisfies the "starts with
            // 0" invariant that the alt-form is meant to guarantee.
            let prefix = if spec.alt_form && n != 0 { "0" } else { "" };
            Some(render_prefixed_int(&format!("{n:o}"), prefix, spec))
        }
        Conv::Char => {
            let n = coerce_to_u64(value)?;
            let byte = u8::try_from(n).ok()?;
            // GNU `file` / C printf `%c` converts the int argument to
            // unsigned char and emits it directly for all byte values
            // 0x00-0xff. Rust's `String` must be valid UTF-8, so we
            // embed bytes >= 0x80 as their Latin-1 code points (U+0080
            // through U+00FF) via `char::from(u8)` which is infallible
            // and lossless. Consumers with UTF-8 terminals see the
            // 2-byte UTF-8 encoding of that code point; consumers
            // iterating the returned bytes directly can recover the
            // original byte by re-encoding the code point as Latin-1.
            //
            // POSIX: the `0` flag is ignored for `%c` -- zero-padding only
            // applies to numeric/float conversions. Always use space-padding
            // for `%c`, matching C printf behavior.
            Some(pad_non_numeric(&char::from(byte).to_string(), spec))
        }
    }
}

/// Render a [`Value`] for `%s`. Strings pass through; byte sequences are
/// converted via lossy UTF-8; numbers render as decimal (GNU `file` does
/// the same for mixed-type `%s` substitutions).
fn render_string(value: &Value) -> String {
    match value {
        Value::String(s) => s.clone(),
        Value::Bytes(b) => String::from_utf8_lossy(b).into_owned(),
        Value::Uint(n) => n.to_string(),
        Value::Int(n) => n.to_string(),
        Value::Float(f) => f.to_string(),
    }
}

/// Coerce a numeric-ish [`Value`] to `i64`. Float values are truncated
/// toward zero (documented intent -- matches C's `(long long)float`
/// semantics that libmagic's `printf` path relies on). String/Bytes
/// values have no sensible mapping and return `None`.
#[allow(
    clippy::cast_possible_truncation,
    clippy::cast_sign_loss,
    clippy::cast_possible_wrap
)]
fn coerce_to_i64(value: &Value) -> Option<i64> {
    match value {
        Value::Int(n) => Some(*n),
        // u64 -> i64 bit-pattern reinterpret: matches C's implicit
        // cast in `printf("%lld", (unsigned long long)...)`.
        Value::Uint(n) => Some(*n as i64),
        // f64 -> i64 truncation toward zero, matching C behavior for
        // `printf("%d", (double)...)`.
        Value::Float(f) => Some(*f as i64),
        Value::String(_) | Value::Bytes(_) => None,
    }
}

/// Coerce a numeric-ish [`Value`] to `u64`. Mirrors [`coerce_to_i64`]
/// but preserves the unsigned bit pattern when the source is signed.
#[allow(
    clippy::cast_possible_truncation,
    clippy::cast_sign_loss,
    clippy::cast_precision_loss
)]
fn coerce_to_u64(value: &Value) -> Option<u64> {
    match value {
        Value::Uint(n) => Some(*n),
        // i64 -> u64 bit-pattern reinterpret for rendering; parallels
        // the `coerce_to_i64` case.
        Value::Int(n) => Some(*n as u64),
        Value::Float(f) => Some(*f as u64),
        Value::String(_) | Value::Bytes(_) => None,
    }
}

/// Coerce a numeric-ish [`Value`] to `u64`, masked to the natural bit
/// width of `type_kind`. Used by hex/octal specifiers to avoid
/// surprising sign-extended renderings like `byte = -1` rendering as
/// `ffffffffffffffff` when the user expected `ff`.
fn coerce_to_u64_masked(value: &Value, type_kind: &TypeKind) -> Option<u64> {
    let raw = coerce_to_u64(value)?;
    let mask = match type_kind.bit_width() {
        Some(8) => 0xff_u64,
        Some(16) => 0xffff_u64,
        Some(32) => 0xffff_ffff_u64,
        // 64-bit, unknown width, or any other case: no mask needed.
        _ => return Some(raw),
    };
    Some(raw & mask)
}

/// Render a numeric body with an alt-form prefix (`0x` / `0o` / empty),
/// applying width and padding correctly.
///
/// For zero-padded widths (`%#0Nx`), C printf inserts zeros *between*
/// the prefix and the digits: `%#06x` + `0xab` -> `0x00ab`, not
/// `  0xab`. For space-padded widths (`%#Nx`), the spaces go *before*
/// the prefix: `%#6x` + `0xab` -> `  0xab`. For left-aligned widths
/// (`%-#6x`), trailing spaces follow the digits: `0xab  `.
fn render_prefixed_int(digits: &str, prefix: &str, spec: &Spec) -> String {
    // The effective body length for width comparison is prefix + digits.
    let body_len = prefix.len() + digits.len();
    if body_len >= spec.width {
        return format!("{prefix}{digits}");
    }
    let pad = spec.width - body_len;
    if spec.zero_pad && !spec.left_align {
        // Zeros insert between the prefix and the digits.
        let zeros: String = std::iter::repeat_n('0', pad).collect();
        format!("{prefix}{zeros}{digits}")
    } else if spec.left_align {
        let spaces: String = std::iter::repeat_n(' ', pad).collect();
        format!("{prefix}{digits}{spaces}")
    } else {
        let spaces: String = std::iter::repeat_n(' ', pad).collect();
        format!("{spaces}{prefix}{digits}")
    }
}

/// Apply width and alignment to a non-numeric rendered body using space-only padding.
///
/// Used for `%c` (and any other non-numeric conversion where the POSIX `0` flag
/// must be ignored). Zero-padding is not applied regardless of `spec.zero_pad`.
fn pad_non_numeric(body: &str, spec: &Spec) -> String {
    if body.len() >= spec.width {
        return body.to_string();
    }
    let pad = spec.width - body.len();
    let padding: String = std::iter::repeat_n(' ', pad).collect();
    if spec.left_align {
        format!("{body}{padding}")
    } else {
        format!("{padding}{body}")
    }
}

/// Apply width and padding to an already-rendered numeric body.
///
/// For zero-padded right-aligned formatting, a leading `-` sign is kept at
/// the front while zeros are inserted between the sign and the magnitude
/// digits -- matching C printf semantics (e.g., `%05d` with `-7` β†’ `-0007`,
/// not `000-7`).
fn pad_numeric(body: &str, spec: &Spec) -> String {
    if body.len() >= spec.width {
        return body.to_string();
    }
    // C printf sign-aware zero-padding: sign goes before the zeros.
    if spec.zero_pad
        && !spec.left_align
        && let Some(digits) = body.strip_prefix('-')
    {
        let needed = spec.width.saturating_sub(1 + digits.len());
        if needed == 0 {
            return body.to_string();
        }
        let zeros: String = std::iter::repeat_n('0', needed).collect();
        return format!("-{zeros}{digits}");
    }
    let pad = spec.width - body.len();
    let pad_char = if spec.zero_pad && !spec.left_align {
        '0'
    } else {
        ' '
    };
    let padding: String = std::iter::repeat_n(pad_char, pad).collect();
    if spec.left_align {
        format!("{body}{padding}")
    } else {
        format!("{padding}{body}")
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn byte_t() -> TypeKind {
        TypeKind::Byte { signed: false }
    }

    fn long_t() -> TypeKind {
        TypeKind::Long {
            endian: crate::parser::ast::Endianness::Little,
            signed: true,
        }
    }

    // ---- happy path --------------------------------------------------

    #[test]
    fn test_signed_decimal_substitution() {
        // Covers %d, %i, %ld, %lld (length modifiers are accepted and ignored).
        let cases = [
            ("v=%d", Value::Int(-7), "v=-7"),
            ("v=%i", Value::Int(42), "v=42"),
            ("v=%ld", Value::Int(10), "v=10"),
            ("at_offset %lld", Value::Uint(11), "at_offset 11"),
        ];
        for (tmpl, val, expected) in cases {
            assert_eq!(
                format_magic_message(tmpl, &val, &byte_t()),
                expected,
                "template {tmpl:?} with value {val:?}",
            );
        }
    }

    #[test]
    fn test_unsigned_decimal_substitution() {
        let out = format_magic_message("n=%u", &Value::Uint(200), &byte_t());
        assert_eq!(out, "n=200");

        // i64::MIN as unsigned should come through as 2^63.
        let out = format_magic_message("n=%llu", &Value::Int(i64::MIN), &long_t());
        assert_eq!(out, "n=9223372036854775808");
    }

    #[test]
    fn test_hex_substitution_with_byte_width_masking() {
        // The canonical searchbug.result case: ubyte `%02x`.
        let out = format_magic_message("0x%02x", &Value::Uint(0x31), &byte_t());
        assert_eq!(out, "0x31");

        // Byte -1 (sign-extended to u64::MAX in Value::Int) must render as "ff",
        // not "ffffffffffffffff", when the underlying type is a byte.
        let out = format_magic_message("0x%02x", &Value::Int(-1), &byte_t());
        assert_eq!(out, "0xff");

        // %X is uppercase.
        let out = format_magic_message("%X", &Value::Uint(0xdead_beef), &long_t());
        assert_eq!(out, "DEADBEEF");

        // %#x emits the "0x" prefix via alt form.
        let out = format_magic_message("%#x", &Value::Uint(0xab), &byte_t());
        assert_eq!(out, "0xab");

        // %#06x: zero-pad inserts between prefix and digits (C printf semantics),
        // not before the prefix. Regression guard for correctness review COR-002.
        let out = format_magic_message("%#06x", &Value::Uint(0xab), &byte_t());
        assert_eq!(out, "0x00ab");

        // Space-padded width with alt-form prefix: spaces go before prefix.
        let out = format_magic_message("%#6x", &Value::Uint(0xab), &byte_t());
        assert_eq!(out, "  0xab");

        // Left-aligned with alt-form prefix: spaces trail the digits.
        let out = format_magic_message("%-#6x|", &Value::Uint(0xab), &byte_t());
        assert_eq!(out, "0xab  |");

        // %#08o: zero-pad inserts between C-style "0" prefix and digits.
        // C printf uses a single "0" prefix for %#o (not Rust's "0o").
        let out = format_magic_message("%#08o", &Value::Uint(8), &byte_t());
        assert_eq!(out, "00000010");

        // %#X: uppercase alt-form uses "0X" prefix to match the specifier case.
        let out = format_magic_message("%#X", &Value::Uint(0xab), &byte_t());
        assert_eq!(out, "0XAB");
    }

    #[test]
    fn test_string_substitution() {
        let out = format_magic_message(
            "hello %s",
            &Value::String("world".to_string()),
            &TypeKind::String { max_length: None },
        );
        assert_eq!(out, "hello world");

        // Bytes go through lossy UTF-8.
        let out = format_magic_message(
            "data=%s",
            &Value::Bytes(b"abc".to_vec()),
            &TypeKind::String { max_length: None },
        );
        assert_eq!(out, "data=abc");
    }

    #[test]
    fn test_alt_form_prefix_suppressed_on_zero_value() {
        // C printf special-cases `%#o`, `%#x`, `%#X` with value 0: the
        // alt-form prefix is suppressed because the rendered digit
        // already begins with `0`. Regression guard after pr-review
        // caught that our implementation emitted `"00"` / `"0x0"` /
        // `"0X0"` for zero values.
        let out = format_magic_message("%#o", &Value::Uint(0), &byte_t());
        assert_eq!(out, "0", "%#o with 0 must emit single '0', not '00'");

        let out = format_magic_message("%#x", &Value::Uint(0), &byte_t());
        assert_eq!(out, "0", "%#x with 0 must emit single '0', not '0x0'");

        let out = format_magic_message("%#X", &Value::Uint(0), &byte_t());
        assert_eq!(out, "0", "%#X with 0 must emit single '0', not '0X0'");

        // Non-zero values still get the prefix.
        let out = format_magic_message("%#x", &Value::Uint(1), &byte_t());
        assert_eq!(out, "0x1");
    }

    #[test]
    fn test_octal_substitution() {
        let out = format_magic_message("%o", &Value::Uint(8), &byte_t());
        assert_eq!(out, "10");
        // C printf %#o uses a single "0" prefix, not Rust's "0o".
        let out = format_magic_message("%#o", &Value::Uint(8), &byte_t());
        assert_eq!(out, "010");
    }

    #[test]
    fn test_char_substitution() {
        let out = format_magic_message("[%c]", &Value::Uint(u64::from(b'A')), &byte_t());
        assert_eq!(out, "[A]");

        // Full 0x00-0xff range: bytes >= 0x80 are embedded as Latin-1 code points.
        let out = format_magic_message("%c", &Value::Uint(0xa9), &byte_t());
        assert_eq!(out, "\u{00a9}"); // U+00A9 COPYRIGHT SIGN

        // Width with space-padding (right-aligned).
        let out = format_magic_message("%3c", &Value::Uint(u64::from(b'A')), &byte_t());
        assert_eq!(out, "  A");

        // Left-aligned width.
        let out = format_magic_message("%-3c|", &Value::Uint(u64::from(b'A')), &byte_t());
        assert_eq!(out, "A  |");
    }

    #[test]
    fn test_char_zero_flag_ignored() {
        // POSIX: the `0` flag is ignored for `%c` -- zero-padding applies only to
        // numeric conversions. `%03c` must produce space-padded "  A", not "00A".
        // Regression guard: an earlier revision called `pad_numeric` for `Conv::Char`,
        // which applied zero-padding and diverged from C printf semantics.
        let out = format_magic_message("%03c", &Value::Uint(u64::from(b'A')), &byte_t());
        assert_eq!(out, "  A", "%03c must use space-padding, not zero-padding");

        // Combined zero and left-align: `-` overrides `0` for numerics; for %c
        // `0` was never active, but `-` still triggers left-alignment.
        let out = format_magic_message("%-03c|", &Value::Uint(u64::from(b'A')), &byte_t());
        assert_eq!(out, "A  |", "%-03c must left-align with spaces");
    }

    #[test]
    fn test_percent_escape() {
        let out = format_magic_message("100%% sure", &Value::Uint(0), &byte_t());
        assert_eq!(out, "100% sure");
    }

    #[test]
    fn test_non_ascii_template_preserved() {
        // Regression guard: earlier revisions iterated by byte and
        // pushed each `b as char`, which re-encoded non-ASCII UTF-8
        // continuation bytes as Latin-1 code points and mangled the
        // output (e.g., "cafΓ©" -> "café"). The plain-run flush path
        // must copy slices of the original template to preserve the
        // original UTF-8 byte sequences.
        let out = format_magic_message("cafΓ© %d", &Value::Int(42), &long_t());
        assert_eq!(out, "cafΓ© 42");

        // Non-ASCII around a specifier on both sides.
        let out = format_magic_message("β†’ %s ←", &Value::String("ok".into()), &byte_t());
        assert_eq!(out, "β†’ ok ←");

        // Non-ASCII only, no specifiers.
        let out = format_magic_message("ΓΌber", &Value::Uint(0), &byte_t());
        assert_eq!(out, "ΓΌber");
    }

    #[test]
    fn test_multiple_specifiers_in_one_template() {
        // Note: current implementation binds every specifier to the single
        // `value`; multiple specifiers are rendered against the same value.
        // This matches libmagic's single-argument model -- magic rules only
        // expose one read value per rule.
        let out = format_magic_message("a=%d b=%d", &Value::Int(5), &long_t());
        assert_eq!(out, "a=5 b=5");
    }

    #[test]
    fn test_width_padding() {
        // Zero-padded width with negative value: sign must precede zeros.
        // Regression guard for sign-aware zero-padding (C printf semantics).
        let out = format_magic_message("%05d", &Value::Int(-7), &long_t());
        assert_eq!(out, "-0007");
        let out = format_magic_message("%06d", &Value::Int(-42), &long_t());
        assert_eq!(out, "-00042");
        // Zero-padded width.
        let out = format_magic_message("%05d", &Value::Int(42), &long_t());
        assert_eq!(out, "00042");
        // Space-padded width.
        let out = format_magic_message("%5d", &Value::Int(42), &long_t());
        assert_eq!(out, "   42");
        // Negative with space-padding: sign stays in the body, spaces lead.
        let out = format_magic_message("%5d", &Value::Int(-7), &long_t());
        assert_eq!(out, "   -7");
        // Left-aligned (zero flag ignored when `-` is set).
        let out = format_magic_message("%-5d|", &Value::Int(42), &long_t());
        assert_eq!(out, "42   |");
        // Left-aligned negative: body left-aligned, spaces trail.
        let out = format_magic_message("%-6d|", &Value::Int(-7), &long_t());
        assert_eq!(out, "-7    |");
    }

    #[test]
    fn test_width_cap_prevents_large_allocation() {
        // A width larger than MAX_FORMAT_WIDTH must be silently clamped.
        // The output should be valid (the value rendered, possibly padded)
        // rather than triggering a huge allocation.
        let huge_width = format!("%{}d", usize::MAX);
        let out = format_magic_message(&huge_width, &Value::Int(1), &long_t());
        // After clamping, the output is at most MAX_FORMAT_WIDTH+1 chars.
        assert!(
            out.len() <= MAX_FORMAT_WIDTH + 1,
            "output too long: {}",
            out.len()
        );
        assert!(out.ends_with('1'), "rendered value must appear: {out:?}");
    }

    // ---- edge cases --------------------------------------------------

    #[test]
    fn test_empty_template() {
        assert_eq!(
            format_magic_message("", &Value::Uint(0), &byte_t()),
            String::new()
        );
    }

    #[test]
    fn test_literal_with_no_specifiers() {
        assert_eq!(
            format_magic_message("hello world", &Value::Uint(0), &byte_t()),
            "hello world"
        );
    }

    #[test]
    fn test_trailing_percent_with_no_spec() {
        // A stray `%` at end-of-string: pass through literally.
        let out = format_magic_message("done %", &Value::Uint(0), &byte_t());
        assert_eq!(out, "done %");
    }

    #[test]
    fn test_unknown_specifier_pass_through() {
        // `%q` is not in our subset.
        let out = format_magic_message("bad %q end", &Value::Uint(0), &byte_t());
        assert_eq!(out, "bad %q end");
    }

    #[test]
    fn test_type_mismatch_string_conv_on_uint_still_renders() {
        // `%s` against an integer value -- GNU `file` renders the number
        // as decimal; libmagic-rs matches that behavior via `render_string`.
        let out = format_magic_message("v=%s", &Value::Uint(42), &byte_t());
        assert_eq!(out, "v=42");
    }

    #[test]
    fn test_type_mismatch_numeric_conv_on_string_passes_through() {
        // `%d` against a string has no sensible coercion -> literal.
        let out = format_magic_message(
            "v=%d",
            &Value::String("hi".to_string()),
            &TypeKind::String { max_length: None },
        );
        assert_eq!(out, "v=%d");
    }

    #[test]
    fn test_char_specifier_accepts_full_byte_range() {
        // `%c` emits every byte value 0x00..=0xff directly, matching
        // GNU `file` / C printf semantics. Bytes 0x80-0xff are embedded
        // as their Latin-1 code points via `char::from(u8)`.
        // 0xff maps to U+00FF ('ΓΏ'); UTF-8 encoding is 0xc3 0xbf.
        let out = format_magic_message("[%c]", &Value::Uint(0xff), &byte_t());
        assert_eq!(out, "[\u{00ff}]");

        // ASCII boundary stays unchanged.
        let out = format_magic_message("[%c]", &Value::Uint(u64::from(b'A')), &byte_t());
        assert_eq!(out, "[A]");

        // Out-of-range (doesn't fit u8) passes through literally.
        let out = format_magic_message("[%c]", &Value::Uint(0x1_0000), &byte_t());
        assert_eq!(out, "[%c]");
    }

    #[test]
    fn test_byte_width_masking_on_negative_signed_byte() {
        // Regression guard: a signed byte carrying -1 (the representation
        // on the Value side is Int(-1)) must NOT render as a 64-bit mask.
        let out = format_magic_message("%x", &Value::Int(-1), &byte_t());
        assert_eq!(out, "ff");
    }

    #[test]
    fn test_hex_width_masking_respects_16bit() {
        let short_t = TypeKind::Short {
            endian: crate::parser::ast::Endianness::Little,
            signed: true,
        };
        let out = format_magic_message("%x", &Value::Int(-1), &short_t);
        assert_eq!(out, "ffff");
    }
}