libmagic_rs/output/
format.rs

1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Printf-style format specifier substitution for magic rule messages.
5//!
6//! Magic file messages frequently contain C-style format specifiers such as
7//! `%lld`, `%02x`, or `%s` that reference the rule's read value. GNU `file`
8//! renders the message with the value substituted at the specifier's
9//! position; without this pass libmagic-rs would emit the literal
10//! specifier tokens (e.g., `at_offset %lld`) and diverge visibly from
11//! `file(1)` output.
12//!
13//! The substitution is intentionally narrow: it supports the subset of
14//! C's `printf` syntax that appears in shipping magic corpora (notably
15//! `third_party/tests/searchbug.magic` and the GNU `file` `Magdir`
16//! collection). Unrecognized specifiers pass through literally with a
17//! `debug!` log rather than erroring -- matching the evaluator's
18//! graceful-skip discipline.
19//!
20//! Width masking for hex specifiers uses [`crate::parser::ast::TypeKind::bit_width`]
21//! so that e.g. a signed byte rendered with `%02x` produces the unsigned
22//! 8-bit interpretation (`0xff`, not `0xffffffffffffffff`).
23//!
24//! See the project plan at
25//! `docs/plans/2026-04-22-001-feat-meta-type-offset-and-format-substitution-plan.md`
26//! for scope, and GOTCHAS.md S14.2 for historical context.
27
28use log::debug;
29
30use crate::parser::ast::{TypeKind, Value};
31
32/// Substitute printf-style format specifiers in a magic rule message.
33///
34/// Walks `template` left to right. Plain text is copied verbatim; on
35/// each `%`, the full specifier (`%[flags][width][.precision][length]<conv>`)
36/// is parsed and substituted from `value`. `%%` emits a single `%`.
37/// Unrecognized or malformed specifiers are passed through literally
38/// with a `debug!` log.
39///
40/// `type_kind` is consulted only for hex specifiers, which need the
41/// natural bit width of the underlying read to mask sign-extended
42/// values correctly. For non-hex specifiers `type_kind` is ignored.
43///
44/// # Examples
45///
46/// ```
47/// use libmagic_rs::output::format::format_magic_message;
48/// use libmagic_rs::parser::ast::{TypeKind, Value};
49///
50/// let out = format_magic_message(
51///     "at_offset %lld",
52///     &Value::Uint(11),
53///     &TypeKind::Byte { signed: false },
54/// );
55/// assert_eq!(out, "at_offset 11");
56///
57/// let out = format_magic_message(
58///     "followed_by 0x%02x",
59///     &Value::Uint(0x31),
60///     &TypeKind::Byte { signed: false },
61/// );
62/// assert_eq!(out, "followed_by 0x31");
63///
64/// // Unknown specifier falls through literally.
65/// let out = format_magic_message("%q", &Value::Uint(0), &TypeKind::Byte { signed: false });
66/// assert_eq!(out, "%q");
67///
68/// // `%%` is an escaped literal percent.
69/// let out = format_magic_message("100%% sure", &Value::Uint(0), &TypeKind::Byte { signed: false });
70/// assert_eq!(out, "100% sure");
71/// ```
72#[must_use]
73pub fn format_magic_message(template: &str, value: &Value, type_kind: &TypeKind) -> String {
74    let mut out = String::with_capacity(template.len());
75    let bytes = template.as_bytes();
76    let mut i = 0;
77    // Start of the most recent run of non-`%` bytes. We copy the run
78    // as a string slice rather than byte-by-byte so non-ASCII UTF-8
79    // code points survive intact. Scanning still happens at the byte
80    // level (safe because `%` is ASCII 0x25 and cannot appear as a
81    // UTF-8 continuation byte, which is always >= 0x80).
82    let mut plain_start = 0;
83
84    while i < bytes.len() {
85        if bytes[i] != b'%' {
86            i += 1;
87            continue;
88        }
89
90        // Flush any pending plain-text run as a single UTF-8 slice.
91        if plain_start < i {
92            out.push_str(&template[plain_start..i]);
93        }
94
95        // Start of a format specifier at position i.
96        let spec_start = i;
97        let Some(parsed_spec) = parse_spec(bytes, i + 1) else {
98            // Malformed specifier (e.g., trailing `%` with nothing after,
99            // or a sequence that doesn't end in a valid conversion char).
100            // Pass through the remaining literal and stop scanning.
101            debug!(
102                "format_magic_message: malformed specifier at byte {i} in template {template:?}; passing through remainder literally",
103            );
104            out.push_str(&template[i..]);
105            // Skip the trailing flush -- we have already emitted the
106            // remainder above.
107            plain_start = bytes.len();
108            break;
109        };
110        let next_i = parsed_spec.end;
111        if let Some(rendered) = render(&parsed_spec, value, type_kind) {
112            out.push_str(&rendered);
113        } else {
114            // Type mismatch or unsupported conversion; pass through the
115            // literal specifier and log.
116            let literal = &template[spec_start..next_i];
117            debug!(
118                "format_magic_message: unsupported specifier {literal:?} for value {value:?}; passing through literally",
119            );
120            out.push_str(literal);
121        }
122        i = next_i;
123        plain_start = i;
124    }
125
126    // Flush any trailing plain-text run.
127    if plain_start < bytes.len() {
128        out.push_str(&template[plain_start..]);
129    }
130
131    out
132}
133
134/// Kinds of conversion characters we recognize.
135#[derive(Debug, Clone, Copy, PartialEq, Eq)]
136enum Conv {
137    /// `%d`, `%i`, `%ld`, `%lld` -- signed decimal.
138    Signed,
139    /// `%u`, `%lu`, `%llu` -- unsigned decimal.
140    Unsigned,
141    /// `%x` -- lowercase hex.
142    HexLower,
143    /// `%X` -- uppercase hex.
144    HexUpper,
145    /// `%o` -- octal.
146    Octal,
147    /// `%s` -- string.
148    Str,
149    /// `%c` -- single character (full 0x00-0xff byte range via Latin-1 code points).
150    Char,
151    /// `%%` -- literal percent.
152    Percent,
153}
154
155/// Parsed format specifier.
156#[derive(Debug, Clone)]
157struct Spec {
158    zero_pad: bool,
159    left_align: bool,
160    alt_form: bool,
161    width: usize,
162    conv: Conv,
163    /// Byte index of the character *after* this specifier in the template.
164    end: usize,
165}
166
167/// Maximum width value accepted from a format specifier.
168///
169/// Caps the field width to prevent crafted magic rules with enormous widths
170/// (e.g., `%999999999d`) from driving unbounded `repeat_n` allocations in the
171/// padding helpers. 4096 is generous for any real magic-corpus usage.
172const MAX_FORMAT_WIDTH: usize = 4096;
173
174/// Parse a format specifier starting at `start` (the first byte after the
175/// leading `%`). Returns `None` if the sequence does not end in a
176/// recognized conversion character.
177fn parse_spec(bytes: &[u8], start: usize) -> Option<Spec> {
178    let mut i = start;
179    let mut zero_pad = false;
180    let mut left_align = false;
181    let mut alt_form = false;
182
183    // Flags (subset: 0, -, #). Other flags (+, space) are parsed but ignored.
184    while i < bytes.len() {
185        match bytes[i] {
186            b'0' => {
187                zero_pad = true;
188                i += 1;
189            }
190            b'-' => {
191                left_align = true;
192                i += 1;
193            }
194            b'#' => {
195                alt_form = true;
196                i += 1;
197            }
198            b'+' | b' ' => {
199                // Accepted for syntactic completeness, no rendering effect
200                // in the current subset.
201                i += 1;
202            }
203            _ => break,
204        }
205    }
206
207    // Width (decimal digits). Capped at MAX_FORMAT_WIDTH to prevent
208    // unbounded allocations from crafted format strings.
209    let mut width: usize = 0;
210    while i < bytes.len() && bytes[i].is_ascii_digit() {
211        let digit = (bytes[i] - b'0') as usize;
212        width = width
213            .saturating_mul(10)
214            .saturating_add(digit)
215            .min(MAX_FORMAT_WIDTH);
216        i += 1;
217    }
218
219    // Precision (`.<digits>`): parsed and skipped -- no current consumer
220    // requires precision handling, and numeric rendering is whole-value.
221    if i < bytes.len() && bytes[i] == b'.' {
222        i += 1;
223        while i < bytes.len() && bytes[i].is_ascii_digit() {
224            i += 1;
225        }
226    }
227
228    // Length modifier (`h`, `hh`, `l`, `ll`, `j`, `z`, `t`). We consume
229    // these for syntactic completeness but never rely on them -- all
230    // numeric rendering uses full u64/i64 width.
231    while i < bytes.len() {
232        match bytes[i] {
233            b'l' | b'h' | b'j' | b'z' | b't' => i += 1,
234            _ => break,
235        }
236    }
237
238    if i >= bytes.len() {
239        return None;
240    }
241
242    let conv = match bytes[i] {
243        b'd' | b'i' => Conv::Signed,
244        b'u' => Conv::Unsigned,
245        b'x' => Conv::HexLower,
246        b'X' => Conv::HexUpper,
247        b'o' => Conv::Octal,
248        b's' => Conv::Str,
249        b'c' => Conv::Char,
250        b'%' => Conv::Percent,
251        _ => return None,
252    };
253    i += 1;
254
255    Some(Spec {
256        zero_pad,
257        left_align,
258        alt_form,
259        width,
260        conv,
261        end: i,
262    })
263}
264
265/// Render the specifier against `value`, or return `None` if the value
266/// is type-incompatible with the conversion.
267fn render(spec: &Spec, value: &Value, type_kind: &TypeKind) -> Option<String> {
268    match spec.conv {
269        Conv::Percent => Some("%".to_string()),
270        Conv::Str => Some(render_string(value)),
271        Conv::Signed => {
272            let n = coerce_to_i64(value)?;
273            Some(pad_numeric(&n.to_string(), spec))
274        }
275        Conv::Unsigned => {
276            let n = coerce_to_u64(value)?;
277            Some(pad_numeric(&n.to_string(), spec))
278        }
279        Conv::HexLower => {
280            let n = coerce_to_u64_masked(value, type_kind)?;
281            // C printf suppresses the `0x`/`0X` alt-form prefix when the
282            // value is zero: `printf("%#x", 0)` emits `"0"`, not `"0x0"`.
283            let prefix = if spec.alt_form && n != 0 { "0x" } else { "" };
284            Some(render_prefixed_int(&format!("{n:x}"), prefix, spec))
285        }
286        Conv::HexUpper => {
287            let n = coerce_to_u64_masked(value, type_kind)?;
288            let prefix = if spec.alt_form && n != 0 { "0X" } else { "" };
289            Some(render_prefixed_int(&format!("{n:X}"), prefix, spec))
290        }
291        Conv::Octal => {
292            let n = coerce_to_u64_masked(value, type_kind)?;
293            // C printf uses a single "0" prefix for %#o (not Rust's "0o"),
294            // and suppresses the prefix when the value itself is zero --
295            // the resulting digit `0` already satisfies the "starts with
296            // 0" invariant that the alt-form is meant to guarantee.
297            let prefix = if spec.alt_form && n != 0 { "0" } else { "" };
298            Some(render_prefixed_int(&format!("{n:o}"), prefix, spec))
299        }
300        Conv::Char => {
301            let n = coerce_to_u64(value)?;
302            let byte = u8::try_from(n).ok()?;
303            // GNU `file` / C printf `%c` converts the int argument to
304            // unsigned char and emits it directly for all byte values
305            // 0x00-0xff. Rust's `String` must be valid UTF-8, so we
306            // embed bytes >= 0x80 as their Latin-1 code points (U+0080
307            // through U+00FF) via `char::from(u8)` which is infallible
308            // and lossless. Consumers with UTF-8 terminals see the
309            // 2-byte UTF-8 encoding of that code point; consumers
310            // iterating the returned bytes directly can recover the
311            // original byte by re-encoding the code point as Latin-1.
312            //
313            // POSIX: the `0` flag is ignored for `%c` -- zero-padding only
314            // applies to numeric/float conversions. Always use space-padding
315            // for `%c`, matching C printf behavior.
316            Some(pad_non_numeric(&char::from(byte).to_string(), spec))
317        }
318    }
319}
320
321/// Render a [`Value`] for `%s`. Strings pass through; byte sequences are
322/// converted via lossy UTF-8; numbers render as decimal (GNU `file` does
323/// the same for mixed-type `%s` substitutions).
324fn render_string(value: &Value) -> String {
325    match value {
326        Value::String(s) => s.clone(),
327        Value::Bytes(b) => String::from_utf8_lossy(b).into_owned(),
328        Value::Uint(n) => n.to_string(),
329        Value::Int(n) => n.to_string(),
330        Value::Float(f) => f.to_string(),
331    }
332}
333
334/// Coerce a numeric-ish [`Value`] to `i64`. Float values are truncated
335/// toward zero (documented intent -- matches C's `(long long)float`
336/// semantics that libmagic's `printf` path relies on). String/Bytes
337/// values have no sensible mapping and return `None`.
338#[allow(
339    clippy::cast_possible_truncation,
340    clippy::cast_sign_loss,
341    clippy::cast_possible_wrap
342)]
343fn coerce_to_i64(value: &Value) -> Option<i64> {
344    match value {
345        Value::Int(n) => Some(*n),
346        // u64 -> i64 bit-pattern reinterpret: matches C's implicit
347        // cast in `printf("%lld", (unsigned long long)...)`.
348        Value::Uint(n) => Some(*n as i64),
349        // f64 -> i64 truncation toward zero, matching C behavior for
350        // `printf("%d", (double)...)`.
351        Value::Float(f) => Some(*f as i64),
352        Value::String(_) | Value::Bytes(_) => None,
353    }
354}
355
356/// Coerce a numeric-ish [`Value`] to `u64`. Mirrors [`coerce_to_i64`]
357/// but preserves the unsigned bit pattern when the source is signed.
358#[allow(
359    clippy::cast_possible_truncation,
360    clippy::cast_sign_loss,
361    clippy::cast_precision_loss
362)]
363fn coerce_to_u64(value: &Value) -> Option<u64> {
364    match value {
365        Value::Uint(n) => Some(*n),
366        // i64 -> u64 bit-pattern reinterpret for rendering; parallels
367        // the `coerce_to_i64` case.
368        Value::Int(n) => Some(*n as u64),
369        Value::Float(f) => Some(*f as u64),
370        Value::String(_) | Value::Bytes(_) => None,
371    }
372}
373
374/// Coerce a numeric-ish [`Value`] to `u64`, masked to the natural bit
375/// width of `type_kind`. Used by hex/octal specifiers to avoid
376/// surprising sign-extended renderings like `byte = -1` rendering as
377/// `ffffffffffffffff` when the user expected `ff`.
378fn coerce_to_u64_masked(value: &Value, type_kind: &TypeKind) -> Option<u64> {
379    let raw = coerce_to_u64(value)?;
380    let mask = match type_kind.bit_width() {
381        Some(8) => 0xff_u64,
382        Some(16) => 0xffff_u64,
383        Some(32) => 0xffff_ffff_u64,
384        // 64-bit, unknown width, or any other case: no mask needed.
385        _ => return Some(raw),
386    };
387    Some(raw & mask)
388}
389
390/// Render a numeric body with an alt-form prefix (`0x` / `0o` / empty),
391/// applying width and padding correctly.
392///
393/// For zero-padded widths (`%#0Nx`), C printf inserts zeros *between*
394/// the prefix and the digits: `%#06x` + `0xab` -> `0x00ab`, not
395/// `  0xab`. For space-padded widths (`%#Nx`), the spaces go *before*
396/// the prefix: `%#6x` + `0xab` -> `  0xab`. For left-aligned widths
397/// (`%-#6x`), trailing spaces follow the digits: `0xab  `.
398fn render_prefixed_int(digits: &str, prefix: &str, spec: &Spec) -> String {
399    // The effective body length for width comparison is prefix + digits.
400    let body_len = prefix.len() + digits.len();
401    if body_len >= spec.width {
402        return format!("{prefix}{digits}");
403    }
404    let pad = spec.width - body_len;
405    if spec.zero_pad && !spec.left_align {
406        // Zeros insert between the prefix and the digits.
407        let zeros: String = std::iter::repeat_n('0', pad).collect();
408        format!("{prefix}{zeros}{digits}")
409    } else if spec.left_align {
410        let spaces: String = std::iter::repeat_n(' ', pad).collect();
411        format!("{prefix}{digits}{spaces}")
412    } else {
413        let spaces: String = std::iter::repeat_n(' ', pad).collect();
414        format!("{spaces}{prefix}{digits}")
415    }
416}
417
418/// Apply width and alignment to a non-numeric rendered body using space-only padding.
419///
420/// Used for `%c` (and any other non-numeric conversion where the POSIX `0` flag
421/// must be ignored). Zero-padding is not applied regardless of `spec.zero_pad`.
422fn pad_non_numeric(body: &str, spec: &Spec) -> String {
423    if body.len() >= spec.width {
424        return body.to_string();
425    }
426    let pad = spec.width - body.len();
427    let padding: String = std::iter::repeat_n(' ', pad).collect();
428    if spec.left_align {
429        format!("{body}{padding}")
430    } else {
431        format!("{padding}{body}")
432    }
433}
434
435/// Apply width and padding to an already-rendered numeric body.
436///
437/// For zero-padded right-aligned formatting, a leading `-` sign is kept at
438/// the front while zeros are inserted between the sign and the magnitude
439/// digits -- matching C printf semantics (e.g., `%05d` with `-7` → `-0007`,
440/// not `000-7`).
441fn pad_numeric(body: &str, spec: &Spec) -> String {
442    if body.len() >= spec.width {
443        return body.to_string();
444    }
445    // C printf sign-aware zero-padding: sign goes before the zeros.
446    if spec.zero_pad
447        && !spec.left_align
448        && let Some(digits) = body.strip_prefix('-')
449    {
450        let needed = spec.width.saturating_sub(1 + digits.len());
451        if needed == 0 {
452            return body.to_string();
453        }
454        let zeros: String = std::iter::repeat_n('0', needed).collect();
455        return format!("-{zeros}{digits}");
456    }
457    let pad = spec.width - body.len();
458    let pad_char = if spec.zero_pad && !spec.left_align {
459        '0'
460    } else {
461        ' '
462    };
463    let padding: String = std::iter::repeat_n(pad_char, pad).collect();
464    if spec.left_align {
465        format!("{body}{padding}")
466    } else {
467        format!("{padding}{body}")
468    }
469}
470
471#[cfg(test)]
472mod tests {
473    use super::*;
474
475    fn byte_t() -> TypeKind {
476        TypeKind::Byte { signed: false }
477    }
478
479    fn long_t() -> TypeKind {
480        TypeKind::Long {
481            endian: crate::parser::ast::Endianness::Little,
482            signed: true,
483        }
484    }
485
486    // ---- happy path --------------------------------------------------
487
488    #[test]
489    fn test_signed_decimal_substitution() {
490        // Covers %d, %i, %ld, %lld (length modifiers are accepted and ignored).
491        let cases = [
492            ("v=%d", Value::Int(-7), "v=-7"),
493            ("v=%i", Value::Int(42), "v=42"),
494            ("v=%ld", Value::Int(10), "v=10"),
495            ("at_offset %lld", Value::Uint(11), "at_offset 11"),
496        ];
497        for (tmpl, val, expected) in cases {
498            assert_eq!(
499                format_magic_message(tmpl, &val, &byte_t()),
500                expected,
501                "template {tmpl:?} with value {val:?}",
502            );
503        }
504    }
505
506    #[test]
507    fn test_unsigned_decimal_substitution() {
508        let out = format_magic_message("n=%u", &Value::Uint(200), &byte_t());
509        assert_eq!(out, "n=200");
510
511        // i64::MIN as unsigned should come through as 2^63.
512        let out = format_magic_message("n=%llu", &Value::Int(i64::MIN), &long_t());
513        assert_eq!(out, "n=9223372036854775808");
514    }
515
516    #[test]
517    fn test_hex_substitution_with_byte_width_masking() {
518        // The canonical searchbug.result case: ubyte `%02x`.
519        let out = format_magic_message("0x%02x", &Value::Uint(0x31), &byte_t());
520        assert_eq!(out, "0x31");
521
522        // Byte -1 (sign-extended to u64::MAX in Value::Int) must render as "ff",
523        // not "ffffffffffffffff", when the underlying type is a byte.
524        let out = format_magic_message("0x%02x", &Value::Int(-1), &byte_t());
525        assert_eq!(out, "0xff");
526
527        // %X is uppercase.
528        let out = format_magic_message("%X", &Value::Uint(0xdead_beef), &long_t());
529        assert_eq!(out, "DEADBEEF");
530
531        // %#x emits the "0x" prefix via alt form.
532        let out = format_magic_message("%#x", &Value::Uint(0xab), &byte_t());
533        assert_eq!(out, "0xab");
534
535        // %#06x: zero-pad inserts between prefix and digits (C printf semantics),
536        // not before the prefix. Regression guard for correctness review COR-002.
537        let out = format_magic_message("%#06x", &Value::Uint(0xab), &byte_t());
538        assert_eq!(out, "0x00ab");
539
540        // Space-padded width with alt-form prefix: spaces go before prefix.
541        let out = format_magic_message("%#6x", &Value::Uint(0xab), &byte_t());
542        assert_eq!(out, "  0xab");
543
544        // Left-aligned with alt-form prefix: spaces trail the digits.
545        let out = format_magic_message("%-#6x|", &Value::Uint(0xab), &byte_t());
546        assert_eq!(out, "0xab  |");
547
548        // %#08o: zero-pad inserts between C-style "0" prefix and digits.
549        // C printf uses a single "0" prefix for %#o (not Rust's "0o").
550        let out = format_magic_message("%#08o", &Value::Uint(8), &byte_t());
551        assert_eq!(out, "00000010");
552
553        // %#X: uppercase alt-form uses "0X" prefix to match the specifier case.
554        let out = format_magic_message("%#X", &Value::Uint(0xab), &byte_t());
555        assert_eq!(out, "0XAB");
556    }
557
558    #[test]
559    fn test_string_substitution() {
560        let out = format_magic_message(
561            "hello %s",
562            &Value::String("world".to_string()),
563            &TypeKind::String { max_length: None },
564        );
565        assert_eq!(out, "hello world");
566
567        // Bytes go through lossy UTF-8.
568        let out = format_magic_message(
569            "data=%s",
570            &Value::Bytes(b"abc".to_vec()),
571            &TypeKind::String { max_length: None },
572        );
573        assert_eq!(out, "data=abc");
574    }
575
576    #[test]
577    fn test_alt_form_prefix_suppressed_on_zero_value() {
578        // C printf special-cases `%#o`, `%#x`, `%#X` with value 0: the
579        // alt-form prefix is suppressed because the rendered digit
580        // already begins with `0`. Regression guard after pr-review
581        // caught that our implementation emitted `"00"` / `"0x0"` /
582        // `"0X0"` for zero values.
583        let out = format_magic_message("%#o", &Value::Uint(0), &byte_t());
584        assert_eq!(out, "0", "%#o with 0 must emit single '0', not '00'");
585
586        let out = format_magic_message("%#x", &Value::Uint(0), &byte_t());
587        assert_eq!(out, "0", "%#x with 0 must emit single '0', not '0x0'");
588
589        let out = format_magic_message("%#X", &Value::Uint(0), &byte_t());
590        assert_eq!(out, "0", "%#X with 0 must emit single '0', not '0X0'");
591
592        // Non-zero values still get the prefix.
593        let out = format_magic_message("%#x", &Value::Uint(1), &byte_t());
594        assert_eq!(out, "0x1");
595    }
596
597    #[test]
598    fn test_octal_substitution() {
599        let out = format_magic_message("%o", &Value::Uint(8), &byte_t());
600        assert_eq!(out, "10");
601        // C printf %#o uses a single "0" prefix, not Rust's "0o".
602        let out = format_magic_message("%#o", &Value::Uint(8), &byte_t());
603        assert_eq!(out, "010");
604    }
605
606    #[test]
607    fn test_char_substitution() {
608        let out = format_magic_message("[%c]", &Value::Uint(u64::from(b'A')), &byte_t());
609        assert_eq!(out, "[A]");
610
611        // Full 0x00-0xff range: bytes >= 0x80 are embedded as Latin-1 code points.
612        let out = format_magic_message("%c", &Value::Uint(0xa9), &byte_t());
613        assert_eq!(out, "\u{00a9}"); // U+00A9 COPYRIGHT SIGN
614
615        // Width with space-padding (right-aligned).
616        let out = format_magic_message("%3c", &Value::Uint(u64::from(b'A')), &byte_t());
617        assert_eq!(out, "  A");
618
619        // Left-aligned width.
620        let out = format_magic_message("%-3c|", &Value::Uint(u64::from(b'A')), &byte_t());
621        assert_eq!(out, "A  |");
622    }
623
624    #[test]
625    fn test_char_zero_flag_ignored() {
626        // POSIX: the `0` flag is ignored for `%c` -- zero-padding applies only to
627        // numeric conversions. `%03c` must produce space-padded "  A", not "00A".
628        // Regression guard: an earlier revision called `pad_numeric` for `Conv::Char`,
629        // which applied zero-padding and diverged from C printf semantics.
630        let out = format_magic_message("%03c", &Value::Uint(u64::from(b'A')), &byte_t());
631        assert_eq!(out, "  A", "%03c must use space-padding, not zero-padding");
632
633        // Combined zero and left-align: `-` overrides `0` for numerics; for %c
634        // `0` was never active, but `-` still triggers left-alignment.
635        let out = format_magic_message("%-03c|", &Value::Uint(u64::from(b'A')), &byte_t());
636        assert_eq!(out, "A  |", "%-03c must left-align with spaces");
637    }
638
639    #[test]
640    fn test_percent_escape() {
641        let out = format_magic_message("100%% sure", &Value::Uint(0), &byte_t());
642        assert_eq!(out, "100% sure");
643    }
644
645    #[test]
646    fn test_non_ascii_template_preserved() {
647        // Regression guard: earlier revisions iterated by byte and
648        // pushed each `b as char`, which re-encoded non-ASCII UTF-8
649        // continuation bytes as Latin-1 code points and mangled the
650        // output (e.g., "café" -> "cafÃ©"). The plain-run flush path
651        // must copy slices of the original template to preserve the
652        // original UTF-8 byte sequences.
653        let out = format_magic_message("café %d", &Value::Int(42), &long_t());
654        assert_eq!(out, "café 42");
655
656        // Non-ASCII around a specifier on both sides.
657        let out = format_magic_message("→ %s ←", &Value::String("ok".into()), &byte_t());
658        assert_eq!(out, "→ ok ←");
659
660        // Non-ASCII only, no specifiers.
661        let out = format_magic_message("über", &Value::Uint(0), &byte_t());
662        assert_eq!(out, "über");
663    }
664
665    #[test]
666    fn test_multiple_specifiers_in_one_template() {
667        // Note: current implementation binds every specifier to the single
668        // `value`; multiple specifiers are rendered against the same value.
669        // This matches libmagic's single-argument model -- magic rules only
670        // expose one read value per rule.
671        let out = format_magic_message("a=%d b=%d", &Value::Int(5), &long_t());
672        assert_eq!(out, "a=5 b=5");
673    }
674
675    #[test]
676    fn test_width_padding() {
677        // Zero-padded width with negative value: sign must precede zeros.
678        // Regression guard for sign-aware zero-padding (C printf semantics).
679        let out = format_magic_message("%05d", &Value::Int(-7), &long_t());
680        assert_eq!(out, "-0007");
681        let out = format_magic_message("%06d", &Value::Int(-42), &long_t());
682        assert_eq!(out, "-00042");
683        // Zero-padded width.
684        let out = format_magic_message("%05d", &Value::Int(42), &long_t());
685        assert_eq!(out, "00042");
686        // Space-padded width.
687        let out = format_magic_message("%5d", &Value::Int(42), &long_t());
688        assert_eq!(out, "   42");
689        // Negative with space-padding: sign stays in the body, spaces lead.
690        let out = format_magic_message("%5d", &Value::Int(-7), &long_t());
691        assert_eq!(out, "   -7");
692        // Left-aligned (zero flag ignored when `-` is set).
693        let out = format_magic_message("%-5d|", &Value::Int(42), &long_t());
694        assert_eq!(out, "42   |");
695        // Left-aligned negative: body left-aligned, spaces trail.
696        let out = format_magic_message("%-6d|", &Value::Int(-7), &long_t());
697        assert_eq!(out, "-7    |");
698    }
699
700    #[test]
701    fn test_width_cap_prevents_large_allocation() {
702        // A width larger than MAX_FORMAT_WIDTH must be silently clamped.
703        // The output should be valid (the value rendered, possibly padded)
704        // rather than triggering a huge allocation.
705        let huge_width = format!("%{}d", usize::MAX);
706        let out = format_magic_message(&huge_width, &Value::Int(1), &long_t());
707        // After clamping, the output is at most MAX_FORMAT_WIDTH+1 chars.
708        assert!(
709            out.len() <= MAX_FORMAT_WIDTH + 1,
710            "output too long: {}",
711            out.len()
712        );
713        assert!(out.ends_with('1'), "rendered value must appear: {out:?}");
714    }
715
716    // ---- edge cases --------------------------------------------------
717
718    #[test]
719    fn test_empty_template() {
720        assert_eq!(
721            format_magic_message("", &Value::Uint(0), &byte_t()),
722            String::new()
723        );
724    }
725
726    #[test]
727    fn test_literal_with_no_specifiers() {
728        assert_eq!(
729            format_magic_message("hello world", &Value::Uint(0), &byte_t()),
730            "hello world"
731        );
732    }
733
734    #[test]
735    fn test_trailing_percent_with_no_spec() {
736        // A stray `%` at end-of-string: pass through literally.
737        let out = format_magic_message("done %", &Value::Uint(0), &byte_t());
738        assert_eq!(out, "done %");
739    }
740
741    #[test]
742    fn test_unknown_specifier_pass_through() {
743        // `%q` is not in our subset.
744        let out = format_magic_message("bad %q end", &Value::Uint(0), &byte_t());
745        assert_eq!(out, "bad %q end");
746    }
747
748    #[test]
749    fn test_type_mismatch_string_conv_on_uint_still_renders() {
750        // `%s` against an integer value -- GNU `file` renders the number
751        // as decimal; libmagic-rs matches that behavior via `render_string`.
752        let out = format_magic_message("v=%s", &Value::Uint(42), &byte_t());
753        assert_eq!(out, "v=42");
754    }
755
756    #[test]
757    fn test_type_mismatch_numeric_conv_on_string_passes_through() {
758        // `%d` against a string has no sensible coercion -> literal.
759        let out = format_magic_message(
760            "v=%d",
761            &Value::String("hi".to_string()),
762            &TypeKind::String { max_length: None },
763        );
764        assert_eq!(out, "v=%d");
765    }
766
767    #[test]
768    fn test_char_specifier_accepts_full_byte_range() {
769        // `%c` emits every byte value 0x00..=0xff directly, matching
770        // GNU `file` / C printf semantics. Bytes 0x80-0xff are embedded
771        // as their Latin-1 code points via `char::from(u8)`.
772        // 0xff maps to U+00FF ('ÿ'); UTF-8 encoding is 0xc3 0xbf.
773        let out = format_magic_message("[%c]", &Value::Uint(0xff), &byte_t());
774        assert_eq!(out, "[\u{00ff}]");
775
776        // ASCII boundary stays unchanged.
777        let out = format_magic_message("[%c]", &Value::Uint(u64::from(b'A')), &byte_t());
778        assert_eq!(out, "[A]");
779
780        // Out-of-range (doesn't fit u8) passes through literally.
781        let out = format_magic_message("[%c]", &Value::Uint(0x1_0000), &byte_t());
782        assert_eq!(out, "[%c]");
783    }
784
785    #[test]
786    fn test_byte_width_masking_on_negative_signed_byte() {
787        // Regression guard: a signed byte carrying -1 (the representation
788        // on the Value side is Int(-1)) must NOT render as a 64-bit mask.
789        let out = format_magic_message("%x", &Value::Int(-1), &byte_t());
790        assert_eq!(out, "ff");
791    }
792
793    #[test]
794    fn test_hex_width_masking_respects_16bit() {
795        let short_t = TypeKind::Short {
796            endian: crate::parser::ast::Endianness::Little,
797            signed: true,
798        };
799        let out = format_magic_message("%x", &Value::Int(-1), &short_t);
800        assert_eq!(out, "ffff");
801    }
802}
libmagic_rs/output/format.rs

libmagic_rs/output/
format.rs