libmagic_rs/output/format.rs
1// Copyright (c) 2025-2026 the libmagic-rs contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Printf-style format specifier substitution for magic rule messages.
5//!
6//! Magic file messages frequently contain C-style format specifiers such as
7//! `%lld`, `%02x`, or `%s` that reference the rule's read value. GNU `file`
8//! renders the message with the value substituted at the specifier's
9//! position; without this pass libmagic-rs would emit the literal
10//! specifier tokens (e.g., `at_offset %lld`) and diverge visibly from
11//! `file(1)` output.
12//!
13//! The substitution is intentionally narrow: it supports the subset of
14//! C's `printf` syntax that appears in shipping magic corpora (notably
15//! `third_party/tests/searchbug.magic` and the GNU `file` `Magdir`
16//! collection). Unrecognized specifiers pass through literally with a
17//! `debug!` log rather than erroring -- matching the evaluator's
18//! graceful-skip discipline.
19//!
20//! Width masking for hex specifiers uses [`crate::parser::ast::TypeKind::bit_width`]
21//! so that e.g. a signed byte rendered with `%02x` produces the unsigned
22//! 8-bit interpretation (`0xff`, not `0xffffffffffffffff`).
23//!
24//! See the project plan at
25//! `docs/plans/2026-04-22-001-feat-meta-type-offset-and-format-substitution-plan.md`
26//! for scope, and GOTCHAS.md S14.2 for historical context.
27
28use log::debug;
29
30use crate::parser::ast::{TypeKind, Value};
31
32/// Substitute printf-style format specifiers in a magic rule message.
33///
34/// Walks `template` left to right. Plain text is copied verbatim; on
35/// each `%`, the full specifier (`%[flags][width][.precision][length]<conv>`)
36/// is parsed and substituted from `value`. `%%` emits a single `%`.
37/// Unrecognized or malformed specifiers are passed through literally
38/// with a `debug!` log.
39///
40/// `type_kind` is consulted only for hex specifiers, which need the
41/// natural bit width of the underlying read to mask sign-extended
42/// values correctly. For non-hex specifiers `type_kind` is ignored.
43///
44/// # Examples
45///
46/// ```
47/// use libmagic_rs::output::format::format_magic_message;
48/// use libmagic_rs::parser::ast::{TypeKind, Value};
49///
50/// let out = format_magic_message(
51/// "at_offset %lld",
52/// &Value::Uint(11),
53/// &TypeKind::Byte { signed: false },
54/// );
55/// assert_eq!(out, "at_offset 11");
56///
57/// let out = format_magic_message(
58/// "followed_by 0x%02x",
59/// &Value::Uint(0x31),
60/// &TypeKind::Byte { signed: false },
61/// );
62/// assert_eq!(out, "followed_by 0x31");
63///
64/// // Unknown specifier falls through literally.
65/// let out = format_magic_message("%q", &Value::Uint(0), &TypeKind::Byte { signed: false });
66/// assert_eq!(out, "%q");
67///
68/// // `%%` is an escaped literal percent.
69/// let out = format_magic_message("100%% sure", &Value::Uint(0), &TypeKind::Byte { signed: false });
70/// assert_eq!(out, "100% sure");
71/// ```
72#[must_use]
73pub fn format_magic_message(template: &str, value: &Value, type_kind: &TypeKind) -> String {
74 let mut out = String::with_capacity(template.len());
75 let bytes = template.as_bytes();
76 let mut i = 0;
77 // Start of the most recent run of non-`%` bytes. We copy the run
78 // as a string slice rather than byte-by-byte so non-ASCII UTF-8
79 // code points survive intact. Scanning still happens at the byte
80 // level (safe because `%` is ASCII 0x25 and cannot appear as a
81 // UTF-8 continuation byte, which is always >= 0x80).
82 let mut plain_start = 0;
83
84 while i < bytes.len() {
85 if bytes[i] != b'%' {
86 i += 1;
87 continue;
88 }
89
90 // Flush any pending plain-text run as a single UTF-8 slice.
91 if plain_start < i {
92 out.push_str(&template[plain_start..i]);
93 }
94
95 // Start of a format specifier at position i.
96 let spec_start = i;
97 let Some(parsed_spec) = parse_spec(bytes, i + 1) else {
98 // Malformed specifier (e.g., trailing `%` with nothing after,
99 // or a sequence that doesn't end in a valid conversion char).
100 // Pass through the remaining literal and stop scanning.
101 debug!(
102 "format_magic_message: malformed specifier at byte {i} in template {template:?}; passing through remainder literally",
103 );
104 out.push_str(&template[i..]);
105 // Skip the trailing flush -- we have already emitted the
106 // remainder above.
107 plain_start = bytes.len();
108 break;
109 };
110 let next_i = parsed_spec.end;
111 if let Some(rendered) = render(&parsed_spec, value, type_kind) {
112 out.push_str(&rendered);
113 } else {
114 // Type mismatch or unsupported conversion; pass through the
115 // literal specifier and log.
116 let literal = &template[spec_start..next_i];
117 debug!(
118 "format_magic_message: unsupported specifier {literal:?} for value {value:?}; passing through literally",
119 );
120 out.push_str(literal);
121 }
122 i = next_i;
123 plain_start = i;
124 }
125
126 // Flush any trailing plain-text run.
127 if plain_start < bytes.len() {
128 out.push_str(&template[plain_start..]);
129 }
130
131 out
132}
133
134/// Kinds of conversion characters we recognize.
135#[derive(Debug, Clone, Copy, PartialEq, Eq)]
136enum Conv {
137 /// `%d`, `%i`, `%ld`, `%lld` -- signed decimal.
138 Signed,
139 /// `%u`, `%lu`, `%llu` -- unsigned decimal.
140 Unsigned,
141 /// `%x` -- lowercase hex.
142 HexLower,
143 /// `%X` -- uppercase hex.
144 HexUpper,
145 /// `%o` -- octal.
146 Octal,
147 /// `%s` -- string.
148 Str,
149 /// `%c` -- single character (full 0x00-0xff byte range via Latin-1 code points).
150 Char,
151 /// `%%` -- literal percent.
152 Percent,
153}
154
155/// Parsed format specifier.
156#[derive(Debug, Clone)]
157struct Spec {
158 zero_pad: bool,
159 left_align: bool,
160 alt_form: bool,
161 width: usize,
162 conv: Conv,
163 /// Byte index of the character *after* this specifier in the template.
164 end: usize,
165}
166
167/// Maximum width value accepted from a format specifier.
168///
169/// Caps the field width to prevent crafted magic rules with enormous widths
170/// (e.g., `%999999999d`) from driving unbounded `repeat_n` allocations in the
171/// padding helpers. 4096 is generous for any real magic-corpus usage.
172const MAX_FORMAT_WIDTH: usize = 4096;
173
174/// Parse a format specifier starting at `start` (the first byte after the
175/// leading `%`). Returns `None` if the sequence does not end in a
176/// recognized conversion character.
177fn parse_spec(bytes: &[u8], start: usize) -> Option<Spec> {
178 let mut i = start;
179 let mut zero_pad = false;
180 let mut left_align = false;
181 let mut alt_form = false;
182
183 // Flags (subset: 0, -, #). Other flags (+, space) are parsed but ignored.
184 while i < bytes.len() {
185 match bytes[i] {
186 b'0' => {
187 zero_pad = true;
188 i += 1;
189 }
190 b'-' => {
191 left_align = true;
192 i += 1;
193 }
194 b'#' => {
195 alt_form = true;
196 i += 1;
197 }
198 b'+' | b' ' => {
199 // Accepted for syntactic completeness, no rendering effect
200 // in the current subset.
201 i += 1;
202 }
203 _ => break,
204 }
205 }
206
207 // Width (decimal digits). Capped at MAX_FORMAT_WIDTH to prevent
208 // unbounded allocations from crafted format strings.
209 let mut width: usize = 0;
210 while i < bytes.len() && bytes[i].is_ascii_digit() {
211 let digit = (bytes[i] - b'0') as usize;
212 width = width
213 .saturating_mul(10)
214 .saturating_add(digit)
215 .min(MAX_FORMAT_WIDTH);
216 i += 1;
217 }
218
219 // Precision (`.<digits>`): parsed and skipped -- no current consumer
220 // requires precision handling, and numeric rendering is whole-value.
221 if i < bytes.len() && bytes[i] == b'.' {
222 i += 1;
223 while i < bytes.len() && bytes[i].is_ascii_digit() {
224 i += 1;
225 }
226 }
227
228 // Length modifier (`h`, `hh`, `l`, `ll`, `j`, `z`, `t`). We consume
229 // these for syntactic completeness but never rely on them -- all
230 // numeric rendering uses full u64/i64 width.
231 while i < bytes.len() {
232 match bytes[i] {
233 b'l' | b'h' | b'j' | b'z' | b't' => i += 1,
234 _ => break,
235 }
236 }
237
238 if i >= bytes.len() {
239 return None;
240 }
241
242 let conv = match bytes[i] {
243 b'd' | b'i' => Conv::Signed,
244 b'u' => Conv::Unsigned,
245 b'x' => Conv::HexLower,
246 b'X' => Conv::HexUpper,
247 b'o' => Conv::Octal,
248 b's' => Conv::Str,
249 b'c' => Conv::Char,
250 b'%' => Conv::Percent,
251 _ => return None,
252 };
253 i += 1;
254
255 Some(Spec {
256 zero_pad,
257 left_align,
258 alt_form,
259 width,
260 conv,
261 end: i,
262 })
263}
264
265/// Render the specifier against `value`, or return `None` if the value
266/// is type-incompatible with the conversion.
267fn render(spec: &Spec, value: &Value, type_kind: &TypeKind) -> Option<String> {
268 match spec.conv {
269 Conv::Percent => Some("%".to_string()),
270 Conv::Str => Some(render_string(value)),
271 Conv::Signed => {
272 let n = coerce_to_i64(value)?;
273 Some(pad_numeric(&n.to_string(), spec))
274 }
275 Conv::Unsigned => {
276 let n = coerce_to_u64(value)?;
277 Some(pad_numeric(&n.to_string(), spec))
278 }
279 Conv::HexLower => {
280 let n = coerce_to_u64_masked(value, type_kind)?;
281 // C printf suppresses the `0x`/`0X` alt-form prefix when the
282 // value is zero: `printf("%#x", 0)` emits `"0"`, not `"0x0"`.
283 let prefix = if spec.alt_form && n != 0 { "0x" } else { "" };
284 Some(render_prefixed_int(&format!("{n:x}"), prefix, spec))
285 }
286 Conv::HexUpper => {
287 let n = coerce_to_u64_masked(value, type_kind)?;
288 let prefix = if spec.alt_form && n != 0 { "0X" } else { "" };
289 Some(render_prefixed_int(&format!("{n:X}"), prefix, spec))
290 }
291 Conv::Octal => {
292 let n = coerce_to_u64_masked(value, type_kind)?;
293 // C printf uses a single "0" prefix for %#o (not Rust's "0o"),
294 // and suppresses the prefix when the value itself is zero --
295 // the resulting digit `0` already satisfies the "starts with
296 // 0" invariant that the alt-form is meant to guarantee.
297 let prefix = if spec.alt_form && n != 0 { "0" } else { "" };
298 Some(render_prefixed_int(&format!("{n:o}"), prefix, spec))
299 }
300 Conv::Char => {
301 let n = coerce_to_u64(value)?;
302 let byte = u8::try_from(n).ok()?;
303 // GNU `file` / C printf `%c` converts the int argument to
304 // unsigned char and emits it directly for all byte values
305 // 0x00-0xff. Rust's `String` must be valid UTF-8, so we
306 // embed bytes >= 0x80 as their Latin-1 code points (U+0080
307 // through U+00FF) via `char::from(u8)` which is infallible
308 // and lossless. Consumers with UTF-8 terminals see the
309 // 2-byte UTF-8 encoding of that code point; consumers
310 // iterating the returned bytes directly can recover the
311 // original byte by re-encoding the code point as Latin-1.
312 //
313 // POSIX: the `0` flag is ignored for `%c` -- zero-padding only
314 // applies to numeric/float conversions. Always use space-padding
315 // for `%c`, matching C printf behavior.
316 Some(pad_non_numeric(&char::from(byte).to_string(), spec))
317 }
318 }
319}
320
321/// Render a [`Value`] for `%s`. Strings pass through; byte sequences are
322/// converted via lossy UTF-8; numbers render as decimal (GNU `file` does
323/// the same for mixed-type `%s` substitutions).
324fn render_string(value: &Value) -> String {
325 match value {
326 Value::String(s) => s.clone(),
327 Value::Bytes(b) => String::from_utf8_lossy(b).into_owned(),
328 Value::Uint(n) => n.to_string(),
329 Value::Int(n) => n.to_string(),
330 Value::Float(f) => f.to_string(),
331 }
332}
333
334/// Coerce a numeric-ish [`Value`] to `i64`. Float values are truncated
335/// toward zero (documented intent -- matches C's `(long long)float`
336/// semantics that libmagic's `printf` path relies on). String/Bytes
337/// values have no sensible mapping and return `None`.
338#[allow(
339 clippy::cast_possible_truncation,
340 clippy::cast_sign_loss,
341 clippy::cast_possible_wrap
342)]
343fn coerce_to_i64(value: &Value) -> Option<i64> {
344 match value {
345 Value::Int(n) => Some(*n),
346 // u64 -> i64 bit-pattern reinterpret: matches C's implicit
347 // cast in `printf("%lld", (unsigned long long)...)`.
348 Value::Uint(n) => Some(*n as i64),
349 // f64 -> i64 truncation toward zero, matching C behavior for
350 // `printf("%d", (double)...)`.
351 Value::Float(f) => Some(*f as i64),
352 Value::String(_) | Value::Bytes(_) => None,
353 }
354}
355
356/// Coerce a numeric-ish [`Value`] to `u64`. Mirrors [`coerce_to_i64`]
357/// but preserves the unsigned bit pattern when the source is signed.
358#[allow(
359 clippy::cast_possible_truncation,
360 clippy::cast_sign_loss,
361 clippy::cast_precision_loss
362)]
363fn coerce_to_u64(value: &Value) -> Option<u64> {
364 match value {
365 Value::Uint(n) => Some(*n),
366 // i64 -> u64 bit-pattern reinterpret for rendering; parallels
367 // the `coerce_to_i64` case.
368 Value::Int(n) => Some(*n as u64),
369 Value::Float(f) => Some(*f as u64),
370 Value::String(_) | Value::Bytes(_) => None,
371 }
372}
373
374/// Coerce a numeric-ish [`Value`] to `u64`, masked to the natural bit
375/// width of `type_kind`. Used by hex/octal specifiers to avoid
376/// surprising sign-extended renderings like `byte = -1` rendering as
377/// `ffffffffffffffff` when the user expected `ff`.
378fn coerce_to_u64_masked(value: &Value, type_kind: &TypeKind) -> Option<u64> {
379 let raw = coerce_to_u64(value)?;
380 let mask = match type_kind.bit_width() {
381 Some(8) => 0xff_u64,
382 Some(16) => 0xffff_u64,
383 Some(32) => 0xffff_ffff_u64,
384 // 64-bit, unknown width, or any other case: no mask needed.
385 _ => return Some(raw),
386 };
387 Some(raw & mask)
388}
389
390/// Render a numeric body with an alt-form prefix (`0x` / `0o` / empty),
391/// applying width and padding correctly.
392///
393/// For zero-padded widths (`%#0Nx`), C printf inserts zeros *between*
394/// the prefix and the digits: `%#06x` + `0xab` -> `0x00ab`, not
395/// ` 0xab`. For space-padded widths (`%#Nx`), the spaces go *before*
396/// the prefix: `%#6x` + `0xab` -> ` 0xab`. For left-aligned widths
397/// (`%-#6x`), trailing spaces follow the digits: `0xab `.
398fn render_prefixed_int(digits: &str, prefix: &str, spec: &Spec) -> String {
399 // The effective body length for width comparison is prefix + digits.
400 let body_len = prefix.len() + digits.len();
401 if body_len >= spec.width {
402 return format!("{prefix}{digits}");
403 }
404 let pad = spec.width - body_len;
405 if spec.zero_pad && !spec.left_align {
406 // Zeros insert between the prefix and the digits.
407 let zeros: String = std::iter::repeat_n('0', pad).collect();
408 format!("{prefix}{zeros}{digits}")
409 } else if spec.left_align {
410 let spaces: String = std::iter::repeat_n(' ', pad).collect();
411 format!("{prefix}{digits}{spaces}")
412 } else {
413 let spaces: String = std::iter::repeat_n(' ', pad).collect();
414 format!("{spaces}{prefix}{digits}")
415 }
416}
417
418/// Apply width and alignment to a non-numeric rendered body using space-only padding.
419///
420/// Used for `%c` (and any other non-numeric conversion where the POSIX `0` flag
421/// must be ignored). Zero-padding is not applied regardless of `spec.zero_pad`.
422fn pad_non_numeric(body: &str, spec: &Spec) -> String {
423 if body.len() >= spec.width {
424 return body.to_string();
425 }
426 let pad = spec.width - body.len();
427 let padding: String = std::iter::repeat_n(' ', pad).collect();
428 if spec.left_align {
429 format!("{body}{padding}")
430 } else {
431 format!("{padding}{body}")
432 }
433}
434
435/// Apply width and padding to an already-rendered numeric body.
436///
437/// For zero-padded right-aligned formatting, a leading `-` sign is kept at
438/// the front while zeros are inserted between the sign and the magnitude
439/// digits -- matching C printf semantics (e.g., `%05d` with `-7` → `-0007`,
440/// not `000-7`).
441fn pad_numeric(body: &str, spec: &Spec) -> String {
442 if body.len() >= spec.width {
443 return body.to_string();
444 }
445 // C printf sign-aware zero-padding: sign goes before the zeros.
446 if spec.zero_pad
447 && !spec.left_align
448 && let Some(digits) = body.strip_prefix('-')
449 {
450 let needed = spec.width.saturating_sub(1 + digits.len());
451 if needed == 0 {
452 return body.to_string();
453 }
454 let zeros: String = std::iter::repeat_n('0', needed).collect();
455 return format!("-{zeros}{digits}");
456 }
457 let pad = spec.width - body.len();
458 let pad_char = if spec.zero_pad && !spec.left_align {
459 '0'
460 } else {
461 ' '
462 };
463 let padding: String = std::iter::repeat_n(pad_char, pad).collect();
464 if spec.left_align {
465 format!("{body}{padding}")
466 } else {
467 format!("{padding}{body}")
468 }
469}
470
471#[cfg(test)]
472mod tests {
473 use super::*;
474
475 fn byte_t() -> TypeKind {
476 TypeKind::Byte { signed: false }
477 }
478
479 fn long_t() -> TypeKind {
480 TypeKind::Long {
481 endian: crate::parser::ast::Endianness::Little,
482 signed: true,
483 }
484 }
485
486 // ---- happy path --------------------------------------------------
487
488 #[test]
489 fn test_signed_decimal_substitution() {
490 // Covers %d, %i, %ld, %lld (length modifiers are accepted and ignored).
491 let cases = [
492 ("v=%d", Value::Int(-7), "v=-7"),
493 ("v=%i", Value::Int(42), "v=42"),
494 ("v=%ld", Value::Int(10), "v=10"),
495 ("at_offset %lld", Value::Uint(11), "at_offset 11"),
496 ];
497 for (tmpl, val, expected) in cases {
498 assert_eq!(
499 format_magic_message(tmpl, &val, &byte_t()),
500 expected,
501 "template {tmpl:?} with value {val:?}",
502 );
503 }
504 }
505
506 #[test]
507 fn test_unsigned_decimal_substitution() {
508 let out = format_magic_message("n=%u", &Value::Uint(200), &byte_t());
509 assert_eq!(out, "n=200");
510
511 // i64::MIN as unsigned should come through as 2^63.
512 let out = format_magic_message("n=%llu", &Value::Int(i64::MIN), &long_t());
513 assert_eq!(out, "n=9223372036854775808");
514 }
515
516 #[test]
517 fn test_hex_substitution_with_byte_width_masking() {
518 // The canonical searchbug.result case: ubyte `%02x`.
519 let out = format_magic_message("0x%02x", &Value::Uint(0x31), &byte_t());
520 assert_eq!(out, "0x31");
521
522 // Byte -1 (sign-extended to u64::MAX in Value::Int) must render as "ff",
523 // not "ffffffffffffffff", when the underlying type is a byte.
524 let out = format_magic_message("0x%02x", &Value::Int(-1), &byte_t());
525 assert_eq!(out, "0xff");
526
527 // %X is uppercase.
528 let out = format_magic_message("%X", &Value::Uint(0xdead_beef), &long_t());
529 assert_eq!(out, "DEADBEEF");
530
531 // %#x emits the "0x" prefix via alt form.
532 let out = format_magic_message("%#x", &Value::Uint(0xab), &byte_t());
533 assert_eq!(out, "0xab");
534
535 // %#06x: zero-pad inserts between prefix and digits (C printf semantics),
536 // not before the prefix. Regression guard for correctness review COR-002.
537 let out = format_magic_message("%#06x", &Value::Uint(0xab), &byte_t());
538 assert_eq!(out, "0x00ab");
539
540 // Space-padded width with alt-form prefix: spaces go before prefix.
541 let out = format_magic_message("%#6x", &Value::Uint(0xab), &byte_t());
542 assert_eq!(out, " 0xab");
543
544 // Left-aligned with alt-form prefix: spaces trail the digits.
545 let out = format_magic_message("%-#6x|", &Value::Uint(0xab), &byte_t());
546 assert_eq!(out, "0xab |");
547
548 // %#08o: zero-pad inserts between C-style "0" prefix and digits.
549 // C printf uses a single "0" prefix for %#o (not Rust's "0o").
550 let out = format_magic_message("%#08o", &Value::Uint(8), &byte_t());
551 assert_eq!(out, "00000010");
552
553 // %#X: uppercase alt-form uses "0X" prefix to match the specifier case.
554 let out = format_magic_message("%#X", &Value::Uint(0xab), &byte_t());
555 assert_eq!(out, "0XAB");
556 }
557
558 #[test]
559 fn test_string_substitution() {
560 let out = format_magic_message(
561 "hello %s",
562 &Value::String("world".to_string()),
563 &TypeKind::String { max_length: None },
564 );
565 assert_eq!(out, "hello world");
566
567 // Bytes go through lossy UTF-8.
568 let out = format_magic_message(
569 "data=%s",
570 &Value::Bytes(b"abc".to_vec()),
571 &TypeKind::String { max_length: None },
572 );
573 assert_eq!(out, "data=abc");
574 }
575
576 #[test]
577 fn test_alt_form_prefix_suppressed_on_zero_value() {
578 // C printf special-cases `%#o`, `%#x`, `%#X` with value 0: the
579 // alt-form prefix is suppressed because the rendered digit
580 // already begins with `0`. Regression guard after pr-review
581 // caught that our implementation emitted `"00"` / `"0x0"` /
582 // `"0X0"` for zero values.
583 let out = format_magic_message("%#o", &Value::Uint(0), &byte_t());
584 assert_eq!(out, "0", "%#o with 0 must emit single '0', not '00'");
585
586 let out = format_magic_message("%#x", &Value::Uint(0), &byte_t());
587 assert_eq!(out, "0", "%#x with 0 must emit single '0', not '0x0'");
588
589 let out = format_magic_message("%#X", &Value::Uint(0), &byte_t());
590 assert_eq!(out, "0", "%#X with 0 must emit single '0', not '0X0'");
591
592 // Non-zero values still get the prefix.
593 let out = format_magic_message("%#x", &Value::Uint(1), &byte_t());
594 assert_eq!(out, "0x1");
595 }
596
597 #[test]
598 fn test_octal_substitution() {
599 let out = format_magic_message("%o", &Value::Uint(8), &byte_t());
600 assert_eq!(out, "10");
601 // C printf %#o uses a single "0" prefix, not Rust's "0o".
602 let out = format_magic_message("%#o", &Value::Uint(8), &byte_t());
603 assert_eq!(out, "010");
604 }
605
606 #[test]
607 fn test_char_substitution() {
608 let out = format_magic_message("[%c]", &Value::Uint(u64::from(b'A')), &byte_t());
609 assert_eq!(out, "[A]");
610
611 // Full 0x00-0xff range: bytes >= 0x80 are embedded as Latin-1 code points.
612 let out = format_magic_message("%c", &Value::Uint(0xa9), &byte_t());
613 assert_eq!(out, "\u{00a9}"); // U+00A9 COPYRIGHT SIGN
614
615 // Width with space-padding (right-aligned).
616 let out = format_magic_message("%3c", &Value::Uint(u64::from(b'A')), &byte_t());
617 assert_eq!(out, " A");
618
619 // Left-aligned width.
620 let out = format_magic_message("%-3c|", &Value::Uint(u64::from(b'A')), &byte_t());
621 assert_eq!(out, "A |");
622 }
623
624 #[test]
625 fn test_char_zero_flag_ignored() {
626 // POSIX: the `0` flag is ignored for `%c` -- zero-padding applies only to
627 // numeric conversions. `%03c` must produce space-padded " A", not "00A".
628 // Regression guard: an earlier revision called `pad_numeric` for `Conv::Char`,
629 // which applied zero-padding and diverged from C printf semantics.
630 let out = format_magic_message("%03c", &Value::Uint(u64::from(b'A')), &byte_t());
631 assert_eq!(out, " A", "%03c must use space-padding, not zero-padding");
632
633 // Combined zero and left-align: `-` overrides `0` for numerics; for %c
634 // `0` was never active, but `-` still triggers left-alignment.
635 let out = format_magic_message("%-03c|", &Value::Uint(u64::from(b'A')), &byte_t());
636 assert_eq!(out, "A |", "%-03c must left-align with spaces");
637 }
638
639 #[test]
640 fn test_percent_escape() {
641 let out = format_magic_message("100%% sure", &Value::Uint(0), &byte_t());
642 assert_eq!(out, "100% sure");
643 }
644
645 #[test]
646 fn test_non_ascii_template_preserved() {
647 // Regression guard: earlier revisions iterated by byte and
648 // pushed each `b as char`, which re-encoded non-ASCII UTF-8
649 // continuation bytes as Latin-1 code points and mangled the
650 // output (e.g., "café" -> "café"). The plain-run flush path
651 // must copy slices of the original template to preserve the
652 // original UTF-8 byte sequences.
653 let out = format_magic_message("café %d", &Value::Int(42), &long_t());
654 assert_eq!(out, "café 42");
655
656 // Non-ASCII around a specifier on both sides.
657 let out = format_magic_message("→ %s ←", &Value::String("ok".into()), &byte_t());
658 assert_eq!(out, "→ ok ←");
659
660 // Non-ASCII only, no specifiers.
661 let out = format_magic_message("über", &Value::Uint(0), &byte_t());
662 assert_eq!(out, "über");
663 }
664
665 #[test]
666 fn test_multiple_specifiers_in_one_template() {
667 // Note: current implementation binds every specifier to the single
668 // `value`; multiple specifiers are rendered against the same value.
669 // This matches libmagic's single-argument model -- magic rules only
670 // expose one read value per rule.
671 let out = format_magic_message("a=%d b=%d", &Value::Int(5), &long_t());
672 assert_eq!(out, "a=5 b=5");
673 }
674
675 #[test]
676 fn test_width_padding() {
677 // Zero-padded width with negative value: sign must precede zeros.
678 // Regression guard for sign-aware zero-padding (C printf semantics).
679 let out = format_magic_message("%05d", &Value::Int(-7), &long_t());
680 assert_eq!(out, "-0007");
681 let out = format_magic_message("%06d", &Value::Int(-42), &long_t());
682 assert_eq!(out, "-00042");
683 // Zero-padded width.
684 let out = format_magic_message("%05d", &Value::Int(42), &long_t());
685 assert_eq!(out, "00042");
686 // Space-padded width.
687 let out = format_magic_message("%5d", &Value::Int(42), &long_t());
688 assert_eq!(out, " 42");
689 // Negative with space-padding: sign stays in the body, spaces lead.
690 let out = format_magic_message("%5d", &Value::Int(-7), &long_t());
691 assert_eq!(out, " -7");
692 // Left-aligned (zero flag ignored when `-` is set).
693 let out = format_magic_message("%-5d|", &Value::Int(42), &long_t());
694 assert_eq!(out, "42 |");
695 // Left-aligned negative: body left-aligned, spaces trail.
696 let out = format_magic_message("%-6d|", &Value::Int(-7), &long_t());
697 assert_eq!(out, "-7 |");
698 }
699
700 #[test]
701 fn test_width_cap_prevents_large_allocation() {
702 // A width larger than MAX_FORMAT_WIDTH must be silently clamped.
703 // The output should be valid (the value rendered, possibly padded)
704 // rather than triggering a huge allocation.
705 let huge_width = format!("%{}d", usize::MAX);
706 let out = format_magic_message(&huge_width, &Value::Int(1), &long_t());
707 // After clamping, the output is at most MAX_FORMAT_WIDTH+1 chars.
708 assert!(
709 out.len() <= MAX_FORMAT_WIDTH + 1,
710 "output too long: {}",
711 out.len()
712 );
713 assert!(out.ends_with('1'), "rendered value must appear: {out:?}");
714 }
715
716 // ---- edge cases --------------------------------------------------
717
718 #[test]
719 fn test_empty_template() {
720 assert_eq!(
721 format_magic_message("", &Value::Uint(0), &byte_t()),
722 String::new()
723 );
724 }
725
726 #[test]
727 fn test_literal_with_no_specifiers() {
728 assert_eq!(
729 format_magic_message("hello world", &Value::Uint(0), &byte_t()),
730 "hello world"
731 );
732 }
733
734 #[test]
735 fn test_trailing_percent_with_no_spec() {
736 // A stray `%` at end-of-string: pass through literally.
737 let out = format_magic_message("done %", &Value::Uint(0), &byte_t());
738 assert_eq!(out, "done %");
739 }
740
741 #[test]
742 fn test_unknown_specifier_pass_through() {
743 // `%q` is not in our subset.
744 let out = format_magic_message("bad %q end", &Value::Uint(0), &byte_t());
745 assert_eq!(out, "bad %q end");
746 }
747
748 #[test]
749 fn test_type_mismatch_string_conv_on_uint_still_renders() {
750 // `%s` against an integer value -- GNU `file` renders the number
751 // as decimal; libmagic-rs matches that behavior via `render_string`.
752 let out = format_magic_message("v=%s", &Value::Uint(42), &byte_t());
753 assert_eq!(out, "v=42");
754 }
755
756 #[test]
757 fn test_type_mismatch_numeric_conv_on_string_passes_through() {
758 // `%d` against a string has no sensible coercion -> literal.
759 let out = format_magic_message(
760 "v=%d",
761 &Value::String("hi".to_string()),
762 &TypeKind::String { max_length: None },
763 );
764 assert_eq!(out, "v=%d");
765 }
766
767 #[test]
768 fn test_char_specifier_accepts_full_byte_range() {
769 // `%c` emits every byte value 0x00..=0xff directly, matching
770 // GNU `file` / C printf semantics. Bytes 0x80-0xff are embedded
771 // as their Latin-1 code points via `char::from(u8)`.
772 // 0xff maps to U+00FF ('ÿ'); UTF-8 encoding is 0xc3 0xbf.
773 let out = format_magic_message("[%c]", &Value::Uint(0xff), &byte_t());
774 assert_eq!(out, "[\u{00ff}]");
775
776 // ASCII boundary stays unchanged.
777 let out = format_magic_message("[%c]", &Value::Uint(u64::from(b'A')), &byte_t());
778 assert_eq!(out, "[A]");
779
780 // Out-of-range (doesn't fit u8) passes through literally.
781 let out = format_magic_message("[%c]", &Value::Uint(0x1_0000), &byte_t());
782 assert_eq!(out, "[%c]");
783 }
784
785 #[test]
786 fn test_byte_width_masking_on_negative_signed_byte() {
787 // Regression guard: a signed byte carrying -1 (the representation
788 // on the Value side is Int(-1)) must NOT render as a 64-bit mask.
789 let out = format_magic_message("%x", &Value::Int(-1), &byte_t());
790 assert_eq!(out, "ff");
791 }
792
793 #[test]
794 fn test_hex_width_masking_respects_16bit() {
795 let short_t = TypeKind::Short {
796 endian: crate::parser::ast::Endianness::Little,
797 signed: true,
798 };
799 let out = format_magic_message("%x", &Value::Int(-1), &short_t);
800 assert_eq!(out, "ffff");
801 }
802}