Skip to main content

module_info/
metadata.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4use std::env;
5
6use serde::{Deserialize, Serialize};
7
8use crate::{
9    utils::{bytes_to_linker_directives, get_cargo_toml_content, get_distro_info},
10    ModuleInfoError, ModuleInfoField, ModuleInfoResult, NOTE_ALIGN,
11};
12
13/// Package metadata for embedding in the ELF `.note.package` section.
14///
15/// `PackageMetadata` holds the raw (unsanitized) metadata values that will be
16/// serialized to JSON and byte-encoded into the linker script by
17/// [`embed_package_metadata`](crate::embed_package_metadata). Callers may
18/// either populate this struct manually in `build.rs` (e.g. to supply values
19/// from an outer build system without touching `Cargo.toml`) or use
20/// [`PackageMetadata::from_cargo_toml`] to read the current crate's metadata.
21///
22/// # Non-exhaustive + Default
23///
24/// This struct is marked `#[non_exhaustive]` and implements [`Default`] so new
25/// fields can be added in future minor releases without breaking downstream
26/// code. From outside the crate, `#[non_exhaustive]` forbids struct-literal
27/// construction; start from [`Default::default()`] and assign the fields you
28/// need:
29///
30/// ```rust,no_run
31/// # use module_info::PackageMetadata;
32/// let mut md = PackageMetadata::default();
33/// md.maintainer = "team@contoso.com".into();
34/// md.module_type = "agent".into();
35/// md.version = "1.2.3".into();
36/// md.module_version = "1.2.3.4".into();
37/// ```
38///
39/// # Disabling fields
40///
41/// Seven keys are *required* in the embedded JSON:
42/// `binary`, `version`, `moduleVersion`, `name`, `maintainer`, `os`, and
43/// `osVersion`. The remaining fields (`type`, `repo`, `branch`, `hash`,
44/// `copyright`) are optional. Leave them as the empty string and the
45/// corresponding JSON value is emitted as `""`, which downstream tooling
46/// can skip. `from_cargo_toml()` populates the `os`/`osVersion` fields
47/// from `/etc/os-release`, so most builders get them for free; override
48/// only when the detected values don't match the target platform.
49///
50/// The JSON shape stays stable (every key is always present) because the
51/// `.note.package` payload is a fixed-layout byte array built from the
52/// linker script; the empty-string-as-disabled convention keeps the layout
53/// constant while letting consumers opt out of leaking fields they don't
54/// want in the binary.
55///
56/// ```rust,no_run
57/// use module_info::PackageMetadata;
58///
59/// fn main() -> Result<(), Box<dyn std::error::Error>> {
60///     // Library crate that doesn't want to embed git or repo info:
61///     let mut md = PackageMetadata::from_cargo_toml()?;
62///     md.repo.clear();
63///     md.branch.clear();
64///     md.hash.clear();
65///     // `md` still carries binary/version/moduleVersion/name/maintainer
66///     // plus os/osVersion (auto-populated from /etc/os-release).
67///     Ok(())
68/// }
69/// ```
70#[derive(Debug, Clone, Default, Serialize, Deserialize)]
71#[non_exhaustive]
72pub struct PackageMetadata {
73    /// Binary name (executable or library)
74    pub binary: String,
75
76    /// Full module version (may include build number)
77    #[serde(rename = "moduleVersion")]
78    pub module_version: String,
79
80    /// Crate version from Cargo.toml
81    pub version: String,
82
83    /// Maintainer contact information
84    pub maintainer: String,
85
86    /// Package name
87    pub name: String,
88
89    /// Module type (agent, library, executable, etc.)
90    #[serde(rename = "type")] // Ensure JSON uses "type" instead of "module_type"
91    pub module_type: String,
92
93    /// Git repository name
94    pub repo: String,
95
96    /// Git branch name
97    pub branch: String,
98
99    /// Git commit hash
100    pub hash: String,
101
102    /// Copyright information
103    pub copyright: String,
104
105    /// Operating system name
106    pub os: String,
107
108    /// Operating system version
109    #[serde(rename = "osVersion")]
110    pub os_version: String,
111}
112
113impl PackageMetadata {
114    /// Build a [`PackageMetadata`] by reading the current crate's `Cargo.toml`,
115    /// environment-variable overrides, git working copy, and OS release info.
116    ///
117    /// This is the zero-configuration entry point: the build script for a
118    /// normal Cargo crate can just call
119    /// [`generate_project_metadata_and_linker_script`](crate::generate_project_metadata_and_linker_script),
120    /// which uses this method under the hood. Call `from_cargo_toml` directly
121    /// only when you need to inspect or mutate the collected metadata before
122    /// passing it to [`embed_package_metadata`](crate::embed_package_metadata).
123    ///
124    /// The returned values are *unsanitized*. `embed_package_metadata` runs
125    /// the sanitize step internally so the linker-script bytes and the JSON
126    /// string agree byte-for-byte (the invariant that keeps the `.note.package`
127    /// section 4-byte aligned).
128    ///
129    /// # Errors
130    /// Returns a [`ModuleInfoError`] if `Cargo.toml` is unreadable or malformed,
131    /// if git invocation fails, or if the OS release info cannot be read.
132    pub fn from_cargo_toml() -> ModuleInfoResult<Self> {
133        collect_package_metadata()
134    }
135
136    /// Return the string value associated with a given [`ModuleInfoField`].
137    ///
138    /// This is the single source of truth mapping `ModuleInfoField` variants
139    /// to `PackageMetadata` fields. Both the linker-script emitter and the
140    /// build-time JSON dump iterate [`ModuleInfoField::ALL`] and call this.
141    #[must_use]
142    pub fn field_value(&self, field: ModuleInfoField) -> &str {
143        match field {
144            ModuleInfoField::Binary => &self.binary,
145            ModuleInfoField::Version => &self.version,
146            ModuleInfoField::ModuleVersion => &self.module_version,
147            ModuleInfoField::Maintainer => &self.maintainer,
148            ModuleInfoField::Name => &self.name,
149            ModuleInfoField::Type => &self.module_type,
150            ModuleInfoField::Repo => &self.repo,
151            ModuleInfoField::Branch => &self.branch,
152            ModuleInfoField::Hash => &self.hash,
153            ModuleInfoField::Copyright => &self.copyright,
154            ModuleInfoField::Os => &self.os,
155            ModuleInfoField::OsVersion => &self.os_version,
156        }
157    }
158}
159
160/// Look up `package.metadata.module_info.<key>` as a string slice.
161fn module_info_str<'a>(package: &'a toml::Value, key: &str) -> Option<&'a str> {
162    package
163        .get("metadata")
164        .and_then(|m| m.get("module_info"))
165        .and_then(|mi| mi.get(key))
166        .and_then(|v| v.as_str())
167}
168
169/// Normalize a dotted version string to exactly `parts` numeric components,
170/// padding missing trailing components with `0` and truncating extras.
171///
172/// SemVer-style pre-release / build-metadata suffixes (everything from the
173/// first `-` or `+`) are stripped before splitting so that Azure Pipelines
174/// build numbers like `"5.2.100.0-PullRequest-123456"` normalize cleanly
175/// to `"5.2.100.0"` (or `"5.2.100"` when `parts == 3`) instead of leaving a
176/// non-numeric tail that would fail the u16 check in
177/// `validate_module_version`.
178fn format_version_parts(version_str: &str, parts: usize) -> String {
179    // Strip pre-release / build-metadata suffix before splitting. Find the
180    // first `-` or `+` (whichever comes first) and cut there.
181    let cut = match (version_str.find('-'), version_str.find('+')) {
182        (Some(a), Some(b)) => Some(a.min(b)),
183        (Some(a), None) => Some(a),
184        (None, Some(b)) => Some(b),
185        (None, None) => None,
186    };
187    let core = match cut {
188        Some(end) => version_str.get(..end).unwrap_or(version_str),
189        None => version_str,
190    };
191    if core.len() != version_str.len() {
192        warn!(
193            "version string {:?} carries pre-release/build-metadata suffix; using numeric core {:?}",
194            version_str, core
195        );
196    }
197    // Treat an empty core as "no dot-separated numeric input" rather than
198    // "one empty component": `"".split('.')` returns `[""]`, which would
199    // otherwise make the first emitted part the empty string and produce
200    // a malformed result like ".0.0" (for parts=3) or ".0.0.0" (for parts=4).
201    // The downstream u16 validator in `validate_module_version` would then
202    // reject with a confusing "part 0 is empty" error instead of the
203    // expected "0.0.0" / "0.0.0.0" fallback.
204    let fields: Vec<&str> = if core.is_empty() {
205        Vec::new()
206    } else {
207        core.split('.').collect()
208    };
209    if fields.len() > parts {
210        // Truncation is warn-not-error for backwards compat with pipelines
211        // whose build numbers incidentally carry extra dots (e.g. a
212        // `BUILD_BUILDNUMBER` of `"1.2.3.4.5"` trimmed to `"1.2.3.4"`).
213        warn!(
214            "version string {:?} has {} dot-separated parts; truncating to {} (dropped: {:?})",
215            core,
216            fields.len(),
217            parts,
218            fields.get(parts..).map(|s| s.join(".")).unwrap_or_default()
219        );
220    }
221    // Warn early when any part overflows u16: the hard check in
222    // `validate_module_version` will still reject the value later, but the
223    // error then surfaces several call-frames deep in `embed_package_metadata`
224    // as a generic "moduleVersion part N must fit in 16 bits". Warning here
225    // points at the actual offending env var / Cargo.toml value in the CI log.
226    for (i, f) in fields.iter().take(parts).enumerate() {
227        if !f.is_empty() && f.parse::<u16>().is_err() {
228            warn!(
229                "version part {} ({:?}) in {:?} does not fit u16; downstream validate_module_version will reject this build",
230                i, f, core
231            );
232        }
233    }
234    (0..parts)
235        .map(|i| fields.get(i).copied().unwrap_or("0"))
236        .collect::<Vec<_>>()
237        .join(".")
238}
239
240/// Read `$env_var_name` (if set) and return its trimmed value, or `fallback`
241/// when the env var is unset, unreadable, or whitespace-only.
242///
243/// Trimming matters because CI-supplied values (e.g. `BUILD_BUILDNUMBER`)
244/// occasionally arrive with stray leading/trailing whitespace, which would
245/// otherwise propagate into the first `.`-separated field of a version
246/// string and fail the u16 range check in `validate_module_version`.
247fn env_or_default(env_var_name: Option<&str>, fallback: &str) -> String {
248    let Some(name) = env_var_name else {
249        return fallback.to_string();
250    };
251    let value = match env::var(name) {
252        Ok(v) => v,
253        Err(env::VarError::NotPresent) => String::new(),
254        Err(env::VarError::NotUnicode(_)) => {
255            // Non-UTF8 env values silently drop to the fallback rather than
256            // poisoning the embedded JSON with replacement characters. A
257            // cargo:warning keeps the root cause visible at build time.
258            println!(
259                "cargo:warning=module_info: env var {name} contains non-UTF8 bytes; using fallback"
260            );
261            String::new()
262        }
263    };
264    let trimmed = value.trim();
265    if trimmed.is_empty() {
266        fallback.to_string()
267    } else {
268        trimmed.to_string()
269    }
270}
271
272/// Read Cargo.toml, env vars, git, and OS release info and populate a raw
273/// (unsanitized) `PackageMetadata`.
274fn collect_package_metadata() -> ModuleInfoResult<PackageMetadata> {
275    let cargo_toml = get_cargo_toml_content()?;
276    let package = cargo_toml
277        .get("package")
278        .ok_or_else(|| ModuleInfoError::MalformedJson("No package section found".to_string()))?;
279
280    let binary_name = env::var("CARGO_PKG_NAME").unwrap_or_default();
281    let default_version = env::var("CARGO_PKG_VERSION").unwrap_or_default();
282
283    let version_env_var_name = module_info_str(package, "version_env_var_name").map(str::to_string);
284    let module_version_env_var_name =
285        module_info_str(package, "module_version_env_var_name").map(str::to_string);
286
287    // Caller-named env vars: emit rerun-if-env-changed here since
288    // `embed_package_metadata`'s fixed rerun set can't know them.
289    if let Some(name) = version_env_var_name.as_deref() {
290        println!("cargo:rerun-if-env-changed={name}");
291    }
292    if let Some(name) = module_version_env_var_name.as_deref() {
293        println!("cargo:rerun-if-env-changed={name}");
294    }
295
296    let raw_version = env_or_default(version_env_var_name.as_deref(), &default_version);
297    let version = format_version_parts(&raw_version, 3);
298    let raw_module_version = env_or_default(module_version_env_var_name.as_deref(), &raw_version);
299    let module_version = format_version_parts(&raw_module_version, 4);
300
301    let (branch, hash, repo) = crate::utils::get_git_info()?;
302
303    let maintainer = module_info_str(package, "maintainer")
304        .unwrap_or("Unknown")
305        .to_string();
306    let module_type = module_info_str(package, "type")
307        .unwrap_or("Unknown")
308        .to_string();
309    let copyright = module_info_str(package, "copyright")
310        .unwrap_or("Unknown")
311        .to_string();
312
313    let (os, os_version) = get_distro_info()?;
314
315    // Return unsanitized values; `render_note_payloads` sanitizes at emit time
316    // so both manual and `from_cargo_toml` construction paths agree byte-for-byte.
317    Ok(PackageMetadata {
318        binary: binary_name.clone(),
319        module_version,
320        version,
321        maintainer,
322        name: binary_name,
323        module_type,
324        repo,
325        branch,
326        hash,
327        copyright,
328        os,
329        os_version,
330    })
331}
332
333/// Render a [`PackageMetadata`] into the two byte-identical payloads embedded
334/// in the ELF note section: the compact JSON string (`.0`) and the
335/// linker-script body that reproduces the same bytes (`.1`).
336///
337/// Sanitization happens here. The returned JSON string and the byte-encoded
338/// linker-script body are guaranteed to agree byte-for-byte, which is what
339/// keeps the `.note.package` section 4-byte aligned.
340pub(crate) fn render_note_payloads(md: &PackageMetadata) -> ModuleInfoResult<(String, String)> {
341    // Sanitize before serialization so JSON bytes and linker bytes agree.
342    // Otherwise characters that expand/strip (`©` → `(c)`, non-ASCII) would
343    // drift padding and break 4-byte alignment of the note section.
344    let metadata = PackageMetadata {
345        binary: sanitize_for_linker_script(&md.binary),
346        module_version: sanitize_for_linker_script(&md.module_version),
347        version: sanitize_for_linker_script(&md.version),
348        maintainer: sanitize_for_linker_script(&md.maintainer),
349        name: sanitize_for_linker_script(&md.name),
350        module_type: sanitize_for_linker_script(&md.module_type),
351        repo: sanitize_for_linker_script(&md.repo),
352        branch: sanitize_for_linker_script(&md.branch),
353        hash: sanitize_for_linker_script(&md.hash),
354        copyright: sanitize_for_linker_script(&md.copyright),
355        os: sanitize_for_linker_script(&md.os),
356        os_version: sanitize_for_linker_script(&md.os_version),
357    };
358
359    // Emit JSON and linker directives in lock-step so byte counts agree.
360    // Manually emit newlines (not `serde_json::to_string`, which emits one line)
361    // so `strings`/`readelf -n` show one key:value pair per line.
362    let mut linker_script_body = String::new();
363    let mut compact_json = String::new();
364
365    // Iterate `ModuleInfoField::ALL` so the emitter stays in lock-step with the
366    // enum (exhaustive iteration surfaces missing/extra keys at compile time).
367    let entries: Vec<(&str, &str, &str)> = ModuleInfoField::ALL
368        .iter()
369        .map(|f| (f.to_key(), f.to_symbol_name(), metadata.field_value(*f)))
370        .collect();
371
372    // Derive padding from this running count, not `compact_json.len()`.
373    // Sanitized bytes *should* match `compact_json.len()` (the hard check
374    // below enforces that), but using the emitted byte count is the invariant
375    // that actually keeps `.note.package` 4-byte aligned. If sanitization
376    // ever drifts string length, the emitted count is still authoritative.
377    let mut note_payload_bytes: usize = 0;
378
379    linker_script_body.push('\n');
380    linker_script_body.push_str(&bytes_to_linker_script_format("{\n")); // '{', '\n'
381    compact_json.push_str("{\n");
382    note_payload_bytes += 2;
383    for (i, (key, symbol_name, value)) in entries.iter().enumerate() {
384        let key_json = format!("\"{key}\":");
385        let bytes_key_str = bytes_to_linker_script_format(&key_json);
386        linker_script_body.push_str(&format!("\n\n    /* Key: {key} */"));
387        linker_script_body.push_str(&format!("\n{bytes_key_str}"));
388        compact_json.push_str(&key_json);
389        note_payload_bytes += key_json.len();
390
391        // Symbol marks the value's start address; runtime extraction reads from
392        // here without parsing the JSON.
393        linker_script_body.push_str(&format!("\n    {symbol_name} = .;"));
394
395        // No `/* Value: ... */` comment: sanitized values could contain `*/`
396        // and interpolating user bytes into a C-style comment is fragile.
397        let value_json = format!("\"{value}\"");
398        let bytes_value_str = bytes_to_linker_script_format(&value_json);
399        linker_script_body.push_str(&format!("\n{bytes_value_str}"));
400        compact_json.push_str(&value_json);
401        note_payload_bytes += value_json.len();
402
403        if i < entries.len() - 1 {
404            linker_script_body.push('\n');
405            linker_script_body.push_str(&bytes_to_linker_script_format(",\n"));
406            compact_json.push_str(",\n");
407            note_payload_bytes += 2;
408        }
409    }
410
411    linker_script_body.push('\n');
412    linker_script_body.push_str(&bytes_to_linker_script_format("\n}")); // '\n', '}'
413    compact_json.push_str("\n}");
414    note_payload_bytes += 2;
415    debug!(" Compact JSON Len: {}", compact_json.len());
416
417    // Always emit 1–4 NUL bytes of padding (never 0). The lower bound is
418    // load-bearing: `extract_module_info` scans forward until `\0`, capped at
419    // `MAX_NOTE_VALUE_LEN`. Without a terminating NUL the scan runs to the cap
420    // and reads bytes past the section: harmless mid-segment, SIGSEGV at a
421    // segment tail. When `note_payload_bytes % NOTE_ALIGN == 0`, the formula
422    // below emits a full `NOTE_ALIGN` (4-byte) NUL pad, *not* 0; the section
423    // stays 4-aligned either way (+0 mod 4 vs. +4 mod 4), and the scan still
424    // terminates on the first NUL.
425    let padding_needed = NOTE_ALIGN - (note_payload_bytes % NOTE_ALIGN);
426    // Hard check (not debug_assert): mismatch here means the linker script and
427    // JSON disagree byte-for-byte, which corrupts alignment and breaks runtime
428    // extraction. Return an error (not panic) so build.rs sees a clean exit.
429    if note_payload_bytes != compact_json.len() {
430        return Err(crate::ModuleInfoError::Other(
431            format!(
432                "linker script payload size ({note_payload_bytes}) disagrees with compact_json ({}); \
433                 sanitizer and emitter drifted out of sync",
434                compact_json.len()
435            )
436            .into(),
437        ));
438    }
439
440    // Always runs; `padding_needed` is in [1, 4] by construction above.
441    linker_script_body.push_str("\n    /* Padding (always >=1 NUL so runtime scan terminates) */");
442    for _ in 0..padding_needed {
443        linker_script_body.push('\n');
444        linker_script_body.push_str(&bytes_to_linker_script_format("\0"));
445    }
446
447    debug!("Linker script body:\n{}", linker_script_body);
448    debug!("Compact JSON:\n{}", compact_json);
449    debug!("Linker script body size: {}", linker_script_body.len());
450    debug!("Compact JSON size: {}", compact_json.len());
451    debug!("Padding needed: {}", padding_needed);
452    debug!(
453        "Linker script body size after padding: {}",
454        linker_script_body.len()
455    );
456    debug!("Compact JSON size after padding: {}", compact_json.len());
457
458    Ok((compact_json, linker_script_body))
459}
460
461/// Thin wrapper that chains [`PackageMetadata::from_cargo_toml`] and
462/// [`render_note_payloads`], retained so the `test_project_metadata`
463/// regression test can exercise both stages through one call. Production code
464/// reaches for [`crate::embed_package_metadata`] instead.
465#[cfg(test)]
466pub(crate) fn project_metadata() -> ModuleInfoResult<(String, String)> {
467    let md = PackageMetadata::from_cargo_toml()?;
468    render_note_payloads(&md)
469}
470
471/// Sanitize a string so that it can be embedded verbatim in both the emitted
472/// linker script (as raw bytes) and the compact JSON metadata (via serde_json)
473/// without the two representations disagreeing in length.
474///
475/// The contract is: after sanitization, `serde_json::to_string` of the value
476/// produces the same bytes the linker will emit. That invariant keeps the
477/// `.note.package` section 4-byte aligned (padding is computed from the
478/// emitted-byte count) and keeps the JSON parseable at runtime.
479///
480/// To achieve that we strip every character that `serde_json` would escape:
481/// - `"` (would become `\"` in JSON: 2 bytes vs 1 raw byte)
482/// - `\` (would become `\\`)
483/// - any control character, including `\n`, `\r`, `\t` (would become `\n`/`\t`/`\uNNNN`)
484/// - any non-ASCII character (would be UTF-8 multi-byte; we keep the section ASCII-only)
485///
486/// We also map a few common trademark/copyright glyphs to ASCII equivalents
487/// before stripping, because those legitimately show up in `copyright` fields.
488/// These replacements *expand* the byte count (`©` = 2 UTF-8 bytes → `(c)` =
489/// 3 ASCII bytes), so a pathological copyright string full of glyphs can push
490/// the serialized JSON over [`crate::constants::MAX_JSON_SIZE`] (1 KiB) and
491/// fail validation in [`crate::embed_package_metadata`]. That's the intended
492/// outcome (the size cap is the forcing function), but if a build fails
493/// with `MetadataTooLarge`, glyph expansion is the likely cause.
494///
495/// The glyph map is intentionally minimal (©, ®, ™). Other non-ASCII glyphs
496/// that might appear in author or copyright fields (U+2014 em-dash (`—`), curly
497/// quotes (`“ ”`), section (`§`), accented Latin letters (`André`), or any
498/// CJK text) are dropped rather than transliterated. Expand the map if a
499/// legitimate field is losing characters, and update the
500/// `sanitize_strips_...` tests to match. Don't add per-language
501/// transliteration (`é` → `e`): that loses meaning silently. Prefer an ASCII
502/// spelling at the source.
503///
504/// Additionally, `*` and `/` are preserved (they appear in paths and versions),
505/// but the caller must not interpolate the sanitized string into a C-style
506/// `/* ... */` comment without escaping `*/` first.
507///
508/// # Example
509/// `"Contoso©"` → `"Contoso(c)"`; `"a\"b\nc"` → `"abc"`.
510pub fn sanitize_for_linker_script(input: &str) -> String {
511    input
512        .replace('©', "(c)")
513        .replace('®', "(r)")
514        .replace('™', "(tm)")
515        .chars()
516        .filter(|&c| {
517            // Must be plain ASCII so the emitted bytes are one-to-one with chars.
518            if !c.is_ascii() {
519                return false;
520            }
521            // Drop anything serde_json would escape, and anything that would
522            // otherwise break the emitted JSON string literal at runtime.
523            if c.is_control() {
524                return false;
525            }
526            if c == '"' || c == '\\' {
527                return false;
528            }
529            // Keep printable ASCII: alphanumeric, space, and the standard
530            // punctuation set (minus the quote / backslash we excluded above).
531            c.is_alphanumeric() || c == ' ' || c.is_ascii_punctuation()
532        })
533        .collect()
534}
535
536/// `&str` convenience wrapper over [`bytes_to_linker_directives`].
537fn bytes_to_linker_script_format(s: &str) -> String {
538    bytes_to_linker_directives(s.as_bytes())
539}
540
541#[cfg(test)]
542mod tests {
543    use super::*;
544
545    /// After sanitizing, the bytes `serde_json::to_string` emits for the value
546    /// must equal the raw bytes the linker script writes (the value wrapped in
547    /// `"..."`). This is the invariant that keeps the `.note.package` section
548    /// 4-byte aligned: the linker-script padding is computed from the raw
549    /// byte count, so any divergence would corrupt the ELF note alignment.
550    fn assert_sanitize_json_agreement(raw: &str) {
551        let sanitized = sanitize_for_linker_script(raw);
552        let linker_bytes = format!("\"{sanitized}\"");
553        // `serde_json::to_string` on a `String` cannot fail; match explicitly
554        // rather than using an unreachable `.unwrap_or_default()`.
555        let json_bytes = match serde_json::to_string(&sanitized) {
556            Ok(s) => s,
557            Err(e) => panic!("serde_json::to_string on a plain String failed: {e}"),
558        };
559        assert_eq!(
560            linker_bytes, json_bytes,
561            "sanitized input {raw:?} produced diverging linker vs. JSON bytes"
562        );
563        assert_eq!(
564            linker_bytes.len(),
565            json_bytes.len(),
566            "sanitized input {raw:?} produced diverging byte lengths"
567        );
568    }
569
570    #[test]
571    fn sanitize_strips_quote_and_backslash() {
572        // `"` and `\` would both be escaped by serde_json, doubling their byte
573        // count; they must be stripped so the sanitized value serializes verbatim.
574        let s = sanitize_for_linker_script("a\"b\\c");
575        assert_eq!(s, "abc");
576        assert_sanitize_json_agreement("a\"b\\c");
577    }
578
579    #[test]
580    fn sanitize_strips_control_chars() {
581        // Newlines, carriage returns, tabs, and the NUL byte would all be escaped
582        // by serde_json (\n, \r, \t, \u0000); strip them to keep byte counts aligned.
583        let s = sanitize_for_linker_script("line1\nline2\r\nx\ty\0z");
584        assert_eq!(s, "line1line2xyz");
585        assert_sanitize_json_agreement("line1\nline2\r\nx\ty\0z");
586    }
587
588    #[test]
589    fn sanitize_maps_common_glyphs_to_ascii() {
590        // Copyright/trademark glyphs are the realistic case in a `copyright`
591        // field; they must round-trip to ASCII before the non-ASCII filter runs.
592        let s = sanitize_for_linker_script("Contoso© Fabrikam® / Widgets™");
593        assert_eq!(s, "Contoso(c) Fabrikam(r) / Widgets(tm)");
594        assert_sanitize_json_agreement("Contoso© Fabrikam® / Widgets™");
595    }
596
597    #[test]
598    fn sanitize_strips_generic_non_ascii() {
599        // Generic non-ASCII (e.g. accented names in an author field) would be
600        // emitted as multi-byte UTF-8 by the linker but escaped as \uNNNN by
601        // serde_json unless it stays in the BMP; either way the byte counts
602        // diverge, so non-ASCII must be dropped.
603        let s = sanitize_for_linker_script("André naïve 日本");
604        assert_eq!(s, "Andr nave ");
605        assert_sanitize_json_agreement("André naïve 日本");
606    }
607
608    #[test]
609    fn sanitize_preserves_star_slash_for_paths_and_versions() {
610        // `*` and `/` are intentionally kept; they appear in paths and in
611        // version strings. (The linker-script emitter deliberately does NOT
612        // interpolate sanitized values into `/* ... */` comments, so `*/`
613        // in a value cannot close a comment.)
614        let s = sanitize_for_linker_script("path/to/*.rs v1.2.3+build");
615        assert_eq!(s, "path/to/*.rs v1.2.3+build");
616        assert_sanitize_json_agreement("path/to/*.rs v1.2.3+build");
617    }
618
619    #[test]
620    fn sanitize_keeps_star_slash_sequence_literally() {
621        // Regression guard: even if a value contains `*/`, sanitize keeps both
622        // characters (they're not safety-critical on their own). The safety
623        // comes from the linker-script emitter never interpolating the value
624        // into a C-style comment. The JSON and linker byte counts still match.
625        let raw = "hello*/world";
626        let s = sanitize_for_linker_script(raw);
627        assert_eq!(s, "hello*/world");
628        assert_sanitize_json_agreement(raw);
629    }
630
631    #[test]
632    fn sanitize_handles_empty_string() {
633        let s = sanitize_for_linker_script("");
634        assert_eq!(s, "");
635        assert_sanitize_json_agreement("");
636    }
637
638    #[test]
639    fn sanitize_handles_only_stripped_chars() {
640        // All-bad input should collapse to empty, and empty must still agree
641        // across JSON and linker output (both emit `""`, 2 bytes).
642        let s = sanitize_for_linker_script("\"\\\n\t日");
643        assert_eq!(s, "");
644        assert_sanitize_json_agreement("\"\\\n\t日");
645    }
646
647    /// Azure Pipelines' `BUILD_BUILDNUMBER` can arrive shaped like
648    /// `"5.2.100.0-PullRequest-123456"`. The SemVer-style `-<suffix>` must
649    /// be stripped before splitting on `.`, otherwise the 4-part
650    /// `moduleVersion` fails the u16 check on the last component.
651    #[test]
652    fn format_version_parts_strips_semver_suffix() {
653        assert_eq!(
654            format_version_parts("5.2.100.0-PullRequest-123456", 4),
655            "5.2.100.0"
656        );
657        assert_eq!(
658            format_version_parts("5.2.100.0-PullRequest-123456", 3),
659            "5.2.100"
660        );
661        // SemVer pre-release label (`-beta.N`) is also stripped.
662        assert_eq!(format_version_parts("2.10.0-beta.3", 4), "2.10.0.0");
663        // `+build` (SemVer build-metadata) is also stripped.
664        assert_eq!(format_version_parts("3.1.4+ci.42", 3), "3.1.4");
665        // Plain numeric input is unchanged.
666        assert_eq!(format_version_parts("1.2.3.4", 4), "1.2.3.4");
667        // Padding behavior is preserved for short inputs.
668        assert_eq!(format_version_parts("1.2", 4), "1.2.0.0");
669        // Empty input must yield the all-zero fallback (not ".0.0" /
670        // ".0.0.0" from `"".split('.')` returning `[""]`). A malformed
671        // leading-dot result would fail downstream u16 validation with
672        // a confusing "part 0 is empty" error.
673        assert_eq!(format_version_parts("", 3), "0.0.0");
674        assert_eq!(format_version_parts("", 4), "0.0.0.0");
675    }
676
677    #[test]
678    fn sanitize_is_idempotent() {
679        // Applying sanitize repeatedly must produce the same result as applying
680        // it once. The invariant described in the doc comment (JSON and
681        // linker-script bytes agree) could drift through code paths that
682        // re-sanitize defensively, so we iterate four passes rather than the
683        // original two; enough to catch a rewrite where a glyph expansion
684        // introduces another glyph-trigger pattern.
685        //
686        // The input deliberately includes:
687        // - Raw glyphs that get expanded (`©`→`(c)`, `®`→`(r)`, `™`→`(tm)`)
688        // - Text that matches the *output* of those expansions
689        //   (`"Copyright (c) Contoso (2024)"`), which is the scenario where
690        //   a regex-based rewrite could go wrong by re-triggering expansion
691        //   on the `(c)` literal. The current implementation uses literal
692        //   `.replace()` so it's immune; the test pins that contract.
693        let inputs = [
694            "Contoso© Fabrikam® Widgets™ / path*/here",
695            "Copyright (c) Contoso (2024), (r) (tm)",
696            "(c)(r)(tm) only",
697            "",
698        ];
699        for raw in inputs {
700            let once = sanitize_for_linker_script(raw);
701            let mut current = once.clone();
702            for pass in 2..=4 {
703                let next = sanitize_for_linker_script(&current);
704                assert_eq!(
705                    next, once,
706                    "sanitize pass {pass} for input {raw:?} diverged from pass 1 output {once:?}; got {next:?}"
707                );
708                current = next;
709            }
710        }
711    }
712}