module_info/metadata.rs
1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4use std::env;
5
6use serde::{Deserialize, Serialize};
7
8use crate::{
9 utils::{bytes_to_linker_directives, get_cargo_toml_content, get_distro_info},
10 ModuleInfoError, ModuleInfoField, ModuleInfoResult, NOTE_ALIGN,
11};
12
13/// Package metadata for embedding in the ELF `.note.package` section.
14///
15/// `PackageMetadata` holds the raw (unsanitized) metadata values that will be
16/// serialized to JSON and byte-encoded into the linker script by
17/// [`embed_package_metadata`](crate::embed_package_metadata). Callers may
18/// either populate this struct manually in `build.rs` (e.g. to supply values
19/// from an outer build system without touching `Cargo.toml`) or use
20/// [`PackageMetadata::from_cargo_toml`] to read the current crate's metadata.
21///
22/// # Non-exhaustive + Default
23///
24/// This struct is marked `#[non_exhaustive]` and implements [`Default`] so new
25/// fields can be added in future minor releases without breaking downstream
26/// code. From outside the crate, `#[non_exhaustive]` forbids struct-literal
27/// construction; start from [`Default::default()`] and assign the fields you
28/// need:
29///
30/// ```rust,no_run
31/// # use module_info::PackageMetadata;
32/// let mut md = PackageMetadata::default();
33/// md.maintainer = "team@contoso.com".into();
34/// md.module_type = "agent".into();
35/// md.version = "1.2.3".into();
36/// md.module_version = "1.2.3.4".into();
37/// ```
38///
39/// # Disabling fields
40///
41/// Seven keys are *required* in the embedded JSON:
42/// `binary`, `version`, `moduleVersion`, `name`, `maintainer`, `os`, and
43/// `osVersion`. The remaining fields (`type`, `repo`, `branch`, `hash`,
44/// `copyright`) are optional. Leave them as the empty string and the
45/// corresponding JSON value is emitted as `""`, which downstream tooling
46/// can skip. `from_cargo_toml()` populates the `os`/`osVersion` fields
47/// from `/etc/os-release`, so most builders get them for free; override
48/// only when the detected values don't match the target platform.
49///
50/// The JSON shape stays stable (every key is always present) because the
51/// `.note.package` payload is a fixed-layout byte array built from the
52/// linker script; the empty-string-as-disabled convention keeps the layout
53/// constant while letting consumers opt out of leaking fields they don't
54/// want in the binary.
55///
56/// ```rust,no_run
57/// use module_info::PackageMetadata;
58///
59/// fn main() -> Result<(), Box<dyn std::error::Error>> {
60/// // Library crate that doesn't want to embed git or repo info:
61/// let mut md = PackageMetadata::from_cargo_toml()?;
62/// md.repo.clear();
63/// md.branch.clear();
64/// md.hash.clear();
65/// // `md` still carries binary/version/moduleVersion/name/maintainer
66/// // plus os/osVersion (auto-populated from /etc/os-release).
67/// Ok(())
68/// }
69/// ```
70#[derive(Debug, Clone, Default, Serialize, Deserialize)]
71#[non_exhaustive]
72pub struct PackageMetadata {
73 /// Binary name (executable or library)
74 pub binary: String,
75
76 /// Full module version (may include build number)
77 #[serde(rename = "moduleVersion")]
78 pub module_version: String,
79
80 /// Crate version from Cargo.toml
81 pub version: String,
82
83 /// Maintainer contact information
84 pub maintainer: String,
85
86 /// Package name
87 pub name: String,
88
89 /// Module type (agent, library, executable, etc.)
90 #[serde(rename = "type")] // Ensure JSON uses "type" instead of "module_type"
91 pub module_type: String,
92
93 /// Git repository name
94 pub repo: String,
95
96 /// Git branch name
97 pub branch: String,
98
99 /// Git commit hash
100 pub hash: String,
101
102 /// Copyright information
103 pub copyright: String,
104
105 /// Operating system name
106 pub os: String,
107
108 /// Operating system version
109 #[serde(rename = "osVersion")]
110 pub os_version: String,
111}
112
113impl PackageMetadata {
114 /// Build a [`PackageMetadata`] by reading the current crate's `Cargo.toml`,
115 /// environment-variable overrides, git working copy, and OS release info.
116 ///
117 /// This is the zero-configuration entry point: the build script for a
118 /// normal Cargo crate can just call
119 /// [`generate_project_metadata_and_linker_script`](crate::generate_project_metadata_and_linker_script),
120 /// which uses this method under the hood. Call `from_cargo_toml` directly
121 /// only when you need to inspect or mutate the collected metadata before
122 /// passing it to [`embed_package_metadata`](crate::embed_package_metadata).
123 ///
124 /// The returned values are *unsanitized*. `embed_package_metadata` runs
125 /// the sanitize step internally so the linker-script bytes and the JSON
126 /// string agree byte-for-byte (the invariant that keeps the `.note.package`
127 /// section 4-byte aligned).
128 ///
129 /// # Errors
130 /// Returns a [`ModuleInfoError`] if `Cargo.toml` is unreadable or malformed,
131 /// if git invocation fails, or if the OS release info cannot be read.
132 pub fn from_cargo_toml() -> ModuleInfoResult<Self> {
133 collect_package_metadata()
134 }
135
136 /// Return the string value associated with a given [`ModuleInfoField`].
137 ///
138 /// This is the single source of truth mapping `ModuleInfoField` variants
139 /// to `PackageMetadata` fields. Both the linker-script emitter and the
140 /// build-time JSON dump iterate [`ModuleInfoField::ALL`] and call this.
141 #[must_use]
142 pub fn field_value(&self, field: ModuleInfoField) -> &str {
143 match field {
144 ModuleInfoField::Binary => &self.binary,
145 ModuleInfoField::Version => &self.version,
146 ModuleInfoField::ModuleVersion => &self.module_version,
147 ModuleInfoField::Maintainer => &self.maintainer,
148 ModuleInfoField::Name => &self.name,
149 ModuleInfoField::Type => &self.module_type,
150 ModuleInfoField::Repo => &self.repo,
151 ModuleInfoField::Branch => &self.branch,
152 ModuleInfoField::Hash => &self.hash,
153 ModuleInfoField::Copyright => &self.copyright,
154 ModuleInfoField::Os => &self.os,
155 ModuleInfoField::OsVersion => &self.os_version,
156 }
157 }
158}
159
160/// Look up `package.metadata.module_info.<key>` as a string slice.
161fn module_info_str<'a>(package: &'a toml::Value, key: &str) -> Option<&'a str> {
162 package
163 .get("metadata")
164 .and_then(|m| m.get("module_info"))
165 .and_then(|mi| mi.get(key))
166 .and_then(|v| v.as_str())
167}
168
169/// Normalize a dotted version string to exactly `parts` numeric components,
170/// padding missing trailing components with `0` and truncating extras.
171///
172/// SemVer-style pre-release / build-metadata suffixes (everything from the
173/// first `-` or `+`) are stripped before splitting so that Azure Pipelines
174/// build numbers like `"5.2.100.0-PullRequest-123456"` normalize cleanly
175/// to `"5.2.100.0"` (or `"5.2.100"` when `parts == 3`) instead of leaving a
176/// non-numeric tail that would fail the u16 check in
177/// `validate_module_version`.
178fn format_version_parts(version_str: &str, parts: usize) -> String {
179 // Strip pre-release / build-metadata suffix before splitting. Find the
180 // first `-` or `+` (whichever comes first) and cut there.
181 let cut = match (version_str.find('-'), version_str.find('+')) {
182 (Some(a), Some(b)) => Some(a.min(b)),
183 (Some(a), None) => Some(a),
184 (None, Some(b)) => Some(b),
185 (None, None) => None,
186 };
187 let core = match cut {
188 Some(end) => version_str.get(..end).unwrap_or(version_str),
189 None => version_str,
190 };
191 if core.len() != version_str.len() {
192 warn!(
193 "version string {:?} carries pre-release/build-metadata suffix; using numeric core {:?}",
194 version_str, core
195 );
196 }
197 // Treat an empty core as "no dot-separated numeric input" rather than
198 // "one empty component": `"".split('.')` returns `[""]`, which would
199 // otherwise make the first emitted part the empty string and produce
200 // a malformed result like ".0.0" (for parts=3) or ".0.0.0" (for parts=4).
201 // The downstream u16 validator in `validate_module_version` would then
202 // reject with a confusing "part 0 is empty" error instead of the
203 // expected "0.0.0" / "0.0.0.0" fallback.
204 let fields: Vec<&str> = if core.is_empty() {
205 Vec::new()
206 } else {
207 core.split('.').collect()
208 };
209 if fields.len() > parts {
210 // Truncation is warn-not-error for backwards compat with pipelines
211 // whose build numbers incidentally carry extra dots (e.g. a
212 // `BUILD_BUILDNUMBER` of `"1.2.3.4.5"` trimmed to `"1.2.3.4"`).
213 warn!(
214 "version string {:?} has {} dot-separated parts; truncating to {} (dropped: {:?})",
215 core,
216 fields.len(),
217 parts,
218 fields.get(parts..).map(|s| s.join(".")).unwrap_or_default()
219 );
220 }
221 // Warn early when any part overflows u16: the hard check in
222 // `validate_module_version` will still reject the value later, but the
223 // error then surfaces several call-frames deep in `embed_package_metadata`
224 // as a generic "moduleVersion part N must fit in 16 bits". Warning here
225 // points at the actual offending env var / Cargo.toml value in the CI log.
226 for (i, f) in fields.iter().take(parts).enumerate() {
227 if !f.is_empty() && f.parse::<u16>().is_err() {
228 warn!(
229 "version part {} ({:?}) in {:?} does not fit u16; downstream validate_module_version will reject this build",
230 i, f, core
231 );
232 }
233 }
234 (0..parts)
235 .map(|i| fields.get(i).copied().unwrap_or("0"))
236 .collect::<Vec<_>>()
237 .join(".")
238}
239
240/// Read `$env_var_name` (if set) and return its trimmed value, or `fallback`
241/// when the env var is unset, unreadable, or whitespace-only.
242///
243/// Trimming matters because CI-supplied values (e.g. `BUILD_BUILDNUMBER`)
244/// occasionally arrive with stray leading/trailing whitespace, which would
245/// otherwise propagate into the first `.`-separated field of a version
246/// string and fail the u16 range check in `validate_module_version`.
247fn env_or_default(env_var_name: Option<&str>, fallback: &str) -> String {
248 let Some(name) = env_var_name else {
249 return fallback.to_string();
250 };
251 let value = match env::var(name) {
252 Ok(v) => v,
253 Err(env::VarError::NotPresent) => String::new(),
254 Err(env::VarError::NotUnicode(_)) => {
255 // Non-UTF8 env values silently drop to the fallback rather than
256 // poisoning the embedded JSON with replacement characters. A
257 // cargo:warning keeps the root cause visible at build time.
258 println!(
259 "cargo:warning=module_info: env var {name} contains non-UTF8 bytes; using fallback"
260 );
261 String::new()
262 }
263 };
264 let trimmed = value.trim();
265 if trimmed.is_empty() {
266 fallback.to_string()
267 } else {
268 trimmed.to_string()
269 }
270}
271
272/// Read Cargo.toml, env vars, git, and OS release info and populate a raw
273/// (unsanitized) `PackageMetadata`.
274fn collect_package_metadata() -> ModuleInfoResult<PackageMetadata> {
275 let cargo_toml = get_cargo_toml_content()?;
276 let package = cargo_toml
277 .get("package")
278 .ok_or_else(|| ModuleInfoError::MalformedJson("No package section found".to_string()))?;
279
280 let binary_name = env::var("CARGO_PKG_NAME").unwrap_or_default();
281 let default_version = env::var("CARGO_PKG_VERSION").unwrap_or_default();
282
283 let version_env_var_name = module_info_str(package, "version_env_var_name").map(str::to_string);
284 let module_version_env_var_name =
285 module_info_str(package, "module_version_env_var_name").map(str::to_string);
286
287 // Caller-named env vars: emit rerun-if-env-changed here since
288 // `embed_package_metadata`'s fixed rerun set can't know them.
289 if let Some(name) = version_env_var_name.as_deref() {
290 println!("cargo:rerun-if-env-changed={name}");
291 }
292 if let Some(name) = module_version_env_var_name.as_deref() {
293 println!("cargo:rerun-if-env-changed={name}");
294 }
295
296 let raw_version = env_or_default(version_env_var_name.as_deref(), &default_version);
297 let version = format_version_parts(&raw_version, 3);
298 let raw_module_version = env_or_default(module_version_env_var_name.as_deref(), &raw_version);
299 let module_version = format_version_parts(&raw_module_version, 4);
300
301 let (branch, hash, repo) = crate::utils::get_git_info()?;
302
303 let maintainer = module_info_str(package, "maintainer")
304 .unwrap_or("Unknown")
305 .to_string();
306 let module_type = module_info_str(package, "type")
307 .unwrap_or("Unknown")
308 .to_string();
309 let copyright = module_info_str(package, "copyright")
310 .unwrap_or("Unknown")
311 .to_string();
312
313 let (os, os_version) = get_distro_info()?;
314
315 // Return unsanitized values; `render_note_payloads` sanitizes at emit time
316 // so both manual and `from_cargo_toml` construction paths agree byte-for-byte.
317 Ok(PackageMetadata {
318 binary: binary_name.clone(),
319 module_version,
320 version,
321 maintainer,
322 name: binary_name,
323 module_type,
324 repo,
325 branch,
326 hash,
327 copyright,
328 os,
329 os_version,
330 })
331}
332
333/// Render a [`PackageMetadata`] into the two byte-identical payloads embedded
334/// in the ELF note section: the compact JSON string (`.0`) and the
335/// linker-script body that reproduces the same bytes (`.1`).
336///
337/// Sanitization happens here. The returned JSON string and the byte-encoded
338/// linker-script body are guaranteed to agree byte-for-byte, which is what
339/// keeps the `.note.package` section 4-byte aligned.
340pub(crate) fn render_note_payloads(md: &PackageMetadata) -> ModuleInfoResult<(String, String)> {
341 // Sanitize before serialization so JSON bytes and linker bytes agree.
342 // Otherwise characters that expand/strip (`©` → `(c)`, non-ASCII) would
343 // drift padding and break 4-byte alignment of the note section.
344 let metadata = PackageMetadata {
345 binary: sanitize_for_linker_script(&md.binary),
346 module_version: sanitize_for_linker_script(&md.module_version),
347 version: sanitize_for_linker_script(&md.version),
348 maintainer: sanitize_for_linker_script(&md.maintainer),
349 name: sanitize_for_linker_script(&md.name),
350 module_type: sanitize_for_linker_script(&md.module_type),
351 repo: sanitize_for_linker_script(&md.repo),
352 branch: sanitize_for_linker_script(&md.branch),
353 hash: sanitize_for_linker_script(&md.hash),
354 copyright: sanitize_for_linker_script(&md.copyright),
355 os: sanitize_for_linker_script(&md.os),
356 os_version: sanitize_for_linker_script(&md.os_version),
357 };
358
359 // Emit JSON and linker directives in lock-step so byte counts agree.
360 // Manually emit newlines (not `serde_json::to_string`, which emits one line)
361 // so `strings`/`readelf -n` show one key:value pair per line.
362 let mut linker_script_body = String::new();
363 let mut compact_json = String::new();
364
365 // Iterate `ModuleInfoField::ALL` so the emitter stays in lock-step with the
366 // enum (exhaustive iteration surfaces missing/extra keys at compile time).
367 let entries: Vec<(&str, &str, &str)> = ModuleInfoField::ALL
368 .iter()
369 .map(|f| (f.to_key(), f.to_symbol_name(), metadata.field_value(*f)))
370 .collect();
371
372 // Derive padding from this running count, not `compact_json.len()`.
373 // Sanitized bytes *should* match `compact_json.len()` (the hard check
374 // below enforces that), but using the emitted byte count is the invariant
375 // that actually keeps `.note.package` 4-byte aligned. If sanitization
376 // ever drifts string length, the emitted count is still authoritative.
377 let mut note_payload_bytes: usize = 0;
378
379 linker_script_body.push('\n');
380 linker_script_body.push_str(&bytes_to_linker_script_format("{\n")); // '{', '\n'
381 compact_json.push_str("{\n");
382 note_payload_bytes += 2;
383 for (i, (key, symbol_name, value)) in entries.iter().enumerate() {
384 let key_json = format!("\"{key}\":");
385 let bytes_key_str = bytes_to_linker_script_format(&key_json);
386 linker_script_body.push_str(&format!("\n\n /* Key: {key} */"));
387 linker_script_body.push_str(&format!("\n{bytes_key_str}"));
388 compact_json.push_str(&key_json);
389 note_payload_bytes += key_json.len();
390
391 // Symbol marks the value's start address; runtime extraction reads from
392 // here without parsing the JSON.
393 linker_script_body.push_str(&format!("\n {symbol_name} = .;"));
394
395 // No `/* Value: ... */` comment: sanitized values could contain `*/`
396 // and interpolating user bytes into a C-style comment is fragile.
397 let value_json = format!("\"{value}\"");
398 let bytes_value_str = bytes_to_linker_script_format(&value_json);
399 linker_script_body.push_str(&format!("\n{bytes_value_str}"));
400 compact_json.push_str(&value_json);
401 note_payload_bytes += value_json.len();
402
403 if i < entries.len() - 1 {
404 linker_script_body.push('\n');
405 linker_script_body.push_str(&bytes_to_linker_script_format(",\n"));
406 compact_json.push_str(",\n");
407 note_payload_bytes += 2;
408 }
409 }
410
411 linker_script_body.push('\n');
412 linker_script_body.push_str(&bytes_to_linker_script_format("\n}")); // '\n', '}'
413 compact_json.push_str("\n}");
414 note_payload_bytes += 2;
415 debug!(" Compact JSON Len: {}", compact_json.len());
416
417 // Always emit 1–4 NUL bytes of padding (never 0). The lower bound is
418 // load-bearing: `extract_module_info` scans forward until `\0`, capped at
419 // `MAX_NOTE_VALUE_LEN`. Without a terminating NUL the scan runs to the cap
420 // and reads bytes past the section: harmless mid-segment, SIGSEGV at a
421 // segment tail. When `note_payload_bytes % NOTE_ALIGN == 0`, the formula
422 // below emits a full `NOTE_ALIGN` (4-byte) NUL pad, *not* 0; the section
423 // stays 4-aligned either way (+0 mod 4 vs. +4 mod 4), and the scan still
424 // terminates on the first NUL.
425 let padding_needed = NOTE_ALIGN - (note_payload_bytes % NOTE_ALIGN);
426 // Hard check (not debug_assert): mismatch here means the linker script and
427 // JSON disagree byte-for-byte, which corrupts alignment and breaks runtime
428 // extraction. Return an error (not panic) so build.rs sees a clean exit.
429 if note_payload_bytes != compact_json.len() {
430 return Err(crate::ModuleInfoError::Other(
431 format!(
432 "linker script payload size ({note_payload_bytes}) disagrees with compact_json ({}); \
433 sanitizer and emitter drifted out of sync",
434 compact_json.len()
435 )
436 .into(),
437 ));
438 }
439
440 // Always runs; `padding_needed` is in [1, 4] by construction above.
441 linker_script_body.push_str("\n /* Padding (always >=1 NUL so runtime scan terminates) */");
442 for _ in 0..padding_needed {
443 linker_script_body.push('\n');
444 linker_script_body.push_str(&bytes_to_linker_script_format("\0"));
445 }
446
447 debug!("Linker script body:\n{}", linker_script_body);
448 debug!("Compact JSON:\n{}", compact_json);
449 debug!("Linker script body size: {}", linker_script_body.len());
450 debug!("Compact JSON size: {}", compact_json.len());
451 debug!("Padding needed: {}", padding_needed);
452 debug!(
453 "Linker script body size after padding: {}",
454 linker_script_body.len()
455 );
456 debug!("Compact JSON size after padding: {}", compact_json.len());
457
458 Ok((compact_json, linker_script_body))
459}
460
461/// Thin wrapper that chains [`PackageMetadata::from_cargo_toml`] and
462/// [`render_note_payloads`], retained so the `test_project_metadata`
463/// regression test can exercise both stages through one call. Production code
464/// reaches for [`crate::embed_package_metadata`] instead.
465#[cfg(test)]
466pub(crate) fn project_metadata() -> ModuleInfoResult<(String, String)> {
467 let md = PackageMetadata::from_cargo_toml()?;
468 render_note_payloads(&md)
469}
470
471/// Sanitize a string so that it can be embedded verbatim in both the emitted
472/// linker script (as raw bytes) and the compact JSON metadata (via serde_json)
473/// without the two representations disagreeing in length.
474///
475/// The contract is: after sanitization, `serde_json::to_string` of the value
476/// produces the same bytes the linker will emit. That invariant keeps the
477/// `.note.package` section 4-byte aligned (padding is computed from the
478/// emitted-byte count) and keeps the JSON parseable at runtime.
479///
480/// To achieve that we strip every character that `serde_json` would escape:
481/// - `"` (would become `\"` in JSON: 2 bytes vs 1 raw byte)
482/// - `\` (would become `\\`)
483/// - any control character, including `\n`, `\r`, `\t` (would become `\n`/`\t`/`\uNNNN`)
484/// - any non-ASCII character (would be UTF-8 multi-byte; we keep the section ASCII-only)
485///
486/// We also map a few common trademark/copyright glyphs to ASCII equivalents
487/// before stripping, because those legitimately show up in `copyright` fields.
488/// These replacements *expand* the byte count (`©` = 2 UTF-8 bytes → `(c)` =
489/// 3 ASCII bytes), so a pathological copyright string full of glyphs can push
490/// the serialized JSON over [`crate::constants::MAX_JSON_SIZE`] (1 KiB) and
491/// fail validation in [`crate::embed_package_metadata`]. That's the intended
492/// outcome (the size cap is the forcing function), but if a build fails
493/// with `MetadataTooLarge`, glyph expansion is the likely cause.
494///
495/// The glyph map is intentionally minimal (©, ®, ™). Other non-ASCII glyphs
496/// that might appear in author or copyright fields (U+2014 em-dash (`—`), curly
497/// quotes (`“ ”`), section (`§`), accented Latin letters (`André`), or any
498/// CJK text) are dropped rather than transliterated. Expand the map if a
499/// legitimate field is losing characters, and update the
500/// `sanitize_strips_...` tests to match. Don't add per-language
501/// transliteration (`é` → `e`): that loses meaning silently. Prefer an ASCII
502/// spelling at the source.
503///
504/// Additionally, `*` and `/` are preserved (they appear in paths and versions),
505/// but the caller must not interpolate the sanitized string into a C-style
506/// `/* ... */` comment without escaping `*/` first.
507///
508/// # Example
509/// `"Contoso©"` → `"Contoso(c)"`; `"a\"b\nc"` → `"abc"`.
510pub fn sanitize_for_linker_script(input: &str) -> String {
511 input
512 .replace('©', "(c)")
513 .replace('®', "(r)")
514 .replace('™', "(tm)")
515 .chars()
516 .filter(|&c| {
517 // Must be plain ASCII so the emitted bytes are one-to-one with chars.
518 if !c.is_ascii() {
519 return false;
520 }
521 // Drop anything serde_json would escape, and anything that would
522 // otherwise break the emitted JSON string literal at runtime.
523 if c.is_control() {
524 return false;
525 }
526 if c == '"' || c == '\\' {
527 return false;
528 }
529 // Keep printable ASCII: alphanumeric, space, and the standard
530 // punctuation set (minus the quote / backslash we excluded above).
531 c.is_alphanumeric() || c == ' ' || c.is_ascii_punctuation()
532 })
533 .collect()
534}
535
536/// `&str` convenience wrapper over [`bytes_to_linker_directives`].
537fn bytes_to_linker_script_format(s: &str) -> String {
538 bytes_to_linker_directives(s.as_bytes())
539}
540
541#[cfg(test)]
542mod tests {
543 use super::*;
544
545 /// After sanitizing, the bytes `serde_json::to_string` emits for the value
546 /// must equal the raw bytes the linker script writes (the value wrapped in
547 /// `"..."`). This is the invariant that keeps the `.note.package` section
548 /// 4-byte aligned: the linker-script padding is computed from the raw
549 /// byte count, so any divergence would corrupt the ELF note alignment.
550 fn assert_sanitize_json_agreement(raw: &str) {
551 let sanitized = sanitize_for_linker_script(raw);
552 let linker_bytes = format!("\"{sanitized}\"");
553 // `serde_json::to_string` on a `String` cannot fail; match explicitly
554 // rather than using an unreachable `.unwrap_or_default()`.
555 let json_bytes = match serde_json::to_string(&sanitized) {
556 Ok(s) => s,
557 Err(e) => panic!("serde_json::to_string on a plain String failed: {e}"),
558 };
559 assert_eq!(
560 linker_bytes, json_bytes,
561 "sanitized input {raw:?} produced diverging linker vs. JSON bytes"
562 );
563 assert_eq!(
564 linker_bytes.len(),
565 json_bytes.len(),
566 "sanitized input {raw:?} produced diverging byte lengths"
567 );
568 }
569
570 #[test]
571 fn sanitize_strips_quote_and_backslash() {
572 // `"` and `\` would both be escaped by serde_json, doubling their byte
573 // count; they must be stripped so the sanitized value serializes verbatim.
574 let s = sanitize_for_linker_script("a\"b\\c");
575 assert_eq!(s, "abc");
576 assert_sanitize_json_agreement("a\"b\\c");
577 }
578
579 #[test]
580 fn sanitize_strips_control_chars() {
581 // Newlines, carriage returns, tabs, and the NUL byte would all be escaped
582 // by serde_json (\n, \r, \t, \u0000); strip them to keep byte counts aligned.
583 let s = sanitize_for_linker_script("line1\nline2\r\nx\ty\0z");
584 assert_eq!(s, "line1line2xyz");
585 assert_sanitize_json_agreement("line1\nline2\r\nx\ty\0z");
586 }
587
588 #[test]
589 fn sanitize_maps_common_glyphs_to_ascii() {
590 // Copyright/trademark glyphs are the realistic case in a `copyright`
591 // field; they must round-trip to ASCII before the non-ASCII filter runs.
592 let s = sanitize_for_linker_script("Contoso© Fabrikam® / Widgets™");
593 assert_eq!(s, "Contoso(c) Fabrikam(r) / Widgets(tm)");
594 assert_sanitize_json_agreement("Contoso© Fabrikam® / Widgets™");
595 }
596
597 #[test]
598 fn sanitize_strips_generic_non_ascii() {
599 // Generic non-ASCII (e.g. accented names in an author field) would be
600 // emitted as multi-byte UTF-8 by the linker but escaped as \uNNNN by
601 // serde_json unless it stays in the BMP; either way the byte counts
602 // diverge, so non-ASCII must be dropped.
603 let s = sanitize_for_linker_script("André naïve 日本");
604 assert_eq!(s, "Andr nave ");
605 assert_sanitize_json_agreement("André naïve 日本");
606 }
607
608 #[test]
609 fn sanitize_preserves_star_slash_for_paths_and_versions() {
610 // `*` and `/` are intentionally kept; they appear in paths and in
611 // version strings. (The linker-script emitter deliberately does NOT
612 // interpolate sanitized values into `/* ... */` comments, so `*/`
613 // in a value cannot close a comment.)
614 let s = sanitize_for_linker_script("path/to/*.rs v1.2.3+build");
615 assert_eq!(s, "path/to/*.rs v1.2.3+build");
616 assert_sanitize_json_agreement("path/to/*.rs v1.2.3+build");
617 }
618
619 #[test]
620 fn sanitize_keeps_star_slash_sequence_literally() {
621 // Regression guard: even if a value contains `*/`, sanitize keeps both
622 // characters (they're not safety-critical on their own). The safety
623 // comes from the linker-script emitter never interpolating the value
624 // into a C-style comment. The JSON and linker byte counts still match.
625 let raw = "hello*/world";
626 let s = sanitize_for_linker_script(raw);
627 assert_eq!(s, "hello*/world");
628 assert_sanitize_json_agreement(raw);
629 }
630
631 #[test]
632 fn sanitize_handles_empty_string() {
633 let s = sanitize_for_linker_script("");
634 assert_eq!(s, "");
635 assert_sanitize_json_agreement("");
636 }
637
638 #[test]
639 fn sanitize_handles_only_stripped_chars() {
640 // All-bad input should collapse to empty, and empty must still agree
641 // across JSON and linker output (both emit `""`, 2 bytes).
642 let s = sanitize_for_linker_script("\"\\\n\t日");
643 assert_eq!(s, "");
644 assert_sanitize_json_agreement("\"\\\n\t日");
645 }
646
647 /// Azure Pipelines' `BUILD_BUILDNUMBER` can arrive shaped like
648 /// `"5.2.100.0-PullRequest-123456"`. The SemVer-style `-<suffix>` must
649 /// be stripped before splitting on `.`, otherwise the 4-part
650 /// `moduleVersion` fails the u16 check on the last component.
651 #[test]
652 fn format_version_parts_strips_semver_suffix() {
653 assert_eq!(
654 format_version_parts("5.2.100.0-PullRequest-123456", 4),
655 "5.2.100.0"
656 );
657 assert_eq!(
658 format_version_parts("5.2.100.0-PullRequest-123456", 3),
659 "5.2.100"
660 );
661 // SemVer pre-release label (`-beta.N`) is also stripped.
662 assert_eq!(format_version_parts("2.10.0-beta.3", 4), "2.10.0.0");
663 // `+build` (SemVer build-metadata) is also stripped.
664 assert_eq!(format_version_parts("3.1.4+ci.42", 3), "3.1.4");
665 // Plain numeric input is unchanged.
666 assert_eq!(format_version_parts("1.2.3.4", 4), "1.2.3.4");
667 // Padding behavior is preserved for short inputs.
668 assert_eq!(format_version_parts("1.2", 4), "1.2.0.0");
669 // Empty input must yield the all-zero fallback (not ".0.0" /
670 // ".0.0.0" from `"".split('.')` returning `[""]`). A malformed
671 // leading-dot result would fail downstream u16 validation with
672 // a confusing "part 0 is empty" error.
673 assert_eq!(format_version_parts("", 3), "0.0.0");
674 assert_eq!(format_version_parts("", 4), "0.0.0.0");
675 }
676
677 #[test]
678 fn sanitize_is_idempotent() {
679 // Applying sanitize repeatedly must produce the same result as applying
680 // it once. The invariant described in the doc comment (JSON and
681 // linker-script bytes agree) could drift through code paths that
682 // re-sanitize defensively, so we iterate four passes rather than the
683 // original two; enough to catch a rewrite where a glyph expansion
684 // introduces another glyph-trigger pattern.
685 //
686 // The input deliberately includes:
687 // - Raw glyphs that get expanded (`©`→`(c)`, `®`→`(r)`, `™`→`(tm)`)
688 // - Text that matches the *output* of those expansions
689 // (`"Copyright (c) Contoso (2024)"`), which is the scenario where
690 // a regex-based rewrite could go wrong by re-triggering expansion
691 // on the `(c)` literal. The current implementation uses literal
692 // `.replace()` so it's immune; the test pins that contract.
693 let inputs = [
694 "Contoso© Fabrikam® Widgets™ / path*/here",
695 "Copyright (c) Contoso (2024), (r) (tm)",
696 "(c)(r)(tm) only",
697 "",
698 ];
699 for raw in inputs {
700 let once = sanitize_for_linker_script(raw);
701 let mut current = once.clone();
702 for pass in 2..=4 {
703 let next = sanitize_for_linker_script(¤t);
704 assert_eq!(
705 next, once,
706 "sanitize pass {pass} for input {raw:?} diverged from pass 1 output {once:?}; got {next:?}"
707 );
708 current = next;
709 }
710 }
711 }
712}