haz_cache/key/components.rs
1//! Canonical serialisation of cache-key components
2//! (`CACHE-005..009`).
3//!
4//! Each function here writes a single component into the running
5//! [`Hasher`] in the canonical byte sequence the spec mandates.
6//! Multi-item components (input files, predecessors, env entries)
7//! enforce their ordering rule inside the function: callers MAY
8//! pass items in any order; the function sorts them before
9//! contribution.
10
11use std::collections::BTreeMap;
12
13use haz_domain::action::{ShellType, TaskAction};
14use haz_domain::env::EnvVarName;
15use haz_domain::name::{ProjectName, TaskName};
16
17use crate::hasher::Hasher;
18
19/// Tag byte distinguishing `command` from `shell` in CACHE-005.
20const TAG_COMMAND: u8 = 0x01;
21/// Tag byte distinguishing `shell` from `command` in CACHE-005.
22const TAG_SHELL: u8 = 0x02;
23
24/// Tag byte for "host variable is absent" in CACHE-008's
25/// `from_host` contribution.
26const ENV_ABSENT_MARKER: u8 = 0x00;
27
28/// Contribution of one file matched by an `inputs` pattern
29/// (`CACHE-006`).
30///
31/// `workspace_absolute_path` is the path's canonical workspace-
32/// absolute form (`PATH-*`); `content_hash` is the file's content
33/// hash under the cache's configured hash function. The cache does
34/// NOT compute the content hash itself in this layer: callers
35/// resolve globs to files, hash each file's contents through the
36/// same [`Hasher`] algorithm, and hand the resulting digests here.
37pub struct InputFile<'a> {
38 /// Canonical workspace-absolute path of the file.
39 pub workspace_absolute_path: &'a str,
40 /// Content hash of the file's bytes under the configured hash
41 /// function.
42 pub content_hash: [u8; 32],
43}
44
45/// Contribution of one hard-edge predecessor's captured streams
46/// (`CACHE-007`).
47///
48/// `stdout_hash` and `stderr_hash` are hashes of the predecessor's
49/// stdout and stderr byte streams under the cache's configured
50/// hash function. The two are kept distinct: a stream switch
51/// (stdout -> stderr) changes the predecessor's contribution.
52pub struct PredecessorStreams<'a> {
53 /// Project component of the predecessor's identity.
54 pub project: &'a ProjectName,
55 /// Task component of the predecessor's identity.
56 pub task: &'a TaskName,
57 /// Hash of the predecessor's stdout bytes.
58 pub stdout_hash: [u8; 32],
59 /// Hash of the predecessor's stderr bytes.
60 pub stderr_hash: [u8; 32],
61}
62
63/// Resolved environment contribution (`CACHE-008`).
64///
65/// `from_host` carries each allow-listed variable name together
66/// with the value it took in the host process at key-derivation
67/// time, or `None` when the host did not set that name. The cache
68/// does NOT consult `std::env` itself: callers resolve the values
69/// (typically `std::env::var(name).ok()`) and hand the map here.
70///
71/// `overrides` carries the task-level `env.override` map. On a
72/// name appearing in both maps, the `overrides` entry wins: the
73/// `from_host` entry for that name does NOT contribute to the key.
74/// Enforcement happens inside [`contribute_env`], so the caller MAY
75/// leave the `from_host` map as-passed.
76pub struct EnvContribution<'a> {
77 /// Allow-listed host variable names paired with their values
78 /// at key-derivation time (`None` for absent).
79 pub from_host: &'a BTreeMap<EnvVarName, Option<String>>,
80 /// Task-level hardcoded name/value overrides.
81 pub overrides: &'a BTreeMap<EnvVarName, String>,
82}
83
84/// Write `action` into `hasher` per `CACHE-005`.
85///
86/// # Panics
87///
88/// Panics if the argv length or any argument's byte length exceeds
89/// [`u32::MAX`]; both bounds are structurally impossible for real
90/// workspaces (a 4 GiB single argument or 4 billion arguments).
91pub fn contribute_action(hasher: &mut Hasher, action: &TaskAction) {
92 match action {
93 TaskAction::Command(argv) => {
94 hasher.update(&[TAG_COMMAND]);
95 let count =
96 u32::try_from(argv.len()).expect("argv length within u32::MAX is structural");
97 hasher.update(&count.to_be_bytes());
98 for arg in argv {
99 write_length_prefixed(hasher, arg.as_bytes());
100 }
101 }
102 TaskAction::Shell { script, shell } => {
103 hasher.update(&[TAG_SHELL]);
104 write_length_prefixed(hasher, shell_identifier(shell).as_bytes());
105 write_length_prefixed(hasher, script.as_bytes());
106 }
107 }
108}
109
110/// Write `files` into `hasher` per `CACHE-006` and the file-
111/// ordering clause of `CACHE-009`.
112///
113/// # Panics
114///
115/// Panics if the file count or any path's byte length exceeds
116/// [`u32::MAX`]; both bounds are structurally impossible for real
117/// workspaces.
118pub fn contribute_input_files(hasher: &mut Hasher, files: &[InputFile<'_>]) {
119 let mut sorted: Vec<&InputFile<'_>> = files.iter().collect();
120 sorted.sort_by(|a, b| {
121 a.workspace_absolute_path
122 .as_bytes()
123 .cmp(b.workspace_absolute_path.as_bytes())
124 });
125
126 let count =
127 u32::try_from(sorted.len()).expect("input-file count within u32::MAX is structural");
128 hasher.update(&count.to_be_bytes());
129 for f in sorted {
130 hasher.update(&f.content_hash);
131 write_length_prefixed(hasher, f.workspace_absolute_path.as_bytes());
132 }
133}
134
135/// Write `predecessors` into `hasher` per `CACHE-007` and the
136/// predecessor-ordering clause of `CACHE-009`.
137///
138/// # Panics
139///
140/// Panics if the predecessor count or any name's byte length
141/// exceeds [`u32::MAX`]; structurally impossible for real
142/// workspaces.
143pub fn contribute_predecessors(hasher: &mut Hasher, predecessors: &[PredecessorStreams<'_>]) {
144 let mut sorted: Vec<&PredecessorStreams<'_>> = predecessors.iter().collect();
145 sorted.sort_by(|a, b| {
146 let lhs = (
147 AsRef::<str>::as_ref(a.project.as_ref()).as_bytes(),
148 AsRef::<str>::as_ref(a.task.as_ref()).as_bytes(),
149 );
150 let rhs = (
151 AsRef::<str>::as_ref(b.project.as_ref()).as_bytes(),
152 AsRef::<str>::as_ref(b.task.as_ref()).as_bytes(),
153 );
154 lhs.cmp(&rhs)
155 });
156
157 let count =
158 u32::try_from(sorted.len()).expect("predecessor count within u32::MAX is structural");
159 hasher.update(&count.to_be_bytes());
160 for p in sorted {
161 write_length_prefixed(hasher, AsRef::<str>::as_ref(p.project.as_ref()).as_bytes());
162 write_length_prefixed(hasher, AsRef::<str>::as_ref(p.task.as_ref()).as_bytes());
163 hasher.update(&p.stdout_hash);
164 hasher.update(&p.stderr_hash);
165 }
166}
167
168/// Write the environment contribution into `hasher` per
169/// `CACHE-008` and the env-ordering clause of `CACHE-009`.
170///
171/// Order of the two sub-components: `from_host` first, then
172/// `overrides`. Names appearing in both are dropped from the
173/// `from_host` contribution.
174///
175/// # Panics
176///
177/// Panics if either map's entry count, or any name or value byte
178/// length, exceeds [`u32::MAX`]; structurally impossible for real
179/// workspaces.
180pub fn contribute_env(hasher: &mut Hasher, env: &EnvContribution<'_>) {
181 // `from_host`, minus names shadowed by `overrides`. BTreeMap
182 // iteration is already lexicographic, satisfying CACHE-009.
183 let from_host_effective: Vec<(&EnvVarName, &Option<String>)> = env
184 .from_host
185 .iter()
186 .filter(|(name, _)| !env.overrides.contains_key(*name))
187 .collect();
188
189 let from_host_count = u32::try_from(from_host_effective.len())
190 .expect("env from_host count within u32::MAX is structural");
191 hasher.update(&from_host_count.to_be_bytes());
192 for (name, value) in &from_host_effective {
193 write_length_prefixed(hasher, AsRef::<str>::as_ref(name.as_ref()).as_bytes());
194 match value {
195 Some(v) => write_length_prefixed(hasher, v.as_bytes()),
196 None => hasher.update(&[ENV_ABSENT_MARKER]),
197 }
198 }
199
200 let override_count = u32::try_from(env.overrides.len())
201 .expect("env override count within u32::MAX is structural");
202 hasher.update(&override_count.to_be_bytes());
203 for (name, value) in env.overrides {
204 write_length_prefixed(hasher, AsRef::<str>::as_ref(name.as_ref()).as_bytes());
205 write_length_prefixed(hasher, value.as_bytes());
206 }
207}
208
209/// Write `bytes` length-prefixed (4-byte big-endian unsigned
210/// integer) followed by the bytes themselves. The canonical
211/// per-item encoding shared by every component above.
212fn write_length_prefixed(hasher: &mut Hasher, bytes: &[u8]) {
213 let len = u32::try_from(bytes.len()).expect("item length within u32::MAX is structural");
214 hasher.update(&len.to_be_bytes());
215 hasher.update(bytes);
216}
217
218/// The canonical identifier of `shell` as it enters the cache key
219/// per `CACHE-005`. First-class variants emit their literal name
220/// (`sh`, `bash`); custom shells emit the validated
221/// [`haz_domain::action::NonEmptyAsciiName`] verbatim.
222fn shell_identifier(shell: &ShellType) -> &str {
223 match shell {
224 ShellType::Sh => "sh",
225 ShellType::Bash => "bash",
226 ShellType::Other(name) => AsRef::<str>::as_ref(name.as_ref()),
227 }
228}