s4_server/service.rs
1//! `s3s::S3` 実装 — `s3s_aws::Proxy` への delegation を default にしつつ、
2//! `put_object` / `get_object` 経路で `s4_codec::CodecRegistry` を呼ぶ。
3//!
4//! ## カバー範囲 (Phase 1 月 2)
5//!
6//! - 圧縮 hook あり: `put_object`, `get_object`
7//! - 純 delegation (圧縮なし): `head_bucket`, `list_buckets`, `create_bucket`, `delete_bucket`,
8//! `head_object`, `delete_object`, `delete_objects`, `copy_object`, `list_objects`,
9//! `list_objects_v2`, `create_multipart_upload`, `upload_part`,
10//! `complete_multipart_upload`, `abort_multipart_upload`, `list_multipart_uploads`,
11//! `list_parts`
12//! - 未対応 (デフォルトで NotImplemented): その他 80+ ops (Tagging / ACL / Lifecycle 等は Phase 2)
13//!
14//! ## アーキテクチャ
15//!
16//! - `S4Service<B>` は backend (B: S3) と `Arc<CodecRegistry>` と `Arc<dyn CodecDispatcher>`
17//! を保持する。`CodecRegistry` 経由で複数 codec を抱えられるので、ひとつの S4 インスタンスが
18//! 複数 codec で書かれた object を透過的に GET できる
19//! - PUT: dispatcher が body の先頭 sample から codec を選び、registry で compress、
20//! manifest を S3 metadata に書いて backend に forward
21//! - GET: backend から取得 → metadata から manifest を復元 → registry.decompress で
22//! manifest 指定の codec で解凍 → 元の bytes を return
23//!
24//! ## 既知の制限事項
25//!
26//! - **Multipart Upload は per-part 圧縮が未実装**: 現状は upload_part を素通し。
27//! Phase 1 月 2 後半で per-part compress + complete_multipart_upload で manifest 集約。
28//! - **PUT body は memory に collect**: max_body_bytes 上限あり (default 5 GiB = S3 単発 PUT 上限)。
29//! Streaming-aware 圧縮は Phase 2。
30
31use std::sync::Arc;
32
33use base64::Engine as _;
34use bytes::BytesMut;
35use s3s::dto::*;
36use s3s::{S3, S3Error, S3ErrorCode, S3Request, S3Response, S3Result};
37use s4_codec::index::{FrameIndex, build_index_from_body, decode_index, encode_index, sidecar_key};
38use s4_codec::multipart::{
39 FRAME_HEADER_BYTES, FrameHeader, FrameIter, S3_MULTIPART_MIN_PART_BYTES, pad_to_minimum,
40 write_frame,
41};
42use s4_codec::{ChunkManifest, CodecDispatcher, CodecKind, CodecRegistry, CompressTelemetry};
43use std::time::Instant;
44use tracing::{debug, info};
45
46use crate::blob::{
47 bytes_to_blob, chain_sample_with_rest, collect_blob, collect_with_sample, peek_sample,
48};
49use crate::streaming::{
50 Crc32cVerifyingReader, async_read_to_blob, blob_to_async_read, cpu_zstd_decompress_stream,
51 pick_chunk_size, streaming_compress_to_frames, supports_streaming_compress,
52 supports_streaming_decompress,
53};
54
55/// PUT body の先頭 sampling で渡す最大 byte 数。
56const SAMPLE_BYTES: usize = 4096;
57
58/// v0.8 #55: stamp the GPU pipeline metrics (`s4_gpu_compress_seconds`,
59/// `s4_gpu_throughput_bytes_per_sec`, `s4_gpu_oom_total`) from a
60/// `CompressTelemetry` returned by `CodecRegistry::compress_with_telemetry`.
61/// CPU codecs (`gpu_seconds = None`) are no-ops here — they're already
62/// covered by the existing `s4_request_latency_seconds` / `s4_bytes_*`
63/// counters in the request-level `record_put` / `record_get` calls.
64#[inline]
65fn stamp_gpu_compress_telemetry(tel: &CompressTelemetry) {
66 if let Some(secs) = tel.gpu_seconds {
67 crate::metrics::record_gpu_compress(tel.codec, secs, tel.bytes_in, tel.bytes_out);
68 }
69 if tel.oom {
70 crate::metrics::record_gpu_oom(tel.codec);
71 }
72}
73
74/// v0.7 #49: percent-encoding set covering everything that is **not** an
75/// `unreserved` character per RFC 3986 §2.3, **plus** we additionally
76/// encode the path-reserved sub-delims that `http::Uri` rejects in a
77/// path segment (`?`, `#`, `%`, control bytes, space, etc.). We
78/// deliberately keep `/` un-encoded because S3 keys legally use `/` as
79/// a logical separator and the rest of the synthetic URI relies on the
80/// path layout `/{bucket}/{key}` round-tripping byte-for-byte.
81const URI_KEY_ENCODE_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS
82 .add(b' ')
83 .add(b'"')
84 .add(b'#')
85 .add(b'<')
86 .add(b'>')
87 .add(b'?')
88 .add(b'`')
89 .add(b'{')
90 .add(b'}')
91 .add(b'|')
92 .add(b'\\')
93 .add(b'^')
94 .add(b'[')
95 .add(b']')
96 .add(b'%');
97
98/// v0.7 #49: build the synthetic `/{bucket}/{key}` request URI used by
99/// the sidecar / replication helpers when they re-enter the backend
100/// trait without going through the HTTP layer. S3 object keys can
101/// contain spaces, control bytes, and arbitrary Unicode that would
102/// make `format!(...).parse::<http::Uri>()` panic; we percent-encode
103/// the key bytes (RFC 3986 path segment) and the bucket name (defensive
104/// — bucket names are normally DNS-safe, but the helper is the single
105/// choke-point) before splicing them in. If the encoded form *still*
106/// fails to parse (extremely unlikely once everything outside the
107/// unreserved set is escaped) we surface a typed `400 InvalidObjectName`
108/// instead of crashing the worker.
109pub(crate) fn safe_object_uri(bucket: &str, key: &str) -> S3Result<http::Uri> {
110 use percent_encoding::utf8_percent_encode;
111 let bucket_enc = utf8_percent_encode(bucket, URI_KEY_ENCODE_SET);
112 let key_enc = utf8_percent_encode(key, URI_KEY_ENCODE_SET);
113 let raw = format!("/{bucket_enc}/{key_enc}");
114 raw.parse::<http::Uri>().map_err(|e| {
115 // S3 spec uses `InvalidObjectName` (HTTP 400) for keys that
116 // can't be represented in a request URI. The generated
117 // `S3ErrorCode` enum doesn't expose a typed variant for it,
118 // so we round-trip through `from_bytes` which preserves the
119 // canonical wire string while falling back to InvalidArgument
120 // if even that lookup fails (cannot happen at runtime — kept
121 // as a belt-and-suspenders branch so this helper never
122 // panics).
123 let code =
124 S3ErrorCode::from_bytes(b"InvalidObjectName").unwrap_or(S3ErrorCode::InvalidArgument);
125 S3Error::with_message(
126 code,
127 format!("object key cannot be encoded as a request URI: {e}"),
128 )
129 })
130}
131
132/// v0.8.12 HIGH-12 fix: verify a client-supplied integrity checksum
133/// against the received body BEFORE we strip the header on the way
134/// to the backend. Returns `Err(BadDigest)` on mismatch (matches
135/// AWS S3 wire behaviour); `Ok(())` when the supplied digest matches
136/// OR when the supplied algorithm is one we don't yet implement
137/// (the latter is logged so operators see the gap — fail-open on
138/// unsupported algorithms is the documented trade in the v0.8.11
139/// CHANGELOG, with full coverage tracked as a follow-up issue).
140///
141/// Algorithms covered: `Content-MD5` (base64 MD5),
142/// `x-amz-checksum-crc32c` (base64 big-endian u32),
143/// `x-amz-checksum-sha256` (base64 SHA-256). The remaining S3
144/// checksum algorithms (CRC32 non-Castagnoli, SHA-1, CRC64-NVME)
145/// are accepted and silently passed; verifying them needs new
146/// dependencies and was held back to keep the v0.8.12 surface
147/// bounded.
148#[allow(clippy::too_many_arguments)]
149fn verify_client_body_checksums(
150 body: &[u8],
151 content_md5_b64: Option<&str>,
152 checksum_crc32_b64: Option<&str>,
153 checksum_crc32c_b64: Option<&str>,
154 checksum_sha1_b64: Option<&str>,
155 checksum_sha256_b64: Option<&str>,
156 checksum_crc64nvme_b64: Option<&str>,
157) -> S3Result<()> {
158 use base64::Engine as _;
159 use md5::Md5;
160 use sha2::Sha256;
161 // `Digest` from md-5 / sha2 brings the `new`, `update`, `finalize`
162 // trait methods into scope. Bind anonymously so this `use` is
163 // never flagged as unused while still serving its real purpose.
164 use md5::Digest as _;
165 let b64 = base64::engine::general_purpose::STANDARD;
166 let bad = |what: &str| {
167 let code = S3ErrorCode::from_bytes(b"BadDigest").unwrap_or(S3ErrorCode::InvalidArgument);
168 S3Error::with_message(
169 code,
170 format!("client-supplied {what} did not match the received body"),
171 )
172 };
173 if let Some(claimed) = content_md5_b64 {
174 let want = b64.decode(claimed).map_err(|_| {
175 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed Content-MD5")
176 })?;
177 if want.len() != 16 {
178 return Err(S3Error::with_message(
179 S3ErrorCode::InvalidDigest,
180 "Content-MD5 must decode to 16 bytes",
181 ));
182 }
183 let mut h = Md5::new();
184 h.update(body);
185 let got = h.finalize();
186 // `subtle::ConstantTimeEq` would be ideal but the existing
187 // `constant_time_eq` helper in sse.rs is private; use a
188 // straightforward byte compare. The attacker doesn't get to
189 // choose the body retroactively, so a timing oracle here
190 // doesn't help them. `&got[..]` derefs the GenericArray
191 // into a `&[u8]` (the deprecated `.as_slice()` is gone in
192 // generic-array 1.x; CI runs `-D warnings`).
193 if got[..] != *want.as_slice() {
194 return Err(bad("Content-MD5"));
195 }
196 }
197 if let Some(claimed) = checksum_crc32c_b64 {
198 let want = b64.decode(claimed).map_err(|_| {
199 S3Error::with_message(
200 S3ErrorCode::InvalidDigest,
201 "malformed x-amz-checksum-crc32c",
202 )
203 })?;
204 if want.len() != 4 {
205 return Err(S3Error::with_message(
206 S3ErrorCode::InvalidDigest,
207 "x-amz-checksum-crc32c must decode to 4 bytes (big-endian u32)",
208 ));
209 }
210 let got = crc32c::crc32c(body).to_be_bytes();
211 if got != want.as_slice() {
212 return Err(bad("x-amz-checksum-crc32c"));
213 }
214 }
215 if let Some(claimed) = checksum_sha256_b64 {
216 let want = b64.decode(claimed).map_err(|_| {
217 S3Error::with_message(
218 S3ErrorCode::InvalidDigest,
219 "malformed x-amz-checksum-sha256",
220 )
221 })?;
222 if want.len() != 32 {
223 return Err(S3Error::with_message(
224 S3ErrorCode::InvalidDigest,
225 "x-amz-checksum-sha256 must decode to 32 bytes",
226 ));
227 }
228 let mut h = Sha256::new();
229 h.update(body);
230 let got = h.finalize();
231 if got[..] != *want.as_slice() {
232 return Err(bad("x-amz-checksum-sha256"));
233 }
234 }
235 // v0.8.12 #128 (MED-C): CRC32 (IEEE 802.3 — the non-Castagnoli
236 // variant AWS uses for `x-amz-checksum-crc32`). 4-byte
237 // big-endian value, base64-encoded.
238 if let Some(claimed) = checksum_crc32_b64 {
239 let want = b64.decode(claimed).map_err(|_| {
240 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-crc32")
241 })?;
242 if want.len() != 4 {
243 return Err(S3Error::with_message(
244 S3ErrorCode::InvalidDigest,
245 "x-amz-checksum-crc32 must decode to 4 bytes (big-endian u32)",
246 ));
247 }
248 let mut h = crc32fast::Hasher::new();
249 h.update(body);
250 let got = h.finalize().to_be_bytes();
251 if got != want.as_slice() {
252 return Err(bad("x-amz-checksum-crc32"));
253 }
254 }
255 // v0.8.12 #128 (MED-C): SHA-1. 20-byte digest, base64-encoded.
256 if let Some(claimed) = checksum_sha1_b64 {
257 use sha1::Sha1;
258 let want = b64.decode(claimed).map_err(|_| {
259 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-sha1")
260 })?;
261 if want.len() != 20 {
262 return Err(S3Error::with_message(
263 S3ErrorCode::InvalidDigest,
264 "x-amz-checksum-sha1 must decode to 20 bytes",
265 ));
266 }
267 let mut h = Sha1::new();
268 h.update(body);
269 let got = h.finalize();
270 if got[..] != *want.as_slice() {
271 return Err(bad("x-amz-checksum-sha1"));
272 }
273 }
274 // v0.8.12 #128 (MED-C): CRC64-NVME — AWS's newest checksum
275 // algorithm. NVMe spec: poly 0xad93d23594c93659, init / xorout
276 // 0xffffffffffffffff, refin / refout true. The reflected
277 // polynomial + 256-entry lookup table are computed lazily on
278 // first call (small enough to inline rather than pull in a
279 // dedicated crc64 crate).
280 if let Some(claimed) = checksum_crc64nvme_b64 {
281 let want = b64.decode(claimed).map_err(|_| {
282 S3Error::with_message(
283 S3ErrorCode::InvalidDigest,
284 "malformed x-amz-checksum-crc64nvme",
285 )
286 })?;
287 if want.len() != 8 {
288 return Err(S3Error::with_message(
289 S3ErrorCode::InvalidDigest,
290 "x-amz-checksum-crc64nvme must decode to 8 bytes (big-endian u64)",
291 ));
292 }
293 let got = crc64_nvme(body).to_be_bytes();
294 if got != want.as_slice() {
295 return Err(bad("x-amz-checksum-crc64nvme"));
296 }
297 }
298 Ok(())
299}
300
301/// v0.8.12 #128 (MED-C): CRC-64/NVME (AWS S3 `x-amz-checksum-crc64nvme`).
302/// NVMe spec: poly 0xad93d23594c93659, init 0xffffffffffffffff, refin
303/// true, refout true, xorout 0xffffffffffffffff. The reflected
304/// polynomial table is computed lazily on first call via
305/// [`std::sync::OnceLock`]; subsequent calls share the 256-entry table.
306fn crc64_nvme(bytes: &[u8]) -> u64 {
307 use std::sync::OnceLock;
308 static TABLE: OnceLock<[u64; 256]> = OnceLock::new();
309 let tbl = TABLE.get_or_init(|| {
310 // Reflected polynomial (bit-reverse of 0xad93d23594c93659).
311 const POLY_REFLECTED: u64 = 0x9a6c_9329_ac4b_c9b5;
312 let mut t = [0u64; 256];
313 let mut i = 0usize;
314 while i < 256 {
315 let mut c = i as u64;
316 let mut j = 0;
317 while j < 8 {
318 c = if c & 1 != 0 {
319 (c >> 1) ^ POLY_REFLECTED
320 } else {
321 c >> 1
322 };
323 j += 1;
324 }
325 t[i] = c;
326 i += 1;
327 }
328 t
329 });
330 let mut crc: u64 = !0u64;
331 for &b in bytes {
332 let idx = ((crc as u8) ^ b) as usize;
333 crc = (crc >> 8) ^ tbl[idx];
334 }
335 !crc
336}
337
338/// v0.4 #20: captured at the start of a handler, before the request is
339/// consumed by the backend call, so the matching `record_access` at
340/// end-of-request can fill in the structured access log entry.
341struct AccessLogPreamble {
342 remote_ip: Option<String>,
343 requester: Option<String>,
344 request_uri: String,
345 user_agent: Option<String>,
346}
347
348pub struct S4Service<B: S3> {
349 /// Wrapped in `Arc` so the v0.6 #40 cross-bucket replication
350 /// dispatcher can clone it into a detached `tokio::spawn` task
351 /// (Arc::clone is cheap; backend trait methods take `&self` so no
352 /// other handler is affected by the indirection).
353 backend: Arc<B>,
354 registry: Arc<CodecRegistry>,
355 dispatcher: Arc<dyn CodecDispatcher>,
356 max_body_bytes: usize,
357 policy: Option<crate::policy::SharedPolicy>,
358 /// v0.3 #13: surfaced as the `aws:SecureTransport` Condition key. Set
359 /// to `true` when the listener is wrapped in TLS (or ACME), so policies
360 /// gating "deny if not over TLS" can do their job. Defaults to `false`
361 /// (HTTP); set via [`S4Service::with_secure_transport`] at boot.
362 secure_transport: bool,
363 /// v0.4 #19: optional per-(principal, bucket) token-bucket limiter.
364 rate_limits: Option<crate::rate_limit::SharedRateLimits>,
365 /// v0.4 #20: optional S3-style access log emitter.
366 access_log: Option<crate::access_log::SharedAccessLog>,
367 /// v0.4 #21 / v0.5 #29: optional server-side encryption keyring
368 /// (AES-256-GCM). When set, every PUT body gets wrapped in S4E2
369 /// (with the keyring's active key id) after the compress + framing
370 /// steps; every GET that sniffs as S4E1/S4E2 is decrypted before
371 /// frame parsing. A `with_sse_key(...)` call wraps the supplied
372 /// key in a 1-slot keyring so single-key (v0.4) operators get the
373 /// same behaviour they had before, just on the v2 frame.
374 sse_keyring: Option<crate::sse::SharedSseKeyring>,
375 /// v0.5 #34: optional first-class versioning state machine. When
376 /// `Some(...)`, S4-server itself owns the per-bucket versioning
377 /// state + per-(bucket, key) version chain; PUT / GET / DELETE /
378 /// list_object_versions / get_bucket_versioning /
379 /// put_bucket_versioning handlers consult the manager instead of
380 /// passing through. When `None` (default), the legacy
381 /// backend-passthrough behaviour applies so existing v0.4
382 /// deployments are unaffected until they explicitly call
383 /// `with_versioning(...)`.
384 versioning: Option<Arc<crate::versioning::VersioningManager>>,
385 /// v0.5 #28: optional SSE-KMS envelope-encryption backend. When
386 /// `Some(...)`, PUTs carrying `x-amz-server-side-encryption: aws:kms`
387 /// generate a fresh DEK via the backend, encrypt the body with it
388 /// (S4E4 frame), and persist only the wrapped DEK. GETs sniffing as
389 /// S4E4 unwrap the DEK through the same backend before decrypt.
390 /// `kms_default_key_id` is used when the request omits an explicit
391 /// `x-amz-server-side-encryption-aws-kms-key-id` (mirrors AWS S3
392 /// bucket-default behaviour).
393 kms: Option<Arc<dyn crate::kms::KmsBackend>>,
394 kms_default_key_id: Option<String>,
395 /// v0.5 #30: optional Object Lock (WORM) enforcement layer. When
396 /// `Some(...)`, `delete_object` and overwrite-style `put_object`
397 /// consult the manager and refuse the operation with HTTP 403
398 /// `AccessDenied` while the object is locked (Compliance until
399 /// expiry, Governance unless the bypass header is set, or any time
400 /// a legal hold is on). PUT also auto-applies the bucket-default
401 /// retention to brand-new objects when configured. When `None`
402 /// (default), the legacy backend-passthrough behaviour applies, so
403 /// existing v0.4 deployments are unaffected until they explicitly
404 /// call `with_object_lock(...)`.
405 object_lock: Option<Arc<crate::object_lock::ObjectLockManager>>,
406 /// v0.6 #38: optional first-class CORS bucket configuration manager.
407 /// When `Some(...)`, S4-server itself owns per-bucket CORS rules and
408 /// `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
409 /// consult the manager instead of passing through to the backend.
410 /// `handle_preflight` (public method on `S4Service`) routes OPTIONS-
411 /// style preflight matching through the same store; the actual HTTP
412 /// OPTIONS routing wire-up at the listener level is a follow-up
413 /// (s3s framework does not surface OPTIONS as a typed handler).
414 cors: Option<Arc<crate::cors::CorsManager>>,
415 /// v0.6 #36: optional first-class S3 Inventory manager. When
416 /// `Some(...)`, S4-server itself owns per-(bucket, id) inventory
417 /// configurations and `put_bucket_inventory_configuration` /
418 /// `get_bucket_inventory_configuration` /
419 /// `list_bucket_inventory_configurations` /
420 /// `delete_bucket_inventory_configuration` consult the manager
421 /// instead of passing through to the backend. The actual periodic
422 /// CSV emission is driven by a tokio task in `main.rs` that calls
423 /// `InventoryManager::run_once_for_test` on a fixed cadence; the
424 /// service handlers below only deal with config-level CRUD.
425 inventory: Option<Arc<crate::inventory::InventoryManager>>,
426 /// v0.6 #35: optional first-class S3 bucket-notification manager.
427 /// When `Some(...)`, S4-server itself owns per-bucket notification
428 /// configurations and `put_bucket_notification_configuration` /
429 /// `get_bucket_notification_configuration` consult the manager
430 /// instead of passing through to the backend. Successful PUT /
431 /// DELETE handlers fire matching destinations on a detached tokio
432 /// task (best-effort; see `crate::notifications::dispatch_event`).
433 notifications: Option<Arc<crate::notifications::NotificationManager>>,
434 /// v0.6 #37: optional first-class S3 Lifecycle configuration
435 /// manager. When `Some(...)`, S4-server itself owns per-bucket
436 /// lifecycle rules and `put_bucket_lifecycle_configuration` /
437 /// `get_bucket_lifecycle_configuration` /
438 /// `delete_bucket_lifecycle` consult the manager instead of
439 /// passing through to the backend. The actual background scanner
440 /// (list_objects_v2 -> evaluate -> delete / metadata-rewrite per
441 /// rule) is a v0.7+ follow-up; the test path
442 /// `S4Service::run_lifecycle_once_for_test` exercises the
443 /// evaluator end-to-end so this v0.6 #37 wiring is enough to ship
444 /// the configuration-management half without putting a
445 /// half-wired bucket-walk in front of users.
446 lifecycle: Option<Arc<crate::lifecycle::LifecycleManager>>,
447 /// v0.6 #39: optional first-class object + bucket Tagging manager.
448 /// When `Some(...)`, S4-server itself owns per-(bucket, key) and
449 /// per-bucket tag state — `PutObjectTagging` /
450 /// `GetObjectTagging` / `DeleteObjectTagging` /
451 /// `PutBucketTagging` / `GetBucketTagging` /
452 /// `DeleteBucketTagging` route through the manager (replacing the
453 /// previous backend-passthrough behaviour). `put_object` also
454 /// pre-parses the `x-amz-tagging` header / `Tagging` input field
455 /// so the IAM policy evaluator can gate on
456 /// `s3:RequestObjectTag/<key>` and `s3:ExistingObjectTag/<key>`.
457 /// On a successful PUT the parsed tags are persisted; on a
458 /// successful DELETE the matching tag entry is dropped.
459 tagging: Option<Arc<crate::tagging::TagManager>>,
460 /// v0.6 #40: optional first-class cross-bucket replication manager.
461 /// When `Some(...)`, S4-server itself owns per-bucket replication
462 /// rules; `PutBucketReplication` / `GetBucketReplication` /
463 /// `DeleteBucketReplication` route through the manager (replacing
464 /// the previous backend-passthrough behaviour). On every successful
465 /// `put_object` the manager's rule list is consulted; the
466 /// highest-priority matching enabled rule wins, the per-key status
467 /// is recorded as `Pending`, and the source body and metadata are
468 /// handed to a detached tokio task that PUTs to the destination
469 /// bucket through the same backend. The replica is stamped with
470 /// `x-amz-replication-status: REPLICA` in its metadata; the
471 /// source-side status is updated to `Completed` on success or
472 /// `Failed` after the 3-attempt retry budget is exhausted (drop
473 /// counter bumps in either-side case so dashboards see the loss).
474 /// `head_object` / `get_object` echo the recorded status back as
475 /// `x-amz-replication-status` so consumers can poll progress.
476 /// Limited to single-instance (same `S4Service`) replication; true
477 /// cross-region (multi-instance) is a v0.7+ follow-up.
478 replication: Option<Arc<crate::replication::ReplicationManager>>,
479 /// v0.6 #42: optional MFA-Delete enforcement layer. When `Some(...)`,
480 /// every DELETE / DELETE-version / delete-marker / `PutBucketVersioning`
481 /// request against a bucket whose MFA-Delete state is `Enabled`
482 /// must carry `x-amz-mfa: <serial> <code>` (RFC 6238 6-digit TOTP);
483 /// missing or invalid tokens return HTTP 403 `AccessDenied`. When
484 /// `None` (default), the gate is a no-op so existing v0.4 / v0.5
485 /// deployments are unaffected until they explicitly call
486 /// `with_mfa_delete(...)`.
487 mfa_delete: Option<Arc<crate::mfa::MfaDeleteManager>>,
488 /// v0.5 #32: when `true`, every PUT must carry an SSE indicator
489 /// (`x-amz-server-side-encryption`, the SSE-C customer-key headers,
490 /// or be matched against a configured server-managed keyring/KMS).
491 /// Set by `--compliance-mode strict` after the boot-time
492 /// prerequisite check passes.
493 compliance_strict: bool,
494 /// v0.7 #47: optional SigV4a (asymmetric ECDSA-P256-SHA256) verify
495 /// gate. When `Some(...)`, the listener-side middleware (see
496 /// [`crate::routing::try_sigv4a_verify`]) inspects every incoming
497 /// request and short-circuits SigV4a-signed ones — verifying the
498 /// signature against the credential store and returning 403
499 /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` on failure. Plain
500 /// SigV4 (HMAC-SHA256) requests pass through to s3s untouched. When
501 /// `None`, the middleware is a no-op so the existing SigV4 path is
502 /// unaffected (operators opt in via `--sigv4a-credentials <DIR>`).
503 sigv4a_gate: Option<Arc<SigV4aGate>>,
504 /// v0.8 #54 BUG-5..10: per-`upload_id` side-table that ferries the
505 /// SSE / Tagging / Object-Lock context captured at
506 /// `CreateMultipartUpload` time through to `UploadPart` /
507 /// `CompleteMultipartUpload`. Always-on (no `with_*` flag) — the
508 /// store is gateway-internal and idle when no multipart is in
509 /// flight. See [`crate::multipart_state`] for rationale.
510 multipart_state: Arc<crate::multipart_state::MultipartStateStore>,
511 /// v0.8 #52: plaintext bytes per S4E5 chunk on the SSE-S4 PUT
512 /// path. `0` (default) → use the legacy buffered S4E2 path
513 /// (whole-body AES-GCM tag, GET buffers + verifies before
514 /// emitting). Non-zero → use the chunked S4E5 frame so GET can
515 /// stream-decrypt chunk-by-chunk. Wired by `--sse-chunk-size`
516 /// in `main.rs`. SSE-C and SSE-KMS are intentionally unaffected
517 /// (chunked variants tracked in a follow-up issue).
518 sse_chunk_size: usize,
519 /// v0.8.5 #86 (audit M-2): bounded permit pool gating the detached
520 /// replication dispatcher in [`Self::spawn_replication_if_matched`].
521 /// Without this cap, a high-volume PUT workload (1k req/s × N enabled
522 /// rules × slow destination = O(10k) in-flight tokio tasks) could
523 /// exhaust process memory before the destination drains. Each
524 /// dispatcher spawn `acquire_owned`s one permit and holds it for the
525 /// lifetime of the destination PUT + status stamp; once the cap is
526 /// reached the dispatcher async-blocks on `acquire_owned()` so the
527 /// listener path itself never stalls — only the in-flight replica
528 /// queue depth is bounded. Default 1024 (operator-tunable via
529 /// `--replication-max-concurrent`).
530 replication_semaphore: Arc<tokio::sync::Semaphore>,
531 /// v0.8.11 CRIT-4 fix: trust the `X-Forwarded-For` header for the
532 /// `aws:SourceIp` Condition key only when the operator has
533 /// explicitly opted in via `--trust-x-forwarded-for`. Default
534 /// (`false`) makes the policy evaluator see `source_ip = None`
535 /// for incoming requests, so a public-internet client can no
536 /// longer spoof an internal CIDR by setting `X-Forwarded-For`
537 /// themselves. Operators behind a trusted reverse proxy that
538 /// scrubs / sets `X-Forwarded-For` enable the flag; gateways
539 /// listening directly on the public internet leave it off and
540 /// gain a clear fail-closed default. A future release plumbs
541 /// the TCP peer address through the s3s service trait so we can
542 /// validate the forwarded header against a `--trusted-proxies`
543 /// CIDR list; until then the boolean opt-in closes the immediate
544 /// auth-bypass surface.
545 trust_x_forwarded_for: bool,
546}
547
548impl<B: S3> S4Service<B> {
549 /// AWS S3 単発 PUT の API 上限 (5 GiB)
550 pub const DEFAULT_MAX_BODY_BYTES: usize = 5 * 1024 * 1024 * 1024;
551
552 /// v0.8.5 #86 (audit M-2): default cap on simultaneously-in-flight
553 /// replication dispatcher tasks. See the `replication_semaphore`
554 /// field doc for the rationale + override path.
555 pub const DEFAULT_REPLICATION_MAX_CONCURRENT: usize = 1024;
556
557 pub fn new(
558 backend: B,
559 registry: Arc<CodecRegistry>,
560 dispatcher: Arc<dyn CodecDispatcher>,
561 ) -> Self {
562 Self {
563 backend: Arc::new(backend),
564 registry,
565 dispatcher,
566 max_body_bytes: Self::DEFAULT_MAX_BODY_BYTES,
567 policy: None,
568 secure_transport: false,
569 rate_limits: None,
570 access_log: None,
571 sse_keyring: None,
572 versioning: None,
573 kms: None,
574 kms_default_key_id: None,
575 object_lock: None,
576 cors: None,
577 inventory: None,
578 notifications: None,
579 lifecycle: None,
580 tagging: None,
581 replication: None,
582 mfa_delete: None,
583 compliance_strict: false,
584 sigv4a_gate: None,
585 multipart_state: Arc::new(crate::multipart_state::MultipartStateStore::new()),
586 // v0.8 #52: chunked SSE-S4 disabled by default — opt
587 // in via `S4Service::with_sse_chunk_size(...)` /
588 // `--sse-chunk-size <BYTES>`. Default keeps the legacy
589 // S4E2 buffered path so existing deployments are
590 // bit-for-bit unchanged.
591 sse_chunk_size: 0,
592 // v0.8.5 #86 (audit M-2): default cap of 1024 in-flight
593 // replication tasks. Picked to be (a) ample headroom over a
594 // typical steady-state replication rate (the v0.8.3 #66
595 // status-sweep doc cites 1k keys/hour as a "steady" rate, so
596 // even a 100x burst lands well under 1024), (b) small enough
597 // that the worst-case memory pinned by stalled dispatchers
598 // — body bytes + metadata — stays bounded (1024 × 5 MiB
599 // typical S3 PUT ≈ 5 GiB, recoverable). Operators with
600 // wider cross-region fan-out can override via
601 // `--replication-max-concurrent`.
602 replication_semaphore: Arc::new(tokio::sync::Semaphore::new(
603 Self::DEFAULT_REPLICATION_MAX_CONCURRENT,
604 )),
605 // v0.8.11 CRIT-4: default fail-closed — ignore client-
606 // supplied `X-Forwarded-For` until the operator opts in
607 // through `with_trust_x_forwarded_for(true)`.
608 trust_x_forwarded_for: false,
609 }
610 }
611
612 /// v0.8.11 CRIT-4 fix: opt in to consuming the leftmost token of
613 /// the `X-Forwarded-For` header as `aws:SourceIp`. Only enable
614 /// when the gateway sits behind a trusted reverse proxy that
615 /// strips (or rewrites) any client-supplied value. When left
616 /// off (default), the policy evaluator sees `source_ip = None`
617 /// regardless of what the client sends — closing the
618 /// public-internet `X-Forwarded-For: 10.0.0.1` IAM-allowlist
619 /// bypass.
620 #[must_use]
621 pub fn with_trust_x_forwarded_for(mut self, on: bool) -> Self {
622 self.trust_x_forwarded_for = on;
623 self
624 }
625
626 /// v0.7 #47: attach the SigV4a verify gate. Once set, the
627 /// listener-side middleware (`crate::routing::try_sigv4a_verify`)
628 /// short-circuits any incoming `AWS4-ECDSA-P256-SHA256` request,
629 /// verifying it against the supplied credential store and
630 /// returning 403 on failure. Plain SigV4 (HMAC-SHA256) requests
631 /// are unaffected. When the gate is unset (default), the
632 /// middleware skips entirely so existing SigV4 deployments keep
633 /// working.
634 #[must_use]
635 pub fn with_sigv4a_gate(mut self, gate: Arc<SigV4aGate>) -> Self {
636 self.sigv4a_gate = Some(gate);
637 self
638 }
639
640 /// v0.7 #47: borrow the attached SigV4a gate. Used by `main.rs`
641 /// to snapshot the gate `Arc` before the s3s `ServiceBuilder`
642 /// consumes the `S4Service` (the listener-side middleware needs
643 /// the same `Arc` because s3s' SigV4 verifier rejects SigV4a
644 /// algorithm tokens with "unknown algorithm" — match has to
645 /// happen at the hyper layer instead).
646 #[must_use]
647 pub fn sigv4a_gate(&self) -> Option<&Arc<SigV4aGate>> {
648 self.sigv4a_gate.as_ref()
649 }
650
651 /// v0.8.2 #62: borrow the multipart state store so `main.rs` can
652 /// snapshot the `Arc` before the s3s `ServiceBuilder` consumes
653 /// the `S4Service`. The background `sweep_stale` task in `main.rs`
654 /// holds this `Arc` and ticks once an hour to drop abandoned
655 /// upload contexts (and their `Zeroizing<[u8; 32]>` SSE-C keys).
656 #[must_use]
657 pub fn multipart_state(&self) -> &Arc<crate::multipart_state::MultipartStateStore> {
658 &self.multipart_state
659 }
660
661 /// v0.6 #39: attach the in-memory object + bucket Tagging manager.
662 /// Once set, `Put/Get/Delete` `Object/Bucket Tagging` route
663 /// through the manager (instead of forwarding to the backend),
664 /// and `put_object`'s `x-amz-tagging` parse path becomes the
665 /// source of `s3:RequestObjectTag/<key>` for the IAM policy
666 /// evaluator. The manager itself is shared via `Arc`.
667 #[must_use]
668 pub fn with_tagging(mut self, mgr: Arc<crate::tagging::TagManager>) -> Self {
669 self.tagging = Some(mgr);
670 self
671 }
672
673 /// v0.6 #39: borrow the attached tagging manager (test /
674 /// introspection — the snapshotter in `main.rs`, when wired,
675 /// will keep its own `Arc` clone).
676 #[must_use]
677 pub fn tag_manager(&self) -> Option<&Arc<crate::tagging::TagManager>> {
678 self.tagging.as_ref()
679 }
680
681 /// v0.6 #36: attach the in-memory S3 Inventory manager. Once set,
682 /// `put_bucket_inventory_configuration` /
683 /// `get_bucket_inventory_configuration` /
684 /// `list_bucket_inventory_configurations` /
685 /// `delete_bucket_inventory_configuration` route through the
686 /// manager. The actual periodic CSV / manifest emission is
687 /// orchestrated by a tokio task started in `main.rs`; the manager
688 /// itself is shared between the handler and the scheduler via
689 /// `Arc`.
690 #[must_use]
691 pub fn with_inventory(mut self, mgr: Arc<crate::inventory::InventoryManager>) -> Self {
692 self.inventory = Some(mgr);
693 self
694 }
695
696 /// v0.6 #36: borrow the attached inventory manager (test /
697 /// introspection — the background scheduler in `main.rs` keeps its
698 /// own `Arc` clone, so this accessor is for the test path that
699 /// invokes `run_once_for_test` directly).
700 #[must_use]
701 pub fn inventory_manager(&self) -> Option<&Arc<crate::inventory::InventoryManager>> {
702 self.inventory.as_ref()
703 }
704
705 /// v0.6 #37: attach the in-memory S3 Lifecycle configuration
706 /// manager. Once set, `put_bucket_lifecycle_configuration` /
707 /// `get_bucket_lifecycle_configuration` / `delete_bucket_lifecycle`
708 /// route through the manager (replacing the previous backend-
709 /// passthrough behaviour). The actual periodic scanner that walks
710 /// the source bucket and invokes Expiration / Transition /
711 /// NoncurrentExpiration actions is a v0.7+ follow-up — see
712 /// [`Self::run_lifecycle_once_for_test`] for the in-memory test
713 /// path that exercises the evaluator end-to-end.
714 #[must_use]
715 pub fn with_lifecycle(mut self, mgr: Arc<crate::lifecycle::LifecycleManager>) -> Self {
716 self.lifecycle = Some(mgr);
717 self
718 }
719
720 /// v0.6 #37: borrow the attached lifecycle manager (test /
721 /// introspection — the background scheduler in `main.rs` keeps its
722 /// own `Arc` clone, so this accessor is for the test path that
723 /// invokes the evaluator directly).
724 #[must_use]
725 pub fn lifecycle_manager(&self) -> Option<&Arc<crate::lifecycle::LifecycleManager>> {
726 self.lifecycle.as_ref()
727 }
728
729 /// v0.6 #37: synchronous test entry that runs the lifecycle evaluator
730 /// against a caller-provided list of `(key, age, size, tags)` tuples
731 /// and returns the `(key, action)` pairs that should fire. The actual
732 /// backend invocation (S3.delete_object / metadata rewrite) is left
733 /// to the caller — the unit + E2E tests use this to verify the
734 /// evaluator without spawning the (deferred) background scanner.
735 /// Returns an empty `Vec` when no lifecycle manager is attached or
736 /// no rule matches.
737 #[must_use]
738 pub fn run_lifecycle_once_for_test(
739 &self,
740 bucket: &str,
741 objects: &[crate::lifecycle::EvaluateBatchEntry],
742 ) -> Vec<(String, crate::lifecycle::LifecycleAction)> {
743 let Some(mgr) = self.lifecycle.as_ref() else {
744 return Vec::new();
745 };
746 crate::lifecycle::evaluate_batch(mgr, bucket, objects)
747 }
748
749 /// v0.6 #35: attach the in-memory bucket-notification manager. Once
750 /// set, `put_bucket_notification_configuration` /
751 /// `get_bucket_notification_configuration` route through the manager
752 /// (replacing the previous backend-passthrough behaviour); successful
753 /// `put_object` / `delete_object` calls fire matching destinations
754 /// on a detached tokio task via
755 /// `crate::notifications::dispatch_event` (best-effort, fire-and-
756 /// forget — failures bump the manager's `dropped_total` counter and
757 /// log at warn but do NOT fail the originating S3 request).
758 #[must_use]
759 pub fn with_notifications(
760 mut self,
761 mgr: Arc<crate::notifications::NotificationManager>,
762 ) -> Self {
763 self.notifications = Some(mgr);
764 self
765 }
766
767 /// v0.6 #35: borrow the attached notifications manager (test /
768 /// introspection — used by the metrics layer to read
769 /// `dropped_total`).
770 #[must_use]
771 pub fn notifications_manager(&self) -> Option<&Arc<crate::notifications::NotificationManager>> {
772 self.notifications.as_ref()
773 }
774
775 /// v0.6 #35: internal helper used by the DELETE handlers to fire a
776 /// matching notification on a detached tokio task. No-op when no
777 /// manager is attached or no rule on the bucket matches the given
778 /// (event, key) tuple.
779 fn fire_delete_notification(
780 &self,
781 bucket: &str,
782 key: &str,
783 event: crate::notifications::EventType,
784 version_id: Option<String>,
785 ) {
786 let Some(mgr) = self.notifications.as_ref() else {
787 return;
788 };
789 let dests = mgr.match_destinations(bucket, &event, key);
790 if dests.is_empty() {
791 return;
792 }
793 tokio::spawn(crate::notifications::dispatch_event(
794 Arc::clone(mgr),
795 bucket.to_owned(),
796 key.to_owned(),
797 event,
798 None,
799 None,
800 version_id,
801 format!("S4-{}", uuid::Uuid::new_v4()),
802 ));
803 }
804
805 /// v0.6 #40: attach the in-memory cross-bucket replication manager.
806 /// Once set, `put_bucket_replication` / `get_bucket_replication` /
807 /// `delete_bucket_replication` route through the manager (replacing
808 /// the previous backend-passthrough behaviour); a successful
809 /// `put_object` whose key matches an enabled rule fires a detached
810 /// tokio task that PUTs the same body + metadata to the rule's
811 /// destination bucket, stamping the replica with
812 /// `x-amz-replication-status: REPLICA`. Failures after the retry
813 /// budget bump the manager's `dropped_total` counter and are
814 /// surfaced in the `s4_replication_dropped_total` Prometheus
815 /// counter; successes bump `s4_replication_replicated_total`.
816 #[must_use]
817 pub fn with_replication(mut self, mgr: Arc<crate::replication::ReplicationManager>) -> Self {
818 self.replication = Some(mgr);
819 self
820 }
821
822 /// v0.6 #40: borrow the attached replication manager (test /
823 /// introspection — used by the metrics layer to read
824 /// `dropped_total`).
825 #[must_use]
826 pub fn replication_manager(&self) -> Option<&Arc<crate::replication::ReplicationManager>> {
827 self.replication.as_ref()
828 }
829
830 /// v0.6 #40: internal helper used by the PUT handlers to fire a
831 /// detached cross-bucket replication task. No-op when no manager
832 /// is attached, the source backend PUT failed, or no rule on the
833 /// source bucket matches the (key, tags) tuple. The `body` is the
834 /// post-compression / post-encryption `Bytes` that was sent to
835 /// the source backend (refcount-cloned), and `metadata` is the
836 /// metadata map that already includes the manifest /
837 /// `s4-encrypted` markers — the replica decodes through the same
838 /// path. The destination PUT runs through `Arc<B>::put_object`.
839 ///
840 /// ## v0.8.2 #61: generation token + shadow-key destination
841 ///
842 /// `pending_version` is the source-side `PutOutcome` minted by the
843 /// caller's versioning branch (or `None` for unversioned /
844 /// suspended buckets). When `pending_version.versioned_response`
845 /// is `true`, the dispatcher writes the destination under the same
846 /// shadow path the source uses (`<key>.__s4ver__/<vid>`) so the
847 /// destination's version chain receives the new version the same
848 /// way `?versionId=` GET resolves it. Closes audit C-1.
849 ///
850 /// The dispatcher also mints a fresh `generation` token before
851 /// spawning, threaded through to [`crate::replication::
852 /// replicate_object`]. Closes audit C-3 — a stale retry of an
853 /// older PUT can no longer overwrite the destination's newer bytes
854 /// because the CAS guard sees the higher stored generation and
855 /// drops its destination write.
856 ///
857 /// ## Asymmetric versioning policy (out of scope)
858 ///
859 /// We assume source + destination buckets share the same
860 /// versioning policy (both Enabled or both Suspended /
861 /// Unversioned). Cross-bucket policy queries would require a
862 /// backend round-trip per replication, which is not worth it for
863 /// the single-instance scope. Operators who configure asymmetric
864 /// versioning will see destination-side `?versionId=` lookups
865 /// miss — documented as out-of-scope until a future per-rule
866 /// `destination_versioning_policy` knob lands.
867 // 8 args is the post-#61 shape: replication needs the
868 // source bucket+key, the canonical tag set for rule-matching,
869 // the post-codec body+metadata for the destination PUT, the
870 // backend-success gate, and the pending version-id for the
871 // shadow-key destination override. A shape struct would just
872 // split the (single) call site so opt for the inline form.
873 #[allow(clippy::too_many_arguments)]
874 fn spawn_replication_if_matched(
875 &self,
876 source_bucket: &str,
877 source_key: &str,
878 request_tags: &Option<crate::tagging::TagSet>,
879 body: &bytes::Bytes,
880 metadata: &Option<std::collections::HashMap<String, String>>,
881 backend_ok: bool,
882 pending_version: Option<&crate::versioning::PutOutcome>,
883 ) where
884 B: Send + Sync + 'static,
885 {
886 if !backend_ok {
887 return;
888 }
889 let Some(mgr) = self.replication.as_ref() else {
890 return;
891 };
892 // Pull the request's tags into the (k, v) shape the matcher
893 // expects. The tagging manager would have the canonical
894 // post-PUT view but at this point in the pipeline it's
895 // already been written above; for the rule-match decision
896 // the request's tags are sufficient (= the tags this PUT
897 // applies, S3 PutObject is full-replace on tags).
898 let object_tags: Vec<(String, String)> = request_tags
899 .as_ref()
900 .map(|ts| ts.iter().cloned().collect())
901 .unwrap_or_default();
902 let Some(rule) = mgr.match_rule(source_bucket, source_key, &object_tags) else {
903 return;
904 };
905 // v0.8.2 #61: mint the per-PUT generation BEFORE the eager
906 // Pending stamp so the stamp itself carries the right
907 // generation (the CAS in `record_status_if_newer` would
908 // otherwise see a `generation=0` Pending and accept any
909 // stale retry).
910 let generation = mgr.next_generation();
911 // Eagerly mark the source key as Pending so a HEAD between
912 // the source PUT returning and the spawned task completing
913 // surfaces the in-flight state. CAS-guarded so a slower
914 // older PUT can't downgrade a newer Completed back to Pending.
915 let _ = mgr.record_status_if_newer(
916 source_bucket,
917 source_key,
918 generation,
919 crate::replication::ReplicationStatus::Pending,
920 );
921 // v0.8.2 #61: derive the destination storage key. For a
922 // versioning-Enabled source the destination receives the
923 // same shadow-key path so a `?versionId=<vid>` GET on the
924 // destination resolves through the same lookup the source
925 // uses. Suspended / Unversioned sources keep the logical
926 // key (= `None` override = dispatcher uses `source_key`).
927 let destination_key_override = pending_version
928 .filter(|pv| pv.versioned_response)
929 .map(|pv| versioned_shadow_key(source_key, &pv.version_id));
930 // v0.8.3 #68 (audit M-1): capture the source object's Object
931 // Lock state so the dispatcher can decorate the destination
932 // PUT with the matching AWS-wire lock headers. Without this,
933 // a Compliance / Governance / legal-hold protected source
934 // would replicate to a destination where DELETE succeeds
935 // (the WORM posture would only hold on the source).
936 let source_lock_state = self
937 .object_lock
938 .as_ref()
939 .and_then(|mgr| mgr.get(source_bucket, source_key));
940 // v0.8.3 #68: hand the destination-side ObjectLockManager to
941 // the dispatcher closure so we can persist the propagated
942 // lock state on successful destination PUT (the destination
943 // PUT below bypasses S4Service::put_object — we drive the
944 // backend directly — so the explicit_lock_mode commit block
945 // in put_object never fires for replicas. We replay it here
946 // against the destination key.)
947 let dest_lock_mgr = self.object_lock.as_ref().map(Arc::clone);
948 let mgr_cl = Arc::clone(mgr);
949 let backend = Arc::clone(&self.backend);
950 let body_cl = body.clone();
951 let metadata_cl = metadata.clone();
952 let source_bucket_cl = source_bucket.to_owned();
953 let source_key_cl = source_key.to_owned();
954 let source_lock_state_for_closure = source_lock_state.clone();
955 let source_bucket_for_warn = source_bucket.to_owned();
956 // v0.8.5 #86 (audit M-2): bound the in-flight replication queue
957 // depth. Acquire happens INSIDE the spawned task (not on the
958 // listener path) so a saturated semaphore back-pressures the
959 // dispatcher pool without stalling the source PUT response —
960 // the source has already returned 200 to the client by the time
961 // the spawn body runs. A failed `acquire_owned` only happens
962 // when the semaphore is closed (we never close it, so the
963 // logged-and-skipped fallback is unreachable in practice).
964 let semaphore = Arc::clone(&self.replication_semaphore);
965 tokio::spawn(async move {
966 let _permit = match semaphore.acquire_owned().await {
967 Ok(p) => p,
968 Err(e) => {
969 tracing::warn!(
970 bucket = %source_bucket_cl,
971 key = %source_key_cl,
972 "S4 replication dispatcher could not acquire semaphore permit (closed? {e}); skipping replica"
973 );
974 return;
975 }
976 };
977 let do_put = move |dest_bucket: String,
978 dest_key: String,
979 dest_body: bytes::Bytes,
980 dest_meta: Option<std::collections::HashMap<String, String>>| {
981 let backend = Arc::clone(&backend);
982 let dest_lock_mgr = dest_lock_mgr.clone();
983 let lock_state = source_lock_state_for_closure.clone();
984 let warn_src = source_bucket_for_warn.clone();
985 async move {
986 let req = S3Request {
987 input: PutObjectInput {
988 bucket: dest_bucket.clone(),
989 key: dest_key.clone(),
990 body: Some(bytes_to_blob(dest_body)),
991 metadata: dest_meta,
992 ..Default::default()
993 },
994 method: http::Method::PUT,
995 uri: "/".parse().unwrap(),
996 headers: http::HeaderMap::new(),
997 extensions: http::Extensions::new(),
998 credentials: None,
999 region: None,
1000 service: None,
1001 trailing_headers: None,
1002 };
1003 let put_result = backend
1004 .put_object(req)
1005 .await
1006 .map(|_| ())
1007 .map_err(|e| format!("destination put_object: {e}"));
1008 // v0.8.3 #68: on successful destination PUT,
1009 // persist the propagated lock state into the
1010 // destination's ObjectLockManager so a subsequent
1011 // DELETE on the destination is refused. Three cases:
1012 // - PUT failed → skip (no replica to protect)
1013 // - lock_state None → nothing to propagate
1014 // - dest manager None (operator misconfig)
1015 // → log warn-once + bump skip metric
1016 if put_result.is_ok()
1017 && let Some(state) = lock_state
1018 {
1019 match dest_lock_mgr {
1020 Some(ref mgr) => {
1021 mgr.set(&dest_bucket, &dest_key, state);
1022 }
1023 None => {
1024 crate::replication::warn_lock_propagation_skipped(
1025 &warn_src,
1026 &dest_bucket,
1027 );
1028 }
1029 }
1030 }
1031 put_result
1032 }
1033 };
1034 // v0.8.5 #81 (audit H-7): wrap the dispatcher body in
1035 // `futures::FutureExt::catch_unwind` so a panic inside
1036 // `replicate_object` (or any of the user-supplied closures
1037 // it drives — `do_put`, the destination backend, the lock
1038 // manager) does NOT bubble out of the detached task as a
1039 // `JoinError` that no operator dashboard scrapes. Caught
1040 // panics bump `s4_dispatcher_panics_total{kind="replication"}`
1041 // + log at ERROR with the panic payload, so silent feature
1042 // degradation (= every replication PUT panicking and
1043 // dropping the replica without any visible signal) becomes
1044 // a first-class metric the operator can alert on.
1045 //
1046 // `AssertUnwindSafe` is required because the inner future
1047 // captures `Arc<...>` clones + a `do_put` closure that are
1048 // not `UnwindSafe` by default; the safety contract here is
1049 // "we don't continue using any of those captures after the
1050 // panic" which trivially holds (we drop them and return).
1051 use futures::FutureExt as _;
1052 let dispatcher_kind = "replication";
1053 let fut = crate::replication::replicate_object(
1054 rule,
1055 source_bucket_cl,
1056 source_key_cl,
1057 body_cl,
1058 metadata_cl,
1059 do_put,
1060 mgr_cl,
1061 generation,
1062 destination_key_override,
1063 source_lock_state,
1064 );
1065 if let Err(panic) = std::panic::AssertUnwindSafe(fut).catch_unwind().await {
1066 let panic_msg = panic
1067 .downcast_ref::<&'static str>()
1068 .copied()
1069 .map(str::to_owned)
1070 .or_else(|| panic.downcast_ref::<String>().cloned())
1071 .unwrap_or_else(|| "(non-string panic payload)".to_owned());
1072 tracing::error!(
1073 kind = dispatcher_kind,
1074 panic_payload = %panic_msg,
1075 "S4 dispatcher task panicked (caught by catch_unwind, runtime not poisoned)"
1076 );
1077 crate::metrics::record_dispatcher_panic(dispatcher_kind);
1078 }
1079 });
1080 }
1081
1082 /// v0.6 #42: attach the in-memory MFA-Delete enforcement manager.
1083 /// Once set, every DELETE / DELETE-version / delete-marker /
1084 /// `PutBucketVersioning` request against a bucket whose MFA-Delete
1085 /// state is `Enabled` requires a valid `x-amz-mfa: <serial> <code>`
1086 /// header (RFC 6238 6-digit TOTP); the gate is a no-op for buckets
1087 /// where MFA-Delete is `Disabled` (S3 default).
1088 #[must_use]
1089 pub fn with_mfa_delete(mut self, mgr: Arc<crate::mfa::MfaDeleteManager>) -> Self {
1090 self.mfa_delete = Some(mgr);
1091 self
1092 }
1093
1094 /// v0.6 #42: borrow the attached MFA-Delete manager (test /
1095 /// introspection — used by the snapshot path in `main.rs` to call
1096 /// `to_json` for restart-recoverable state).
1097 #[must_use]
1098 pub fn mfa_delete_manager(&self) -> Option<&Arc<crate::mfa::MfaDeleteManager>> {
1099 self.mfa_delete.as_ref()
1100 }
1101
1102 /// v0.6 #38: attach the in-memory CORS configuration manager. Once
1103 /// set, `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
1104 /// route through the manager instead of forwarding to the backend,
1105 /// and [`Self::handle_preflight`] becomes useful for the (future)
1106 /// listener-side OPTIONS interceptor.
1107 #[must_use]
1108 pub fn with_cors(mut self, mgr: Arc<crate::cors::CorsManager>) -> Self {
1109 self.cors = Some(mgr);
1110 self
1111 }
1112
1113 /// v0.6 #38: Borrow the attached CORS manager (test / introspection).
1114 #[must_use]
1115 pub fn cors_manager(&self) -> Option<&Arc<crate::cors::CorsManager>> {
1116 self.cors.as_ref()
1117 }
1118
1119 /// v0.6 #38: evaluate a CORS preflight request against the bucket's
1120 /// configured rules and, if a rule matches, return the headers that
1121 /// the (future) listener-side OPTIONS interceptor must put on the
1122 /// 200 response: `Access-Control-Allow-Origin`, `Access-Control-
1123 /// Allow-Methods`, `Access-Control-Allow-Headers`, optionally
1124 /// `Access-Control-Max-Age` and `Access-Control-Expose-Headers`.
1125 ///
1126 /// Returns `None` when no manager is attached, no config is
1127 /// registered for the bucket, or no rule matches the (origin,
1128 /// method, headers) triple. The caller is responsible for turning
1129 /// `None` into the appropriate 403 response.
1130 ///
1131 /// **Note:** the OPTIONS routing itself (i.e. wiring this method
1132 /// into the hyper-util listener path) is a follow-up — s3s does not
1133 /// surface OPTIONS as a typed S3 handler, so this method is
1134 /// currently call-able only from inside other handlers and tests.
1135 #[must_use]
1136 pub fn handle_preflight(
1137 &self,
1138 bucket: &str,
1139 origin: &str,
1140 method: &str,
1141 request_headers: &[String],
1142 ) -> Option<std::collections::HashMap<String, String>> {
1143 let mgr = self.cors.as_ref()?;
1144 let rule = mgr.match_preflight(bucket, origin, method, request_headers)?;
1145 let mut h = std::collections::HashMap::new();
1146 // Echo the matched origin back. If the rule used "*" we still
1147 // echo "*" (S3 spec — the spec does not require us to echo the
1148 // *requesting* origin when the wildcard matched).
1149 let allow_origin = if rule.allowed_origins.iter().any(|o| o == "*") {
1150 "*".to_string()
1151 } else {
1152 origin.to_string()
1153 };
1154 h.insert("Access-Control-Allow-Origin".to_string(), allow_origin);
1155 h.insert(
1156 "Access-Control-Allow-Methods".to_string(),
1157 rule.allowed_methods.join(", "),
1158 );
1159 if !rule.allowed_headers.is_empty() {
1160 // For the Allow-Headers response, echo back the rule's
1161 // pattern list verbatim (S3 echoes the configured list,
1162 // including "*" if present). Browsers honour exact-match
1163 // rules.
1164 h.insert(
1165 "Access-Control-Allow-Headers".to_string(),
1166 rule.allowed_headers.join(", "),
1167 );
1168 }
1169 if let Some(secs) = rule.max_age_seconds {
1170 h.insert("Access-Control-Max-Age".to_string(), secs.to_string());
1171 }
1172 if !rule.expose_headers.is_empty() {
1173 h.insert(
1174 "Access-Control-Expose-Headers".to_string(),
1175 rule.expose_headers.join(", "),
1176 );
1177 }
1178 Some(h)
1179 }
1180
1181 /// v0.5 #32: enable strict compliance mode. Every PUT must carry an
1182 /// SSE indicator (server-side encryption header or SSE-C customer
1183 /// key); requests without one are rejected with 400 InvalidRequest.
1184 /// Boot-time prerequisite checking lives in the binary
1185 /// (`validate_compliance_mode`) so this flag is purely the runtime
1186 /// switch.
1187 #[must_use]
1188 pub fn with_compliance_strict(mut self, on: bool) -> Self {
1189 self.compliance_strict = on;
1190 self
1191 }
1192
1193 /// v0.5 #30: attach the in-memory Object Lock (WORM) enforcement
1194 /// manager. Once set, `delete_object` and overwrite-path
1195 /// `put_object` refuse operations on locked keys with HTTP 403
1196 /// `AccessDenied`; new PUTs to a bucket with a default retention
1197 /// policy auto-create per-object lock state.
1198 #[must_use]
1199 pub fn with_object_lock(mut self, mgr: Arc<crate::object_lock::ObjectLockManager>) -> Self {
1200 self.object_lock = Some(mgr);
1201 self
1202 }
1203
1204 /// v0.7 #45: borrow the attached Object Lock manager (read-only —
1205 /// the lifecycle scanner uses this to skip currently-locked objects
1206 /// before issuing `delete_object`, since an Object Lock always wins
1207 /// over Lifecycle Expiration in AWS S3 semantics). Mirrors the
1208 /// shape of [`Self::lifecycle_manager`] /
1209 /// [`Self::tag_manager`] — purely additive accessor, no handler
1210 /// behaviour change.
1211 #[must_use]
1212 pub fn object_lock_manager(&self) -> Option<&Arc<crate::object_lock::ObjectLockManager>> {
1213 self.object_lock.as_ref()
1214 }
1215
1216 /// v0.5 #28: attach an SSE-KMS backend. `default_key_id` is used
1217 /// when a PUT requests SSE-KMS without naming a specific KMS key
1218 /// (operators set this to mirror AWS S3's bucket-default key).
1219 #[must_use]
1220 pub fn with_kms_backend(
1221 mut self,
1222 kms: Arc<dyn crate::kms::KmsBackend>,
1223 default_key_id: Option<String>,
1224 ) -> Self {
1225 self.kms = Some(kms);
1226 self.kms_default_key_id = default_key_id;
1227 self
1228 }
1229
1230 /// v0.5 #34: attach the first-class versioning state machine. Once
1231 /// set, this `S4Service` owns the per-bucket versioning state +
1232 /// per-(bucket, key) version chain; `put_object` / `get_object` /
1233 /// `delete_object` / `list_object_versions` /
1234 /// `get_bucket_versioning` / `put_bucket_versioning` consult the
1235 /// manager instead of passing through to the backend. The backend
1236 /// is still used as the byte store: Suspended / Unversioned buckets
1237 /// keep using `<key>` directly (legacy), Enabled buckets redirect
1238 /// each version's bytes to a shadow key
1239 /// (`<key>.__s4ver__/<version-id>`) so older versions survive newer
1240 /// PUTs to the same logical key.
1241 #[must_use]
1242 pub fn with_versioning(mut self, mgr: Arc<crate::versioning::VersioningManager>) -> Self {
1243 self.versioning = Some(mgr);
1244 self
1245 }
1246
1247 /// v0.8.5 #86 (audit M-3): borrow the attached versioning manager so
1248 /// the SIGUSR1 snapshot dump-back hook in `main.rs` can re-emit the
1249 /// in-memory state to the operator's `--versioning-state-file`
1250 /// without restarting the gateway. Mirrors the shape of
1251 /// [`Self::object_lock_manager`] / [`Self::lifecycle_manager`] —
1252 /// purely additive accessor, no handler behaviour change.
1253 #[must_use]
1254 pub fn versioning_manager(&self) -> Option<&Arc<crate::versioning::VersioningManager>> {
1255 self.versioning.as_ref()
1256 }
1257
1258 /// v0.8.5 #86 (audit M-2): override the default replication-dispatch
1259 /// concurrency cap (1024). Wired by the `--replication-max-concurrent`
1260 /// CLI flag in `main.rs`. Operators running heavy cross-region
1261 /// fan-out may need to raise this; operators on memory-constrained
1262 /// hosts may need to lower it. The new value replaces the existing
1263 /// `Semaphore` (so calling this after dispatchers are already in
1264 /// flight is fine — the in-flight tasks hold permits from the old
1265 /// semaphore which is dropped when its last permit is released).
1266 /// A `max` of 0 would deadlock all replicas; the value is silently
1267 /// clamped to 1 instead.
1268 #[must_use]
1269 pub fn with_replication_max_concurrent(mut self, max: usize) -> Self {
1270 let max = max.max(1);
1271 self.replication_semaphore = Arc::new(tokio::sync::Semaphore::new(max));
1272 self
1273 }
1274
1275 /// v0.8.5 #86 (audit M-2): borrow the in-flight replication
1276 /// concurrency permit pool. Tests inspect `available_permits()`
1277 /// after invoking `spawn_replication_if_matched` to verify the
1278 /// dispatcher actually `acquire_owned`s before kicking off the
1279 /// destination PUT.
1280 #[must_use]
1281 pub fn replication_semaphore(&self) -> &Arc<tokio::sync::Semaphore> {
1282 &self.replication_semaphore
1283 }
1284
1285 /// v0.4 #21 (kept for back-compat): attach a single SSE-S4 key.
1286 /// Internally wraps it in a 1-slot keyring with id=1 active, so
1287 /// new objects ride the v0.5 S4E2 frame while previously-written
1288 /// S4E1 bytes (this same key) still decrypt via the keyring's S4E1
1289 /// fallback path. Operators wanting true rotation should call
1290 /// [`Self::with_sse_keyring`] instead.
1291 #[must_use]
1292 pub fn with_sse_key(mut self, key: crate::sse::SharedSseKey) -> Self {
1293 let keyring = crate::sse::SseKeyring::new(1, key);
1294 self.sse_keyring = Some(std::sync::Arc::new(keyring));
1295 self
1296 }
1297
1298 /// v0.5 #29: attach a multi-key SSE-S4 keyring. PUT encrypts under
1299 /// the active key (S4E2 frame stamped with that key's id); GET
1300 /// dispatches on the body's magic — S4E1 falls back to trying every
1301 /// key in the ring (active first) so v0.4 objects survive a
1302 /// migration; S4E2 looks up the explicit key_id from the header.
1303 #[must_use]
1304 pub fn with_sse_keyring(mut self, keyring: crate::sse::SharedSseKeyring) -> Self {
1305 self.sse_keyring = Some(keyring);
1306 self
1307 }
1308
1309 /// v0.8 #52: opt the SSE-S4 PUT path into the chunked S4E5 frame
1310 /// (so the matching GET can stream-decrypt chunk-by-chunk
1311 /// instead of buffering the entire body before tag verify).
1312 /// `bytes` is the plaintext slice size — typically 1 MiB; 0
1313 /// disables the path and reverts to the legacy S4E2 buffered
1314 /// frame.
1315 ///
1316 /// SSE-C (S4E3) and SSE-KMS (S4E4) are intentionally untouched:
1317 /// the chunked envelopes for those flows are a follow-up issue
1318 /// (the customer-key wire surface needs separate version
1319 /// negotiation).
1320 ///
1321 /// Has no effect when `with_sse_keyring` / `with_sse_key` is
1322 /// not also set — the chunked path runs only on the SSE-S4
1323 /// branch of `put_object`.
1324 #[must_use]
1325 pub fn with_sse_chunk_size(mut self, bytes: usize) -> Self {
1326 self.sse_chunk_size = bytes;
1327 self
1328 }
1329
1330 /// v0.4 #20: attach an S3-style access-log emitter. Each completed
1331 /// PUT / GET / DELETE / List handler emits one entry into the
1332 /// emitter's buffer; a background flusher (started separately, see
1333 /// [`crate::access_log::AccessLog::spawn_flusher`]) writes hourly
1334 /// rotated `.log` files into the configured directory.
1335 #[must_use]
1336 pub fn with_access_log(mut self, log: crate::access_log::SharedAccessLog) -> Self {
1337 self.access_log = Some(log);
1338 self
1339 }
1340
1341 /// Capture the per-request access-log preamble before the request is
1342 /// consumed by the backend call. Returns `None` if no access logger
1343 /// is configured (cheap early-out so the handler doesn't pay the
1344 /// header-clone cost when access logging is off).
1345 fn access_log_preamble<I>(&self, req: &S3Request<I>) -> Option<AccessLogPreamble> {
1346 self.access_log.as_ref()?;
1347 Some(AccessLogPreamble {
1348 // v0.8.11 CRIT-4 fix: same trust gate as `request_context`.
1349 // Recording a client-controllable header in the access log
1350 // would poison forensic queries; leave it `None` until the
1351 // operator declares X-Forwarded-For is set by a trusted
1352 // proxy.
1353 remote_ip: if self.trust_x_forwarded_for {
1354 req.headers
1355 .get("x-forwarded-for")
1356 .and_then(|v| v.to_str().ok())
1357 .and_then(|raw| raw.split(',').next())
1358 .map(|s| s.trim().to_owned())
1359 } else {
1360 None
1361 },
1362 requester: Self::principal_of(req).map(str::to_owned),
1363 request_uri: format!("{} {}", req.method, req.uri.path()),
1364 user_agent: req
1365 .headers
1366 .get("user-agent")
1367 .and_then(|v| v.to_str().ok())
1368 .map(str::to_owned),
1369 })
1370 }
1371
1372 /// Internal — called by handlers at end-of-request with a captured
1373 /// preamble. Best-effort: swallows the await fast (clones Arc +
1374 /// pushes), no error propagation back to the request path.
1375 #[allow(clippy::too_many_arguments)]
1376 async fn record_access(
1377 &self,
1378 preamble: Option<AccessLogPreamble>,
1379 operation: &'static str,
1380 bucket: &str,
1381 key: Option<&str>,
1382 http_status: u16,
1383 bytes_sent: u64,
1384 object_size: u64,
1385 total_time_ms: u64,
1386 error_code: Option<&str>,
1387 ) {
1388 let (Some(log), Some(p)) = (self.access_log.as_ref(), preamble) else {
1389 return;
1390 };
1391 log.record(crate::access_log::AccessLogEntry {
1392 time: std::time::SystemTime::now(),
1393 bucket: bucket.to_owned(),
1394 remote_ip: p.remote_ip,
1395 requester: p.requester,
1396 operation,
1397 key: key.map(str::to_owned),
1398 request_uri: p.request_uri,
1399 http_status,
1400 error_code: error_code.map(str::to_owned),
1401 bytes_sent,
1402 object_size,
1403 total_time_ms,
1404 user_agent: p.user_agent,
1405 })
1406 .await;
1407 }
1408
1409 /// v0.4 #19: attach a per-(principal, bucket) token-bucket rate limiter.
1410 /// When set, every PUT / GET / DELETE / List / Copy / multipart op is
1411 /// throttle-checked before the policy gate; throttled requests return
1412 /// `S3ErrorCode::SlowDown` (HTTP 503) and bump
1413 /// `s4_rate_limit_throttled_total{principal,bucket}`.
1414 #[must_use]
1415 pub fn with_rate_limits(mut self, rl: crate::rate_limit::SharedRateLimits) -> Self {
1416 self.rate_limits = Some(rl);
1417 self
1418 }
1419
1420 /// Helper used by request handlers to apply the rate limit. Returns
1421 /// `Ok(())` when allowed (or no rate limiter is configured), or a
1422 /// `SlowDown` S3Error otherwise.
1423 fn enforce_rate_limit<I>(&self, req: &S3Request<I>, bucket: &str) -> S3Result<()> {
1424 let Some(rl) = self.rate_limits.as_ref() else {
1425 return Ok(());
1426 };
1427 let principal_id = Self::principal_of(req);
1428 if !rl.check(principal_id, bucket) {
1429 crate::metrics::record_rate_limit_throttle(principal_id.unwrap_or("-"), bucket);
1430 return Err(S3Error::with_message(
1431 S3ErrorCode::SlowDown,
1432 format!("rate-limited: bucket={bucket}"),
1433 ));
1434 }
1435 Ok(())
1436 }
1437
1438 /// Tell the policy evaluator that the listener is reached over TLS
1439 /// (or ACME). When `true`, the `aws:SecureTransport` Condition key
1440 /// resolves to `true`. Defaults to `false`.
1441 #[must_use]
1442 pub fn with_secure_transport(mut self, on: bool) -> Self {
1443 self.secure_transport = on;
1444 self
1445 }
1446
1447 #[must_use]
1448 pub fn with_max_body_bytes(mut self, n: usize) -> Self {
1449 self.max_body_bytes = n;
1450 self
1451 }
1452
1453 /// Attach an optional bucket policy (v0.2 #7). When `Some(...)`, every
1454 /// PUT / GET / DELETE / List handler runs `policy.evaluate(...)` before
1455 /// delegating to the backend; failures return `S3ErrorCode::AccessDenied`.
1456 /// When `None` (the default), no policy enforcement happens.
1457 #[must_use]
1458 pub fn with_policy(mut self, policy: crate::policy::SharedPolicy) -> Self {
1459 self.policy = Some(policy);
1460 self
1461 }
1462
1463 /// Pull the SigV4 access key id off the request's credentials, if any.
1464 /// Used as the `principal_id` for policy evaluation.
1465 fn principal_of<I>(req: &S3Request<I>) -> Option<&str> {
1466 req.credentials.as_ref().map(|c| c.access_key.as_str())
1467 }
1468
1469 /// v0.3 #13: build the per-request policy context from the incoming
1470 /// `S3Request`. Pulls `aws:UserAgent` from the User-Agent header,
1471 /// `aws:SourceIp` from the standard `X-Forwarded-For` header (most
1472 /// production deployments are behind an LB / reverse proxy that sets
1473 /// this), `aws:CurrentTime` from the system clock, and
1474 /// `aws:SecureTransport` from the per-listener TLS flag.
1475 fn request_context<I>(&self, req: &S3Request<I>) -> crate::policy::RequestContext {
1476 let user_agent = req
1477 .headers
1478 .get("user-agent")
1479 .and_then(|v| v.to_str().ok())
1480 .map(str::to_owned);
1481 // v0.8.11 CRIT-4 fix: `X-Forwarded-For` is a client-controllable
1482 // header. Trusting it unconditionally lets any public-internet
1483 // request claim it came from a trusted CIDR (e.g.
1484 // `curl -H 'X-Forwarded-For: 10.0.0.1'` to satisfy a
1485 // `Condition: NotIpAddress aws:SourceIp [10.0.0.0/8]` Deny).
1486 // We now only consume the header when the operator has
1487 // declared "this gateway sits behind a trusted reverse proxy
1488 // that scrubs client-supplied values" via
1489 // `with_trust_x_forwarded_for(true)` /
1490 // `--trust-x-forwarded-for`. Default leaves `source_ip` as
1491 // `None`, which fails closed for IP-allowlist Allow rules
1492 // and fails open for IP-blocklist Deny rules — operators
1493 // who need either case behind a public listener must opt in
1494 // or move the gate to the reverse proxy. The leftmost
1495 // comma-separated token is the originator per the
1496 // `X-Forwarded-For: client, proxy1, proxy2` convention.
1497 let source_ip = if self.trust_x_forwarded_for {
1498 req.headers
1499 .get("x-forwarded-for")
1500 .and_then(|v| v.to_str().ok())
1501 .and_then(|raw| raw.split(',').next())
1502 .and_then(|s| s.trim().parse().ok())
1503 } else {
1504 None
1505 };
1506 crate::policy::RequestContext {
1507 source_ip,
1508 user_agent,
1509 request_time: Some(std::time::SystemTime::now()),
1510 secure_transport: self.secure_transport,
1511 existing_object_tags: None,
1512 request_object_tags: None,
1513 extra: Default::default(),
1514 }
1515 }
1516
1517 /// Helper used by request handlers to enforce the optional policy.
1518 /// Returns `Ok(())` when allowed (or no policy is configured), or an
1519 /// `AccessDenied` S3Error otherwise. Bumps the policy denial Prometheus
1520 /// counter on deny.
1521 fn enforce_policy<I>(
1522 &self,
1523 req: &S3Request<I>,
1524 action: &'static str,
1525 bucket: &str,
1526 key: Option<&str>,
1527 ) -> S3Result<()> {
1528 self.enforce_policy_with_extra(req, action, bucket, key, None, None)
1529 }
1530
1531 /// v0.6 #39: variant of [`Self::enforce_policy`] that lets the
1532 /// caller plumb tag context (existing-on-object + on-request) into
1533 /// the policy evaluator. Both arguments default to `None`, in
1534 /// which case the resulting `RequestContext` is identical to
1535 /// [`Self::enforce_policy`]'s — so for handlers that don't deal
1536 /// with tags this is a transparent no-op.
1537 fn enforce_policy_with_extra<I>(
1538 &self,
1539 req: &S3Request<I>,
1540 action: &'static str,
1541 bucket: &str,
1542 key: Option<&str>,
1543 request_tags: Option<&crate::tagging::TagSet>,
1544 existing_tags: Option<&crate::tagging::TagSet>,
1545 ) -> S3Result<()> {
1546 let Some(policy) = self.policy.as_ref() else {
1547 return Ok(());
1548 };
1549 let principal_id = Self::principal_of(req);
1550 let mut ctx = self.request_context(req);
1551 if let Some(t) = request_tags {
1552 ctx.request_object_tags = Some(t.clone());
1553 }
1554 if let Some(t) = existing_tags {
1555 ctx.existing_object_tags = Some(t.clone());
1556 }
1557 let decision = policy.evaluate_with(action, bucket, key, principal_id, &ctx);
1558 if decision.allow {
1559 Ok(())
1560 } else {
1561 crate::metrics::record_policy_denial(action, bucket);
1562 tracing::info!(
1563 action,
1564 bucket,
1565 key = ?key,
1566 principal = ?principal_id,
1567 source_ip = ?ctx.source_ip,
1568 user_agent = ?ctx.user_agent,
1569 secure_transport = ctx.secure_transport,
1570 matched_sid = ?decision.matched_sid,
1571 effect = ?decision.matched_effect,
1572 "S4 policy denied request"
1573 );
1574 Err(S3Error::with_message(
1575 S3ErrorCode::AccessDenied,
1576 format!("denied by S4 policy: {action} on bucket={bucket}"),
1577 ))
1578 }
1579 }
1580
1581 /// テスト用: backend を取り戻す (test helper、production では使わない).
1582 /// v0.6 #40 で `backend` が `Arc<B>` 化したので `Arc::try_unwrap` で
1583 /// 1-clone の場合のみ返す。共有されている (= replication dispatcher が
1584 /// 同じ Arc を持っていて未完了) 場合は `Err` を返さず panic させる
1585 /// (test 用途専用 helper の caller 契約を維持)。
1586 pub fn into_backend(self) -> B {
1587 Arc::try_unwrap(self.backend).unwrap_or_else(|_| {
1588 panic!("into_backend: backend Arc still shared (replication dispatcher in flight?)")
1589 })
1590 }
1591
1592 /// 必要 frame だけを backend に Range GET し、frame parse + decompress + slice
1593 /// した結果を返す sidecar fast path。Range request の **帯域節約版**。
1594 async fn partial_range_get(
1595 &self,
1596 req: &S3Request<GetObjectInput>,
1597 plan: s4_codec::index::RangePlan,
1598 client_start: u64,
1599 client_end_exclusive: u64,
1600 total_original: u64,
1601 get_start: Instant,
1602 ) -> S3Result<S3Response<GetObjectOutput>> {
1603 // 必要 byte 範囲だけを backend に partial GET
1604 let backend_range = s3s::dto::Range::Int {
1605 first: plan.byte_start,
1606 last: Some(plan.byte_end_exclusive - 1),
1607 };
1608 let backend_input = GetObjectInput {
1609 bucket: req.input.bucket.clone(),
1610 key: req.input.key.clone(),
1611 range: Some(backend_range),
1612 ..Default::default()
1613 };
1614 let backend_req = S3Request {
1615 input: backend_input,
1616 method: req.method.clone(),
1617 uri: req.uri.clone(),
1618 headers: req.headers.clone(),
1619 extensions: http::Extensions::new(),
1620 credentials: req.credentials.clone(),
1621 region: req.region.clone(),
1622 service: req.service.clone(),
1623 trailing_headers: None,
1624 };
1625 let mut backend_resp = self.backend.get_object(backend_req).await?;
1626 let blob = backend_resp.output.body.take().ok_or_else(|| {
1627 S3Error::with_message(
1628 S3ErrorCode::InternalError,
1629 "backend partial GET returned empty body",
1630 )
1631 })?;
1632 let bytes = collect_blob(blob, self.max_body_bytes)
1633 .await
1634 .map_err(internal("collect partial body"))?;
1635
1636 // frame parse + decompress
1637 let mut combined = BytesMut::new();
1638 for frame in FrameIter::new(bytes) {
1639 let (header, payload) = frame.map_err(|e| {
1640 S3Error::with_message(
1641 S3ErrorCode::InternalError,
1642 format!("partial-range frame parse: {e}"),
1643 )
1644 })?;
1645 let chunk_manifest = ChunkManifest {
1646 codec: header.codec,
1647 original_size: header.original_size,
1648 compressed_size: header.compressed_size,
1649 crc32c: header.crc32c,
1650 };
1651 let decompressed = self
1652 .registry
1653 .decompress(payload, &chunk_manifest)
1654 .await
1655 .map_err(internal("partial-range decompress"))?;
1656 combined.extend_from_slice(&decompressed);
1657 }
1658 let combined = combined.freeze();
1659 let sliced = combined
1660 .slice(plan.slice_start_in_combined as usize..plan.slice_end_in_combined as usize);
1661
1662 // response 組立て
1663 let returned_size = sliced.len() as u64;
1664 backend_resp.output.content_length = Some(returned_size as i64);
1665 backend_resp.output.content_range = Some(format!(
1666 "bytes {client_start}-{}/{total_original}",
1667 client_end_exclusive - 1
1668 ));
1669 backend_resp.output.checksum_crc32 = None;
1670 backend_resp.output.checksum_crc32c = None;
1671 backend_resp.output.checksum_crc64nvme = None;
1672 backend_resp.output.checksum_sha1 = None;
1673 backend_resp.output.checksum_sha256 = None;
1674 backend_resp.output.e_tag = None;
1675 backend_resp.output.body = Some(bytes_to_blob(sliced));
1676 backend_resp.status = Some(http::StatusCode::PARTIAL_CONTENT);
1677
1678 let elapsed = get_start.elapsed();
1679 crate::metrics::record_get(
1680 "partial",
1681 plan.byte_end_exclusive - plan.byte_start,
1682 returned_size,
1683 elapsed.as_secs_f64(),
1684 true,
1685 );
1686 info!(
1687 op = "get_object",
1688 bucket = %req.input.bucket,
1689 key = %req.input.key,
1690 bytes_in = plan.byte_end_exclusive - plan.byte_start,
1691 bytes_out = returned_size,
1692 total_object_size = total_original,
1693 range = true,
1694 path = "sidecar-partial",
1695 latency_ms = elapsed.as_millis() as u64,
1696 "S4 partial Range GET via sidecar index"
1697 );
1698 Ok(backend_resp)
1699 }
1700
1701 /// `<key>.s4index` sidecar object を backend に書く。失敗しても本体 PUT は
1702 /// 成功扱いにしたいので、err は warn ログのみ (Range GET の partial path が
1703 /// 使えなくなるが、full read fallback で意味的には正しい結果を返す)。
1704 async fn write_sidecar(&self, bucket: &str, key: &str, index: &FrameIndex) {
1705 let bytes = encode_index(index);
1706 let len = bytes.len() as i64;
1707 let sidecar = sidecar_key(key);
1708 // v0.7 #49: synthetic re-entry URI must be percent-encoded; if
1709 // the (already legally-arbitrary) S3 key produces something we
1710 // cannot encode at all, drop the sidecar PUT (the GET path
1711 // falls back to a full read on a missing sidecar) instead of
1712 // panicking on `parse().unwrap()`.
1713 let uri = match safe_object_uri(bucket, &sidecar) {
1714 Ok(u) => u,
1715 Err(e) => {
1716 tracing::warn!(
1717 bucket,
1718 key,
1719 "S4 write_sidecar skipped (key not URI-encodable): {e}"
1720 );
1721 return;
1722 }
1723 };
1724 let put_input = PutObjectInput {
1725 bucket: bucket.into(),
1726 key: sidecar,
1727 body: Some(bytes_to_blob(bytes)),
1728 content_length: Some(len),
1729 content_type: Some("application/x-s4-index".into()),
1730 ..Default::default()
1731 };
1732 let put_req = S3Request {
1733 input: put_input,
1734 method: http::Method::PUT,
1735 uri,
1736 headers: http::HeaderMap::new(),
1737 extensions: http::Extensions::new(),
1738 credentials: None,
1739 region: None,
1740 service: None,
1741 trailing_headers: None,
1742 };
1743 if let Err(e) = self.backend.put_object(put_req).await {
1744 tracing::warn!(
1745 bucket,
1746 key,
1747 "S4 write_sidecar failed (Range GET will fall back to full read): {e}"
1748 );
1749 }
1750 }
1751
1752 /// v0.8.4 #73 H-2: confirm that the sidecar we just decoded still
1753 /// describes the current backend object before we trust its frame
1754 /// offsets for a partial Range GET. The sidecar carries the source
1755 /// `etag` and `compressed_size` that were observed at PUT time; we
1756 /// HEAD the backend object and compare.
1757 ///
1758 /// Decision matrix:
1759 /// - sidecar `source_etag = None` (legacy v1 / build_index_from_body
1760 /// that wasn't stamped) → return `true` (best-effort, preserves
1761 /// pre-v0.8.4 behaviour for existing on-disk sidecars).
1762 /// - HEAD fails → return `false` (we can't tell either way; full GET
1763 /// path will surface the real backend error to the client).
1764 /// - HEAD ETag matches → `true`.
1765 /// - HEAD ETag differs OR HEAD size differs from
1766 /// `source_compressed_size` → `false` (sidecar stale or attacker-
1767 /// written; fall back to full GET).
1768 async fn sidecar_version_binding_ok(
1769 &self,
1770 bucket: &str,
1771 key: &str,
1772 index: &FrameIndex,
1773 ) -> bool {
1774 let Some(ref expected_etag) = index.source_etag else {
1775 // Legacy sidecar without the v0.8.4 #73 H-2 binding —
1776 // back-compat: trust it (the partial fetch is the same
1777 // best-effort path that v0.8.3 and earlier shipped).
1778 return true;
1779 };
1780 let head_input = HeadObjectInput {
1781 bucket: bucket.into(),
1782 key: key.into(),
1783 ..Default::default()
1784 };
1785 let uri = match safe_object_uri(bucket, key) {
1786 Ok(u) => u,
1787 Err(_) => return false,
1788 };
1789 let head_req = S3Request {
1790 input: head_input,
1791 method: http::Method::HEAD,
1792 uri,
1793 headers: http::HeaderMap::new(),
1794 extensions: http::Extensions::new(),
1795 credentials: None,
1796 region: None,
1797 service: None,
1798 trailing_headers: None,
1799 };
1800 let head = match self.backend.head_object(head_req).await {
1801 Ok(r) => r.output,
1802 Err(e) => {
1803 tracing::debug!(
1804 bucket,
1805 key,
1806 "S4 sidecar version-binding HEAD failed, falling back to full GET: {e}"
1807 );
1808 return false;
1809 }
1810 };
1811 // ETag is a strong-vs-weak enum; we compare on the unwrapped string
1812 // form (matches what the PUT path stamped — see below).
1813 let live_etag = head.e_tag.as_ref().map(|t| t.value());
1814 if live_etag != Some(expected_etag.as_str()) {
1815 tracing::debug!(
1816 bucket,
1817 key,
1818 "sidecar stale (ETag mismatch), falling back to full GET (sidecar={:?}, live={:?})",
1819 expected_etag,
1820 live_etag,
1821 );
1822 return false;
1823 }
1824 if let Some(expected_size) = index.source_compressed_size
1825 && let Some(live_size) = head.content_length
1826 && live_size as u64 != expected_size
1827 {
1828 tracing::debug!(
1829 bucket,
1830 key,
1831 "sidecar stale (size mismatch), falling back to full GET (sidecar={}, live={})",
1832 expected_size,
1833 live_size,
1834 );
1835 return false;
1836 }
1837 true
1838 }
1839
1840 /// `<key>.s4index` sidecar を backend から読み出す。なければ None。
1841 async fn read_sidecar(&self, bucket: &str, key: &str) -> Option<FrameIndex> {
1842 let sidecar = sidecar_key(key);
1843 // v0.7 #49: same encode-or-bail treatment as write_sidecar.
1844 let uri = safe_object_uri(bucket, &sidecar).ok()?;
1845 let get_input = GetObjectInput {
1846 bucket: bucket.into(),
1847 key: sidecar,
1848 ..Default::default()
1849 };
1850 let get_req = S3Request {
1851 input: get_input,
1852 method: http::Method::GET,
1853 uri,
1854 headers: http::HeaderMap::new(),
1855 extensions: http::Extensions::new(),
1856 credentials: None,
1857 region: None,
1858 service: None,
1859 trailing_headers: None,
1860 };
1861 let resp = self.backend.get_object(get_req).await.ok()?;
1862 let blob = resp.output.body?;
1863 let bytes = collect_blob(blob, 64 * 1024 * 1024).await.ok()?;
1864 decode_index(bytes).ok()
1865 }
1866
1867 /// Multipart object (frame 列) を解凍 → 元 bytes を再構築。
1868 ///
1869 /// **per-frame codec dispatch**: 各 frame header に codec_id が入っているので、
1870 /// frame ごとに registry が違う codec を呼ぶことができる。同一 object 内で
1871 /// 異なる codec が混在していても透過的に解凍可能 (parquet 風 mixed columns 等)。
1872 async fn decompress_multipart(&self, bytes: bytes::Bytes) -> S3Result<bytes::Bytes> {
1873 let mut out = BytesMut::new();
1874 for frame in FrameIter::new(bytes) {
1875 let (header, payload) = frame.map_err(|e| {
1876 S3Error::with_message(
1877 S3ErrorCode::InternalError,
1878 format!("multipart frame parse: {e}"),
1879 )
1880 })?;
1881 let chunk_manifest = ChunkManifest {
1882 codec: header.codec,
1883 original_size: header.original_size,
1884 compressed_size: header.compressed_size,
1885 crc32c: header.crc32c,
1886 };
1887 let decompressed = self
1888 .registry
1889 .decompress(payload, &chunk_manifest)
1890 .await
1891 .map_err(internal("multipart frame decompress"))?;
1892 out.extend_from_slice(&decompressed);
1893 }
1894 Ok(out.freeze())
1895 }
1896}
1897
1898/// Parse a CopySourceRange header value (`bytes=N-M`, `bytes=N-`, `bytes=-N`)
1899/// into the s3s::dto::Range used by the GetObject path. The S3 spec only
1900/// allows `bytes=N-M` for upload_part_copy (no suffix or open-ended), so
1901/// reject the other variants for parity with AWS.
1902fn parse_copy_source_range(s: &str) -> Result<s3s::dto::Range, String> {
1903 let rest = s
1904 .strip_prefix("bytes=")
1905 .ok_or_else(|| format!("CopySourceRange must start with 'bytes=', got {s:?}"))?;
1906 let (a, b) = rest
1907 .split_once('-')
1908 .ok_or_else(|| format!("CopySourceRange must be 'bytes=N-M', got {s:?}"))?;
1909 let first: u64 = a
1910 .parse()
1911 .map_err(|_| format!("CopySourceRange first byte not a number: {a:?}"))?;
1912 let last: u64 = b
1913 .parse()
1914 .map_err(|_| format!("CopySourceRange last byte not a number: {b:?}"))?;
1915 if last < first {
1916 return Err(format!("CopySourceRange last < first: {s:?}"));
1917 }
1918 Ok(s3s::dto::Range::Int {
1919 first,
1920 last: Some(last),
1921 })
1922}
1923
1924/// v0.5 #34: synthesize the backend storage key for a given
1925/// (logical key, version-id) pair on an Enabled-versioning bucket.
1926///
1927/// Uses the `__s4ver__/` infix because:
1928/// - it's not a substring of `.s4index` / `.s4ver` natural keys (no false-positive
1929/// listing filter collisions)
1930/// - directory-style separator keeps S3 console "browse by prefix" UX intact
1931/// (versions roll up under one virtual folder per object)
1932/// - human-readable on debug logs / `aws s3 ls`
1933///
1934/// `list_objects` / `list_objects_v2` / `list_object_versions` MUST filter
1935/// keys containing `.__s4ver__/` from results so customers don't see internal
1936/// shadow objects.
1937pub fn versioned_shadow_key(key: &str, version_id: &str) -> String {
1938 format!("{key}.__s4ver__/{version_id}")
1939}
1940
1941/// Test for the marker substring used by [`versioned_shadow_key`]. Cheap str
1942/// scan; both list_objects filter and the GET passthrough check use this.
1943fn is_versioning_shadow_key(key: &str) -> bool {
1944 key.contains(".__s4ver__/")
1945}
1946
1947/// v0.6 #42: wall-clock seconds since the UNIX epoch — fed to
1948/// `mfa::check_mfa` so the TOTP verifier can match the client's
1949/// authenticator app's view of "now". Falls back to `0` on the
1950/// (impossible-in-practice) clock-before-1970 path so the verifier
1951/// rejects rather than panicking.
1952fn current_unix_secs() -> u64 {
1953 std::time::SystemTime::now()
1954 .duration_since(std::time::UNIX_EPOCH)
1955 .map(|d| d.as_secs())
1956 .unwrap_or(0)
1957}
1958
1959/// v0.6 #42: translate an `MfaError` into the matching S3 wire error.
1960///
1961/// - `Missing` / `SerialMismatch` / `InvalidCode` → `403 AccessDenied`
1962/// (S3 spec for MFA Delete: every gating failure surfaces as
1963/// `AccessDenied`, not a separate `MFA*` code).
1964/// - `Malformed` → `400 InvalidRequest` (the request itself is
1965/// syntactically broken, not a permission issue).
1966fn mfa_error_to_s3(e: crate::mfa::MfaError) -> S3Error {
1967 match e {
1968 crate::mfa::MfaError::Missing => S3Error::with_message(
1969 S3ErrorCode::AccessDenied,
1970 "MFA token required for this operation",
1971 ),
1972 crate::mfa::MfaError::Malformed => {
1973 S3Error::with_message(S3ErrorCode::InvalidRequest, "malformed x-amz-mfa header")
1974 }
1975 crate::mfa::MfaError::SerialMismatch => S3Error::with_message(
1976 S3ErrorCode::AccessDenied,
1977 "MFA serial does not match configured device",
1978 ),
1979 crate::mfa::MfaError::InvalidCode => {
1980 S3Error::with_message(S3ErrorCode::AccessDenied, "invalid MFA code")
1981 }
1982 }
1983}
1984
1985fn is_multipart_object(metadata: &Option<Metadata>) -> bool {
1986 metadata
1987 .as_ref()
1988 .and_then(|m| m.get(META_MULTIPART))
1989 .map(|v| v == "true")
1990 .unwrap_or(false)
1991}
1992
1993const META_CODEC: &str = "s4-codec";
1994const META_ORIGINAL_SIZE: &str = "s4-original-size";
1995const META_COMPRESSED_SIZE: &str = "s4-compressed-size";
1996const META_CRC32C: &str = "s4-crc32c";
1997/// Multipart upload で per-part frame format を使ったオブジェクトであることを示す。
1998/// GET 時にこの flag を見て frame parser を起動する。
1999const META_MULTIPART: &str = "s4-multipart";
2000/// v0.2 #4: single-PUT でも S4F2 framed format で書かれていることを示す。
2001/// 旧 v0.1 single-PUT は raw 圧縮 bytes (この flag なし)。GET 時にこの flag を
2002/// 見て framed 経路 (= multipart と同じ FrameIter parse) に流す。
2003const META_FRAMED: &str = "s4-framed";
2004
2005fn is_framed_v2_object(metadata: &Option<Metadata>) -> bool {
2006 metadata
2007 .as_ref()
2008 .and_then(|m| m.get(META_FRAMED))
2009 .map(|v| v == "true")
2010 .unwrap_or(false)
2011}
2012
2013/// v0.4 #21: detect SSE-S4 by the metadata flag we set on PUT.
2014fn is_sse_encrypted(metadata: &Option<Metadata>) -> bool {
2015 metadata
2016 .as_ref()
2017 .and_then(|m| m.get("s4-encrypted"))
2018 .map(|v| v == "aes-256-gcm")
2019 .unwrap_or(false)
2020}
2021
2022/// v0.5 #27: pull the three SSE-C headers off an input struct. The S3
2023/// contract is "all three or none" — partial sets are a 400.
2024///
2025/// Returns `Ok(None)` when no SSE-C headers were sent (server-managed or
2026/// no encryption), `Ok(Some(material))` on validated client key, and
2027/// `Err` for malformed or partial inputs.
2028fn extract_sse_c_material(
2029 algorithm: &Option<String>,
2030 key: &Option<String>,
2031 md5: &Option<String>,
2032) -> S3Result<Option<crate::sse::CustomerKeyMaterial>> {
2033 match (algorithm, key, md5) {
2034 (None, None, None) => Ok(None),
2035 (Some(a), Some(k), Some(m)) => crate::sse::parse_customer_key_headers(a, k, m)
2036 .map(Some)
2037 .map_err(sse_c_error_to_s3),
2038 _ => Err(S3Error::with_message(
2039 S3ErrorCode::InvalidRequest,
2040 "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
2041 )),
2042 }
2043}
2044
2045/// v0.5 #28: detect SSE-KMS request — `x-amz-server-side-encryption: aws:kms`.
2046/// Returns the key-id to wrap under, falling back to the gateway default.
2047fn extract_kms_key_id(
2048 sse: &Option<ServerSideEncryption>,
2049 sse_kms_key_id: &Option<String>,
2050 gateway_default: Option<&str>,
2051) -> Option<String> {
2052 let asks_for_kms = sse
2053 .as_ref()
2054 .map(|s| s.as_str() == ServerSideEncryption::AWS_KMS)
2055 .unwrap_or(false);
2056 if !asks_for_kms {
2057 return None;
2058 }
2059 sse_kms_key_id
2060 .clone()
2061 .or_else(|| gateway_default.map(str::to_owned))
2062}
2063
2064/// v0.5 #28: map kms module errors to AWS-shaped S3 error codes.
2065/// `KeyNotFound` is operator misconfig (400); `BackendUnavailable` is a
2066/// transient KMS outage (503). Other variants are 500 InternalError.
2067fn kms_error_to_s3(e: crate::kms::KmsError) -> S3Error {
2068 use crate::kms::KmsError as K;
2069 match e {
2070 K::KeyNotFound { key_id } => S3Error::with_message(
2071 S3ErrorCode::InvalidArgument,
2072 format!("KMS key not found: {key_id}"),
2073 ),
2074 K::BackendUnavailable { message } => S3Error::with_message(
2075 S3ErrorCode::ServiceUnavailable,
2076 format!("KMS backend unavailable: {message}"),
2077 ),
2078 other => S3Error::with_message(S3ErrorCode::InternalError, format!("KMS error: {other}")),
2079 }
2080}
2081
2082/// v0.5 #27: map sse module errors to AWS-shaped S3 error codes.
2083/// `WrongCustomerKey` → 403 AccessDenied (matches AWS behaviour);
2084/// `InvalidCustomerKey` / algorithm / required / unexpected → 400.
2085fn sse_c_error_to_s3(e: crate::sse::SseError) -> S3Error {
2086 use crate::sse::SseError as E;
2087 match e {
2088 E::WrongCustomerKey => S3Error::with_message(
2089 S3ErrorCode::AccessDenied,
2090 "SSE-C key does not match the key used at PUT time",
2091 ),
2092 E::InvalidCustomerKey { reason } => {
2093 S3Error::with_message(S3ErrorCode::InvalidArgument, format!("SSE-C: {reason}"))
2094 }
2095 E::CustomerKeyAlgorithmUnsupported { algo } => S3Error::with_message(
2096 S3ErrorCode::InvalidArgument,
2097 format!("SSE-C unsupported algorithm: {algo:?} (only AES256 is allowed)"),
2098 ),
2099 E::CustomerKeyRequired => S3Error::with_message(
2100 S3ErrorCode::InvalidRequest,
2101 "object is SSE-C encrypted; supply x-amz-server-side-encryption-customer-* headers",
2102 ),
2103 E::CustomerKeyUnexpected => S3Error::with_message(
2104 S3ErrorCode::InvalidRequest,
2105 "object is not SSE-C encrypted; do not send x-amz-server-side-encryption-customer-* headers",
2106 ),
2107 other => S3Error::with_message(S3ErrorCode::InternalError, format!("SSE error: {other}")),
2108 }
2109}
2110
2111fn extract_manifest(metadata: &Option<Metadata>) -> Option<ChunkManifest> {
2112 let m = metadata.as_ref()?;
2113 let codec = m
2114 .get(META_CODEC)
2115 .and_then(|s| s.parse::<CodecKind>().ok())?;
2116 let original_size = m.get(META_ORIGINAL_SIZE)?.parse().ok()?;
2117 let compressed_size = m.get(META_COMPRESSED_SIZE)?.parse().ok()?;
2118 let crc32c = m.get(META_CRC32C)?.parse().ok()?;
2119 Some(ChunkManifest {
2120 codec,
2121 original_size,
2122 compressed_size,
2123 crc32c,
2124 })
2125}
2126
2127fn write_manifest(metadata: &mut Option<Metadata>, manifest: &ChunkManifest) {
2128 let meta = metadata.get_or_insert_with(Default::default);
2129 meta.insert(META_CODEC.into(), manifest.codec.as_str().into());
2130 meta.insert(
2131 META_ORIGINAL_SIZE.into(),
2132 manifest.original_size.to_string(),
2133 );
2134 meta.insert(
2135 META_COMPRESSED_SIZE.into(),
2136 manifest.compressed_size.to_string(),
2137 );
2138 meta.insert(META_CRC32C.into(), manifest.crc32c.to_string());
2139}
2140
2141fn internal<E: std::fmt::Display>(prefix: &'static str) -> impl FnOnce(E) -> S3Error {
2142 move |e| S3Error::with_message(S3ErrorCode::InternalError, format!("{prefix}: {e}"))
2143}
2144
2145/// v0.6 #41: map a `select::SelectError` to the S3 error surface. AWS
2146/// uses a domain-specific `InvalidSqlExpression` code for parse / unsupported
2147/// errors, but s3s 0.13 doesn't expose that as a typed variant — we
2148/// fall back to the well-known `InvalidRequest` 400 with a descriptive
2149/// message that includes the original error context.
2150fn select_error_to_s3(e: crate::select::SelectError, fmt: &str) -> S3Error {
2151 use crate::select::SelectError;
2152 match e {
2153 SelectError::Parse(msg) => S3Error::with_message(
2154 S3ErrorCode::InvalidRequest,
2155 format!("SQL parse error: {msg}"),
2156 ),
2157 SelectError::UnsupportedFeature(msg) => S3Error::with_message(
2158 S3ErrorCode::InvalidRequest,
2159 format!("unsupported SQL feature: {msg}"),
2160 ),
2161 SelectError::RowEval(msg) => S3Error::with_message(
2162 S3ErrorCode::InvalidRequest,
2163 format!("SQL row evaluation error: {msg}"),
2164 ),
2165 SelectError::InputFormat(msg) => S3Error::with_message(
2166 S3ErrorCode::InvalidRequest,
2167 format!("{fmt} input format error: {msg}"),
2168 ),
2169 }
2170}
2171
2172/// v0.5 #30: parse the `x-amz-bypass-governance-retention` header into a
2173/// boolean flag. AWS S3 accepts `true` (case-insensitive); any other value
2174/// (including missing) is treated as `false`.
2175fn parse_bypass_governance_header(headers: &http::HeaderMap) -> bool {
2176 headers
2177 .get("x-amz-bypass-governance-retention")
2178 .and_then(|v| v.to_str().ok())
2179 .map(|s| s.eq_ignore_ascii_case("true"))
2180 .unwrap_or(false)
2181}
2182
2183/// Convert s3s `Timestamp` into a `chrono::DateTime<Utc>` by formatting it
2184/// as an RFC3339 string and re-parsing through `chrono`. The string format
2185/// avoids pulling the `time` crate (transitive dep of s3s, not declared by
2186/// s4-server) into our direct deps. Returns `None` if the format/parse fails
2187/// or the value is outside `chrono`'s supported range.
2188fn timestamp_to_chrono_utc(ts: &Timestamp) -> Option<chrono::DateTime<chrono::Utc>> {
2189 let mut buf = Vec::new();
2190 ts.format(s3s::dto::TimestampFormat::DateTime, &mut buf)
2191 .ok()?;
2192 let s = std::str::from_utf8(&buf).ok()?;
2193 chrono::DateTime::parse_from_rfc3339(s)
2194 .ok()
2195 .map(|dt| dt.with_timezone(&chrono::Utc))
2196}
2197
2198/// Inverse of [`timestamp_to_chrono_utc`] — emit RFC3339 (the s3s
2199/// `DateTime` wire format) and re-parse via `Timestamp::parse`.
2200fn chrono_utc_to_timestamp(dt: chrono::DateTime<chrono::Utc>) -> Timestamp {
2201 // chrono's RFC3339 output format matches s3s' parser ("...Z" with
2202 // optional sub-second precision). Fall back to UNIX_EPOCH if anything
2203 // unexpected happens — we never produce malformed strings, so this
2204 // branch is unreachable in practice.
2205 let s = dt.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
2206 Timestamp::parse(s3s::dto::TimestampFormat::DateTime, &s).unwrap_or_default()
2207}
2208
2209/// v0.6 #39: convert our internal [`crate::tagging::TagSet`] into the
2210/// s3s `Vec<Tag>` wire shape used on `GetObject/BucketTaggingOutput`.
2211/// Both halves of every pair land in the `Some(_)` slot — AWS marks
2212/// the field optional but always populates it on response.
2213fn tagset_to_aws(set: &crate::tagging::TagSet) -> Vec<Tag> {
2214 set.iter()
2215 .map(|(k, v)| Tag {
2216 key: Some(k.clone()),
2217 value: Some(v.clone()),
2218 })
2219 .collect()
2220}
2221
2222/// v0.6 #39: inverse of [`tagset_to_aws`] for input handlers. Missing
2223/// keys / values become empty strings (mirrors AWS, which rejects
2224/// `<Key/>` with InvalidTag at the parser layer; downstream
2225/// `TagSet::validate` then enforces our size limits).
2226fn aws_to_tagset(tags: &[Tag]) -> Result<crate::tagging::TagSet, crate::tagging::TagError> {
2227 let pairs = tags
2228 .iter()
2229 .map(|t| {
2230 (
2231 t.key.clone().unwrap_or_default(),
2232 t.value.clone().unwrap_or_default(),
2233 )
2234 })
2235 .collect();
2236 crate::tagging::TagSet::from_pairs(pairs)
2237}
2238
2239/// `Range` request を decompressed object サイズ `total` に適用して `(start, end_exclusive)`
2240/// を返す。`Range::Int { first, last }` は `bytes=first-last` (last は inclusive)、
2241/// `Range::Suffix { length }` は末尾 `length` byte。S3 仕様に準拠。
2242pub fn resolve_range(range: &s3s::dto::Range, total: u64) -> Result<(u64, u64), String> {
2243 if total == 0 {
2244 return Err("cannot range-get zero-length object".into());
2245 }
2246 match range {
2247 s3s::dto::Range::Int { first, last } => {
2248 let start = *first;
2249 let end_inclusive = match last {
2250 Some(l) => (*l).min(total - 1),
2251 None => total - 1,
2252 };
2253 if start > end_inclusive || start >= total {
2254 return Err(format!(
2255 "range bytes={start}-{:?} out of object size {total}",
2256 last
2257 ));
2258 }
2259 Ok((start, end_inclusive + 1))
2260 }
2261 s3s::dto::Range::Suffix { length } => {
2262 let len = (*length).min(total);
2263 Ok((total - len, total))
2264 }
2265 }
2266}
2267
2268#[async_trait::async_trait]
2269impl<B: S3> S3 for S4Service<B> {
2270 // === 圧縮を挟む path (PUT) ===
2271 #[tracing::instrument(
2272 name = "s4.put_object",
2273 skip(self, req),
2274 fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_in, bytes_out, latency_ms)
2275 )]
2276 async fn put_object(
2277 &self,
2278 mut req: S3Request<PutObjectInput>,
2279 ) -> S3Result<S3Response<PutObjectOutput>> {
2280 let put_start = Instant::now();
2281 let put_bucket = req.input.bucket.clone();
2282 let put_key = req.input.key.clone();
2283 let access_preamble = self.access_log_preamble(&req);
2284 self.enforce_rate_limit(&req, &put_bucket)?;
2285 // v0.6 #39: parse `x-amz-tagging` (URL-encoded query string) so
2286 // the IAM policy gate sees the request's tags via
2287 // `s3:RequestObjectTag/<key>`. `existing_object_tags` is also
2288 // resolved from the Tagging manager (when wired) so
2289 // `s3:ExistingObjectTag/<key>` works on overwrite.
2290 let request_tags: Option<crate::tagging::TagSet> = req
2291 .input
2292 .tagging
2293 .as_deref()
2294 .map(crate::tagging::parse_tagging_header)
2295 .transpose()
2296 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
2297 let existing_tags: Option<crate::tagging::TagSet> = self
2298 .tagging
2299 .as_ref()
2300 .and_then(|m| m.get_object_tags(&put_bucket, &put_key));
2301 self.enforce_policy_with_extra(
2302 &req,
2303 "s3:PutObject",
2304 &put_bucket,
2305 Some(&put_key),
2306 request_tags.as_ref(),
2307 existing_tags.as_ref(),
2308 )?;
2309 // v0.5 #30: an Object Lock-protected key cannot be overwritten by
2310 // a non-versioned PUT (Suspended / Unversioned bucket). Enabled
2311 // bucket PUTs are exempt because they materialise a fresh
2312 // version under a shadow key (`<key>.__s4ver__/<vid>`) — the
2313 // locked version's bytes are untouched. The check mirrors the
2314 // delete path (Compliance never bypassable, Governance via the
2315 // bypass header, legal hold never).
2316 if let Some(mgr) = self.object_lock.as_ref()
2317 && let Some(state) = mgr.get(&put_bucket, &put_key)
2318 {
2319 let bucket_versioned_enabled = self
2320 .versioning
2321 .as_ref()
2322 .map(|v| v.state(&put_bucket) == crate::versioning::VersioningState::Enabled)
2323 .unwrap_or(false);
2324 if !bucket_versioned_enabled {
2325 let bypass = parse_bypass_governance_header(&req.headers);
2326 let now = chrono::Utc::now();
2327 if !state.can_delete(now, bypass) {
2328 crate::metrics::record_policy_denial("s3:PutObject", &put_bucket);
2329 return Err(S3Error::with_message(
2330 S3ErrorCode::AccessDenied,
2331 "Access Denied because object protected by object lock",
2332 ));
2333 }
2334 }
2335 }
2336 // v0.5 #30: per-PUT explicit retention / legal hold (S3
2337 // `x-amz-object-lock-mode`, `x-amz-object-lock-retain-until-date`,
2338 // `x-amz-object-lock-legal-hold`). Captured before the body
2339 // moves into the backend; persisted into the manager only on
2340 // backend success below.
2341 let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
2342 .input
2343 .object_lock_mode
2344 .as_ref()
2345 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
2346 let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
2347 .input
2348 .object_lock_retain_until_date
2349 .as_ref()
2350 .and_then(timestamp_to_chrono_utc);
2351 let explicit_legal_hold_on: Option<bool> = req
2352 .input
2353 .object_lock_legal_hold_status
2354 .as_ref()
2355 .map(|s| s.as_str().eq_ignore_ascii_case("ON"));
2356 if let Some(blob) = req.input.body.take() {
2357 // Sample 4 KiB から codec を決定。streaming-aware codec なら streaming
2358 // compress fast path、そうでなければ従来の collect-then-compress。
2359 let (sample, rest_stream) = peek_sample(blob, SAMPLE_BYTES)
2360 .await
2361 .map_err(internal("peek put sample"))?;
2362 let sample_len = sample.len().min(SAMPLE_BYTES);
2363 // v0.8 #56: pass the request's Content-Length (when present) so
2364 // the sampling dispatcher can promote large objects to a GPU
2365 // codec. Chunked transfers (no Content-Length) keep CPU.
2366 let total_size_hint = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2367 let kind = self
2368 .dispatcher
2369 .pick_with_size_hint(&sample[..sample_len], total_size_hint)
2370 .await;
2371
2372 // Passthrough buys nothing from S4F2 wrapping (no compression =
2373 // no per-chunk frame to skip past) and the +28-byte header
2374 // overhead breaks size-sensitive callers that expect a true
2375 // pass-through. So passthrough always uses the legacy raw-blob
2376 // path; only compressing codecs go through the framed path.
2377 //
2378 // v0.8.14 follow-up to #127 MED-B: the previous attempt
2379 // forced the buffered path whenever the client supplied
2380 // any whole-body checksum so `verify_client_body_checksums`
2381 // could run. Modern AWS SDKs auto-add an
2382 // `x-amz-checksum-crc32` trailer by default, which made
2383 // every SDK PUT lose the streaming-framed path and
2384 // therefore lose its sidecar — silent data path
2385 // regression caught by
2386 // `range_get_falls_back_to_full_when_sidecar_etag_stale`
2387 // and `upload_part_copy_propagates_source_version_id`
2388 // on the MinIO E2E job. The streaming PUT path now
2389 // passes through unchanged; client-supplied checksums on
2390 // streaming PUTs are NOT verified (same fail-open as
2391 // pre-v0.8.12). The buffered PUT branch and UploadPart
2392 // do verify, which covers the buffered upload case the
2393 // HIGH-12 audit was scoped to. True streaming verify
2394 // (tee-into-hasher on the chained input) remains the
2395 // tracked follow-up.
2396 let use_framed = supports_streaming_compress(kind) && kind != CodecKind::Passthrough;
2397 let (compressed, manifest, is_framed) = if use_framed {
2398 // streaming fast path: input は memory に collect しない
2399 let chained = chain_sample_with_rest(sample, rest_stream);
2400 debug!(
2401 bucket = ?req.input.bucket,
2402 key = ?req.input.key,
2403 codec = kind.as_str(),
2404 path = "streaming-framed",
2405 "S4 put_object: compressing (streaming, S4F2 multi-frame)"
2406 );
2407 // v0.4 #16: pick the chunk size based on the request's
2408 // Content-Length when known, falling back to the 4 MiB
2409 // default for chunked transfers.
2410 let chunk_size = pick_chunk_size(req.input.content_length.map(|n| n as u64));
2411 // v0.8.4 #73 M2: pass the request's Content-Length so
2412 // streaming_compress_to_frames can fail-fast on a mid-PUT
2413 // truncation (client disconnect after sending half the
2414 // body). `None` is the chunked-Transfer-Encoding case
2415 // where the upstream genuinely doesn't know the size and
2416 // the backend's framing layer is the only truncation
2417 // signal we have.
2418 let expected_input_size =
2419 req.input.content_length.and_then(|n| u64::try_from(n).ok());
2420 let (body, manifest) = streaming_compress_to_frames(
2421 chained,
2422 Arc::clone(&self.registry),
2423 kind,
2424 chunk_size,
2425 expected_input_size,
2426 )
2427 .await
2428 .map_err(|e| match e {
2429 s4_codec::CodecError::TruncatedStream { expected, got } => {
2430 // 400 IncompleteBody: client advertised N bytes
2431 // but disconnected after `got`. Mirrors AWS S3's
2432 // canonical error code for the same shape so SDK
2433 // retries kick in instead of treating the PUT as
2434 // a successful upload of a half-body.
2435 S3Error::with_message(
2436 S3ErrorCode::IncompleteBody,
2437 format!("PUT body truncated: expected {expected} bytes, got {got}"),
2438 )
2439 }
2440 other => internal("streaming framed compress")(other),
2441 })?;
2442 (body, manifest, true)
2443 } else {
2444 // GPU codec 等で streaming-aware でないものは bytes-buffered path
2445 // (raw 圧縮 bytes、framed なし — back-compat 互換 path)
2446 let bytes = collect_with_sample(sample, rest_stream, self.max_body_bytes)
2447 .await
2448 .map_err(internal("collect put body (buffered path)"))?;
2449 // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
2450 // checksum algorithms against the received body on
2451 // the buffered path. The streaming-framed branch
2452 // above redirects here when ANY checksum header is
2453 // present (#127 MED-B), so this is the single
2454 // checkpoint for client-supplied integrity.
2455 verify_client_body_checksums(
2456 &bytes,
2457 req.input.content_md5.as_deref(),
2458 req.input.checksum_crc32.as_deref(),
2459 req.input.checksum_crc32c.as_deref(),
2460 req.input.checksum_sha1.as_deref(),
2461 req.input.checksum_sha256.as_deref(),
2462 req.input.checksum_crc64nvme.as_deref(),
2463 )?;
2464 debug!(
2465 bucket = ?req.input.bucket,
2466 key = ?req.input.key,
2467 bytes = bytes.len(),
2468 codec = kind.as_str(),
2469 path = "buffered",
2470 "S4 put_object: compressing (buffered, raw blob)"
2471 );
2472 // v0.8 #55: telemetry-returning compress so we can stamp
2473 // GPU-pipeline Prometheus metrics (`s4_gpu_compress_seconds`,
2474 // throughput gauge, OOM counter) for nvcomp / dietgpu codecs.
2475 // CPU codecs come back with `gpu_seconds = None` and the
2476 // stamp helper short-circuits — no extra cost on CPU path.
2477 let (compress_res, tel) = self.registry.compress_with_telemetry(bytes, kind).await;
2478 stamp_gpu_compress_telemetry(&tel);
2479 let (body, m) = compress_res.map_err(internal("registry compress"))?;
2480 (body, m, false)
2481 };
2482
2483 write_manifest(&mut req.input.metadata, &manifest);
2484 if is_framed {
2485 // v0.2 #4: framed body であることを GET 側に伝える meta flag。
2486 req.input
2487 .metadata
2488 .get_or_insert_with(Default::default)
2489 .insert(META_FRAMED.into(), "true".into());
2490 }
2491 // 重要: content_length を圧縮後サイズで更新する。
2492 // これを忘れると下流 (aws-sdk-s3 → S3) が宣言サイズ分の bytes を
2493 // 待ち続けて RequestTimeout で失敗する (S3 仕様)。
2494 req.input.content_length = Some(compressed.len() as i64);
2495 // body を書き換えたので、客側が送ってきた original body 用の
2496 // checksum / MD5 ヘッダは無効化する (そのまま転送すると下流 S3 が
2497 // XAmzContentChecksumMismatch を返す)。S4 自身の整合性は
2498 // ChunkManifest.crc32c で担保している。
2499 req.input.checksum_algorithm = None;
2500 req.input.checksum_crc32 = None;
2501 req.input.checksum_crc32c = None;
2502 req.input.checksum_crc64nvme = None;
2503 req.input.checksum_sha1 = None;
2504 req.input.checksum_sha256 = None;
2505 req.input.content_md5 = None;
2506 let original_size = manifest.original_size;
2507 let compressed_size = manifest.compressed_size;
2508 let codec_label = manifest.codec.as_str();
2509 // (sidecar_index is built below, after the SSE-mode
2510 // extraction, so v0.8.12 HIGH-10 can short-circuit the
2511 // build when the on-disk bytes are about to be encrypted.)
2512 // v0.4 #21 / v0.5 #29 / v0.5 #27: encrypt-after-compress.
2513 // Precedence:
2514 // - SSE-C headers present → per-request customer key (S4E3)
2515 // - server-managed keyring configured → active key (S4E2)
2516 // - neither → no encryption (raw compressed body)
2517 // The `s4-encrypted: aes-256-gcm` metadata flag is set in
2518 // both encrypted modes; the on-disk frame magic distinguishes
2519 // S4E1 / S4E2 / S4E3 so GET picks the right decrypt path.
2520 // v0.7 #48 BUG-2/3 fix: take() the SSE fields off req.input
2521 // so the encryption headers are NOT forwarded to the
2522 // backend. S4 owns the encrypt-then-store contract; if we
2523 // leave the headers in place, real S3-compat backends
2524 // (MinIO / AWS) try to apply their own SSE on top and
2525 // either reject (MinIO requires HTTPS for SSE-C) or fail
2526 // (MinIO has no KMS configured). MemoryBackend ignored
2527 // these so mock tests passed.
2528 let sse_c_alg = req.input.sse_customer_algorithm.take();
2529 let sse_c_key = req.input.sse_customer_key.take();
2530 let sse_c_md5 = req.input.sse_customer_key_md5.take();
2531 let sse_header = req.input.server_side_encryption.take();
2532 let sse_kms_key = req.input.ssekms_key_id.take();
2533 let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
2534 // v0.5 #28: SSE-KMS request? Resolves to None unless the
2535 // request asks for `aws:kms` AND a key id is available
2536 // (explicit header or gateway default). When set, we'll
2537 // generate a per-object DEK below.
2538 let kms_key_id = extract_kms_key_id(
2539 &sse_header,
2540 &sse_kms_key,
2541 self.kms_default_key_id.as_deref(),
2542 );
2543 // v0.8.12 HIGH-10 fix: the sidecar offsets describe the
2544 // pre-encrypt `compressed` body, but the bytes the
2545 // backend stores when any SSE mode is active are
2546 // *post-encrypt* (different length, different layout).
2547 // A Range GET on an SSE-encrypted object would slice the
2548 // ciphertext at the stale offsets, hand the wrong bytes
2549 // to the frame parser, and 500. Suppress the sidecar
2550 // entirely when SSE is going to be applied below;
2551 // encrypted-object Range GET falls back to the buffered
2552 // path (decrypt full body → frame parse → slice), trading
2553 // partial-fetch performance for correctness. An
2554 // encryption-aware sidecar format is a follow-up issue.
2555 let will_encrypt =
2556 sse_c_material.is_some() || kms_key_id.is_some() || self.sse_keyring.is_some();
2557 let sidecar_index = if is_framed && !will_encrypt {
2558 s4_codec::index::build_index_from_body(&compressed).ok()
2559 } else {
2560 None
2561 };
2562 // v0.5 #32: in compliance-strict mode, every PUT must
2563 // declare SSE — either client-supplied (SSE-C), KMS, or by
2564 // virtue of a server-side keyring being configured (which
2565 // applies SSE-S4 to every PUT automatically). Requests that
2566 // would otherwise land as plain compressed bytes are
2567 // rejected with 400 InvalidRequest.
2568 if self.compliance_strict
2569 && sse_c_material.is_none()
2570 && kms_key_id.is_none()
2571 && self.sse_keyring.is_none()
2572 && sse_header.as_ref().map(|s| s.as_str()) != Some(ServerSideEncryption::AES256)
2573 {
2574 return Err(S3Error::with_message(
2575 S3ErrorCode::InvalidRequest,
2576 "compliance-mode strict: PUT must include x-amz-server-side-encryption \
2577 (AES256 or aws:kms) or x-amz-server-side-encryption-customer-* headers",
2578 ));
2579 }
2580 // SSE-C and SSE-KMS are mutually exclusive on a single PUT
2581 // (AWS S3 returns 400 InvalidArgument). SSE-C wins by spec.
2582 if sse_c_material.is_some() && kms_key_id.is_some() {
2583 return Err(S3Error::with_message(
2584 S3ErrorCode::InvalidArgument,
2585 "SSE-C and SSE-KMS cannot be used together on the same PUT",
2586 ));
2587 }
2588 // KMS path needs to call generate_dek().await before the
2589 // body_to_send branch; capture the result here.
2590 //
2591 // v0.8.1 #58: the plaintext DEK lives in three places
2592 // during one PUT:
2593 //
2594 // 1. The `Zeroizing<Vec<u8>>` returned by `generate_dek`
2595 // — wiped when the binding `dek` falls out of scope at
2596 // the end of this `if`-arm.
2597 // 2. The stack `[u8; 32]` we copy into for `SseSource::Kms`
2598 // — wrapped in `Zeroizing<[u8; 32]>` so it's wiped when
2599 // the outer `kms_wrap` `Option` is dropped at the end
2600 // of `put_object`.
2601 // 3. AES-GCM internal key state inside the `aes-gcm`
2602 // crate during `encrypt_with_source` — out of scope
2603 // for this fix; tracked separately in v0.8.2.
2604 let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
2605 if let Some(ref key_id) = kms_key_id {
2606 let kms = self.kms.as_ref().ok_or_else(|| {
2607 S3Error::with_message(
2608 S3ErrorCode::InvalidRequest,
2609 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
2610 )
2611 })?;
2612 // `dek` is `Zeroizing<Vec<u8>>`; deref + slice access
2613 // works unchanged via `Deref<Target=Vec<u8>>`.
2614 let (dek, wrapped) = kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
2615 if dek.len() != 32 {
2616 return Err(S3Error::with_message(
2617 S3ErrorCode::InternalError,
2618 format!(
2619 "KMS backend returned a DEK of {} bytes (expected 32)",
2620 dek.len()
2621 ),
2622 ));
2623 }
2624 let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
2625 zeroize::Zeroizing::new([0u8; 32]);
2626 dek_arr.copy_from_slice(&dek);
2627 // `dek` (the `Zeroizing<Vec<u8>>`) is dropped at the
2628 // end of this scope, wiping the heap allocation.
2629 Some((dek_arr, wrapped))
2630 } else {
2631 None
2632 };
2633 // v0.7 #48 BUG-4 fix: stamp the SSE *type* into metadata
2634 // alongside `s4-encrypted` so HEAD (which doesn't fetch the
2635 // body) can echo the correct `x-amz-server-side-encryption`
2636 // value. Without this, HEAD on an SSE-KMS object would not
2637 // echo `aws:kms` because the frame magic is only available
2638 // on the body (which HEAD doesn't read).
2639 let body_to_send = if let Some(ref m) = sse_c_material {
2640 let meta = req.input.metadata.get_or_insert_with(Default::default);
2641 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2642 meta.insert("s4-sse-type".into(), "AES256".into());
2643 meta.insert(
2644 "s4-sse-c-key-md5".into(),
2645 base64::engine::general_purpose::STANDARD.encode(m.key_md5),
2646 );
2647 crate::sse::encrypt_with_source(
2648 &compressed,
2649 crate::sse::SseSource::CustomerKey {
2650 key: &m.key,
2651 key_md5: &m.key_md5,
2652 },
2653 )
2654 } else if let Some((ref dek, ref wrapped)) = kms_wrap {
2655 let meta = req.input.metadata.get_or_insert_with(Default::default);
2656 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2657 meta.insert("s4-sse-type".into(), "aws:kms".into());
2658 meta.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
2659 // v0.8.1 #58: `dek` is `&Zeroizing<[u8; 32]>`; `SseSource::Kms`
2660 // wants `&[u8; 32]`. Rust auto-derefs `&Zeroizing<T>` to
2661 // `&T` here via `Deref<Target=T>`, so the binding picks
2662 // up the inner array reference without copying. The array
2663 // stays in the `Zeroizing` wrapper that owns it and gets
2664 // wiped when `kms_wrap` drops at the end of `put_object`.
2665 let dek_ref: &[u8; 32] = dek;
2666 crate::sse::encrypt_with_source(
2667 &compressed,
2668 crate::sse::SseSource::Kms {
2669 dek: dek_ref,
2670 wrapped,
2671 },
2672 )
2673 } else if let Some(keyring) = self.sse_keyring.as_ref() {
2674 // SSE-S4 is server-driven transparent encryption; the
2675 // client didn't ask for SSE. We stamp `s4-encrypted`
2676 // (internal flag the GET path needs) but deliberately
2677 // do NOT stamp `s4-sse-type` — that lights up the HEAD
2678 // echo of `x-amz-server-side-encryption: AES256`,
2679 // which would falsely advertise AWS-style SSE-S3
2680 // semantics the operator didn't request.
2681 let meta = req.input.metadata.get_or_insert_with(Default::default);
2682 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2683 // v0.8 #52: when `--sse-chunk-size > 0` is configured,
2684 // emit the chunked S4E5 frame so the matching GET can
2685 // stream-decrypt instead of buffering 5 GiB before
2686 // emitting a byte. Falls back to the buffered S4E2
2687 // frame at chunk_size=0 (default) so existing
2688 // deployments are bit-for-bit unchanged.
2689 if self.sse_chunk_size > 0 {
2690 crate::sse::encrypt_v2_chunked(&compressed, keyring, self.sse_chunk_size)
2691 .map_err(|e| {
2692 S3Error::with_message(
2693 S3ErrorCode::InternalError,
2694 format!("SSE-S4 chunked encrypt failed: {e}"),
2695 )
2696 })?
2697 } else {
2698 crate::sse::encrypt_v2(&compressed, keyring)
2699 }
2700 } else {
2701 compressed.clone()
2702 };
2703 // v0.6 #40: capture the about-to-be-sent body + metadata so
2704 // the replication dispatcher (run after the source PUT
2705 // succeeds) can hand the same backend bytes to the
2706 // destination bucket. `Bytes` clone is cheap (refcounted).
2707 let replication_body = body_to_send.clone();
2708 let replication_metadata = req.input.metadata.clone();
2709 // v0.7 #48 BUG-1 fix: SSE encryption (S4E1/E2/E3/E4 frames)
2710 // makes the body longer than the post-compression bytes
2711 // (header + nonce + tag overhead). The earlier
2712 // content_length stamp at compressed.len() is now stale, so
2713 // re-stamp from the actual bytes about to be sent or the
2714 // backend (real S3 / MinIO) rejects with
2715 // `StreamLengthMismatch`. MemoryBackend never validated
2716 // this, which is why mock-only tests passed.
2717 req.input.content_length = Some(body_to_send.len() as i64);
2718 req.input.body = Some(bytes_to_blob(body_to_send));
2719 // v0.5 #34: pre-allocate a version-id when the bucket is
2720 // Enabled, then redirect the backend storage key to the
2721 // shadow path so older versions survive newer PUTs.
2722 // Suspended / Unversioned buckets keep using the plain
2723 // `<key>` (S3 spec: Suspended overwrites the same backend
2724 // object). Pre-allocation (instead of recording after PUT)
2725 // ensures the shadow key + the response's
2726 // `x-amz-version-id` use the same vid.
2727 let pending_version: Option<crate::versioning::PutOutcome> = self
2728 .versioning
2729 .as_ref()
2730 .map(|mgr| mgr.state(&put_bucket))
2731 .map(|state| match state {
2732 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
2733 version_id: crate::versioning::VersioningManager::new_version_id(),
2734 versioned_response: true,
2735 },
2736 crate::versioning::VersioningState::Suspended
2737 | crate::versioning::VersioningState::Unversioned => {
2738 crate::versioning::PutOutcome {
2739 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
2740 versioned_response: false,
2741 }
2742 }
2743 });
2744 if let Some(ref pv) = pending_version
2745 && pv.versioned_response
2746 {
2747 req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
2748 }
2749 // v0.8.4 #73 H-2: capture the to-be-stored body length BEFORE
2750 // the move into `req.input` is consumed by the backend call.
2751 // The sidecar's `source_compressed_size` is checked against
2752 // the live HEAD `Content-Length` on Range GET to detect a
2753 // backend-side mutation.
2754 let backend_object_size = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2755 let mut backend_resp = self.backend.put_object(req).await;
2756 if let Some(mut idx) = sidecar_index
2757 && let Ok(ref resp) = backend_resp
2758 && idx.entries.len() > 1
2759 {
2760 // 1 chunk しかない (small object) なら sidecar は意味がない (=
2761 // partial fetch しても full body と同じ範囲) ので省略。
2762 // Sidecar は user-visible key で書く (latest version の
2763 // partial fetch path 用)。Old versions の Range GET は今 task
2764 // の scope 外 (full read fallback でも意味的には正しい)。
2765 //
2766 // v0.8.4 #73 H-2: stamp the version-binding fields the
2767 // GET path needs to detect a stale / attacker-written
2768 // sidecar. ETag comes from the backend's PUT response —
2769 // when missing (some backends don't return an ETag) we
2770 // synthesize a CRC-derived stable identifier so the
2771 // sidecar still binds to *something*; the GET HEAD will
2772 // see the same backend ETag (None vs None) and treat the
2773 // pair as consistent.
2774 let source_etag = resp.output.e_tag.as_ref().map(|t| t.value().to_string());
2775 idx.source_etag = source_etag;
2776 idx.source_compressed_size = backend_object_size;
2777 self.write_sidecar(&put_bucket, &put_key, &idx).await;
2778 }
2779 // v0.5 #34: commit the new version into the manager only on
2780 // backend success. Use the pre-allocated vid so the response
2781 // header and the chain entry agree.
2782 if let (Some(mgr), Some(pv), Ok(resp)) = (
2783 self.versioning.as_ref(),
2784 pending_version.as_ref(),
2785 backend_resp.as_mut(),
2786 ) {
2787 let etag = resp
2788 .output
2789 .e_tag
2790 .clone()
2791 .map(ETag::into_value)
2792 .unwrap_or_else(|| format!("\"crc32c-{}\"", manifest.crc32c));
2793 let now = chrono::Utc::now();
2794 mgr.commit_put_with_version(
2795 &put_bucket,
2796 &put_key,
2797 crate::versioning::VersionEntry {
2798 version_id: pv.version_id.clone(),
2799 etag,
2800 size: original_size,
2801 is_delete_marker: false,
2802 created_at: now,
2803 },
2804 );
2805 if pv.versioned_response {
2806 resp.output.version_id = Some(pv.version_id.clone());
2807 }
2808 }
2809 // v0.5 #27: AWS S3 echoes the SSE-C headers back on success
2810 // so the client knows the server actually applied the
2811 // requested algorithm and which key fingerprint matched.
2812 if let (Some(m), Ok(resp)) = (sse_c_material.as_ref(), backend_resp.as_mut()) {
2813 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
2814 resp.output.sse_customer_key_md5 =
2815 Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
2816 }
2817 // v0.5 #28: SSE-KMS echo — `aws:kms` + the canonical key id
2818 // the backend returned (AWS KMS returns the ARN even when
2819 // the request used an alias).
2820 if let (Some((_, wrapped)), Ok(resp)) = (kms_wrap.as_ref(), backend_resp.as_mut()) {
2821 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
2822 ServerSideEncryption::AWS_KMS,
2823 ));
2824 resp.output.ssekms_key_id = Some(wrapped.key_id.clone());
2825 }
2826 // v0.5 #30: persist any per-PUT explicit retention / legal
2827 // hold the client supplied, then auto-apply the bucket
2828 // default (no-op when state is already populated). The
2829 // explicit fields take precedence — the bucket-default
2830 // helper bails out as soon as it sees any retention.
2831 if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
2832 if explicit_lock_mode.is_some()
2833 || explicit_retain_until.is_some()
2834 || explicit_legal_hold_on.is_some()
2835 {
2836 let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
2837 if let Some(m) = explicit_lock_mode {
2838 state.mode = Some(m);
2839 }
2840 if let Some(u) = explicit_retain_until {
2841 state.retain_until = Some(u);
2842 }
2843 if let Some(lh) = explicit_legal_hold_on {
2844 state.legal_hold_on = lh;
2845 }
2846 mgr.set(&put_bucket, &put_key, state);
2847 }
2848 mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
2849 }
2850 let _ = (original_size, compressed_size); // mute unused warnings
2851 let elapsed = put_start.elapsed();
2852 crate::metrics::record_put(
2853 codec_label,
2854 original_size,
2855 compressed_size,
2856 elapsed.as_secs_f64(),
2857 backend_resp.is_ok(),
2858 );
2859 // v0.4 #20: structured access-log entry (best-effort).
2860 self.record_access(
2861 access_preamble,
2862 "REST.PUT.OBJECT",
2863 &put_bucket,
2864 Some(&put_key),
2865 if backend_resp.is_ok() { 200 } else { 500 },
2866 compressed_size,
2867 original_size,
2868 elapsed.as_millis() as u64,
2869 backend_resp.as_ref().err().map(|e| e.code().as_str()),
2870 )
2871 .await;
2872 info!(
2873 op = "put_object",
2874 bucket = %put_bucket,
2875 key = %put_key,
2876 codec = codec_label,
2877 bytes_in = original_size,
2878 bytes_out = compressed_size,
2879 ratio = format!(
2880 "{:.3}",
2881 if original_size == 0 { 1.0 } else { compressed_size as f64 / original_size as f64 }
2882 ),
2883 latency_ms = elapsed.as_millis() as u64,
2884 ok = backend_resp.is_ok(),
2885 "S4 put completed"
2886 );
2887 // v0.6 #35: fire bucket-notification destinations (best-effort,
2888 // detached). Skipped when no manager is attached or when the
2889 // bucket has no rule matching `s3:ObjectCreated:Put` for this
2890 // key.
2891 if backend_resp.is_ok()
2892 && let Some(mgr) = self.notifications.as_ref()
2893 {
2894 let dests = mgr.match_destinations(
2895 &put_bucket,
2896 &crate::notifications::EventType::ObjectCreatedPut,
2897 &put_key,
2898 );
2899 if !dests.is_empty() {
2900 let etag = backend_resp
2901 .as_ref()
2902 .ok()
2903 .and_then(|r| r.output.e_tag.clone())
2904 .map(ETag::into_value);
2905 let version_id = pending_version
2906 .as_ref()
2907 .filter(|pv| pv.versioned_response)
2908 .map(|pv| pv.version_id.clone());
2909 tokio::spawn(crate::notifications::dispatch_event(
2910 Arc::clone(mgr),
2911 put_bucket.clone(),
2912 put_key.clone(),
2913 crate::notifications::EventType::ObjectCreatedPut,
2914 Some(original_size),
2915 etag,
2916 version_id,
2917 format!("S4-{}", uuid::Uuid::new_v4()),
2918 ));
2919 }
2920 }
2921 // v0.6 #39: persist parsed `x-amz-tagging` tags into the
2922 // tagging manager on a successful PUT. AWS PutObject's
2923 // tagging is a full-replace operation (not a merge), so
2924 // any pre-existing entry for `(bucket, key)` is overwritten.
2925 if backend_resp.is_ok()
2926 && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
2927 {
2928 mgr.put_object_tags(&put_bucket, &put_key, tags);
2929 }
2930 // v0.6 #40: cross-bucket replication fire-point. On
2931 // successful source PUT, consult the replication manager;
2932 // when an enabled rule matches, mark the source key
2933 // `Pending` and spawn a detached task that PUTs the same
2934 // backend bytes + metadata to the rule's destination
2935 // bucket. The dispatcher itself records `Completed` /
2936 // `Failed` and bumps the drop counter on retry-budget
2937 // exhaustion.
2938 self.spawn_replication_if_matched(
2939 &put_bucket,
2940 &put_key,
2941 &request_tags,
2942 &replication_body,
2943 &replication_metadata,
2944 backend_resp.is_ok(),
2945 pending_version.as_ref(),
2946 );
2947 return backend_resp;
2948 }
2949 // Body-less PUT (rare: zero-length object). Mirror the body-full
2950 // versioning hooks so list_object_versions / GET-by-version still see
2951 // empty-body objects in the chain.
2952 let pending_version: Option<crate::versioning::PutOutcome> = self
2953 .versioning
2954 .as_ref()
2955 .map(|mgr| mgr.state(&put_bucket))
2956 .map(|state| match state {
2957 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
2958 version_id: crate::versioning::VersioningManager::new_version_id(),
2959 versioned_response: true,
2960 },
2961 _ => crate::versioning::PutOutcome {
2962 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
2963 versioned_response: false,
2964 },
2965 });
2966 if let Some(ref pv) = pending_version
2967 && pv.versioned_response
2968 {
2969 req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
2970 }
2971 let mut backend_resp = self.backend.put_object(req).await;
2972 if let (Some(mgr), Some(pv), Ok(resp)) = (
2973 self.versioning.as_ref(),
2974 pending_version.as_ref(),
2975 backend_resp.as_mut(),
2976 ) {
2977 let etag = resp
2978 .output
2979 .e_tag
2980 .clone()
2981 .map(ETag::into_value)
2982 .unwrap_or_default();
2983 let now = chrono::Utc::now();
2984 mgr.commit_put_with_version(
2985 &put_bucket,
2986 &put_key,
2987 crate::versioning::VersionEntry {
2988 version_id: pv.version_id.clone(),
2989 etag,
2990 size: 0,
2991 is_delete_marker: false,
2992 created_at: now,
2993 },
2994 );
2995 if pv.versioned_response {
2996 resp.output.version_id = Some(pv.version_id.clone());
2997 }
2998 }
2999 // v0.5 #30: same explicit-then-default lock-state commit as the
3000 // body-bearing branch above, so a zero-length PUT also picks up
3001 // bucket-default retention.
3002 if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
3003 if explicit_lock_mode.is_some()
3004 || explicit_retain_until.is_some()
3005 || explicit_legal_hold_on.is_some()
3006 {
3007 let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
3008 if let Some(m) = explicit_lock_mode {
3009 state.mode = Some(m);
3010 }
3011 if let Some(u) = explicit_retain_until {
3012 state.retain_until = Some(u);
3013 }
3014 if let Some(lh) = explicit_legal_hold_on {
3015 state.legal_hold_on = lh;
3016 }
3017 mgr.set(&put_bucket, &put_key, state);
3018 }
3019 mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
3020 }
3021 // v0.6 #35: same notification fire-point as the body-bearing PUT
3022 // branch above (zero-length objects still match `ObjectCreated:Put`
3023 // rules per the AWS event taxonomy).
3024 if backend_resp.is_ok()
3025 && let Some(mgr) = self.notifications.as_ref()
3026 {
3027 let dests = mgr.match_destinations(
3028 &put_bucket,
3029 &crate::notifications::EventType::ObjectCreatedPut,
3030 &put_key,
3031 );
3032 if !dests.is_empty() {
3033 let etag = backend_resp
3034 .as_ref()
3035 .ok()
3036 .and_then(|r| r.output.e_tag.clone())
3037 .map(ETag::into_value);
3038 let version_id = pending_version
3039 .as_ref()
3040 .filter(|pv| pv.versioned_response)
3041 .map(|pv| pv.version_id.clone());
3042 tokio::spawn(crate::notifications::dispatch_event(
3043 Arc::clone(mgr),
3044 put_bucket.clone(),
3045 put_key.clone(),
3046 crate::notifications::EventType::ObjectCreatedPut,
3047 Some(0),
3048 etag,
3049 version_id,
3050 format!("S4-{}", uuid::Uuid::new_v4()),
3051 ));
3052 }
3053 }
3054 // v0.6 #39: persist parsed `x-amz-tagging` for the body-less
3055 // (zero-length) PUT branch too — same shape as the body-bearing
3056 // branch above.
3057 if backend_resp.is_ok()
3058 && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3059 {
3060 mgr.put_object_tags(&put_bucket, &put_key, tags);
3061 }
3062 // v0.6 #40: cross-bucket replication for the zero-length PUT
3063 // branch — same shape as the body-bearing branch above.
3064 // v0.8.2 #61: pass `pending_version` so a versioned source's
3065 // destination receives the same shadow-key path.
3066 self.spawn_replication_if_matched(
3067 &put_bucket,
3068 &put_key,
3069 &request_tags,
3070 &bytes::Bytes::new(),
3071 &None,
3072 backend_resp.is_ok(),
3073 pending_version.as_ref(),
3074 );
3075 backend_resp
3076 }
3077
3078 // === 圧縮を解く path (GET) ===
3079 #[tracing::instrument(
3080 name = "s4.get_object",
3081 skip(self, req),
3082 fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_out, range, path)
3083 )]
3084 async fn get_object(
3085 &self,
3086 mut req: S3Request<GetObjectInput>,
3087 ) -> S3Result<S3Response<GetObjectOutput>> {
3088 let get_start = Instant::now();
3089 let get_bucket = req.input.bucket.clone();
3090 let get_key = req.input.key.clone();
3091 self.enforce_rate_limit(&req, &get_bucket)?;
3092 self.enforce_policy(&req, "s3:GetObject", &get_bucket, Some(&get_key))?;
3093 // Range request の事前検出 (decompress 後 slice する path に使う)。
3094 let range_request = req.input.range.take();
3095 // v0.5 #27: pull SSE-C material from the input headers before
3096 // the request is moved into the backend. A header parse error
3097 // fails fast (no body fetch). The material is consumed below
3098 // when decrypting an S4E3-framed body; the SSE-C headers on
3099 // `req.input` are cleared so the backend doesn't see them.
3100 let sse_c_alg = req.input.sse_customer_algorithm.take();
3101 let sse_c_key = req.input.sse_customer_key.take();
3102 let sse_c_md5 = req.input.sse_customer_key_md5.take();
3103 let get_sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
3104
3105 // v0.5 #34: route the GET through the VersioningManager when
3106 // attached AND the bucket is in a versioning-aware state.
3107 // Resolves which version to fetch (explicit `?versionId=` query
3108 // param vs. chain latest), translates a delete-marker into 404
3109 // NoSuchKey, and rewrites the backend storage key to the shadow
3110 // path (`<key>.__s4ver__/<vid>`) for non-null Enabled-bucket
3111 // versions. `resolved_version_id` is stamped onto the response
3112 // so clients see a coherent `x-amz-version-id` header.
3113 //
3114 // When the bucket is Unversioned (or no manager attached), the
3115 // chain-resolution step is skipped and the request flows
3116 // through the existing single-key path unchanged.
3117 let resolved_version_id: Option<String> = match self.versioning.as_ref() {
3118 Some(mgr)
3119 if mgr.state(&get_bucket) != crate::versioning::VersioningState::Unversioned =>
3120 {
3121 let req_vid = req.input.version_id.take();
3122 let entry = match req_vid.as_deref() {
3123 Some(vid) => {
3124 mgr.lookup_version(&get_bucket, &get_key, vid)
3125 .ok_or_else(|| {
3126 S3Error::with_message(
3127 S3ErrorCode::NoSuchVersion,
3128 format!("no such version: {vid}"),
3129 )
3130 })?
3131 }
3132 None => mgr.lookup_latest(&get_bucket, &get_key).ok_or_else(|| {
3133 S3Error::with_message(
3134 S3ErrorCode::NoSuchKey,
3135 format!("no such key: {get_key}"),
3136 )
3137 })?,
3138 };
3139 if entry.is_delete_marker {
3140 // S3 spec: GET without versionId on a
3141 // delete-marker latest → 404 NoSuchKey + the
3142 // response carries `x-amz-delete-marker: true`.
3143 // GET with explicit versionId pointing at a delete
3144 // marker → 405 MethodNotAllowed; we surface
3145 // NoSuchKey here for both since s3s collapses them
3146 // into the same not-found error path.
3147 return Err(S3Error::with_message(
3148 S3ErrorCode::NoSuchKey,
3149 format!("delete marker is the current version of {get_key}"),
3150 ));
3151 }
3152 if entry.version_id != crate::versioning::NULL_VERSION_ID {
3153 req.input.key = versioned_shadow_key(&get_key, &entry.version_id);
3154 }
3155 Some(entry.version_id)
3156 }
3157 _ => None,
3158 };
3159
3160 // ====== Range GET の partial-fetch fast path (sidecar index 利用) ======
3161 // sidecar `<key>.s4index` が存在し、multipart-framed object であれば
3162 // 必要 frame だけを backend に Range GET し帯域節約する。
3163 //
3164 // v0.8.4 #73 H-2: BEFORE trusting the sidecar's frame offsets,
3165 // verify the source object hasn't been overwritten / mutated since
3166 // the sidecar was stamped. The sidecar carries the backend ETag
3167 // captured at PUT time (`source_etag`); a HEAD against the current
3168 // backend object tells us the live ETag. If they disagree we treat
3169 // the sidecar as stale and fall through to the full-GET path —
3170 // returning the wrong frames for a Range request would surface as
3171 // a CRC mismatch deeper in the stack but would also potentially
3172 // disclose unrelated frames if a hostile operator wrote the
3173 // sidecar themselves. Fail-open to "full read" is the safe default.
3174 //
3175 // Legacy v1 sidecars (no `source_etag` populated) keep the old
3176 // best-effort behaviour so existing on-disk indexes don't suddenly
3177 // start missing the partial-fetch path.
3178 if let Some(ref r) = range_request
3179 && let Some(index) = self.read_sidecar(&req.input.bucket, &req.input.key).await
3180 && self
3181 .sidecar_version_binding_ok(&req.input.bucket, &req.input.key, &index)
3182 .await
3183 {
3184 let total = index.total_original_size();
3185 let (start, end_exclusive) = match resolve_range(r, total) {
3186 Ok(v) => v,
3187 Err(e) => {
3188 return Err(S3Error::with_message(S3ErrorCode::InvalidRange, e));
3189 }
3190 };
3191 if let Some(plan) = index.lookup_range(start, end_exclusive) {
3192 return self
3193 .partial_range_get(&req, plan, start, end_exclusive, total, get_start)
3194 .await;
3195 }
3196 }
3197 let mut resp = self.backend.get_object(req).await?;
3198 // v0.5 #34: stamp the resolved version-id so the client sees a
3199 // coherent `x-amz-version-id` header (only for chains owned by
3200 // the manager — Unversioned buckets / no-manager paths never
3201 // set this).
3202 if let Some(ref vid) = resolved_version_id {
3203 resp.output.version_id = Some(vid.clone());
3204 }
3205 let is_multipart = is_multipart_object(&resp.output.metadata);
3206 let is_framed_v2 = is_framed_v2_object(&resp.output.metadata);
3207 // v0.2 #4: framed-v2 single-PUT は多 frame parse が必要なので
3208 // multipart と同じ path に流す。
3209 let needs_frame_parse = is_multipart || is_framed_v2;
3210 let manifest_opt = extract_manifest(&resp.output.metadata);
3211
3212 if !needs_frame_parse && manifest_opt.is_none() {
3213 // S4 が書いていないオブジェクトは透過 (raw bucket pre-existing object 等)
3214 debug!("S4 get_object: object lacks s4-codec metadata, returning as-is");
3215 return Ok(resp);
3216 }
3217
3218 if let Some(blob) = resp.output.body.take() {
3219 // v0.4 #21 / v0.5 #27: if the object was stored under SSE
3220 // (metadata flag `s4-encrypted: aes-256-gcm`), decrypt
3221 // before any frame parse / streaming decompress. Encrypted
3222 // bodies are opaque to the codec; this also forces the
3223 // buffered path because AES-GCM needs the full body for tag
3224 // verify. SSE-C uses the per-request customer key, SSE-S4
3225 // falls back to the configured keyring.
3226 let blob = if is_sse_encrypted(&resp.output.metadata) {
3227 let body = collect_blob(blob, self.max_body_bytes)
3228 .await
3229 .map_err(internal("collect SSE-encrypted body"))?;
3230 // v0.5 #28: peek the frame magic to route the right
3231 // decrypt path. S4E4 means SSE-KMS — unwrap the DEK
3232 // through the KMS backend (async). S4E1/E2/E3 take
3233 // the sync path (keyring or customer key).
3234 //
3235 // v0.8 #52 (S4E5) / v0.8.1 #57 (S4E6): the chunked
3236 // SSE-S4 frames take the *streaming* path — we hand
3237 // the response body a per-chunk verify-and-emit
3238 // Stream so the client sees chunk 0 plaintext after
3239 // one chunk-worth of AES-GCM verify (vs. waiting
3240 // for the whole body's tag), and the gateway no
3241 // longer needs to materialize the full plaintext
3242 // in memory before responding. SSE-C is out of
3243 // scope for the chunked path (chunked S4E3 is a
3244 // follow-up), so this branch requires the SSE-S4
3245 // keyring to be wired and `get_sse_c_material` to
3246 // be absent — otherwise we surface a clear
3247 // misconfiguration error instead of silently
3248 // falling through to the buffered chunked path.
3249 // v0.8.11 CRIT-1 fix: the chunked stream early-return is
3250 // only correct when the decrypted body IS the user's
3251 // plaintext as-stored. If the object went through the
3252 // codec (compressed) or carries S4F2 frames, returning
3253 // the decrypt stream directly hands the client
3254 // compressed / framed bytes. Restrict the early-return
3255 // to codec=Passthrough + non-framed objects; everything
3256 // else falls through to the buffered path, which
3257 // decrypt-buffers S4E5/S4E6 via
3258 // `decrypt_chunked_buffered_default` and then runs the
3259 // existing decompress pipeline.
3260 let chunked_streaming_safe = !needs_frame_parse
3261 && manifest_opt
3262 .as_ref()
3263 .map(|m| m.codec == CodecKind::Passthrough)
3264 .unwrap_or(false);
3265 if matches!(crate::sse::peek_magic(&body), Some("S4E5") | Some("S4E6"))
3266 && get_sse_c_material.is_none()
3267 && chunked_streaming_safe
3268 {
3269 let keyring_arc = self.sse_keyring.clone().ok_or_else(|| {
3270 S3Error::with_message(
3271 S3ErrorCode::InvalidRequest,
3272 "object is SSE-S4 encrypted (S4E5/S4E6) but no --sse-s4-key is configured on this gateway",
3273 )
3274 })?;
3275 let body_len = body.len() as u64;
3276 let stream = crate::sse::decrypt_chunked_stream(body, keyring_arc.as_ref());
3277 // Stream is `'static` (the keyring borrow is
3278 // consumed up front; the cipher lives inside
3279 // the stream state — see decrypt_chunked_stream
3280 // doc), so we can move it straight into a
3281 // StreamingBlob without lifetime gymnastics.
3282 use futures::StreamExt;
3283 let mapped = stream.map(|r| {
3284 r.map_err(|e| std::io::Error::other(format!("SSE-S4 chunked decrypt: {e}")))
3285 });
3286 use s3s::dto::StreamingBlob;
3287 resp.output.body = Some(StreamingBlob::wrap(mapped));
3288 // Plaintext content_length is unknown until all
3289 // chunks have been verified; null it out so the
3290 // ByteStream wrapper reports `unknown` to the
3291 // HTTP layer (which then emits chunked transfer-
3292 // encoding) rather than lying about the size.
3293 resp.output.content_length = None;
3294 // The backend's checksums + ETag describe the
3295 // encrypted body (S4E5/S4E6 wire format), not
3296 // the plaintext we're about to stream — clear them
3297 // so the AWS SDK doesn't fail the GET with a
3298 // ChecksumMismatch on a successful round-trip.
3299 // Mirrors the streaming-zstd path at L1180-1185.
3300 resp.output.checksum_crc32 = None;
3301 resp.output.checksum_crc32c = None;
3302 resp.output.checksum_crc64nvme = None;
3303 resp.output.checksum_sha1 = None;
3304 resp.output.checksum_sha256 = None;
3305 resp.output.e_tag = None;
3306 let elapsed = get_start.elapsed();
3307 crate::metrics::record_get(
3308 "sse-s4-chunked",
3309 body_len,
3310 body_len,
3311 elapsed.as_secs_f64(),
3312 true,
3313 );
3314 return Ok(resp);
3315 }
3316 let plain = match crate::sse::peek_magic(&body) {
3317 Some("S4E4") => {
3318 let kms = self.kms.as_ref().ok_or_else(|| {
3319 S3Error::with_message(
3320 S3ErrorCode::InvalidRequest,
3321 "object is SSE-KMS encrypted but no --kms-local-dir / --kms-aws-region is configured on this gateway",
3322 )
3323 })?;
3324 let kms_ref: &dyn crate::kms::KmsBackend = kms.as_ref();
3325 crate::sse::decrypt_with_kms(&body, kms_ref)
3326 .await
3327 .map_err(|e| match e {
3328 crate::sse::SseError::KmsBackend(k) => kms_error_to_s3(k),
3329 other => S3Error::with_message(
3330 S3ErrorCode::InternalError,
3331 format!("SSE-KMS decrypt failed: {other}"),
3332 ),
3333 })?
3334 }
3335 _ => {
3336 if let Some(ref m) = get_sse_c_material {
3337 crate::sse::decrypt(
3338 &body,
3339 crate::sse::SseSource::CustomerKey {
3340 key: &m.key,
3341 key_md5: &m.key_md5,
3342 },
3343 )
3344 .map_err(sse_c_error_to_s3)?
3345 } else {
3346 let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
3347 S3Error::with_message(
3348 S3ErrorCode::InvalidRequest,
3349 "object is SSE-S4 encrypted but no --sse-s4-key is configured on this gateway",
3350 )
3351 })?;
3352 crate::sse::decrypt(&body, keyring).map_err(|e| {
3353 S3Error::with_message(
3354 S3ErrorCode::InternalError,
3355 format!("SSE-S4 decrypt failed: {e}"),
3356 )
3357 })?
3358 }
3359 }
3360 };
3361 // v0.5 #28: parse out the on-disk wrapped DEK's key id
3362 // so the GET response can echo `x-amz-server-side-encryption-aws-kms-key-id`.
3363 if matches!(crate::sse::peek_magic(&body), Some("S4E4"))
3364 && let Ok(hdr) = crate::sse::parse_s4e4_header(&body)
3365 {
3366 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
3367 ServerSideEncryption::AWS_KMS,
3368 ));
3369 resp.output.ssekms_key_id = Some(hdr.key_id.to_string());
3370 }
3371 bytes_to_blob(plain)
3372 } else if let Some(ref m) = get_sse_c_material {
3373 // Client sent SSE-C headers for an unencrypted object —
3374 // mirror AWS S3's 400 InvalidRequest.
3375 let _ = m;
3376 return Err(sse_c_error_to_s3(
3377 crate::sse::SseError::CustomerKeyUnexpected,
3378 ));
3379 } else {
3380 blob
3381 };
3382 // v0.5 #27: SSE-C echo on success — algorithm + key MD5
3383 // tell the client that the supplied key was the one used.
3384 if let Some(ref m) = get_sse_c_material {
3385 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
3386 resp.output.sse_customer_key_md5 =
3387 Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
3388 }
3389 // ====== Streaming fast path (CpuZstd, non-multipart, codec supports it) ======
3390 // 大規模 object (e.g. 5 GB) を memory に collect すると OOM するので、
3391 // codec が streaming-aware なら body を chunk-by-chunk で decompress して
3392 // 即座に client に流す。
3393 //
3394 // ただし Range request 時は streaming できない (slice するため total bytes
3395 // が必要) → buffered path に fall through。
3396 if range_request.is_none()
3397 && !needs_frame_parse
3398 && let Some(ref m) = manifest_opt
3399 && supports_streaming_decompress(m.codec)
3400 && m.codec == CodecKind::CpuZstd
3401 {
3402 // v0.8.4 #73 H-1: wrap the decompressor output in a
3403 // rolling-CRC32C verifier so a tampered ciphertext (or a
3404 // backend-side corruption that the zstd decoder happens
3405 // to "successfully" decode into wrong bytes) surfaces as
3406 // a streaming error tail at EOF instead of silently
3407 // delivering corrupt plaintext to the client. The wrap
3408 // is a pure pass-through during the body — no extra
3409 // buffering, TTFB unaffected — and the integrity
3410 // decision lands at the last chunk.
3411 let decompressed_blob = cpu_zstd_decompress_stream(blob);
3412 let verified_reader = Crc32cVerifyingReader::new(
3413 blob_to_async_read(decompressed_blob),
3414 m.crc32c,
3415 m.original_size,
3416 );
3417 let verified_blob = async_read_to_blob(verified_reader);
3418 resp.output.content_length = Some(m.original_size as i64);
3419 resp.output.checksum_crc32 = None;
3420 resp.output.checksum_crc32c = None;
3421 resp.output.checksum_crc64nvme = None;
3422 resp.output.checksum_sha1 = None;
3423 resp.output.checksum_sha256 = None;
3424 resp.output.e_tag = None;
3425 resp.output.body = Some(verified_blob);
3426 let elapsed = get_start.elapsed();
3427 crate::metrics::record_get(
3428 m.codec.as_str(),
3429 m.compressed_size,
3430 m.original_size,
3431 elapsed.as_secs_f64(),
3432 true,
3433 );
3434 info!(
3435 op = "get_object",
3436 bucket = %get_bucket,
3437 key = %get_key,
3438 codec = m.codec.as_str(),
3439 bytes_in = m.compressed_size,
3440 bytes_out = m.original_size,
3441 path = "streaming",
3442 setup_latency_ms = elapsed.as_millis() as u64,
3443 "S4 get started (streaming)"
3444 );
3445 return Ok(resp);
3446 }
3447 // Passthrough: そのまま流す (Range なしの場合のみ streaming)
3448 if range_request.is_none()
3449 && !needs_frame_parse
3450 && let Some(ref m) = manifest_opt
3451 && m.codec == CodecKind::Passthrough
3452 {
3453 resp.output.content_length = Some(m.original_size as i64);
3454 resp.output.checksum_crc32 = None;
3455 resp.output.checksum_crc32c = None;
3456 resp.output.checksum_crc64nvme = None;
3457 resp.output.checksum_sha1 = None;
3458 resp.output.checksum_sha256 = None;
3459 resp.output.e_tag = None;
3460 resp.output.body = Some(blob);
3461 debug!("S4 get_object: passthrough streaming");
3462 return Ok(resp);
3463 }
3464
3465 // ====== Buffered slow path (multipart frame parser, GPU codecs) ======
3466 let bytes = collect_blob(blob, self.max_body_bytes)
3467 .await
3468 .map_err(internal("collect get body"))?;
3469
3470 let decompressed = if needs_frame_parse {
3471 // multipart objects と framed-v2 single-PUT objects は同じ
3472 // S4F2 frame 列なので decompress_multipart で統一処理
3473 self.decompress_multipart(bytes).await?
3474 } else {
3475 let manifest = manifest_opt.as_ref().expect("non-multipart guarded above");
3476 self.registry
3477 .decompress(bytes, manifest)
3478 .await
3479 .map_err(internal("registry decompress"))?
3480 };
3481
3482 // Range request があれば slice。なければ full body を返す。
3483 let total_size = decompressed.len() as u64;
3484 let (final_bytes, status_override) = if let Some(r) = range_request.as_ref() {
3485 let (start, end) = resolve_range(r, total_size)
3486 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
3487 let sliced = decompressed.slice(start as usize..end as usize);
3488 resp.output.content_range = Some(format!(
3489 "bytes {start}-{}/{total_size}",
3490 end.saturating_sub(1)
3491 ));
3492 (sliced, Some(http::StatusCode::PARTIAL_CONTENT))
3493 } else {
3494 (decompressed, None)
3495 };
3496 // 解凍後の真のサイズを返す (S3 client は content_length を信頼するので
3497 // 圧縮 size のままだと downstream が body を途中で切ってしまう)
3498 resp.output.content_length = Some(final_bytes.len() as i64);
3499 // 圧縮済 bytes の checksum を返すと AWS SDK 側で StreamingError
3500 // (ChecksumMismatch) になる。ETag も backend が返した「圧縮済 bytes の
3501 // MD5/checksum」なので意味的にズレる — クリアして S4 自身の crc32c
3502 // (manifest 内 / frame 内) で integrity を保証する設計にする。
3503 resp.output.checksum_crc32 = None;
3504 resp.output.checksum_crc32c = None;
3505 resp.output.checksum_crc64nvme = None;
3506 resp.output.checksum_sha1 = None;
3507 resp.output.checksum_sha256 = None;
3508 resp.output.e_tag = None;
3509 let returned_size = final_bytes.len() as u64;
3510 let codec_label = manifest_opt
3511 .as_ref()
3512 .map(|m| m.codec.as_str())
3513 .unwrap_or("multipart");
3514 resp.output.body = Some(bytes_to_blob(final_bytes));
3515 if let Some(status) = status_override {
3516 resp.status = Some(status);
3517 }
3518 let elapsed = get_start.elapsed();
3519 crate::metrics::record_get(codec_label, 0, returned_size, elapsed.as_secs_f64(), true);
3520 info!(
3521 op = "get_object",
3522 bucket = %get_bucket,
3523 key = %get_key,
3524 codec = codec_label,
3525 bytes_out = returned_size,
3526 total_object_size = total_size,
3527 range = range_request.is_some(),
3528 path = "buffered",
3529 latency_ms = elapsed.as_millis() as u64,
3530 "S4 get completed (buffered)"
3531 );
3532 }
3533 // v0.6 #40: echo the recorded `x-amz-replication-status` so
3534 // consumers can poll progress (PENDING / COMPLETED / FAILED).
3535 if let Some(mgr) = self.replication.as_ref()
3536 && let Some(status) = mgr.lookup_status(&get_bucket, &get_key)
3537 {
3538 resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3539 status.as_aws_str().to_owned(),
3540 ));
3541 }
3542 Ok(resp)
3543 }
3544
3545 // === passthrough delegations ===
3546 async fn head_bucket(
3547 &self,
3548 req: S3Request<HeadBucketInput>,
3549 ) -> S3Result<S3Response<HeadBucketOutput>> {
3550 self.backend.head_bucket(req).await
3551 }
3552 async fn list_buckets(
3553 &self,
3554 req: S3Request<ListBucketsInput>,
3555 ) -> S3Result<S3Response<ListBucketsOutput>> {
3556 self.backend.list_buckets(req).await
3557 }
3558 async fn create_bucket(
3559 &self,
3560 req: S3Request<CreateBucketInput>,
3561 ) -> S3Result<S3Response<CreateBucketOutput>> {
3562 self.backend.create_bucket(req).await
3563 }
3564 async fn delete_bucket(
3565 &self,
3566 req: S3Request<DeleteBucketInput>,
3567 ) -> S3Result<S3Response<DeleteBucketOutput>> {
3568 self.backend.delete_bucket(req).await
3569 }
3570 async fn head_object(
3571 &self,
3572 req: S3Request<HeadObjectInput>,
3573 ) -> S3Result<S3Response<HeadObjectOutput>> {
3574 // v0.6 #40: capture bucket/key before req is consumed so the
3575 // replication-status echo can look the entry up.
3576 let head_bucket = req.input.bucket.clone();
3577 let head_key = req.input.key.clone();
3578 let mut resp = self.backend.head_object(req).await?;
3579 if let Some(manifest) = extract_manifest(&resp.output.metadata) {
3580 // 客側には decompress 後の意味のある content_length / checksum を返す。
3581 // backend が返す圧縮済 bytes の checksum / e_tag は意味が違うため除去
3582 // (S4 は manifest 内の crc32c で integrity を担保する)。
3583 resp.output.content_length = Some(manifest.original_size as i64);
3584 resp.output.checksum_crc32 = None;
3585 resp.output.checksum_crc32c = None;
3586 resp.output.checksum_crc64nvme = None;
3587 resp.output.checksum_sha1 = None;
3588 resp.output.checksum_sha256 = None;
3589 resp.output.e_tag = None;
3590 }
3591 // v0.6 #40: echo `x-amz-replication-status` (PENDING / COMPLETED
3592 // / FAILED) so consumers can poll progress without a GET.
3593 if let Some(mgr) = self.replication.as_ref()
3594 && let Some(status) = mgr.lookup_status(&head_bucket, &head_key)
3595 {
3596 resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3597 status.as_aws_str().to_owned(),
3598 ));
3599 }
3600 // v0.7 #48 BUG-4 fix: HEAD must echo SSE indicators so SDKs
3601 // and pipelines see the same posture they got on PUT. The PUT
3602 // path stamps `s4-sse-type` metadata for exactly this — HEAD
3603 // doesn't fetch the body, so it can't peek frame magic.
3604 if let Some(meta) = resp.output.metadata.as_ref()
3605 && let Some(sse_type) = meta.get("s4-sse-type")
3606 {
3607 {
3608 match sse_type.as_str() {
3609 "aws:kms" => {
3610 resp.output.server_side_encryption = Some(
3611 ServerSideEncryption::from_static(ServerSideEncryption::AWS_KMS),
3612 );
3613 if let Some(key_id) = meta.get("s4-sse-kms-key-id") {
3614 resp.output.ssekms_key_id = Some(key_id.clone());
3615 }
3616 }
3617 _ => {
3618 resp.output.server_side_encryption = Some(
3619 ServerSideEncryption::from_static(ServerSideEncryption::AES256),
3620 );
3621 if let Some(md5) = meta.get("s4-sse-c-key-md5") {
3622 resp.output.sse_customer_algorithm =
3623 Some(crate::sse::SSE_C_ALGORITHM.into());
3624 resp.output.sse_customer_key_md5 = Some(md5.clone());
3625 }
3626 }
3627 }
3628 }
3629 }
3630 Ok(resp)
3631 }
3632 async fn delete_object(
3633 &self,
3634 mut req: S3Request<DeleteObjectInput>,
3635 ) -> S3Result<S3Response<DeleteObjectOutput>> {
3636 let bucket = req.input.bucket.clone();
3637 let key = req.input.key.clone();
3638 self.enforce_rate_limit(&req, &bucket)?;
3639 self.enforce_policy(&req, "s3:DeleteObject", &bucket, Some(&key))?;
3640 // v0.6 #42: MFA Delete enforcement. When the bucket has
3641 // MFA-Delete = Enabled, every DELETE / DELETE-version /
3642 // delete-marker form needs `x-amz-mfa: <serial> <code>` (RFC 6238
3643 // 6-digit TOTP). Runs *before* the WORM / versioning routers so
3644 // a missing token is denied for free regardless of which delete
3645 // path the request would otherwise take.
3646 if let Some(mgr) = self.mfa_delete.as_ref()
3647 && mgr.is_enabled(&bucket)
3648 {
3649 let header = req.input.mfa.as_deref();
3650 if let Err(e) = crate::mfa::check_mfa(&bucket, header, mgr, current_unix_secs()) {
3651 crate::metrics::record_mfa_delete_denial(&bucket);
3652 return Err(mfa_error_to_s3(e));
3653 }
3654 }
3655 // v0.5 #30: refuse the delete while a WORM lock is in effect.
3656 // Compliance can never be bypassed; Governance can be overridden
3657 // via `x-amz-bypass-governance-retention: true`; legal hold
3658 // never. The check happens before the versioning router so a
3659 // locked object can't be soft-deleted (delete-marker push) on an
3660 // Enabled bucket either — S3 spec says lock applies to all
3661 // delete forms.
3662 if let Some(mgr) = self.object_lock.as_ref()
3663 && let Some(state) = mgr.get(&bucket, &key)
3664 {
3665 let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
3666 // v0.8.12 HIGH-7 fix: the bypass header alone used to be
3667 // enough to override Governance retention. AWS spec
3668 // requires the caller hold `s3:BypassGovernanceRetention`
3669 // for the target ARN; without that, the header is
3670 // silently ignored (not an error — it lines up with how
3671 // AWS' canonical behaviour treats unprivileged callers).
3672 let bypass_allowed = if bypass_header {
3673 self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
3674 .is_ok()
3675 } else {
3676 false
3677 };
3678 let now = chrono::Utc::now();
3679 if !state.can_delete(now, bypass_allowed) {
3680 crate::metrics::record_policy_denial("s3:DeleteObject", &bucket);
3681 return Err(S3Error::with_message(
3682 S3ErrorCode::AccessDenied,
3683 "Access Denied because object protected by object lock",
3684 ));
3685 }
3686 }
3687 // v0.5 #34: route DELETE through the VersioningManager when the
3688 // bucket is in a versioning-aware state.
3689 //
3690 // - Enabled bucket, no version_id → push a delete marker into
3691 // the chain. NO backend object is touched (older versions
3692 // stay reachable via specific-version GET).
3693 // - Enabled / Suspended bucket, with version_id → physical
3694 // delete. Backend bytes at the shadow key (or `<key>` for
3695 // `null`) are removed; chain entry is dropped. If the deleted
3696 // entry was a delete marker, no backend bytes exist for it
3697 // (record-only).
3698 // - Suspended bucket, no version_id → push a "null" delete
3699 // marker (S3 spec); backend bytes at `<key>` are physically
3700 // removed (same as legacy).
3701 // - Unversioned bucket → fall through to legacy passthrough.
3702 if let Some(mgr) = self.versioning.as_ref() {
3703 let state = mgr.state(&bucket);
3704 if state != crate::versioning::VersioningState::Unversioned {
3705 let req_vid = req.input.version_id.take();
3706 if let Some(vid) = req_vid {
3707 // Specific-version DELETE: touch backend bytes only
3708 // when the entry was a real version (not a delete
3709 // marker, which has no backend bytes).
3710 let outcome = mgr.record_delete_specific(&bucket, &key, &vid);
3711 let backend_target = if vid == crate::versioning::NULL_VERSION_ID {
3712 key.clone()
3713 } else {
3714 versioned_shadow_key(&key, &vid)
3715 };
3716 let was_real_version = outcome
3717 .as_ref()
3718 .map(|o| !o.is_delete_marker)
3719 .unwrap_or(false);
3720 if was_real_version {
3721 // Best-effort backend cleanup; missing bytes
3722 // are not an error (e.g. shadow key already
3723 // GC'd).
3724 let backend_input = DeleteObjectInput {
3725 bucket: bucket.clone(),
3726 key: backend_target,
3727 ..Default::default()
3728 };
3729 let backend_req = S3Request {
3730 input: backend_input,
3731 method: http::Method::DELETE,
3732 uri: req.uri.clone(),
3733 headers: req.headers.clone(),
3734 extensions: http::Extensions::new(),
3735 credentials: req.credentials.clone(),
3736 region: req.region.clone(),
3737 service: req.service.clone(),
3738 trailing_headers: None,
3739 };
3740 let _ = self.backend.delete_object(backend_req).await;
3741 }
3742 let mut output = DeleteObjectOutput {
3743 version_id: Some(vid.clone()),
3744 ..Default::default()
3745 };
3746 if let Some(o) = outcome.as_ref()
3747 && o.is_delete_marker
3748 {
3749 output.delete_marker = Some(true);
3750 }
3751 // v0.6 #35: specific-version DELETE always counts as
3752 // a hard `ObjectRemoved:Delete` event (the chain
3753 // entry, marker or not, is gone after this call).
3754 self.fire_delete_notification(
3755 &bucket,
3756 &key,
3757 crate::notifications::EventType::ObjectRemovedDelete,
3758 Some(vid.clone()),
3759 );
3760 return Ok(S3Response::new(output));
3761 }
3762 // No version_id: record a delete marker (state-aware).
3763 let outcome = mgr.record_delete(&bucket, &key);
3764 if state == crate::versioning::VersioningState::Suspended {
3765 // Suspended buckets also evict the prior `<key>`
3766 // bytes (the previous null version is gone too).
3767 let backend_input = DeleteObjectInput {
3768 bucket: bucket.clone(),
3769 key: key.clone(),
3770 ..Default::default()
3771 };
3772 let backend_req = S3Request {
3773 input: backend_input,
3774 method: http::Method::DELETE,
3775 uri: req.uri.clone(),
3776 headers: req.headers.clone(),
3777 extensions: http::Extensions::new(),
3778 credentials: req.credentials.clone(),
3779 region: req.region.clone(),
3780 service: req.service.clone(),
3781 trailing_headers: None,
3782 };
3783 let _ = self.backend.delete_object(backend_req).await;
3784 }
3785 let output = DeleteObjectOutput {
3786 delete_marker: Some(true),
3787 version_id: outcome.version_id.clone(),
3788 ..Default::default()
3789 };
3790 // v0.6 #35: versioned bucket DELETE without a version-id
3791 // creates a delete marker — the dedicated AWS event
3792 // taxonomy entry. Suspended-state buckets also push a
3793 // (null) marker, so the same event fires there.
3794 self.fire_delete_notification(
3795 &bucket,
3796 &key,
3797 crate::notifications::EventType::ObjectRemovedDeleteMarker,
3798 outcome.version_id,
3799 );
3800 return Ok(S3Response::new(output));
3801 }
3802 }
3803 // Legacy / Unversioned path: physical delete on the backend +
3804 // best-effort sidecar cleanup (mirrors v0.4 behaviour).
3805 let resp = self.backend.delete_object(req).await?;
3806 // v0.5 #30: drop any per-object lock state once the delete has
3807 // succeeded so the freed key can be re-armed by a future PUT
3808 // under the bucket default. Reaching here implies the lock had
3809 // already passed `can_delete` above, so this is purely cleanup.
3810 if let Some(mgr) = self.object_lock.as_ref() {
3811 mgr.clear(&bucket, &key);
3812 }
3813 // v0.6 #39: drop any object-level tag set on physical delete —
3814 // the freed key starts a fresh tag history if a future PUT
3815 // re-creates it. (Versioned-delete branches above return early
3816 // and do NOT touch tags, mirroring AWS where tag state is
3817 // attached to the logical key, not the version chain.)
3818 if let Some(mgr) = self.tagging.as_ref() {
3819 mgr.delete_object_tags(&bucket, &key);
3820 }
3821 let sidecar = sidecar_key(&key);
3822 // v0.7 #49: skip the sidecar DELETE if the key + sidecar suffix
3823 // can't be encoded into a request URI — the primary delete
3824 // already succeeded and a stale sidecar is harmless (Range GET
3825 // re-validates the underlying object on next read).
3826 if let Ok(uri) = safe_object_uri(&bucket, &sidecar) {
3827 let sidecar_input = DeleteObjectInput {
3828 bucket: bucket.clone(),
3829 key: sidecar,
3830 ..Default::default()
3831 };
3832 let sidecar_req = S3Request {
3833 input: sidecar_input,
3834 method: http::Method::DELETE,
3835 uri,
3836 headers: http::HeaderMap::new(),
3837 extensions: http::Extensions::new(),
3838 credentials: None,
3839 region: None,
3840 service: None,
3841 trailing_headers: None,
3842 };
3843 let _ = self.backend.delete_object(sidecar_req).await;
3844 }
3845 // v0.6 #35: legacy unversioned-bucket hard delete fires the
3846 // canonical `ObjectRemoved:Delete` event.
3847 self.fire_delete_notification(
3848 &bucket,
3849 &key,
3850 crate::notifications::EventType::ObjectRemovedDelete,
3851 None,
3852 );
3853 Ok(resp)
3854 }
3855 async fn delete_objects(
3856 &self,
3857 req: S3Request<DeleteObjectsInput>,
3858 ) -> S3Result<S3Response<DeleteObjectsOutput>> {
3859 // v0.6 #42: MFA Delete applies once to the whole batch (S3 spec:
3860 // when MFA-Delete is on the bucket, a missing / invalid token
3861 // fails the entire DeleteObjects request, not per-object).
3862 if let Some(mgr) = self.mfa_delete.as_ref()
3863 && mgr.is_enabled(&req.input.bucket)
3864 {
3865 let header = req.input.mfa.as_deref();
3866 if let Err(e) =
3867 crate::mfa::check_mfa(&req.input.bucket, header, mgr, current_unix_secs())
3868 {
3869 crate::metrics::record_mfa_delete_denial(&req.input.bucket);
3870 return Err(mfa_error_to_s3(e));
3871 }
3872 }
3873 // v0.8.11 CRIT-3 fix: route every entry through the gated
3874 // per-object `delete_object` path so Object Lock, IAM policy,
3875 // versioning, tagging, sidecar cleanup and notification fan-
3876 // out all fire for batch DELETE. The previous
3877 // `self.backend.delete_objects(req).await` straight-through
3878 // bypassed every gate, so a `legal_hold=on` key listed inside
3879 // a DeleteObjects XML was happily removed.
3880 //
3881 // S3 spec note: DeleteObjects is "best-effort per object" —
3882 // a failure on one key surfaces as an `Errors` entry without
3883 // aborting the rest of the batch. Quiet-mode suppresses the
3884 // `Deleted` list (errors are still reported). We honour both.
3885 let bucket = req.input.bucket.clone();
3886 let bypass_governance = req.input.bypass_governance_retention.unwrap_or(false);
3887 let mfa_header = req.input.mfa.clone();
3888 let quiet = req.input.delete.quiet.unwrap_or(false);
3889 let mut deleted: Vec<DeletedObject> = Vec::new();
3890 let mut errors: Vec<s3s::dto::Error> = Vec::new();
3891 for ident in req.input.delete.objects.iter() {
3892 let key = ident.key.clone();
3893 let version_id = ident.version_id.clone();
3894 let per_input = DeleteObjectInput {
3895 bucket: bucket.clone(),
3896 key: key.clone(),
3897 version_id: version_id.clone(),
3898 bypass_governance_retention: Some(bypass_governance),
3899 mfa: mfa_header.clone(),
3900 ..Default::default()
3901 };
3902 let per_uri = match safe_object_uri(&bucket, &key) {
3903 Ok(u) => u,
3904 Err(_) => {
3905 errors.push(s3s::dto::Error {
3906 code: Some("InvalidArgument".to_owned()),
3907 key: Some(key),
3908 message: Some("object key is not URI-encodable".to_owned()),
3909 version_id,
3910 });
3911 continue;
3912 }
3913 };
3914 let per_req = S3Request {
3915 input: per_input,
3916 method: http::Method::DELETE,
3917 uri: per_uri,
3918 headers: req.headers.clone(),
3919 extensions: http::Extensions::new(),
3920 credentials: req.credentials.clone(),
3921 region: req.region.clone(),
3922 service: req.service.clone(),
3923 trailing_headers: None,
3924 };
3925 match self.delete_object(per_req).await {
3926 Ok(resp) => {
3927 let out = resp.output;
3928 // DeleteObjectOutput doesn't surface a separate
3929 // `delete_marker_version_id`; the marker's version
3930 // id is whatever `version_id` carries (when the
3931 // versioning manager pushed a delete-marker, that
3932 // field already holds the marker's vid).
3933 let vid = out.version_id.clone().or(version_id);
3934 deleted.push(DeletedObject {
3935 key: Some(key),
3936 version_id: vid.clone(),
3937 delete_marker: out.delete_marker,
3938 delete_marker_version_id: vid,
3939 });
3940 }
3941 Err(e) => {
3942 let code_str = e.code().as_str().to_owned();
3943 let msg = e.message().unwrap_or(code_str.as_str()).to_owned();
3944 errors.push(s3s::dto::Error {
3945 code: Some(code_str),
3946 key: Some(key),
3947 message: Some(msg),
3948 version_id,
3949 });
3950 }
3951 }
3952 }
3953 let output = DeleteObjectsOutput {
3954 deleted: if quiet || deleted.is_empty() {
3955 None
3956 } else {
3957 Some(deleted)
3958 },
3959 errors: if errors.is_empty() {
3960 None
3961 } else {
3962 Some(errors)
3963 },
3964 ..Default::default()
3965 };
3966 Ok(S3Response::new(output))
3967 }
3968 async fn copy_object(
3969 &self,
3970 mut req: S3Request<CopyObjectInput>,
3971 ) -> S3Result<S3Response<CopyObjectOutput>> {
3972 // copy is conceptually "GetObject src + PutObject dst" — enforce both.
3973 let dst_bucket = req.input.bucket.clone();
3974 let dst_key = req.input.key.clone();
3975 self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
3976 if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
3977 self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
3978 }
3979 // S4-aware copy: source object に s4-* metadata がある場合、それを
3980 // destination に確実に preserve する。
3981 //
3982 // - MetadataDirective::COPY (default): backend が source metadata を
3983 // そのまま copy するので S4 metadata も自動で渡る。介入不要
3984 // - MetadataDirective::REPLACE: 客が指定した metadata で source を
3985 // 上書き → s4-* metadata が消えると destination は decompress 不能に
3986 // なる (silent corruption)。S4 が source metadata を HEAD で取得し、
3987 // s4-* fields を input.metadata に強制 merge する
3988 let needs_merge = req
3989 .input
3990 .metadata_directive
3991 .as_ref()
3992 .map(|d| d.as_str() == MetadataDirective::REPLACE)
3993 .unwrap_or(false);
3994 if needs_merge && let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
3995 let head_input = HeadObjectInput {
3996 bucket: bucket.to_string(),
3997 key: key.to_string(),
3998 ..Default::default()
3999 };
4000 let head_req = S3Request {
4001 input: head_input,
4002 method: req.method.clone(),
4003 uri: req.uri.clone(),
4004 headers: req.headers.clone(),
4005 extensions: http::Extensions::new(),
4006 credentials: req.credentials.clone(),
4007 region: req.region.clone(),
4008 service: req.service.clone(),
4009 trailing_headers: None,
4010 };
4011 if let Ok(head) = self.backend.head_object(head_req).await
4012 && let Some(src_meta) = head.output.metadata.as_ref()
4013 {
4014 let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4015 for key in [
4016 META_CODEC,
4017 META_ORIGINAL_SIZE,
4018 META_COMPRESSED_SIZE,
4019 META_CRC32C,
4020 META_MULTIPART,
4021 META_FRAMED,
4022 ] {
4023 if let Some(v) = src_meta.get(key) {
4024 // 客が同じ key を指定していたら preserve しない (= 上書き許可)
4025 // していたら何もしない。指定していなければ insert
4026 dest_meta
4027 .entry(key.to_string())
4028 .or_insert_with(|| v.clone());
4029 }
4030 }
4031 debug!(
4032 src_bucket = %bucket,
4033 src_key = %key,
4034 "S4 copy_object: preserved s4-* metadata across REPLACE directive"
4035 );
4036 }
4037 }
4038 self.backend.copy_object(req).await
4039 }
4040 async fn list_objects(
4041 &self,
4042 req: S3Request<ListObjectsInput>,
4043 ) -> S3Result<S3Response<ListObjectsOutput>> {
4044 self.enforce_rate_limit(&req, &req.input.bucket)?;
4045 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4046 let mut resp = self.backend.list_objects(req).await?;
4047 // S4 内部 object (`*.s4index` sidecar、`.__s4ver__/` shadow versions
4048 // — v0.5 #34) を顧客から隠す。
4049 if let Some(contents) = resp.output.contents.as_mut() {
4050 contents.retain(|o| {
4051 o.key
4052 .as_ref()
4053 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4054 .unwrap_or(true)
4055 });
4056 }
4057 Ok(resp)
4058 }
4059 async fn list_objects_v2(
4060 &self,
4061 req: S3Request<ListObjectsV2Input>,
4062 ) -> S3Result<S3Response<ListObjectsV2Output>> {
4063 self.enforce_rate_limit(&req, &req.input.bucket)?;
4064 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4065 let mut resp = self.backend.list_objects_v2(req).await?;
4066 if let Some(contents) = resp.output.contents.as_mut() {
4067 let before = contents.len();
4068 contents.retain(|o| {
4069 o.key
4070 .as_ref()
4071 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4072 .unwrap_or(true)
4073 });
4074 // key_count も補正 (S3 spec compliance)
4075 if let Some(kc) = resp.output.key_count.as_mut() {
4076 *kc -= (before - contents.len()) as i32;
4077 }
4078 }
4079 Ok(resp)
4080 }
4081 /// v0.4 #17: filter S4-internal sidecars from versioned listings.
4082 /// v0.5 #34: when a [`crate::versioning::VersioningManager`] is
4083 /// attached AND the bucket is in a versioning-aware state, build
4084 /// the `Versions` / `DeleteMarkers` arrays directly from the
4085 /// in-memory chain (paginated + ordered the S3 way: key asc,
4086 /// version newest-first inside each key). Otherwise fall back to
4087 /// passthrough + sidecar-filter (legacy v0.4 behaviour).
4088 async fn list_object_versions(
4089 &self,
4090 req: S3Request<ListObjectVersionsInput>,
4091 ) -> S3Result<S3Response<ListObjectVersionsOutput>> {
4092 self.enforce_rate_limit(&req, &req.input.bucket)?;
4093 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4094 // v0.5 #34: VersioningManager-owned path.
4095 if let Some(mgr) = self.versioning.as_ref()
4096 && mgr.state(&req.input.bucket) != crate::versioning::VersioningState::Unversioned
4097 {
4098 let max_keys = req.input.max_keys.unwrap_or(1000) as usize;
4099 let page = mgr.list_versions(
4100 &req.input.bucket,
4101 req.input.prefix.as_deref(),
4102 req.input.key_marker.as_deref(),
4103 req.input.version_id_marker.as_deref(),
4104 max_keys,
4105 );
4106 let versions: Vec<ObjectVersion> = page
4107 .versions
4108 .into_iter()
4109 .map(|e| ObjectVersion {
4110 key: Some(e.key),
4111 version_id: Some(e.version_id),
4112 is_latest: Some(e.is_latest),
4113 e_tag: Some(ETag::Strong(e.etag)),
4114 size: Some(e.size as i64),
4115 last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4116 ..Default::default()
4117 })
4118 .collect();
4119 let delete_markers: Vec<DeleteMarkerEntry> = page
4120 .delete_markers
4121 .into_iter()
4122 .map(|e| DeleteMarkerEntry {
4123 key: Some(e.key),
4124 version_id: Some(e.version_id),
4125 is_latest: Some(e.is_latest),
4126 last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4127 ..Default::default()
4128 })
4129 .collect();
4130 let output = ListObjectVersionsOutput {
4131 name: Some(req.input.bucket.clone()),
4132 prefix: req.input.prefix.clone(),
4133 key_marker: req.input.key_marker.clone(),
4134 version_id_marker: req.input.version_id_marker.clone(),
4135 max_keys: req.input.max_keys,
4136 versions: if versions.is_empty() {
4137 None
4138 } else {
4139 Some(versions)
4140 },
4141 delete_markers: if delete_markers.is_empty() {
4142 None
4143 } else {
4144 Some(delete_markers)
4145 },
4146 is_truncated: Some(page.is_truncated),
4147 next_key_marker: page.next_key_marker,
4148 next_version_id_marker: page.next_version_id_marker,
4149 ..Default::default()
4150 };
4151 return Ok(S3Response::new(output));
4152 }
4153 // Legacy passthrough path (v0.4 #17 sidecar filter retained).
4154 let mut resp = self.backend.list_object_versions(req).await?;
4155 if let Some(versions) = resp.output.versions.as_mut() {
4156 versions.retain(|v| {
4157 v.key
4158 .as_ref()
4159 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4160 .unwrap_or(true)
4161 });
4162 }
4163 if let Some(markers) = resp.output.delete_markers.as_mut() {
4164 markers.retain(|m| {
4165 m.key
4166 .as_ref()
4167 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4168 .unwrap_or(true)
4169 });
4170 }
4171 Ok(resp)
4172 }
4173
4174 async fn create_multipart_upload(
4175 &self,
4176 mut req: S3Request<CreateMultipartUploadInput>,
4177 ) -> S3Result<S3Response<CreateMultipartUploadOutput>> {
4178 // v0.8.12 HIGH-9 fix: gate multipart Create on `s3:PutObject` —
4179 // the destination is conceptually about to host a new object,
4180 // matching what `put_object` enforces L2078. Without this, a
4181 // bucket policy denying `s3:PutObject` was bypassable simply
4182 // by switching the client to the multipart wire path.
4183 let mp_bucket = req.input.bucket.clone();
4184 let mp_key = req.input.key.clone();
4185 self.enforce_policy(&req, "s3:PutObject", &mp_bucket, Some(&mp_key))?;
4186 self.enforce_rate_limit(&req, &mp_bucket)?;
4187 // Multipart object は per-part 圧縮 + frame 形式で書く。GET 時に
4188 // frame parse を起動するため、object metadata に flag を立てる。
4189 // codec は dispatcher の default kind を採用 (per-part 別 codec は Phase 2)。
4190 let codec_kind = self.registry.default_kind();
4191 let meta = req.input.metadata.get_or_insert_with(Default::default);
4192 meta.insert(META_MULTIPART.into(), "true".into());
4193 meta.insert(META_CODEC.into(), codec_kind.as_str().into());
4194 // v0.8 #54 BUG-10 fix: take() the SSE request fields off
4195 // `req.input` so they are NOT forwarded to the backend on
4196 // CreateMultipartUpload. Same root cause as v0.7 #48 BUG-2/3 on
4197 // single-PUT — MinIO rejects SSE-C with "HTTPS required" and
4198 // SSE-KMS with "KMS not configured" when the headers reach it.
4199 // S4 owns the encrypt-then-store contract; we capture the
4200 // recipe in `multipart_state` here and apply it on Complete.
4201 let sse_c_alg = req.input.sse_customer_algorithm.take();
4202 let sse_c_key = req.input.sse_customer_key.take();
4203 let sse_c_md5 = req.input.sse_customer_key_md5.take();
4204 let sse_header = req.input.server_side_encryption.take();
4205 let sse_kms_key = req.input.ssekms_key_id.take();
4206 // Strip the encryption-context too — leaving it would make
4207 // MinIO try to validate it against a non-existent KMS key.
4208 let _ = req.input.ssekms_encryption_context.take();
4209 let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
4210 let kms_key_id = extract_kms_key_id(
4211 &sse_header,
4212 &sse_kms_key,
4213 self.kms_default_key_id.as_deref(),
4214 );
4215 // SSE-C / SSE-KMS exclusivity (mirrors put_object L1870).
4216 if sse_c_material.is_some() && kms_key_id.is_some() {
4217 return Err(S3Error::with_message(
4218 S3ErrorCode::InvalidArgument,
4219 "SSE-C and SSE-KMS cannot be used together on the same multipart upload",
4220 ));
4221 }
4222 let sse_mode = if let Some(ref m) = sse_c_material {
4223 // v0.8.2 #62 (H-6 audit fix): wrap the customer-supplied
4224 // 32-byte key in `Zeroizing` so abandoned uploads (or
4225 // normal Complete/Abort) wipe the key bytes on drop. The
4226 // `key_md5` is the public fingerprint and stays as a
4227 // bare `[u8; 16]`.
4228 crate::multipart_state::MultipartSseMode::SseC {
4229 key: zeroize::Zeroizing::new(m.key),
4230 key_md5: m.key_md5,
4231 }
4232 } else if let Some(ref kid) = kms_key_id {
4233 // KMS pre-flight: fail at Create rather than at Complete if
4234 // the gateway has no KMS backend wired (mirrors the
4235 // put_object L1879 check).
4236 if self.kms.is_none() {
4237 return Err(S3Error::with_message(
4238 S3ErrorCode::InvalidRequest,
4239 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
4240 ));
4241 }
4242 crate::multipart_state::MultipartSseMode::SseKms {
4243 key_id: kid.clone(),
4244 }
4245 } else if self.sse_keyring.is_some() {
4246 // SSE-S4: server-driven transparent encryption. Activates
4247 // whenever the gateway has a keyring configured AND the
4248 // client didn't pick a different SSE mode.
4249 crate::multipart_state::MultipartSseMode::SseS4
4250 } else {
4251 crate::multipart_state::MultipartSseMode::None
4252 };
4253 // v0.8 #54 BUG-9 fix: parse the Tagging header on Create. The
4254 // single-PUT path does this on PutObject; the multipart path
4255 // captures it now and commits via TagManager on Complete.
4256 let request_tags: Option<crate::tagging::TagSet> = req
4257 .input
4258 .tagging
4259 .as_deref()
4260 .map(crate::tagging::parse_tagging_header)
4261 .transpose()
4262 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
4263 // Strip the `Tagging` field off the input so the backend
4264 // doesn't try to apply it (no-op on MinIO but keeps the wire
4265 // clean).
4266 let _ = req.input.tagging.take();
4267 // Object Lock recipe (BUG-7 — captured here, applied on Complete).
4268 let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
4269 .input
4270 .object_lock_mode
4271 .as_ref()
4272 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
4273 let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
4274 .input
4275 .object_lock_retain_until_date
4276 .as_ref()
4277 .and_then(timestamp_to_chrono_utc);
4278 let explicit_legal_hold_on: bool = req
4279 .input
4280 .object_lock_legal_hold_status
4281 .as_ref()
4282 .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
4283 .unwrap_or(false);
4284 let bucket = req.input.bucket.clone();
4285 let key = req.input.key.clone();
4286 debug!(
4287 bucket = %bucket,
4288 key = %key,
4289 codec = codec_kind.as_str(),
4290 sse = ?sse_mode,
4291 "S4 create_multipart_upload: marking object for per-part compression"
4292 );
4293 let mut resp = self.backend.create_multipart_upload(req).await?;
4294 // Stash the per-upload context only after the backend handed
4295 // us an upload_id (failed Creates leave nothing in the store).
4296 if let Some(upload_id) = resp.output.upload_id.as_ref() {
4297 self.multipart_state.put(
4298 upload_id,
4299 crate::multipart_state::MultipartUploadContext {
4300 bucket,
4301 key,
4302 sse: sse_mode.clone(),
4303 tags: request_tags,
4304 object_lock_mode: explicit_lock_mode,
4305 object_lock_retain_until: explicit_retain_until,
4306 object_lock_legal_hold: explicit_legal_hold_on,
4307 },
4308 );
4309 }
4310 // SSE-C / SSE-KMS response echo (mirrors put_object L2036-L2050).
4311 match &sse_mode {
4312 crate::multipart_state::MultipartSseMode::SseC { key_md5, .. } => {
4313 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
4314 resp.output.sse_customer_key_md5 =
4315 Some(base64::engine::general_purpose::STANDARD.encode(key_md5));
4316 }
4317 crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
4318 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
4319 ServerSideEncryption::AWS_KMS,
4320 ));
4321 resp.output.ssekms_key_id = Some(key_id.clone());
4322 }
4323 _ => {}
4324 }
4325 Ok(resp)
4326 }
4327
4328 async fn upload_part(
4329 &self,
4330 mut req: S3Request<UploadPartInput>,
4331 ) -> S3Result<S3Response<UploadPartOutput>> {
4332 // v0.8.12 HIGH-9 fix: same `s3:PutObject` gate as
4333 // `put_object` / `create_multipart_upload`. Even though
4334 // Create already passed the gate, a bucket policy that
4335 // *revokes* `s3:PutObject` mid-flight should stop further
4336 // parts (e.g. legal hold drops, retention shortened).
4337 let part_bucket = req.input.bucket.clone();
4338 let part_key = req.input.key.clone();
4339 self.enforce_policy(&req, "s3:PutObject", &part_bucket, Some(&part_key))?;
4340 self.enforce_rate_limit(&req, &part_bucket)?;
4341 // 各 part を圧縮して frame header 付きで forward。GET 時に
4342 // `decompress_multipart` が frame iter で順に解凍する。
4343 // **per-part codec dispatch**: dispatcher が body 先頭 sample から
4344 // codec を選ぶので、parquet 風の mixed-content multipart で part ごとに
4345 // 最適 codec を使える (整数列 part → Bitcomp、text 列 part → zstd 等)。
4346 //
4347 // v0.8 #54 BUG-5/BUG-10 fix: lookup the per-upload SSE
4348 // context captured by `create_multipart_upload` and (a) strip
4349 // any SSE-C request headers off `req.input` so the backend
4350 // doesn't see them — same root cause as v0.7 #48 BUG-2/3 on
4351 // single-PUT; MinIO refuses SSE-C parts over HTTP — and (b)
4352 // observe that an upload context exists for `upload_id`. The
4353 // actual encrypt happens once at `complete_multipart_upload`
4354 // time on the assembled body (the per-part-encrypt approach
4355 // would require a matching multi-segment decrypt path on GET;
4356 // encrypting the whole assembled body keeps the GET path's
4357 // `is_sse_encrypted` branch in get_object L2429 working
4358 // unchanged).
4359 let sse_ctx = self.multipart_state.get(req.input.upload_id.as_str());
4360 // v0.8.2 #62 (H-1 audit fix): SSE-C key consistency check.
4361 // The AWS S3 spec requires the same SSE-C key headers on
4362 // every UploadPart and rejects mismatches with 400. Prior to
4363 // #62 we silently stripped the headers (BUG-10 fix) without
4364 // validating them, allowing a client to send part 1 under
4365 // key-A and part 2 under key-B; both got stored, then
4366 // re-encrypted with key-A on Complete — the client thinks
4367 // part 2 is under key-B but a GET with key-B would in fact
4368 // hit the part-1 ciphertext that was actually encrypted with
4369 // key-A. That would either decrypt successfully (silent
4370 // corruption: client lost track of which key encrypts what)
4371 // or fail in a confusing way. Validate the per-part headers
4372 // now and reject with 400 InvalidArgument on mismatch /
4373 // omission / partial supply, matching real-S3 behaviour.
4374 if let Some(ref ctx) = sse_ctx {
4375 if let crate::multipart_state::MultipartSseMode::SseC {
4376 key_md5: ctx_md5, ..
4377 } = &ctx.sse
4378 {
4379 let alg = req.input.sse_customer_algorithm.take();
4380 let key_b64 = req.input.sse_customer_key.take();
4381 let md5_b64 = req.input.sse_customer_key_md5.take();
4382 match (alg, key_b64, md5_b64) {
4383 (Some(a), Some(k), Some(m)) => {
4384 // Parse + validate; if the per-part headers
4385 // are themselves malformed (algorithm not
4386 // AES256, MD5 mismatch, key not 32 bytes)
4387 // surface the same 400 the single-PUT path
4388 // would. Then compare the parsed MD5 to the
4389 // upload-context's MD5; mismatch is a
4390 // different-key UploadPart and must reject.
4391 let part_material = crate::sse::parse_customer_key_headers(&a, &k, &m)
4392 .map_err(sse_c_error_to_s3)?;
4393 if part_material.key_md5 != *ctx_md5 {
4394 return Err(S3Error::with_message(
4395 S3ErrorCode::InvalidArgument,
4396 "SSE-C key on UploadPart does not match the key supplied on CreateMultipartUpload",
4397 ));
4398 }
4399 // OK — same key as Create. Headers are
4400 // already taken off `req.input` so the
4401 // backend never sees them.
4402 }
4403 (None, None, None) => {
4404 // AWS S3 spec: SSE-C headers MUST be replayed
4405 // on every UploadPart of an SSE-C multipart.
4406 // Real-S3 returns 400 InvalidRequest in this
4407 // case; mirror that.
4408 return Err(S3Error::with_message(
4409 S3ErrorCode::InvalidRequest,
4410 "SSE-C requires customer-key headers on every UploadPart (CreateMultipartUpload was SSE-C)",
4411 ));
4412 }
4413 _ => {
4414 // Partial header set (e.g. algorithm + key
4415 // but no MD5) — same handling as the
4416 // single-PUT `extract_sse_c_material` helper.
4417 return Err(S3Error::with_message(
4418 S3ErrorCode::InvalidRequest,
4419 "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
4420 ));
4421 }
4422 }
4423 } else {
4424 // CreateMultipartUpload was non-SSE-C (None / SseS4 /
4425 // SseKms). A part that arrives carrying SSE-C headers
4426 // is either a confused client or an attempt to
4427 // smuggle SSE-C around the gateway-internal SSE
4428 // recipe. Reject with 400 InvalidRequest rather than
4429 // silently strip — the strip would let the client
4430 // believe the part was encrypted under their key
4431 // when in fact the upload's encryption recipe is
4432 // whatever the Create captured.
4433 if req.input.sse_customer_algorithm.is_some()
4434 || req.input.sse_customer_key.is_some()
4435 || req.input.sse_customer_key_md5.is_some()
4436 {
4437 return Err(S3Error::with_message(
4438 S3ErrorCode::InvalidRequest,
4439 "UploadPart sent SSE-C headers but CreateMultipartUpload was not SSE-C",
4440 ));
4441 }
4442 }
4443 } else {
4444 // No upload context registered (gateway crashed between
4445 // Create and Part, or pre-#62 abandoned-upload restore).
4446 // We can't check key consistency in this case — strip
4447 // the headers and let the request through unchanged so
4448 // the backend's `NoSuchUpload` reply (or whatever it
4449 // chooses to do) flows back to the client.
4450 let _ = req.input.sse_customer_algorithm.take();
4451 let _ = req.input.sse_customer_key.take();
4452 let _ = req.input.sse_customer_key_md5.take();
4453 }
4454 let _sse_ctx = sse_ctx;
4455 if let Some(blob) = req.input.body.take() {
4456 let bytes = collect_blob(blob, self.max_body_bytes)
4457 .await
4458 .map_err(internal("collect upload_part body"))?;
4459 // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
4460 // checksum algorithms against the received part body.
4461 verify_client_body_checksums(
4462 &bytes,
4463 req.input.content_md5.as_deref(),
4464 req.input.checksum_crc32.as_deref(),
4465 req.input.checksum_crc32c.as_deref(),
4466 req.input.checksum_sha1.as_deref(),
4467 req.input.checksum_sha256.as_deref(),
4468 req.input.checksum_crc64nvme.as_deref(),
4469 )?;
4470 let sample_len = bytes.len().min(SAMPLE_BYTES);
4471 // v0.8 #56: full part body is already in memory here; use its
4472 // length as the size hint so the dispatcher can promote to GPU
4473 // if it's big enough.
4474 let codec_kind = self
4475 .dispatcher
4476 .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
4477 .await;
4478 let original_size = bytes.len() as u64;
4479 // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
4480 let (compress_res, tel) = self
4481 .registry
4482 .compress_with_telemetry(bytes, codec_kind)
4483 .await;
4484 stamp_gpu_compress_telemetry(&tel);
4485 let (compressed, manifest) =
4486 compress_res.map_err(internal("registry compress part"))?;
4487 let header = FrameHeader {
4488 codec: codec_kind,
4489 original_size,
4490 compressed_size: compressed.len() as u64,
4491 crc32c: manifest.crc32c,
4492 };
4493 let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
4494 write_frame(&mut framed, header, &compressed);
4495 // v0.2 #5: heuristic-based padding skip for likely-final parts.
4496 //
4497 // AWS SDK / aws-cli / boto3 always send the final (and only the
4498 // final) part below the configured part_size. So if the raw user
4499 // part is already smaller than S3's 5 MiB multipart minimum, this
4500 // is overwhelmingly likely to be the final part — and the final
4501 // part is exempt from S3's size constraint. Skipping padding here
4502 // saves up to ~5 MiB per object on highly compressible workloads.
4503 //
4504 // If a misbehaving client sends a tiny **non-final** part, S3
4505 // itself rejects with EntityTooSmall at CompleteMultipartUpload —
4506 // identical outcome to a vanilla S3 PUT, just earlier than
4507 // padding-then-complete would catch it.
4508 let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
4509 if !likely_final {
4510 pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
4511 }
4512 let framed_bytes = framed.freeze();
4513 let new_len = framed_bytes.len() as i64;
4514 // 同じ wire 互換問題が multipart にもある (content-length / checksum)
4515 req.input.content_length = Some(new_len);
4516 req.input.checksum_algorithm = None;
4517 req.input.checksum_crc32 = None;
4518 req.input.checksum_crc32c = None;
4519 req.input.checksum_crc64nvme = None;
4520 req.input.checksum_sha1 = None;
4521 req.input.checksum_sha256 = None;
4522 req.input.content_md5 = None;
4523 req.input.body = Some(bytes_to_blob(framed_bytes));
4524 debug!(
4525 part_number = ?req.input.part_number,
4526 upload_id = ?req.input.upload_id,
4527 original_size,
4528 framed_size = new_len,
4529 "S4 upload_part: framed compressed payload"
4530 );
4531 }
4532 self.backend.upload_part(req).await
4533 }
4534 async fn complete_multipart_upload(
4535 &self,
4536 mut req: S3Request<CompleteMultipartUploadInput>,
4537 ) -> S3Result<S3Response<CompleteMultipartUploadOutput>> {
4538 let bucket = req.input.bucket.clone();
4539 let key = req.input.key.clone();
4540 let upload_id = req.input.upload_id.clone();
4541 // v0.8.12 HIGH-9 fix: gate Complete on `s3:PutObject` (the
4542 // commit point for the multipart-assembled object).
4543 self.enforce_policy(&req, "s3:PutObject", &bucket, Some(&key))?;
4544 self.enforce_rate_limit(&req, &bucket)?;
4545 // v0.8.12 HIGH-6 fix: re-verify Object Lock on the target key
4546 // at Complete time. Without this an attacker with PutObject
4547 // permission could `CreateMultipartUpload` against a key
4548 // that's currently under retention / legal hold and silently
4549 // overwrite it on Complete (the single-PUT path runs the
4550 // same check at L2007). Compliance retention is never
4551 // bypassable; Governance only with explicit IAM permission
4552 // (HIGH-7 gate below).
4553 if let Some(mgr) = self.object_lock.as_ref()
4554 && let Some(state) = mgr.get(&bucket, &key)
4555 {
4556 // CompleteMultipartUpload doesn't carry the bypass header
4557 // (the s3s DTO matches AWS' wire schema). A locked key
4558 // therefore cannot be overwritten by Complete regardless
4559 // of caller permission — operators who need to break a
4560 // Governance lock do it via PutObjectRetention before
4561 // calling Complete.
4562 let now = chrono::Utc::now();
4563 if !state.can_delete(now, false) {
4564 crate::metrics::record_policy_denial("s3:PutObject", &bucket);
4565 return Err(S3Error::with_message(
4566 S3ErrorCode::AccessDenied,
4567 "Access Denied because target key is protected by object lock",
4568 ));
4569 }
4570 }
4571 // v0.8.1 #59: serialise concurrent Complete invocations on the
4572 // same `(bucket, key)`. The race window the lock closes is the
4573 // GET-assembled-body → encrypt → PUT-encrypted-body triple
4574 // below (BUG-5 fix); without serialisation, two Completes for
4575 // different `upload_id` but the same logical key could each
4576 // read the other's plaintext assembled body and overwrite the
4577 // peer's encrypted result. The guard is held to function exit
4578 // (drop on `Ok` / `Err`), covering version-id mint, object-
4579 // lock apply, tagging persist, and replication enqueue too.
4580 let completion_lock = self.multipart_state.completion_lock(&bucket, &key);
4581 let _completion_guard = completion_lock.lock().await;
4582 // v0.8 #54 — fetch the per-upload context captured on Create.
4583 // `None` means an abandoned / unknown upload_id (gateway
4584 // crashed between Create and Complete, or pre-v0.8 state
4585 // restore); we still let the backend do its thing for
4586 // transparency, but we can't apply any SSE / version / lock /
4587 // tag / replication post-processing because we never captured
4588 // the recipe.
4589 let ctx = self.multipart_state.get(upload_id.as_str());
4590 // v0.8 #54 BUG-10 fix: same SSE-C header strip as upload_part
4591 // — some clients (boto3 / aws-sdk-cpp older versions) replay
4592 // the SSE-C triple on Complete too, and MinIO will choke if
4593 // they reach the backend.
4594 let _ = req.input.sse_customer_algorithm.take();
4595 let _ = req.input.sse_customer_key.take();
4596 let _ = req.input.sse_customer_key_md5.take();
4597 let mut resp = self.backend.complete_multipart_upload(req).await?;
4598 // CompleteMultipartUpload 成功 → 完成した object を full fetch して frame
4599 // index を build、`<key>.s4index` sidecar として保存。これで Range GET の
4600 // partial fetch path が利用可能になる (Range request の帯域節約)。
4601 // 注: 巨大 object の場合この pass は重いが、Range query は一度 sidecar が
4602 // できれば爆速になるので 1 回の cost は payback される
4603 //
4604 // v0.8 #54 BUG-5..9: this same fetch is the choke-point for
4605 // the SSE encrypt re-PUT + versioning shadow-key rewrite +
4606 // replication source-bytes capture, so we GET once and reuse
4607 // the bytes for every post-processing step.
4608 let assembled_body: Option<bytes::Bytes> = if let Ok(uri) = safe_object_uri(&bucket, &key) {
4609 let get_input = GetObjectInput {
4610 bucket: bucket.clone(),
4611 key: key.clone(),
4612 ..Default::default()
4613 };
4614 let get_req = S3Request {
4615 input: get_input,
4616 method: http::Method::GET,
4617 uri,
4618 headers: http::HeaderMap::new(),
4619 extensions: http::Extensions::new(),
4620 credentials: None,
4621 region: None,
4622 service: None,
4623 trailing_headers: None,
4624 };
4625 match self.backend.get_object(get_req).await {
4626 Ok(get_resp) => match get_resp.output.body {
4627 Some(blob) => collect_blob(blob, self.max_body_bytes).await.ok(),
4628 None => None,
4629 },
4630 Err(e) => {
4631 // v0.8.4 #71 (C-1 audit fix): a silent
4632 // `Err(_) => None` here is a SSE plaintext
4633 // leak. The post-processing block below only
4634 // runs the SSE re-encrypt branch when
4635 // `assembled_body.is_some()`, so swallowing a
4636 // backend error skipped the encrypt step and
4637 // left the multipart object on disk as
4638 // plaintext, even on SSE-S4 / SSE-C / SSE-KMS
4639 // configured buckets. Same root-cause family
4640 // as v0.8 BUG-5; this branch closes the
4641 // remaining read-side window.
4642 //
4643 // We distinguish two cases:
4644 // - `NoSuchKey`: the object is genuinely
4645 // missing post-Complete. This is rare and
4646 // typically races with a concurrent
4647 // DeleteObject; there is nothing to re-
4648 // encrypt and no SSE markers to honour, so
4649 // falling through to the legacy
4650 // `assembled_body = None` path is safe.
4651 // - everything else (5xx, network, auth,
4652 // etc.): we must FAIL the Complete so the
4653 // client can retry. Returning Ok with
4654 // `assembled_body = None` would silently
4655 // skip the SSE re-encrypt and leave the
4656 // backend bytes plaintext.
4657 if matches!(e.code(), &S3ErrorCode::NoSuchKey) {
4658 tracing::warn!(
4659 bucket = %bucket,
4660 key = %key,
4661 "multipart Complete: backend GET returned NoSuchKey; \
4662 skipping post-processing (object likely raced with DeleteObject)"
4663 );
4664 None
4665 } else {
4666 tracing::error!(
4667 bucket = %bucket,
4668 key = %key,
4669 error = %e,
4670 "multipart Complete: backend GET failed; failing the Complete \
4671 so the client retries (silent fall-through would skip SSE \
4672 re-encrypt and store plaintext)"
4673 );
4674 return Err(internal("multipart Complete: backend body fetch failed")(e));
4675 }
4676 }
4677 }
4678 } else {
4679 None
4680 };
4681 // Sidecar build (existing behaviour, gated on assembled body).
4682 //
4683 // v0.8.12 HIGH-10 fix: skip the sidecar when the Complete is
4684 // going to SSE-encrypt the assembled body before re-PUT (the
4685 // single-PUT path applies the same suppression at L2271).
4686 // Stale offsets into the pre-encrypt body would break Range
4687 // GET on the encrypted on-disk bytes. `ctx.sse != None`
4688 // covers all three SSE modes captured at Create time.
4689 let mp_will_encrypt = ctx
4690 .as_ref()
4691 .map(|c| !matches!(c.sse, crate::multipart_state::MultipartSseMode::None))
4692 .unwrap_or(false);
4693 if let Some(ref body) = assembled_body
4694 && !mp_will_encrypt
4695 && let Ok(index) = build_index_from_body(body)
4696 {
4697 self.write_sidecar(&bucket, &key, &index).await;
4698 }
4699 // From here on, post-processing depends on the context —
4700 // short-circuit when the upload had no captured recipe
4701 // (legacy / crashed-Create / pre-v0.8 state restore).
4702 if let Some(ctx) = ctx {
4703 // v0.8 #54 BUG-6 fix: mint a version-id when the bucket
4704 // is versioning-Enabled. The single-PUT path does this in
4705 // `put_object` ~L1968; multipart was the missing branch.
4706 // We mint here (post-Complete, before any re-PUT) so the
4707 // same vid threads into both the shadow-key rewrite and
4708 // the VersionEntry the manager records.
4709 let pending_version: Option<crate::versioning::PutOutcome> = self
4710 .versioning
4711 .as_ref()
4712 .map(|mgr| mgr.state(&bucket))
4713 .map(|state| match state {
4714 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
4715 version_id: crate::versioning::VersioningManager::new_version_id(),
4716 versioned_response: true,
4717 },
4718 crate::versioning::VersioningState::Suspended
4719 | crate::versioning::VersioningState::Unversioned => {
4720 crate::versioning::PutOutcome {
4721 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
4722 versioned_response: false,
4723 }
4724 }
4725 });
4726 // v0.8 #54 BUG-5 fix: encrypt the assembled framed body
4727 // and re-PUT it to the backend so the on-disk bytes are
4728 // SSE-encrypted. The single-PUT path does this body-by-
4729 // body inside `put_object` (L1907-L1942); for multipart,
4730 // encrypt-per-part would require a multi-segment decrypt
4731 // path on GET — we instead do a single encrypt over the
4732 // assembled framed body so the existing GET decrypt
4733 // branch (`is_sse_encrypted` → `decrypt(body, source)` →
4734 // FrameIter) handles it unchanged.
4735 //
4736 // The cost is one extra round-trip per Complete for SSE-
4737 // enabled multipart (already-paid for the sidecar build).
4738 // For single-instance gateways pointing at a co-located
4739 // backend this is negligible; cross-region operators
4740 // would benefit from per-part encrypt + multi-segment
4741 // decrypt as a follow-up.
4742 let needs_re_put = matches!(
4743 ctx.sse,
4744 crate::multipart_state::MultipartSseMode::SseS4
4745 | crate::multipart_state::MultipartSseMode::SseC { .. }
4746 | crate::multipart_state::MultipartSseMode::SseKms { .. }
4747 ) || pending_version
4748 .as_ref()
4749 .map(|pv| pv.versioned_response)
4750 .unwrap_or(false);
4751 // v0.8.11 CRIT-2 fix: seed the replication body with the
4752 // pre-encrypt assembled bytes, but overwrite it with the
4753 // post-encrypt `new_body` once the re-PUT branch lands.
4754 // The previous "snapshot in advance" pattern shipped the
4755 // *plaintext* framed body to the destination bucket even
4756 // when SSE-S4 / SSE-C / SSE-KMS was active — the GET on
4757 // the destination would then fail to decrypt (or, worse,
4758 // succeed in handing out plaintext that the source had
4759 // promised was encrypted at rest). When `needs_re_put`
4760 // is false (no SSE, no versioning), the backend still
4761 // holds the original plaintext-framed bytes, and the
4762 // seed value is what the destination should receive.
4763 let mut replication_body = assembled_body.clone();
4764 let mut applied_metadata: Option<std::collections::HashMap<String, String>> = None;
4765 if needs_re_put && let Some(body) = assembled_body {
4766 // v0.8.1 #58: same Zeroizing pattern as put_object's
4767 // single-PUT KMS branch — DEK plaintext lives in
4768 // `Zeroizing<[u8; 32]>` for the lifetime of this
4769 // Complete handler, then is wiped on drop.
4770 let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
4771 if let crate::multipart_state::MultipartSseMode::SseKms { ref key_id } = ctx.sse
4772 {
4773 let kms = self.kms.as_ref().ok_or_else(|| {
4774 S3Error::with_message(
4775 S3ErrorCode::InvalidRequest,
4776 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
4777 )
4778 })?;
4779 let (dek, wrapped) =
4780 kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
4781 if dek.len() != 32 {
4782 return Err(S3Error::with_message(
4783 S3ErrorCode::InternalError,
4784 format!(
4785 "KMS backend returned a DEK of {} bytes (expected 32)",
4786 dek.len()
4787 ),
4788 ));
4789 }
4790 let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
4791 zeroize::Zeroizing::new([0u8; 32]);
4792 dek_arr.copy_from_slice(&dek);
4793 // `dek` (Zeroizing<Vec<u8>>) is dropped at scope end.
4794 Some((dek_arr, wrapped))
4795 } else {
4796 None
4797 };
4798 // Build the new metadata map: re-fetch via HEAD so
4799 // the multipart / codec markers the backend stamped
4800 // on Create flow through unchanged, then layer the
4801 // SSE markers on top.
4802 let head_req = S3Request {
4803 input: HeadObjectInput {
4804 bucket: bucket.clone(),
4805 key: key.clone(),
4806 ..Default::default()
4807 },
4808 method: http::Method::HEAD,
4809 uri: safe_object_uri(&bucket, &key)?,
4810 headers: http::HeaderMap::new(),
4811 extensions: http::Extensions::new(),
4812 credentials: None,
4813 region: None,
4814 service: None,
4815 trailing_headers: None,
4816 };
4817 let mut new_metadata: std::collections::HashMap<String, String> =
4818 match self.backend.head_object(head_req).await {
4819 Ok(h) => h.output.metadata.unwrap_or_default(),
4820 Err(_) => std::collections::HashMap::new(),
4821 };
4822 let new_body = match &ctx.sse {
4823 crate::multipart_state::MultipartSseMode::SseC { key, key_md5 } => {
4824 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
4825 new_metadata.insert("s4-sse-type".into(), "AES256".into());
4826 new_metadata.insert(
4827 "s4-sse-c-key-md5".into(),
4828 base64::engine::general_purpose::STANDARD.encode(key_md5),
4829 );
4830 // v0.8.2 #62: `key` is `&Zeroizing<[u8; 32]>`;
4831 // auto-deref through one explicit binding so
4832 // `SseSource::CustomerKey` gets the `&[u8; 32]`
4833 // it expects (mirrors the SSE-KMS DEK shape
4834 // a few lines down).
4835 let key_ref: &[u8; 32] = key;
4836 crate::sse::encrypt_with_source(
4837 &body,
4838 crate::sse::SseSource::CustomerKey {
4839 key: key_ref,
4840 key_md5,
4841 },
4842 )
4843 }
4844 crate::multipart_state::MultipartSseMode::SseKms { .. } => {
4845 let (dek, wrapped) = kms_wrap
4846 .as_ref()
4847 .expect("SseKms branch implies kms_wrap is Some");
4848 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
4849 new_metadata.insert("s4-sse-type".into(), "aws:kms".into());
4850 new_metadata.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
4851 // v0.8.1 #58: auto-deref from `&Zeroizing<[u8; 32]>`
4852 // to `&[u8; 32]` (same shape as the put_object
4853 // single-PUT branch).
4854 let dek_ref: &[u8; 32] = dek;
4855 crate::sse::encrypt_with_source(
4856 &body,
4857 crate::sse::SseSource::Kms {
4858 dek: dek_ref,
4859 wrapped,
4860 },
4861 )
4862 }
4863 crate::multipart_state::MultipartSseMode::SseS4 => {
4864 let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
4865 S3Error::with_message(
4866 S3ErrorCode::InternalError,
4867 "SSE-S4 captured at Create but keyring missing at Complete",
4868 )
4869 })?;
4870 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
4871 // SSE-S4 deliberately omits `s4-sse-type` so
4872 // HEAD doesn't falsely advertise AWS-style
4873 // SSE-S3 (matches the put_object L1929-L1939
4874 // comment).
4875 // v0.8 #52: same chunk_size dispatch as the
4876 // single-PUT branch — multipart Complete
4877 // re-encrypts the assembled body, so honoring
4878 // the chunked path here is required to keep
4879 // GET streaming on multipart-uploaded objects.
4880 if self.sse_chunk_size > 0 {
4881 crate::sse::encrypt_v2_chunked(&body, keyring, self.sse_chunk_size)
4882 .map_err(|e| {
4883 S3Error::with_message(
4884 S3ErrorCode::InternalError,
4885 format!("SSE-S4 chunked encrypt failed at Complete: {e}"),
4886 )
4887 })?
4888 } else {
4889 crate::sse::encrypt_v2(&body, keyring)
4890 }
4891 }
4892 crate::multipart_state::MultipartSseMode::None => body.clone(),
4893 };
4894 // v0.8 #54 BUG-6 fix: write the re-PUT under the
4895 // shadow key so the version chain doesn't overwrite
4896 // the previous version on a versioned bucket. The
4897 // original (unshadowed) key was assembled by the
4898 // backend on Complete; we delete it after the shadow
4899 // PUT lands.
4900 let put_target_key = if let Some(pv) = pending_version.as_ref() {
4901 if pv.versioned_response {
4902 versioned_shadow_key(&key, &pv.version_id)
4903 } else {
4904 key.clone()
4905 }
4906 } else {
4907 key.clone()
4908 };
4909 let new_body_len = new_body.len() as i64;
4910 let put_req = S3Request {
4911 input: PutObjectInput {
4912 bucket: bucket.clone(),
4913 key: put_target_key.clone(),
4914 body: Some(bytes_to_blob(new_body.clone())),
4915 metadata: Some(new_metadata.clone()),
4916 content_length: Some(new_body_len),
4917 ..Default::default()
4918 },
4919 method: http::Method::PUT,
4920 uri: safe_object_uri(&bucket, &put_target_key)?,
4921 headers: http::HeaderMap::new(),
4922 extensions: http::Extensions::new(),
4923 credentials: None,
4924 region: None,
4925 service: None,
4926 trailing_headers: None,
4927 };
4928 self.backend.put_object(put_req).await?;
4929 // v0.8.11 CRIT-2 fix: refresh the replication snapshot
4930 // with the bytes that were actually persisted to the
4931 // backend (post-SSE-encrypt for SSE modes; identical to
4932 // `body` for `MultipartSseMode::None` + versioning-only
4933 // re-PUT). The destination then sees the same on-disk
4934 // shape the source does, and a destination GET decrypts
4935 // correctly when SSE is on.
4936 replication_body = Some(new_body.clone());
4937 // If we rewrote the storage key (versioning shadow),
4938 // we must drop the original (unshadowed) Complete-
4939 // assembled bytes so subsequent listings don't see a
4940 // duplicate.
4941 if put_target_key != key {
4942 let del_req = S3Request {
4943 input: DeleteObjectInput {
4944 bucket: bucket.clone(),
4945 key: key.clone(),
4946 ..Default::default()
4947 },
4948 method: http::Method::DELETE,
4949 uri: safe_object_uri(&bucket, &key)?,
4950 headers: http::HeaderMap::new(),
4951 extensions: http::Extensions::new(),
4952 credentials: None,
4953 region: None,
4954 service: None,
4955 trailing_headers: None,
4956 };
4957 let _ = self.backend.delete_object(del_req).await;
4958 }
4959 applied_metadata = Some(new_metadata);
4960 }
4961 // v0.8 #54 BUG-6 commit: register the new version with
4962 // the VersioningManager so list_object_versions /
4963 // GET ?versionId= see it.
4964 if let (Some(mgr), Some(pv)) = (self.versioning.as_ref(), pending_version.as_ref()) {
4965 let etag = resp
4966 .output
4967 .e_tag
4968 .clone()
4969 .map(ETag::into_value)
4970 .unwrap_or_default();
4971 let now = chrono::Utc::now();
4972 mgr.commit_put_with_version(
4973 &bucket,
4974 &key,
4975 crate::versioning::VersionEntry {
4976 version_id: pv.version_id.clone(),
4977 etag,
4978 size: replication_body
4979 .as_ref()
4980 .map(|b| b.len() as u64)
4981 .unwrap_or(0),
4982 is_delete_marker: false,
4983 created_at: now,
4984 },
4985 );
4986 if pv.versioned_response {
4987 resp.output.version_id = Some(pv.version_id.clone());
4988 }
4989 }
4990 // v0.8 #54 BUG-7 fix: persist any per-upload Object Lock
4991 // recipe + auto-apply the bucket default. Mirrors the
4992 // put_object L2057-L2074 block.
4993 if let Some(mgr) = self.object_lock.as_ref() {
4994 if ctx.object_lock_mode.is_some()
4995 || ctx.object_lock_retain_until.is_some()
4996 || ctx.object_lock_legal_hold
4997 {
4998 let mut state = mgr.get(&bucket, &key).unwrap_or_default();
4999 if let Some(m) = ctx.object_lock_mode {
5000 state.mode = Some(m);
5001 }
5002 if let Some(u) = ctx.object_lock_retain_until {
5003 state.retain_until = Some(u);
5004 }
5005 if ctx.object_lock_legal_hold {
5006 state.legal_hold_on = true;
5007 }
5008 mgr.set(&bucket, &key, state);
5009 }
5010 mgr.apply_default_on_put(&bucket, &key, chrono::Utc::now());
5011 }
5012 // v0.8 #54 BUG-9 fix: persist the captured tags via the
5013 // TagManager so GetObjectTagging returns them.
5014 if let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), ctx.tags.as_ref()) {
5015 mgr.put_object_tags(&bucket, &key, tags.clone());
5016 }
5017 // SSE-C / SSE-KMS response echo. The
5018 // CompleteMultipartUploadOutput only exposes
5019 // `server_side_encryption` + `ssekms_key_id` (no
5020 // sse_customer_* — those round-tripped on Create / parts).
5021 match &ctx.sse {
5022 crate::multipart_state::MultipartSseMode::SseC { .. } => {
5023 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5024 ServerSideEncryption::AES256,
5025 ));
5026 }
5027 crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
5028 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5029 ServerSideEncryption::AWS_KMS,
5030 ));
5031 resp.output.ssekms_key_id = Some(key_id.clone());
5032 }
5033 _ => {}
5034 }
5035 // v0.8 #54 BUG-8 fix: fire cross-bucket replication just
5036 // like put_object L2165 does. We hand the dispatcher the
5037 // assembled body bytes (post-encrypt where applicable, so
5038 // the destination ends up byte-identical to the source's
5039 // on-disk shape) plus the metadata that was actually
5040 // committed.
5041 let replication_body_bytes = replication_body.unwrap_or_default();
5042 // v0.8.2 #61: thread the multipart-Complete `pending_version`
5043 // through so a versioning-Enabled source's destination
5044 // receives the same shadow-key path (mirror of the
5045 // single-PUT branch above).
5046 self.spawn_replication_if_matched(
5047 &bucket,
5048 &key,
5049 &ctx.tags,
5050 &replication_body_bytes,
5051 &applied_metadata,
5052 true,
5053 pending_version.as_ref(),
5054 );
5055 self.multipart_state.remove(upload_id.as_str());
5056 }
5057 // v0.8.1 #59 janitor: best-effort sweep of stale completion
5058 // locks while we are still on the critical path of a single
5059 // Complete (so steady-state workloads of unique keys don't
5060 // accumulate `DashMap` entries). The sweep only retires
5061 // entries whose `Arc::strong_count == 1`, so any other in-
5062 // flight Complete on a different key keeps its lock alive.
5063 // Our own `_completion_guard` keeps `bucket`/`key`'s entry
5064 // alive across this call; it's reaped on the next Complete or
5065 // the next caller-driven prune.
5066 self.multipart_state.prune_completion_locks();
5067 Ok(resp)
5068 }
5069 async fn abort_multipart_upload(
5070 &self,
5071 req: S3Request<AbortMultipartUploadInput>,
5072 ) -> S3Result<S3Response<AbortMultipartUploadOutput>> {
5073 // v0.8.12 HIGH-9 fix: gate Abort on `s3:AbortMultipartUpload`
5074 // — the AWS-spec action verb for this operation. Without the
5075 // gate, anyone who could guess an upload_id could throw away
5076 // someone else's in-flight multipart upload.
5077 let abort_bucket = req.input.bucket.clone();
5078 let abort_key = req.input.key.clone();
5079 self.enforce_policy(
5080 &req,
5081 "s3:AbortMultipartUpload",
5082 &abort_bucket,
5083 Some(&abort_key),
5084 )?;
5085 // v0.8 #54: drop the per-upload state (SSE-C key bytes / tag
5086 // set) promptly so an aborted upload doesn't leak the
5087 // customer's key into a long-running gateway's RSS.
5088 //
5089 // v0.8.4 #71 (H-7 audit fix): backend.abort_multipart_upload
5090 // FIRST, then drop in-process state ONLY on success. The
5091 // previous order ("remove → call backend") meant a transient
5092 // backend abort failure (5xx, network) wiped the SSE-C key
5093 // bytes locally while leaving the parts on the backend, so a
5094 // client retry would have to re-validate the SSE-C key against
5095 // a context the gateway no longer has — and the retried abort
5096 // would still hit the unaborted backend parts. Calling the
5097 // backend first lets the failure propagate to the client with
5098 // state intact for a clean retry; only on success do we wipe
5099 // the local state.
5100 let upload_id = req.input.upload_id.as_str().to_owned();
5101 let resp = self.backend.abort_multipart_upload(req).await?;
5102 self.multipart_state.remove(&upload_id);
5103 Ok(resp)
5104 }
5105 async fn list_multipart_uploads(
5106 &self,
5107 req: S3Request<ListMultipartUploadsInput>,
5108 ) -> S3Result<S3Response<ListMultipartUploadsOutput>> {
5109 self.backend.list_multipart_uploads(req).await
5110 }
5111 async fn list_parts(
5112 &self,
5113 req: S3Request<ListPartsInput>,
5114 ) -> S3Result<S3Response<ListPartsOutput>> {
5115 self.backend.list_parts(req).await
5116 }
5117
5118 // =========================================================================
5119 // Phase 2 — pure passthrough delegations。S4 はこれらに対して圧縮 hook を
5120 // 持たないので、backend (= AWS S3) の動作と完全に同一。
5121 //
5122 // 既知の制限事項:
5123 // - copy_object / upload_part_copy: source object が S4-compressed の場合、
5124 // backend が bytes を copy するだけなので metadata (s4-codec etc) も一緒に
5125 // coppied される (AWS S3 default = MetadataDirective COPY)。GET は manifest
5126 // 経由で正しく decompress できる。MetadataDirective REPLACE で上書き
5127 // されると圧縮 metadata が消えて壊れる — 顧客側の運用で注意
5128 // - list_object_versions: versioning enabled bucket では各 version も S4
5129 // metadata を維持する。古い version も S4 経由で正しく GET できる。
5130 // =========================================================================
5131
5132 // ---- Object ACL / tagging / attributes ----
5133 async fn get_object_acl(
5134 &self,
5135 req: S3Request<GetObjectAclInput>,
5136 ) -> S3Result<S3Response<GetObjectAclOutput>> {
5137 self.backend.get_object_acl(req).await
5138 }
5139 async fn put_object_acl(
5140 &self,
5141 req: S3Request<PutObjectAclInput>,
5142 ) -> S3Result<S3Response<PutObjectAclOutput>> {
5143 self.backend.put_object_acl(req).await
5144 }
5145 // v0.6 #39: object tagging — when a `TagManager` is attached the
5146 // configuration / per-(bucket, key) state lives in the manager and
5147 // these handlers serve directly from it; when no manager is
5148 // attached they fall back to the backend (legacy passthrough so
5149 // v0.5 deployments are unaffected).
5150 async fn get_object_tagging(
5151 &self,
5152 req: S3Request<GetObjectTaggingInput>,
5153 ) -> S3Result<S3Response<GetObjectTaggingOutput>> {
5154 let Some(mgr) = self.tagging.as_ref() else {
5155 return self.backend.get_object_tagging(req).await;
5156 };
5157 let tags = mgr
5158 .get_object_tags(&req.input.bucket, &req.input.key)
5159 .unwrap_or_default();
5160 Ok(S3Response::new(GetObjectTaggingOutput {
5161 tag_set: tagset_to_aws(&tags),
5162 ..Default::default()
5163 }))
5164 }
5165 async fn put_object_tagging(
5166 &self,
5167 req: S3Request<PutObjectTaggingInput>,
5168 ) -> S3Result<S3Response<PutObjectTaggingOutput>> {
5169 let Some(mgr) = self.tagging.as_ref() else {
5170 return self.backend.put_object_tagging(req).await;
5171 };
5172 let bucket = req.input.bucket.clone();
5173 let key = req.input.key.clone();
5174 let parsed = aws_to_tagset(&req.input.tagging.tag_set)
5175 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
5176 // v0.6 #39: gate via IAM policy with both the request tags
5177 // (`s3:RequestObjectTag/<key>`) and any existing tags on the
5178 // target object (`s3:ExistingObjectTag/<key>`).
5179 let existing = mgr.get_object_tags(&bucket, &key);
5180 self.enforce_policy_with_extra(
5181 &req,
5182 "s3:PutObjectTagging",
5183 &bucket,
5184 Some(&key),
5185 Some(&parsed),
5186 existing.as_ref(),
5187 )?;
5188 mgr.put_object_tags(&bucket, &key, parsed);
5189 Ok(S3Response::new(PutObjectTaggingOutput::default()))
5190 }
5191 async fn delete_object_tagging(
5192 &self,
5193 req: S3Request<DeleteObjectTaggingInput>,
5194 ) -> S3Result<S3Response<DeleteObjectTaggingOutput>> {
5195 let Some(mgr) = self.tagging.as_ref() else {
5196 return self.backend.delete_object_tagging(req).await;
5197 };
5198 let bucket = req.input.bucket.clone();
5199 let key = req.input.key.clone();
5200 let existing = mgr.get_object_tags(&bucket, &key);
5201 self.enforce_policy_with_extra(
5202 &req,
5203 "s3:DeleteObjectTagging",
5204 &bucket,
5205 Some(&key),
5206 None,
5207 existing.as_ref(),
5208 )?;
5209 mgr.delete_object_tags(&bucket, &key);
5210 Ok(S3Response::new(DeleteObjectTaggingOutput::default()))
5211 }
5212 async fn get_object_attributes(
5213 &self,
5214 req: S3Request<GetObjectAttributesInput>,
5215 ) -> S3Result<S3Response<GetObjectAttributesOutput>> {
5216 self.backend.get_object_attributes(req).await
5217 }
5218 async fn restore_object(
5219 &self,
5220 req: S3Request<RestoreObjectInput>,
5221 ) -> S3Result<S3Response<RestoreObjectOutput>> {
5222 self.backend.restore_object(req).await
5223 }
5224 async fn upload_part_copy(
5225 &self,
5226 req: S3Request<UploadPartCopyInput>,
5227 ) -> S3Result<S3Response<UploadPartCopyOutput>> {
5228 // v0.8.12 HIGH-9 fix: same per-action gates as `copy_object` —
5229 // destination PUT + source GET.
5230 let dst_bucket = req.input.bucket.clone();
5231 let dst_key = req.input.key.clone();
5232 self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
5233 if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
5234 self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
5235 }
5236 self.enforce_rate_limit(&req, &dst_bucket)?;
5237 // v0.2 #6: byte-range aware copy when the source is S4-framed.
5238 //
5239 // For a framed source (multipart upload OR single-PUT framed-v2),
5240 // a naive byte-range passthrough would copy compressed bytes that
5241 // don't align with S4 frame boundaries — silently corrupting the
5242 // result. Instead we GET the source through S4 (which handles
5243 // decompression + Range), re-compress + re-frame as a new part,
5244 // and forward as upload_part. For non-framed sources (S4-untouched
5245 // raw objects), passthrough is correct and we keep the original
5246 // (cheaper) code path.
5247 // v0.8.4 #74: propagate the optional `?versionId=<vid>` from the
5248 // copy-source header. Without this, a versioned source bucket
5249 // copy that pins a specific old version would silently fall
5250 // back to "latest", assembling wrong bytes into the destination
5251 // multipart object (silent data corruption).
5252 let CopySource::Bucket {
5253 bucket: src_bucket,
5254 key: src_key,
5255 version_id: src_version_id,
5256 } = &req.input.copy_source
5257 else {
5258 return self.backend.upload_part_copy(req).await;
5259 };
5260 let src_bucket = src_bucket.to_string();
5261 let src_key = src_key.to_string();
5262 let src_version_id: Option<String> = src_version_id.as_deref().map(str::to_owned);
5263
5264 // Probe metadata to decide whether the source needs S4-aware copy.
5265 let head_input = HeadObjectInput {
5266 bucket: src_bucket.clone(),
5267 key: src_key.clone(),
5268 version_id: src_version_id.clone(),
5269 ..Default::default()
5270 };
5271 let head_req = S3Request {
5272 input: head_input,
5273 method: http::Method::HEAD,
5274 uri: req.uri.clone(),
5275 headers: req.headers.clone(),
5276 extensions: http::Extensions::new(),
5277 credentials: req.credentials.clone(),
5278 region: req.region.clone(),
5279 service: req.service.clone(),
5280 trailing_headers: None,
5281 };
5282 let needs_s4_copy = match self.backend.head_object(head_req).await {
5283 Ok(h) => {
5284 is_multipart_object(&h.output.metadata) || is_framed_v2_object(&h.output.metadata)
5285 }
5286 Err(_) => false,
5287 };
5288 if !needs_s4_copy {
5289 return self.backend.upload_part_copy(req).await;
5290 }
5291
5292 // Resolve the optional source byte range to pass to GET.
5293 let source_range = req
5294 .input
5295 .copy_source_range
5296 .as_ref()
5297 .map(|r| parse_copy_source_range(r))
5298 .transpose()
5299 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
5300
5301 // GET source via S4 (handles decompression + sidecar partial fetch
5302 // when range is present). The result is the requested user-visible
5303 // byte range, fully decompressed. version_id is propagated so
5304 // pinned-version copies fetch the exact version requested.
5305 let mut get_input = GetObjectInput {
5306 bucket: src_bucket.clone(),
5307 key: src_key.clone(),
5308 version_id: src_version_id.clone(),
5309 ..Default::default()
5310 };
5311 get_input.range = source_range;
5312 let get_req = S3Request {
5313 input: get_input,
5314 method: http::Method::GET,
5315 uri: req.uri.clone(),
5316 headers: req.headers.clone(),
5317 extensions: http::Extensions::new(),
5318 credentials: req.credentials.clone(),
5319 region: req.region.clone(),
5320 service: req.service.clone(),
5321 trailing_headers: None,
5322 };
5323 let get_resp = self.get_object(get_req).await?;
5324 let blob = get_resp.output.body.ok_or_else(|| {
5325 S3Error::with_message(
5326 S3ErrorCode::InternalError,
5327 "upload_part_copy: empty body from source GET",
5328 )
5329 })?;
5330 let bytes = collect_blob(blob, self.max_body_bytes)
5331 .await
5332 .map_err(internal("collect upload_part_copy source body"))?;
5333
5334 // Compress + frame as a fresh part (mirrors upload_part path).
5335 let sample_len = bytes.len().min(SAMPLE_BYTES);
5336 // v0.8 #56: same size-hint promotion as the upload_part path.
5337 let codec_kind = self
5338 .dispatcher
5339 .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
5340 .await;
5341 let original_size = bytes.len() as u64;
5342 // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
5343 let (compress_res, tel) = self
5344 .registry
5345 .compress_with_telemetry(bytes, codec_kind)
5346 .await;
5347 stamp_gpu_compress_telemetry(&tel);
5348 let (compressed, manifest) =
5349 compress_res.map_err(internal("registry compress upload_part_copy"))?;
5350 let header = FrameHeader {
5351 codec: codec_kind,
5352 original_size,
5353 compressed_size: compressed.len() as u64,
5354 crc32c: manifest.crc32c,
5355 };
5356 let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
5357 write_frame(&mut framed, header, &compressed);
5358 let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
5359 if !likely_final {
5360 pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
5361 }
5362 let framed_bytes = framed.freeze();
5363 let framed_len = framed_bytes.len() as i64;
5364
5365 // Forward as upload_part to the destination multipart upload.
5366 let part_input = UploadPartInput {
5367 bucket: req.input.bucket.clone(),
5368 key: req.input.key.clone(),
5369 part_number: req.input.part_number,
5370 upload_id: req.input.upload_id.clone(),
5371 body: Some(bytes_to_blob(framed_bytes)),
5372 content_length: Some(framed_len),
5373 ..Default::default()
5374 };
5375 let part_req = S3Request {
5376 input: part_input,
5377 method: http::Method::PUT,
5378 uri: req.uri.clone(),
5379 headers: req.headers.clone(),
5380 extensions: http::Extensions::new(),
5381 credentials: req.credentials.clone(),
5382 region: req.region.clone(),
5383 service: req.service.clone(),
5384 trailing_headers: None,
5385 };
5386 let upload_resp = self.backend.upload_part(part_req).await?;
5387
5388 let copy_output = UploadPartCopyOutput {
5389 copy_part_result: Some(CopyPartResult {
5390 e_tag: upload_resp.output.e_tag.clone(),
5391 ..Default::default()
5392 }),
5393 ..Default::default()
5394 };
5395 Ok(S3Response::new(copy_output))
5396 }
5397
5398 // ---- Object lock / retention / legal hold (v0.5 #30) ----
5399 //
5400 // When an `ObjectLockManager` is attached the configuration / per-object
5401 // state lives in the manager and these handlers serve directly from it;
5402 // when no manager is attached they fall back to the backend (legacy
5403 // passthrough so v0.4 deployments are unaffected).
5404 async fn get_object_lock_configuration(
5405 &self,
5406 req: S3Request<GetObjectLockConfigurationInput>,
5407 ) -> S3Result<S3Response<GetObjectLockConfigurationOutput>> {
5408 self.enforce_policy(
5409 &req,
5410 "s3:GetBucketObjectLockConfiguration",
5411 &req.input.bucket,
5412 None,
5413 )?;
5414 if let Some(mgr) = self.object_lock.as_ref() {
5415 let cfg = mgr
5416 .bucket_default(&req.input.bucket)
5417 .map(|d| ObjectLockConfiguration {
5418 object_lock_enabled: Some(ObjectLockEnabled::from_static(
5419 ObjectLockEnabled::ENABLED,
5420 )),
5421 rule: Some(ObjectLockRule {
5422 default_retention: Some(DefaultRetention {
5423 days: Some(d.retention_days as i32),
5424 mode: Some(ObjectLockRetentionMode::from_static(match d.mode {
5425 crate::object_lock::LockMode::Governance => {
5426 ObjectLockRetentionMode::GOVERNANCE
5427 }
5428 crate::object_lock::LockMode::Compliance => {
5429 ObjectLockRetentionMode::COMPLIANCE
5430 }
5431 })),
5432 years: None,
5433 }),
5434 }),
5435 });
5436 let output = GetObjectLockConfigurationOutput {
5437 object_lock_configuration: cfg,
5438 };
5439 return Ok(S3Response::new(output));
5440 }
5441 self.backend.get_object_lock_configuration(req).await
5442 }
5443 async fn put_object_lock_configuration(
5444 &self,
5445 req: S3Request<PutObjectLockConfigurationInput>,
5446 ) -> S3Result<S3Response<PutObjectLockConfigurationOutput>> {
5447 self.enforce_policy(
5448 &req,
5449 "s3:PutBucketObjectLockConfiguration",
5450 &req.input.bucket,
5451 None,
5452 )?;
5453 if let Some(mgr) = self.object_lock.as_ref() {
5454 let bucket = req.input.bucket.clone();
5455 if let Some(cfg) = req.input.object_lock_configuration.as_ref()
5456 && let Some(rule) = cfg.rule.as_ref()
5457 && let Some(d) = rule.default_retention.as_ref()
5458 {
5459 let mode = d
5460 .mode
5461 .as_ref()
5462 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()))
5463 .ok_or_else(|| {
5464 S3Error::with_message(
5465 S3ErrorCode::InvalidRequest,
5466 "Object Lock default retention requires a valid Mode (GOVERNANCE | COMPLIANCE)",
5467 )
5468 })?;
5469 // S3 spec: exactly one of Days / Years (we accept Days
5470 // outright and convert Years → Days for storage; Years
5471 // is just a UX shorthand on the wire).
5472 let days: u32 = match (d.days, d.years) {
5473 (Some(d), None) if d > 0 => d as u32,
5474 (None, Some(y)) if y > 0 => (y as u32).saturating_mul(365),
5475 _ => {
5476 return Err(S3Error::with_message(
5477 S3ErrorCode::InvalidRequest,
5478 "Object Lock default retention requires exactly one of Days or Years (positive integer)",
5479 ));
5480 }
5481 };
5482 mgr.set_bucket_default(
5483 &bucket,
5484 crate::object_lock::BucketObjectLockDefault {
5485 mode,
5486 retention_days: days,
5487 },
5488 );
5489 }
5490 return Ok(S3Response::new(PutObjectLockConfigurationOutput::default()));
5491 }
5492 self.backend.put_object_lock_configuration(req).await
5493 }
5494 async fn get_object_legal_hold(
5495 &self,
5496 req: S3Request<GetObjectLegalHoldInput>,
5497 ) -> S3Result<S3Response<GetObjectLegalHoldOutput>> {
5498 let key = req.input.key.clone();
5499 self.enforce_policy(&req, "s3:GetObjectLegalHold", &req.input.bucket, Some(&key))?;
5500 if let Some(mgr) = self.object_lock.as_ref() {
5501 let on = mgr
5502 .get(&req.input.bucket, &req.input.key)
5503 .map(|s| s.legal_hold_on)
5504 .unwrap_or(false);
5505 let status = ObjectLockLegalHoldStatus::from_static(if on {
5506 ObjectLockLegalHoldStatus::ON
5507 } else {
5508 ObjectLockLegalHoldStatus::OFF
5509 });
5510 let output = GetObjectLegalHoldOutput {
5511 legal_hold: Some(ObjectLockLegalHold {
5512 status: Some(status),
5513 }),
5514 };
5515 return Ok(S3Response::new(output));
5516 }
5517 self.backend.get_object_legal_hold(req).await
5518 }
5519 async fn put_object_legal_hold(
5520 &self,
5521 req: S3Request<PutObjectLegalHoldInput>,
5522 ) -> S3Result<S3Response<PutObjectLegalHoldOutput>> {
5523 let key = req.input.key.clone();
5524 self.enforce_policy(&req, "s3:PutObjectLegalHold", &req.input.bucket, Some(&key))?;
5525 if let Some(mgr) = self.object_lock.as_ref() {
5526 let on = req
5527 .input
5528 .legal_hold
5529 .as_ref()
5530 .and_then(|h| h.status.as_ref())
5531 .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
5532 .unwrap_or(false);
5533 mgr.set_legal_hold(&req.input.bucket, &req.input.key, on);
5534 return Ok(S3Response::new(PutObjectLegalHoldOutput::default()));
5535 }
5536 self.backend.put_object_legal_hold(req).await
5537 }
5538 async fn get_object_retention(
5539 &self,
5540 req: S3Request<GetObjectRetentionInput>,
5541 ) -> S3Result<S3Response<GetObjectRetentionOutput>> {
5542 let key = req.input.key.clone();
5543 self.enforce_policy(&req, "s3:GetObjectRetention", &req.input.bucket, Some(&key))?;
5544 if let Some(mgr) = self.object_lock.as_ref() {
5545 let retention = mgr
5546 .get(&req.input.bucket, &req.input.key)
5547 .filter(|s| s.mode.is_some() || s.retain_until.is_some())
5548 .map(|s| {
5549 let mode = s.mode.map(|m| {
5550 ObjectLockRetentionMode::from_static(match m {
5551 crate::object_lock::LockMode::Governance => {
5552 ObjectLockRetentionMode::GOVERNANCE
5553 }
5554 crate::object_lock::LockMode::Compliance => {
5555 ObjectLockRetentionMode::COMPLIANCE
5556 }
5557 })
5558 });
5559 let until = s.retain_until.map(chrono_utc_to_timestamp);
5560 ObjectLockRetention {
5561 mode,
5562 retain_until_date: until,
5563 }
5564 });
5565 let output = GetObjectRetentionOutput { retention };
5566 return Ok(S3Response::new(output));
5567 }
5568 self.backend.get_object_retention(req).await
5569 }
5570 async fn put_object_retention(
5571 &self,
5572 req: S3Request<PutObjectRetentionInput>,
5573 ) -> S3Result<S3Response<PutObjectRetentionOutput>> {
5574 let key = req.input.key.clone();
5575 self.enforce_policy(&req, "s3:PutObjectRetention", &req.input.bucket, Some(&key))?;
5576 if let Some(mgr) = self.object_lock.as_ref() {
5577 let bucket = req.input.bucket.clone();
5578 let key = req.input.key.clone();
5579 // v0.8.12 HIGH-7 fix: the bypass header gates Governance
5580 // shortening only when the caller has the matching IAM
5581 // action explicitly allowed; otherwise it's silently
5582 // dropped to `false` and the "shortening Governance
5583 // requires bypass" branch below rejects.
5584 let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
5585 let bypass = if bypass_header {
5586 self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
5587 .is_ok()
5588 } else {
5589 false
5590 };
5591 let retention = req.input.retention.as_ref().ok_or_else(|| {
5592 S3Error::with_message(
5593 S3ErrorCode::InvalidRequest,
5594 "PutObjectRetention requires a Retention element",
5595 )
5596 })?;
5597 let new_mode = retention
5598 .mode
5599 .as_ref()
5600 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
5601 let new_until = retention
5602 .retain_until_date
5603 .as_ref()
5604 .map(timestamp_to_chrono_utc)
5605 .unwrap_or(None);
5606 let now = chrono::Utc::now();
5607 let existing = mgr.get(&bucket, &key).unwrap_or_default();
5608 // S3 immutability rules:
5609 // - Compliance is one-way: once set, mode cannot move to
5610 // Governance, and retain-until cannot be shortened.
5611 // - Governance can be lengthened freely; shortened only
5612 // with bypass=true.
5613 if let Some(existing_mode) = existing.mode
5614 && existing_mode == crate::object_lock::LockMode::Compliance
5615 && existing.is_locked(now)
5616 {
5617 if matches!(new_mode, Some(crate::object_lock::LockMode::Governance)) {
5618 return Err(S3Error::with_message(
5619 S3ErrorCode::AccessDenied,
5620 "Cannot downgrade Compliance retention to Governance while lock is active",
5621 ));
5622 }
5623 if let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5624 && next < prev
5625 {
5626 return Err(S3Error::with_message(
5627 S3ErrorCode::AccessDenied,
5628 "Cannot shorten Compliance retention while lock is active",
5629 ));
5630 }
5631 }
5632 if let Some(existing_mode) = existing.mode
5633 && existing_mode == crate::object_lock::LockMode::Governance
5634 && existing.is_locked(now)
5635 && !bypass
5636 && let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5637 && next < prev
5638 {
5639 return Err(S3Error::with_message(
5640 S3ErrorCode::AccessDenied,
5641 "Shortening Governance retention requires x-amz-bypass-governance-retention: true",
5642 ));
5643 }
5644 let mut state = existing;
5645 if new_mode.is_some() {
5646 state.mode = new_mode;
5647 }
5648 if new_until.is_some() {
5649 state.retain_until = new_until;
5650 }
5651 mgr.set(&bucket, &key, state);
5652 return Ok(S3Response::new(PutObjectRetentionOutput::default()));
5653 }
5654 self.backend.put_object_retention(req).await
5655 }
5656
5657 // ---- Versioning ----
5658 // list_object_versions is implemented above in the compression-hook
5659 // section so it filters S4-internal sidecars (v0.4 #17) AND, when a
5660 // VersioningManager is attached (v0.5 #34), serves chains directly
5661 // from the in-memory index.
5662 async fn get_bucket_versioning(
5663 &self,
5664 req: S3Request<GetBucketVersioningInput>,
5665 ) -> S3Result<S3Response<GetBucketVersioningOutput>> {
5666 // v0.5 #34: when a VersioningManager is attached, the bucket's
5667 // versioning state lives in the manager (= S4-server's
5668 // authoritative source). Pass-through hits the backend only
5669 // when no manager is configured (legacy v0.4 behaviour).
5670 if let Some(mgr) = self.versioning.as_ref() {
5671 let output = match mgr.state(&req.input.bucket).as_aws_status() {
5672 Some(s) => GetBucketVersioningOutput {
5673 status: Some(BucketVersioningStatus::from(s.to_owned())),
5674 ..Default::default()
5675 },
5676 None => GetBucketVersioningOutput::default(),
5677 };
5678 return Ok(S3Response::new(output));
5679 }
5680 self.backend.get_bucket_versioning(req).await
5681 }
5682 async fn put_bucket_versioning(
5683 &self,
5684 req: S3Request<PutBucketVersioningInput>,
5685 ) -> S3Result<S3Response<PutBucketVersioningOutput>> {
5686 // v0.6 #42: MFA gating on the `PutBucketVersioning` request
5687 // itself. S3 spec: when the request body carries an
5688 // `MfaDelete` element (either `Enabled` or `Disabled`), the
5689 // request must include a valid `x-amz-mfa` token — both for
5690 // the *first* enable (so the operator can't quietly side-step
5691 // the gate by never enabling it) and for any subsequent
5692 // change (so a leaked credential alone can't disable MFA
5693 // Delete to bypass it on subsequent DELETEs). Requests that
5694 // omit the `MfaDelete` element entirely (i.e. they flip only
5695 // `Status`) skip this gate, matching AWS.
5696 if let Some(mgr) = self.mfa_delete.as_ref()
5697 && let Some(target_enabled) = req
5698 .input
5699 .versioning_configuration
5700 .mfa_delete
5701 .as_ref()
5702 .map(|m| m.as_str().eq_ignore_ascii_case("Enabled"))
5703 {
5704 let bucket = req.input.bucket.clone();
5705 let header = req.input.mfa.as_deref();
5706 let secret = mgr.lookup_secret(&bucket);
5707 let verified = match (header, secret.as_ref()) {
5708 (Some(h), Some(s)) => match crate::mfa::parse_mfa_header(h) {
5709 Ok((serial, code)) => {
5710 serial == s.serial
5711 && crate::mfa::verify_totp(&s.secret_base32, &code, current_unix_secs())
5712 }
5713 Err(_) => false,
5714 },
5715 _ => false,
5716 };
5717 if !verified {
5718 crate::metrics::record_mfa_delete_denial(&bucket);
5719 let err = if header.is_none() {
5720 crate::mfa::MfaError::Missing
5721 } else {
5722 crate::mfa::MfaError::InvalidCode
5723 };
5724 return Err(mfa_error_to_s3(err));
5725 }
5726 mgr.set_bucket_state(&bucket, target_enabled);
5727 }
5728 // v0.5 #34: stash the new state in the manager, then forward to
5729 // the backend so any downstream that *also* tracks state
5730 // (e.g. a real S3 backend) stays in sync. Manager-attached but
5731 // backend rejection is treated as a soft-fail (state is still
5732 // owned by the manager).
5733 if let Some(mgr) = self.versioning.as_ref() {
5734 let new_state = match req
5735 .input
5736 .versioning_configuration
5737 .status
5738 .as_ref()
5739 .map(|s| s.as_str())
5740 {
5741 Some(s) if s.eq_ignore_ascii_case("Enabled") => {
5742 crate::versioning::VersioningState::Enabled
5743 }
5744 Some(s) if s.eq_ignore_ascii_case("Suspended") => {
5745 crate::versioning::VersioningState::Suspended
5746 }
5747 _ => crate::versioning::VersioningState::Unversioned,
5748 };
5749 mgr.set_state(&req.input.bucket, new_state);
5750 return Ok(S3Response::new(PutBucketVersioningOutput::default()));
5751 }
5752 self.backend.put_bucket_versioning(req).await
5753 }
5754
5755 // ---- Bucket location ----
5756 async fn get_bucket_location(
5757 &self,
5758 req: S3Request<GetBucketLocationInput>,
5759 ) -> S3Result<S3Response<GetBucketLocationOutput>> {
5760 self.backend.get_bucket_location(req).await
5761 }
5762
5763 // ---- Bucket policy ----
5764 async fn get_bucket_policy(
5765 &self,
5766 req: S3Request<GetBucketPolicyInput>,
5767 ) -> S3Result<S3Response<GetBucketPolicyOutput>> {
5768 self.backend.get_bucket_policy(req).await
5769 }
5770 async fn put_bucket_policy(
5771 &self,
5772 req: S3Request<PutBucketPolicyInput>,
5773 ) -> S3Result<S3Response<PutBucketPolicyOutput>> {
5774 self.backend.put_bucket_policy(req).await
5775 }
5776 async fn delete_bucket_policy(
5777 &self,
5778 req: S3Request<DeleteBucketPolicyInput>,
5779 ) -> S3Result<S3Response<DeleteBucketPolicyOutput>> {
5780 self.backend.delete_bucket_policy(req).await
5781 }
5782 async fn get_bucket_policy_status(
5783 &self,
5784 req: S3Request<GetBucketPolicyStatusInput>,
5785 ) -> S3Result<S3Response<GetBucketPolicyStatusOutput>> {
5786 self.backend.get_bucket_policy_status(req).await
5787 }
5788
5789 // ---- Bucket ACL ----
5790 async fn get_bucket_acl(
5791 &self,
5792 req: S3Request<GetBucketAclInput>,
5793 ) -> S3Result<S3Response<GetBucketAclOutput>> {
5794 self.backend.get_bucket_acl(req).await
5795 }
5796 async fn put_bucket_acl(
5797 &self,
5798 req: S3Request<PutBucketAclInput>,
5799 ) -> S3Result<S3Response<PutBucketAclOutput>> {
5800 self.backend.put_bucket_acl(req).await
5801 }
5802
5803 // ---- Bucket CORS (v0.6 #38) ----
5804 async fn get_bucket_cors(
5805 &self,
5806 req: S3Request<GetBucketCorsInput>,
5807 ) -> S3Result<S3Response<GetBucketCorsOutput>> {
5808 if let Some(mgr) = self.cors.as_ref() {
5809 let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
5810 S3Error::with_message(
5811 S3ErrorCode::NoSuchCORSConfiguration,
5812 "The CORS configuration does not exist".to_string(),
5813 )
5814 })?;
5815 let rules: Vec<CORSRule> = cfg
5816 .rules
5817 .into_iter()
5818 .map(|r| CORSRule {
5819 allowed_headers: if r.allowed_headers.is_empty() {
5820 None
5821 } else {
5822 Some(r.allowed_headers)
5823 },
5824 allowed_methods: r.allowed_methods,
5825 allowed_origins: r.allowed_origins,
5826 expose_headers: if r.expose_headers.is_empty() {
5827 None
5828 } else {
5829 Some(r.expose_headers)
5830 },
5831 id: r.id,
5832 max_age_seconds: r.max_age_seconds.map(|s| s as i32),
5833 })
5834 .collect();
5835 return Ok(S3Response::new(GetBucketCorsOutput {
5836 cors_rules: Some(rules),
5837 }));
5838 }
5839 self.backend.get_bucket_cors(req).await
5840 }
5841 async fn put_bucket_cors(
5842 &self,
5843 req: S3Request<PutBucketCorsInput>,
5844 ) -> S3Result<S3Response<PutBucketCorsOutput>> {
5845 if let Some(mgr) = self.cors.as_ref() {
5846 let cfg = crate::cors::CorsConfig {
5847 rules: req
5848 .input
5849 .cors_configuration
5850 .cors_rules
5851 .into_iter()
5852 .map(|r| crate::cors::CorsRule {
5853 allowed_origins: r.allowed_origins,
5854 allowed_methods: r.allowed_methods,
5855 allowed_headers: r.allowed_headers.unwrap_or_default(),
5856 expose_headers: r.expose_headers.unwrap_or_default(),
5857 max_age_seconds: r
5858 .max_age_seconds
5859 .and_then(|s| if s < 0 { None } else { Some(s as u32) }),
5860 id: r.id,
5861 })
5862 .collect(),
5863 };
5864 mgr.put(&req.input.bucket, cfg);
5865 return Ok(S3Response::new(PutBucketCorsOutput::default()));
5866 }
5867 self.backend.put_bucket_cors(req).await
5868 }
5869 async fn delete_bucket_cors(
5870 &self,
5871 req: S3Request<DeleteBucketCorsInput>,
5872 ) -> S3Result<S3Response<DeleteBucketCorsOutput>> {
5873 if let Some(mgr) = self.cors.as_ref() {
5874 mgr.delete(&req.input.bucket);
5875 return Ok(S3Response::new(DeleteBucketCorsOutput::default()));
5876 }
5877 self.backend.delete_bucket_cors(req).await
5878 }
5879
5880 // ---- Bucket lifecycle (v0.6 #37) ----
5881 async fn get_bucket_lifecycle_configuration(
5882 &self,
5883 req: S3Request<GetBucketLifecycleConfigurationInput>,
5884 ) -> S3Result<S3Response<GetBucketLifecycleConfigurationOutput>> {
5885 if let Some(mgr) = self.lifecycle.as_ref() {
5886 let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
5887 S3Error::with_message(
5888 S3ErrorCode::NoSuchLifecycleConfiguration,
5889 "The lifecycle configuration does not exist".to_string(),
5890 )
5891 })?;
5892 let rules: Vec<LifecycleRule> = cfg.rules.iter().map(internal_rule_to_dto).collect();
5893 return Ok(S3Response::new(GetBucketLifecycleConfigurationOutput {
5894 rules: Some(rules),
5895 transition_default_minimum_object_size: None,
5896 }));
5897 }
5898 self.backend.get_bucket_lifecycle_configuration(req).await
5899 }
5900 async fn put_bucket_lifecycle_configuration(
5901 &self,
5902 req: S3Request<PutBucketLifecycleConfigurationInput>,
5903 ) -> S3Result<S3Response<PutBucketLifecycleConfigurationOutput>> {
5904 if let Some(mgr) = self.lifecycle.as_ref() {
5905 let bucket = req.input.bucket.clone();
5906 let dto_cfg = req.input.lifecycle_configuration.unwrap_or_default();
5907 let cfg = dto_lifecycle_to_internal(&dto_cfg);
5908 mgr.put(&bucket, cfg);
5909 return Ok(S3Response::new(
5910 PutBucketLifecycleConfigurationOutput::default(),
5911 ));
5912 }
5913 self.backend.put_bucket_lifecycle_configuration(req).await
5914 }
5915 async fn delete_bucket_lifecycle(
5916 &self,
5917 req: S3Request<DeleteBucketLifecycleInput>,
5918 ) -> S3Result<S3Response<DeleteBucketLifecycleOutput>> {
5919 if let Some(mgr) = self.lifecycle.as_ref() {
5920 mgr.delete(&req.input.bucket);
5921 return Ok(S3Response::new(DeleteBucketLifecycleOutput::default()));
5922 }
5923 self.backend.delete_bucket_lifecycle(req).await
5924 }
5925
5926 // ---- Bucket tagging (v0.6 #39) ----
5927 async fn get_bucket_tagging(
5928 &self,
5929 req: S3Request<GetBucketTaggingInput>,
5930 ) -> S3Result<S3Response<GetBucketTaggingOutput>> {
5931 let Some(mgr) = self.tagging.as_ref() else {
5932 return self.backend.get_bucket_tagging(req).await;
5933 };
5934 let tags = mgr.get_bucket_tags(&req.input.bucket).unwrap_or_default();
5935 Ok(S3Response::new(GetBucketTaggingOutput {
5936 tag_set: tagset_to_aws(&tags),
5937 }))
5938 }
5939 async fn put_bucket_tagging(
5940 &self,
5941 req: S3Request<PutBucketTaggingInput>,
5942 ) -> S3Result<S3Response<PutBucketTaggingOutput>> {
5943 let Some(mgr) = self.tagging.as_ref() else {
5944 return self.backend.put_bucket_tagging(req).await;
5945 };
5946 let bucket = req.input.bucket.clone();
5947 let parsed = aws_to_tagset(&req.input.tagging.tag_set)
5948 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
5949 self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
5950 mgr.put_bucket_tags(&bucket, parsed);
5951 Ok(S3Response::new(PutBucketTaggingOutput::default()))
5952 }
5953 async fn delete_bucket_tagging(
5954 &self,
5955 req: S3Request<DeleteBucketTaggingInput>,
5956 ) -> S3Result<S3Response<DeleteBucketTaggingOutput>> {
5957 let Some(mgr) = self.tagging.as_ref() else {
5958 return self.backend.delete_bucket_tagging(req).await;
5959 };
5960 let bucket = req.input.bucket.clone();
5961 self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
5962 mgr.delete_bucket_tags(&bucket);
5963 Ok(S3Response::new(DeleteBucketTaggingOutput::default()))
5964 }
5965
5966 // ---- Bucket encryption ----
5967 async fn get_bucket_encryption(
5968 &self,
5969 req: S3Request<GetBucketEncryptionInput>,
5970 ) -> S3Result<S3Response<GetBucketEncryptionOutput>> {
5971 self.backend.get_bucket_encryption(req).await
5972 }
5973 async fn put_bucket_encryption(
5974 &self,
5975 req: S3Request<PutBucketEncryptionInput>,
5976 ) -> S3Result<S3Response<PutBucketEncryptionOutput>> {
5977 self.backend.put_bucket_encryption(req).await
5978 }
5979 async fn delete_bucket_encryption(
5980 &self,
5981 req: S3Request<DeleteBucketEncryptionInput>,
5982 ) -> S3Result<S3Response<DeleteBucketEncryptionOutput>> {
5983 self.backend.delete_bucket_encryption(req).await
5984 }
5985
5986 // ---- Bucket logging ----
5987 async fn get_bucket_logging(
5988 &self,
5989 req: S3Request<GetBucketLoggingInput>,
5990 ) -> S3Result<S3Response<GetBucketLoggingOutput>> {
5991 self.backend.get_bucket_logging(req).await
5992 }
5993 async fn put_bucket_logging(
5994 &self,
5995 req: S3Request<PutBucketLoggingInput>,
5996 ) -> S3Result<S3Response<PutBucketLoggingOutput>> {
5997 self.backend.put_bucket_logging(req).await
5998 }
5999
6000 // ---- Bucket notification (v0.6 #35) ----
6001 //
6002 // When a `NotificationManager` is attached, S4 itself owns per-bucket
6003 // notification configurations and the PUT / GET handlers route through
6004 // the manager. The wire DTO's queue / topic configurations map onto
6005 // S4's `Destination::Sqs` / `Destination::Sns`; LambdaFunction and
6006 // EventBridge configurations are accepted on PUT but silently dropped
6007 // (out of scope for v0.6 #35). When no manager is attached the legacy
6008 // backend-passthrough behaviour applies.
6009 async fn get_bucket_notification_configuration(
6010 &self,
6011 req: S3Request<GetBucketNotificationConfigurationInput>,
6012 ) -> S3Result<S3Response<GetBucketNotificationConfigurationOutput>> {
6013 if let Some(mgr) = self.notifications.as_ref() {
6014 let cfg = mgr.get(&req.input.bucket).unwrap_or_default();
6015 let dto = notif_to_dto(&cfg);
6016 return Ok(S3Response::new(GetBucketNotificationConfigurationOutput {
6017 event_bridge_configuration: dto.event_bridge_configuration,
6018 lambda_function_configurations: dto.lambda_function_configurations,
6019 queue_configurations: dto.queue_configurations,
6020 topic_configurations: dto.topic_configurations,
6021 }));
6022 }
6023 self.backend
6024 .get_bucket_notification_configuration(req)
6025 .await
6026 }
6027 async fn put_bucket_notification_configuration(
6028 &self,
6029 req: S3Request<PutBucketNotificationConfigurationInput>,
6030 ) -> S3Result<S3Response<PutBucketNotificationConfigurationOutput>> {
6031 if let Some(mgr) = self.notifications.as_ref() {
6032 let cfg = notif_from_dto(&req.input.notification_configuration);
6033 mgr.put(&req.input.bucket, cfg);
6034 return Ok(S3Response::new(
6035 PutBucketNotificationConfigurationOutput::default(),
6036 ));
6037 }
6038 self.backend
6039 .put_bucket_notification_configuration(req)
6040 .await
6041 }
6042
6043 // ---- Bucket request payment ----
6044 async fn get_bucket_request_payment(
6045 &self,
6046 req: S3Request<GetBucketRequestPaymentInput>,
6047 ) -> S3Result<S3Response<GetBucketRequestPaymentOutput>> {
6048 self.backend.get_bucket_request_payment(req).await
6049 }
6050 async fn put_bucket_request_payment(
6051 &self,
6052 req: S3Request<PutBucketRequestPaymentInput>,
6053 ) -> S3Result<S3Response<PutBucketRequestPaymentOutput>> {
6054 self.backend.put_bucket_request_payment(req).await
6055 }
6056
6057 // ---- Bucket website ----
6058 async fn get_bucket_website(
6059 &self,
6060 req: S3Request<GetBucketWebsiteInput>,
6061 ) -> S3Result<S3Response<GetBucketWebsiteOutput>> {
6062 self.backend.get_bucket_website(req).await
6063 }
6064 async fn put_bucket_website(
6065 &self,
6066 req: S3Request<PutBucketWebsiteInput>,
6067 ) -> S3Result<S3Response<PutBucketWebsiteOutput>> {
6068 self.backend.put_bucket_website(req).await
6069 }
6070 async fn delete_bucket_website(
6071 &self,
6072 req: S3Request<DeleteBucketWebsiteInput>,
6073 ) -> S3Result<S3Response<DeleteBucketWebsiteOutput>> {
6074 self.backend.delete_bucket_website(req).await
6075 }
6076
6077 // ---- Bucket replication (v0.6 #40) ----
6078 async fn get_bucket_replication(
6079 &self,
6080 req: S3Request<GetBucketReplicationInput>,
6081 ) -> S3Result<S3Response<GetBucketReplicationOutput>> {
6082 if let Some(mgr) = self.replication.as_ref() {
6083 return match mgr.get(&req.input.bucket) {
6084 Some(cfg) => Ok(S3Response::new(GetBucketReplicationOutput {
6085 replication_configuration: Some(replication_to_dto(&cfg)),
6086 })),
6087 None => Err(S3Error::with_message(
6088 S3ErrorCode::Custom("ReplicationConfigurationNotFoundError".into()),
6089 format!(
6090 "no replication configuration on bucket {}",
6091 req.input.bucket
6092 ),
6093 )),
6094 };
6095 }
6096 self.backend.get_bucket_replication(req).await
6097 }
6098 async fn put_bucket_replication(
6099 &self,
6100 req: S3Request<PutBucketReplicationInput>,
6101 ) -> S3Result<S3Response<PutBucketReplicationOutput>> {
6102 if let Some(mgr) = self.replication.as_ref() {
6103 let cfg = replication_from_dto(&req.input.replication_configuration);
6104 mgr.put(&req.input.bucket, cfg);
6105 return Ok(S3Response::new(PutBucketReplicationOutput::default()));
6106 }
6107 self.backend.put_bucket_replication(req).await
6108 }
6109 async fn delete_bucket_replication(
6110 &self,
6111 req: S3Request<DeleteBucketReplicationInput>,
6112 ) -> S3Result<S3Response<DeleteBucketReplicationOutput>> {
6113 if let Some(mgr) = self.replication.as_ref() {
6114 mgr.delete(&req.input.bucket);
6115 return Ok(S3Response::new(DeleteBucketReplicationOutput::default()));
6116 }
6117 self.backend.delete_bucket_replication(req).await
6118 }
6119
6120 // ---- Bucket accelerate ----
6121 async fn get_bucket_accelerate_configuration(
6122 &self,
6123 req: S3Request<GetBucketAccelerateConfigurationInput>,
6124 ) -> S3Result<S3Response<GetBucketAccelerateConfigurationOutput>> {
6125 self.backend.get_bucket_accelerate_configuration(req).await
6126 }
6127 async fn put_bucket_accelerate_configuration(
6128 &self,
6129 req: S3Request<PutBucketAccelerateConfigurationInput>,
6130 ) -> S3Result<S3Response<PutBucketAccelerateConfigurationOutput>> {
6131 self.backend.put_bucket_accelerate_configuration(req).await
6132 }
6133
6134 // ---- Bucket ownership controls ----
6135 async fn get_bucket_ownership_controls(
6136 &self,
6137 req: S3Request<GetBucketOwnershipControlsInput>,
6138 ) -> S3Result<S3Response<GetBucketOwnershipControlsOutput>> {
6139 self.backend.get_bucket_ownership_controls(req).await
6140 }
6141 async fn put_bucket_ownership_controls(
6142 &self,
6143 req: S3Request<PutBucketOwnershipControlsInput>,
6144 ) -> S3Result<S3Response<PutBucketOwnershipControlsOutput>> {
6145 self.backend.put_bucket_ownership_controls(req).await
6146 }
6147 async fn delete_bucket_ownership_controls(
6148 &self,
6149 req: S3Request<DeleteBucketOwnershipControlsInput>,
6150 ) -> S3Result<S3Response<DeleteBucketOwnershipControlsOutput>> {
6151 self.backend.delete_bucket_ownership_controls(req).await
6152 }
6153
6154 // ---- Public access block ----
6155 async fn get_public_access_block(
6156 &self,
6157 req: S3Request<GetPublicAccessBlockInput>,
6158 ) -> S3Result<S3Response<GetPublicAccessBlockOutput>> {
6159 self.backend.get_public_access_block(req).await
6160 }
6161 async fn put_public_access_block(
6162 &self,
6163 req: S3Request<PutPublicAccessBlockInput>,
6164 ) -> S3Result<S3Response<PutPublicAccessBlockOutput>> {
6165 self.backend.put_public_access_block(req).await
6166 }
6167 async fn delete_public_access_block(
6168 &self,
6169 req: S3Request<DeletePublicAccessBlockInput>,
6170 ) -> S3Result<S3Response<DeletePublicAccessBlockOutput>> {
6171 self.backend.delete_public_access_block(req).await
6172 }
6173
6174 // ====================================================================
6175 // v0.6 #41: S3 Select — server-side SQL filter on object body.
6176 //
6177 // Fetch the object via the regular `get_object` path (so SSE-C /
6178 // SSE-S4 / SSE-KMS / S4 codec all decompress + decrypt transparently),
6179 // run a small SQL subset (CSV + JSON Lines, equality / inequality /
6180 // LIKE / AND / OR / NOT) over the in-memory body, and stream the
6181 // matched rows back as AWS event-stream `Records` + `Stats` + `End`
6182 // frames.
6183 //
6184 // Limitations (deliberate, documented):
6185 // - Parquet input is rejected with NotImplemented.
6186 // - Aggregates / GROUP BY / JOIN / ORDER BY / LIMIT are rejected at
6187 // parse time as InvalidRequest (s3s 0.13 doesn't expose AWS's
6188 // domain-specific `InvalidSqlExpression` code).
6189 // - The body is fully buffered before SQL evaluation (S3 Select
6190 // streaming-during-evaluation is v0.7 scope).
6191 // - GPU-accelerated WHERE evaluation is stubbed out (always None).
6192 async fn select_object_content(
6193 &self,
6194 req: S3Request<SelectObjectContentInput>,
6195 ) -> S3Result<S3Response<SelectObjectContentOutput>> {
6196 use crate::select::{
6197 EventStreamWriter, SelectInputFormat, SelectOutputFormat, run_select_csv,
6198 run_select_jsonlines,
6199 };
6200
6201 let select_bucket = req.input.bucket.clone();
6202 let select_key = req.input.key.clone();
6203 self.enforce_rate_limit(&req, &select_bucket)?;
6204 self.enforce_policy(&req, "s3:GetObject", &select_bucket, Some(&select_key))?;
6205
6206 let request = req.input.request;
6207 let sql = request.expression.clone();
6208 if request.expression_type.as_str() != "SQL" {
6209 return Err(S3Error::with_message(
6210 S3ErrorCode::InvalidExpressionType,
6211 format!(
6212 "ExpressionType must be SQL, got: {}",
6213 request.expression_type.as_str()
6214 ),
6215 ));
6216 }
6217
6218 let input_format = if let Some(_json) = request.input_serialization.json.as_ref() {
6219 SelectInputFormat::JsonLines
6220 } else if let Some(csv) = request.input_serialization.csv.as_ref() {
6221 let has_header = csv
6222 .file_header_info
6223 .as_ref()
6224 .map(|h| {
6225 let s = h.as_str();
6226 s.eq_ignore_ascii_case("USE") || s.eq_ignore_ascii_case("IGNORE")
6227 })
6228 .unwrap_or(false);
6229 let delim = csv
6230 .field_delimiter
6231 .as_deref()
6232 .and_then(|s| s.chars().next())
6233 .unwrap_or(',');
6234 SelectInputFormat::Csv {
6235 has_header,
6236 delimiter: delim,
6237 }
6238 } else if request.input_serialization.parquet.is_some() {
6239 return Err(S3Error::with_message(
6240 S3ErrorCode::NotImplemented,
6241 "Parquet input is not supported by this S3 Select implementation (v0.6: CSV / JSON Lines only)",
6242 ));
6243 } else {
6244 return Err(S3Error::with_message(
6245 S3ErrorCode::InvalidRequest,
6246 "InputSerialization requires exactly one of CSV / JSON / Parquet",
6247 ));
6248 };
6249 if let Some(ct) = request.input_serialization.compression_type.as_ref()
6250 && !ct.as_str().eq_ignore_ascii_case("NONE")
6251 {
6252 return Err(S3Error::with_message(
6253 S3ErrorCode::NotImplemented,
6254 format!(
6255 "InputSerialization CompressionType={} is not supported (v0.6: NONE only)",
6256 ct.as_str()
6257 ),
6258 ));
6259 }
6260
6261 let output_format = if request.output_serialization.json.is_some() {
6262 SelectOutputFormat::Json
6263 } else if request.output_serialization.csv.is_some() {
6264 SelectOutputFormat::Csv
6265 } else {
6266 return Err(S3Error::with_message(
6267 S3ErrorCode::InvalidRequest,
6268 "OutputSerialization requires exactly one of CSV / JSON",
6269 ));
6270 };
6271
6272 let get_input = GetObjectInput {
6273 bucket: select_bucket.clone(),
6274 key: select_key.clone(),
6275 sse_customer_algorithm: req.input.sse_customer_algorithm.clone(),
6276 sse_customer_key: req.input.sse_customer_key.clone(),
6277 sse_customer_key_md5: req.input.sse_customer_key_md5.clone(),
6278 ..Default::default()
6279 };
6280 let get_req = S3Request {
6281 input: get_input,
6282 method: http::Method::GET,
6283 uri: format!("/{}/{}", select_bucket, select_key)
6284 .parse()
6285 .map_err(|e| {
6286 S3Error::with_message(
6287 S3ErrorCode::InternalError,
6288 format!("constructing inner GET URI: {e}"),
6289 )
6290 })?,
6291 headers: http::HeaderMap::new(),
6292 extensions: http::Extensions::new(),
6293 credentials: req.credentials.clone(),
6294 region: req.region.clone(),
6295 service: req.service.clone(),
6296 trailing_headers: None,
6297 };
6298 let mut get_resp = self.get_object(get_req).await?;
6299 let blob = get_resp.output.body.take().ok_or_else(|| {
6300 S3Error::with_message(
6301 S3ErrorCode::InternalError,
6302 "Select: object body was empty after GET",
6303 )
6304 })?;
6305 let body_bytes = crate::blob::collect_blob(blob, self.max_body_bytes)
6306 .await
6307 .map_err(internal("collect Select body"))?;
6308 let scanned = body_bytes.len() as u64;
6309
6310 let matched_payload = match input_format {
6311 SelectInputFormat::JsonLines => run_select_jsonlines(&sql, &body_bytes, output_format)
6312 .map_err(|e| select_error_to_s3(e, "JSON Lines"))?,
6313 SelectInputFormat::Csv { .. } => {
6314 run_select_csv(&sql, &body_bytes, input_format, output_format)
6315 .map_err(|e| select_error_to_s3(e, "CSV"))?
6316 }
6317 };
6318
6319 let returned = matched_payload.len() as u64;
6320 let processed = scanned;
6321 let mut events: Vec<S3Result<SelectObjectContentEvent>> = Vec::with_capacity(3);
6322 if !matched_payload.is_empty() {
6323 events.push(Ok(SelectObjectContentEvent::Records(RecordsEvent {
6324 payload: Some(bytes::Bytes::from(matched_payload)),
6325 })));
6326 }
6327 events.push(Ok(SelectObjectContentEvent::Stats(StatsEvent {
6328 details: Some(Stats {
6329 bytes_scanned: Some(scanned as i64),
6330 bytes_processed: Some(processed as i64),
6331 bytes_returned: Some(returned as i64),
6332 }),
6333 })));
6334 events.push(Ok(SelectObjectContentEvent::End(EndEvent {})));
6335 // Touch EventStreamWriter so the public API stays linked into the
6336 // build (the actual wire framing is delegated to s3s).
6337 let _writer = EventStreamWriter::new();
6338
6339 let stream = SelectObjectContentEventStream::new(futures::stream::iter(events));
6340 let output = SelectObjectContentOutput {
6341 payload: Some(stream),
6342 };
6343 Ok(S3Response::new(output))
6344 }
6345
6346 // ---- Bucket Inventory configuration (v0.6 #36) ----
6347 //
6348 // When an `InventoryManager` is attached, S4-server owns the
6349 // configuration store and these handlers no longer pass through to
6350 // the backend. The mapping between the s3s-typed
6351 // `InventoryConfiguration` and the inventory module's internal
6352 // `InventoryConfig` is intentionally lossy: only the fields S4
6353 // actually uses for periodic CSV emission survive the round trip
6354 // (id, source bucket, destination bucket / prefix, format, included
6355 // versions, schedule frequency). Optional fields, encryption, and
6356 // filter prefixes are accepted on PUT and re-surfaced on GET via
6357 // a best-effort default-shape `InventoryConfiguration` so the
6358 // client sees a roundtrip-clean response.
6359 async fn put_bucket_inventory_configuration(
6360 &self,
6361 req: S3Request<PutBucketInventoryConfigurationInput>,
6362 ) -> S3Result<S3Response<PutBucketInventoryConfigurationOutput>> {
6363 if let Some(mgr) = self.inventory.as_ref() {
6364 let cfg = inv_from_dto(
6365 &req.input.bucket,
6366 &req.input.id,
6367 &req.input.inventory_configuration,
6368 );
6369 mgr.put(cfg);
6370 return Ok(S3Response::new(
6371 PutBucketInventoryConfigurationOutput::default(),
6372 ));
6373 }
6374 self.backend.put_bucket_inventory_configuration(req).await
6375 }
6376
6377 async fn get_bucket_inventory_configuration(
6378 &self,
6379 req: S3Request<GetBucketInventoryConfigurationInput>,
6380 ) -> S3Result<S3Response<GetBucketInventoryConfigurationOutput>> {
6381 if let Some(mgr) = self.inventory.as_ref() {
6382 let cfg = mgr.get(&req.input.bucket, &req.input.id);
6383 if let Some(cfg) = cfg {
6384 let out = GetBucketInventoryConfigurationOutput {
6385 inventory_configuration: Some(inv_to_dto(&cfg)),
6386 };
6387 return Ok(S3Response::new(out));
6388 }
6389 // AWS returns `NoSuchConfiguration` (404) when the id has no
6390 // matching inventory configuration on the bucket. The
6391 // generated `S3ErrorCode` enum doesn't expose a typed variant
6392 // for this code, so we round-trip through `from_bytes` which
6393 // wraps unknown codes as `Custom(...)` (= the AWS-canonical
6394 // error-code string survives into the XML response envelope).
6395 let code =
6396 S3ErrorCode::from_bytes(b"NoSuchConfiguration").unwrap_or(S3ErrorCode::NoSuchKey);
6397 return Err(S3Error::with_message(
6398 code,
6399 format!(
6400 "no inventory configuration with id={} on bucket={}",
6401 req.input.id, req.input.bucket
6402 ),
6403 ));
6404 }
6405 self.backend.get_bucket_inventory_configuration(req).await
6406 }
6407
6408 async fn list_bucket_inventory_configurations(
6409 &self,
6410 req: S3Request<ListBucketInventoryConfigurationsInput>,
6411 ) -> S3Result<S3Response<ListBucketInventoryConfigurationsOutput>> {
6412 if let Some(mgr) = self.inventory.as_ref() {
6413 let list = mgr.list_for_bucket(&req.input.bucket);
6414 let dto_list: Vec<InventoryConfiguration> = list.iter().map(inv_to_dto).collect();
6415 let out = ListBucketInventoryConfigurationsOutput {
6416 continuation_token: req.input.continuation_token.clone(),
6417 inventory_configuration_list: if dto_list.is_empty() {
6418 None
6419 } else {
6420 Some(dto_list)
6421 },
6422 is_truncated: Some(false),
6423 next_continuation_token: None,
6424 };
6425 return Ok(S3Response::new(out));
6426 }
6427 self.backend.list_bucket_inventory_configurations(req).await
6428 }
6429
6430 async fn delete_bucket_inventory_configuration(
6431 &self,
6432 req: S3Request<DeleteBucketInventoryConfigurationInput>,
6433 ) -> S3Result<S3Response<DeleteBucketInventoryConfigurationOutput>> {
6434 if let Some(mgr) = self.inventory.as_ref() {
6435 mgr.delete(&req.input.bucket, &req.input.id);
6436 return Ok(S3Response::new(
6437 DeleteBucketInventoryConfigurationOutput::default(),
6438 ));
6439 }
6440 self.backend
6441 .delete_bucket_inventory_configuration(req)
6442 .await
6443 }
6444}
6445
6446// ---------------------------------------------------------------------------
6447// v0.6 #36: Convert between the s3s-typed `InventoryConfiguration` (the wire
6448// surface) and our internal `crate::inventory::InventoryConfig`. Only the
6449// fields S4 actually uses for CSV emission survive the round trip; the
6450// missing fields (filter prefix, optional fields, encryption) are dropped on
6451// PUT and re-rendered as the AWS-default shape on GET so the client sees a
6452// well-formed `InventoryConfiguration`.
6453// ---------------------------------------------------------------------------
6454
6455fn inv_from_dto(
6456 bucket: &str,
6457 id: &str,
6458 dto: &InventoryConfiguration,
6459) -> crate::inventory::InventoryConfig {
6460 let frequency_hours = match dto.schedule.frequency.as_str() {
6461 "Weekly" => 24 * 7,
6462 // Daily is the default; anything S4 doesn't recognise (incl.
6463 // empty, which is the s3s-default) maps to Daily so the
6464 // operator's PUT doesn't silently turn into a no-op cadence.
6465 _ => 24,
6466 };
6467 // Parquet/ORC are not supported (issue #36 scope); we still accept
6468 // the PUT so callers don't fail-loud, but we record CSV and rely on
6469 // the operator catching the discrepancy on GET.
6470 let format = crate::inventory::InventoryFormat::Csv;
6471 crate::inventory::InventoryConfig {
6472 id: id.to_owned(),
6473 bucket: bucket.to_owned(),
6474 destination_bucket: dto.destination.s3_bucket_destination.bucket.clone(),
6475 destination_prefix: dto
6476 .destination
6477 .s3_bucket_destination
6478 .prefix
6479 .clone()
6480 .unwrap_or_default(),
6481 frequency_hours,
6482 format,
6483 included_object_versions: crate::inventory::IncludedVersions::from_aws_str(
6484 dto.included_object_versions.as_str(),
6485 ),
6486 }
6487}
6488
6489fn inv_to_dto(cfg: &crate::inventory::InventoryConfig) -> InventoryConfiguration {
6490 InventoryConfiguration {
6491 id: cfg.id.clone(),
6492 is_enabled: true,
6493 included_object_versions: InventoryIncludedObjectVersions::from(
6494 cfg.included_object_versions.as_aws_str().to_owned(),
6495 ),
6496 destination: InventoryDestination {
6497 s3_bucket_destination: InventoryS3BucketDestination {
6498 account_id: None,
6499 bucket: cfg.destination_bucket.clone(),
6500 encryption: None,
6501 format: InventoryFormat::from(cfg.format.as_aws_str().to_owned()),
6502 prefix: if cfg.destination_prefix.is_empty() {
6503 None
6504 } else {
6505 Some(cfg.destination_prefix.clone())
6506 },
6507 },
6508 },
6509 schedule: InventorySchedule {
6510 // `frequency_hours == 168` -> Weekly; everything else maps to
6511 // Daily for the wire response (the manager keeps the precise
6512 // hour count internally for due-checking).
6513 frequency: InventoryFrequency::from(
6514 if cfg.frequency_hours == 24 * 7 {
6515 "Weekly"
6516 } else {
6517 "Daily"
6518 }
6519 .to_owned(),
6520 ),
6521 },
6522 filter: None,
6523 optional_fields: None,
6524 }
6525}
6526
6527// ---------------------------------------------------------------------------
6528// v0.6 #35: Convert between the s3s-typed `NotificationConfiguration` (the
6529// wire surface) and our internal `crate::notifications::NotificationConfig`.
6530//
6531// We support TopicConfiguration (-> Destination::Sns) and QueueConfiguration
6532// (-> Destination::Sqs). LambdaFunction and EventBridge configurations are
6533// silently dropped on PUT (out of scope for v0.6 #35); the GET response only
6534// surfaces topic / queue rules.
6535//
6536// The webhook destination has no AWS-native wire form: operators configure
6537// webhooks via the JSON snapshot file (`--notifications-state-file`) or by
6538// poking `NotificationManager::put` directly from a custom binary. This
6539// keeps the wire surface AWS-compatible while still letting the always-
6540// available `Webhook` destination be reachable.
6541// ---------------------------------------------------------------------------
6542
6543fn notif_from_dto(dto: &NotificationConfiguration) -> crate::notifications::NotificationConfig {
6544 let mut rules: Vec<crate::notifications::NotificationRule> = Vec::new();
6545 if let Some(topics) = dto.topic_configurations.as_ref() {
6546 for (idx, t) in topics.iter().enumerate() {
6547 let events = events_from_dto(&t.events);
6548 let (prefix, suffix) = filter_from_dto(t.filter.as_ref());
6549 rules.push(crate::notifications::NotificationRule {
6550 id: t.id.clone().unwrap_or_else(|| format!("topic-{idx}")),
6551 events,
6552 destination: crate::notifications::Destination::Sns {
6553 topic_arn: t.topic_arn.clone(),
6554 },
6555 filter_prefix: prefix,
6556 filter_suffix: suffix,
6557 });
6558 }
6559 }
6560 if let Some(queues) = dto.queue_configurations.as_ref() {
6561 for (idx, q) in queues.iter().enumerate() {
6562 let events = events_from_dto(&q.events);
6563 let (prefix, suffix) = filter_from_dto(q.filter.as_ref());
6564 rules.push(crate::notifications::NotificationRule {
6565 id: q.id.clone().unwrap_or_else(|| format!("queue-{idx}")),
6566 events,
6567 destination: crate::notifications::Destination::Sqs {
6568 queue_arn: q.queue_arn.clone(),
6569 },
6570 filter_prefix: prefix,
6571 filter_suffix: suffix,
6572 });
6573 }
6574 }
6575 crate::notifications::NotificationConfig { rules }
6576}
6577
6578fn notif_to_dto(cfg: &crate::notifications::NotificationConfig) -> NotificationConfiguration {
6579 let mut topics: Vec<TopicConfiguration> = Vec::new();
6580 let mut queues: Vec<QueueConfiguration> = Vec::new();
6581 for rule in &cfg.rules {
6582 let events: Vec<Event> = rule
6583 .events
6584 .iter()
6585 .map(|e| Event::from(e.as_aws_str().to_owned()))
6586 .collect();
6587 let filter = filter_to_dto(rule.filter_prefix.as_deref(), rule.filter_suffix.as_deref());
6588 match &rule.destination {
6589 crate::notifications::Destination::Sns { topic_arn } => {
6590 topics.push(TopicConfiguration {
6591 events,
6592 filter,
6593 id: Some(rule.id.clone()),
6594 topic_arn: topic_arn.clone(),
6595 });
6596 }
6597 crate::notifications::Destination::Sqs { queue_arn } => {
6598 queues.push(QueueConfiguration {
6599 events,
6600 filter,
6601 id: Some(rule.id.clone()),
6602 queue_arn: queue_arn.clone(),
6603 });
6604 }
6605 // Webhook destinations have no AWS wire equivalent — they
6606 // round-trip through the JSON snapshot only. Skip them on the
6607 // GET surface (an SDK consumer wouldn't know what to do with
6608 // them anyway).
6609 crate::notifications::Destination::Webhook { .. } => {}
6610 }
6611 }
6612 NotificationConfiguration {
6613 event_bridge_configuration: None,
6614 lambda_function_configurations: None,
6615 queue_configurations: if queues.is_empty() {
6616 None
6617 } else {
6618 Some(queues)
6619 },
6620 topic_configurations: if topics.is_empty() {
6621 None
6622 } else {
6623 Some(topics)
6624 },
6625 }
6626}
6627
6628fn events_from_dto(events: &[Event]) -> Vec<crate::notifications::EventType> {
6629 events
6630 .iter()
6631 .filter_map(|e| crate::notifications::EventType::from_aws_str(e.as_ref()))
6632 .collect()
6633}
6634
6635fn filter_from_dto(
6636 f: Option<&NotificationConfigurationFilter>,
6637) -> (Option<String>, Option<String>) {
6638 let Some(f) = f else {
6639 return (None, None);
6640 };
6641 let Some(key) = f.key.as_ref() else {
6642 return (None, None);
6643 };
6644 let Some(rules) = key.filter_rules.as_ref() else {
6645 return (None, None);
6646 };
6647 let mut prefix = None;
6648 let mut suffix = None;
6649 for r in rules {
6650 let name = r.name.as_ref().map(|n| n.as_str().to_ascii_lowercase());
6651 let value = r.value.clone();
6652 match name.as_deref() {
6653 Some("prefix") => prefix = value,
6654 Some("suffix") => suffix = value,
6655 _ => {}
6656 }
6657 }
6658 (prefix, suffix)
6659}
6660
6661fn filter_to_dto(
6662 prefix: Option<&str>,
6663 suffix: Option<&str>,
6664) -> Option<NotificationConfigurationFilter> {
6665 if prefix.is_none() && suffix.is_none() {
6666 return None;
6667 }
6668 let mut rules: Vec<FilterRule> = Vec::new();
6669 if let Some(p) = prefix {
6670 rules.push(FilterRule {
6671 name: Some(FilterRuleName::from("prefix".to_owned())),
6672 value: Some(p.to_owned()),
6673 });
6674 }
6675 if let Some(s) = suffix {
6676 rules.push(FilterRule {
6677 name: Some(FilterRuleName::from("suffix".to_owned())),
6678 value: Some(s.to_owned()),
6679 });
6680 }
6681 Some(NotificationConfigurationFilter {
6682 key: Some(S3KeyFilter {
6683 filter_rules: Some(rules),
6684 }),
6685 })
6686}
6687
6688// ---------------------------------------------------------------------------
6689// v0.6 #40: Convert between the s3s-typed `ReplicationConfiguration` (the
6690// wire surface) and our internal `crate::replication::ReplicationConfig`.
6691// AWS's `ReplicationRuleFilter` is a sum type — `Prefix | Tag | And { Prefix,
6692// Tags }`; we flatten it into the single `(prefix, tag-vec)` representation
6693// the matcher needs. Sub-blocks v0.6 #40 does not implement
6694// (DeleteMarkerReplication / SourceSelectionCriteria / ReplicationTime /
6695// Metrics / EncryptionConfiguration) round-trip as `None` on GET — operators
6696// who set them on PUT see them silently dropped, mirroring "feature not
6697// supported in this release" semantics.
6698// ---------------------------------------------------------------------------
6699
6700fn replication_from_dto(dto: &ReplicationConfiguration) -> crate::replication::ReplicationConfig {
6701 let rules = dto
6702 .rules
6703 .iter()
6704 .enumerate()
6705 .map(|(idx, r)| {
6706 let id =
6707 r.id.as_ref()
6708 .map(|s| s.as_str().to_owned())
6709 .unwrap_or_else(|| format!("rule-{idx}"));
6710 let priority = r.priority.unwrap_or(0).max(0) as u32;
6711 let status_enabled = r.status.as_str() == ReplicationRuleStatus::ENABLED;
6712 let filter = replication_filter_from_dto(r.filter.as_ref(), r.prefix.as_deref());
6713 let destination_bucket = r.destination.bucket.clone();
6714 let destination_storage_class = r
6715 .destination
6716 .storage_class
6717 .as_ref()
6718 .map(|s| s.as_str().to_owned());
6719 crate::replication::ReplicationRule {
6720 id,
6721 priority,
6722 status_enabled,
6723 filter,
6724 destination_bucket,
6725 destination_storage_class,
6726 }
6727 })
6728 .collect();
6729 crate::replication::ReplicationConfig {
6730 role: dto.role.clone(),
6731 rules,
6732 }
6733}
6734
6735fn replication_to_dto(cfg: &crate::replication::ReplicationConfig) -> ReplicationConfiguration {
6736 let rules = cfg
6737 .rules
6738 .iter()
6739 .map(|r| {
6740 let status = if r.status_enabled {
6741 ReplicationRuleStatus::from_static(ReplicationRuleStatus::ENABLED)
6742 } else {
6743 ReplicationRuleStatus::from_static(ReplicationRuleStatus::DISABLED)
6744 };
6745 let destination = Destination {
6746 access_control_translation: None,
6747 account: None,
6748 bucket: r.destination_bucket.clone(),
6749 encryption_configuration: None,
6750 metrics: None,
6751 replication_time: None,
6752 storage_class: r
6753 .destination_storage_class
6754 .as_ref()
6755 .map(|s| StorageClass::from(s.clone())),
6756 };
6757 let filter = Some(replication_filter_to_dto(&r.filter));
6758 ReplicationRule {
6759 delete_marker_replication: None,
6760 destination,
6761 existing_object_replication: None,
6762 filter,
6763 id: Some(r.id.clone()),
6764 prefix: None,
6765 priority: Some(r.priority as i32),
6766 source_selection_criteria: None,
6767 status,
6768 }
6769 })
6770 .collect();
6771 ReplicationConfiguration {
6772 role: cfg.role.clone(),
6773 rules,
6774 }
6775}
6776
6777fn replication_filter_from_dto(
6778 f: Option<&ReplicationRuleFilter>,
6779 rule_level_prefix: Option<&str>,
6780) -> crate::replication::ReplicationFilter {
6781 let mut prefix: Option<String> = rule_level_prefix.map(str::to_owned);
6782 let mut tags: Vec<(String, String)> = Vec::new();
6783 if let Some(f) = f {
6784 if let Some(p) = f.prefix.as_ref()
6785 && prefix.is_none()
6786 {
6787 prefix = Some(p.clone());
6788 }
6789 if let Some(t) = f.tag.as_ref()
6790 && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
6791 {
6792 tags.push((k.clone(), v.clone()));
6793 }
6794 if let Some(and) = f.and.as_ref() {
6795 if let Some(p) = and.prefix.as_ref()
6796 && prefix.is_none()
6797 {
6798 prefix = Some(p.clone());
6799 }
6800 if let Some(ts) = and.tags.as_ref() {
6801 for t in ts {
6802 if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
6803 tags.push((k.clone(), v.clone()));
6804 }
6805 }
6806 }
6807 }
6808 }
6809 crate::replication::ReplicationFilter { prefix, tags }
6810}
6811
6812fn replication_filter_to_dto(f: &crate::replication::ReplicationFilter) -> ReplicationRuleFilter {
6813 if f.tags.is_empty() {
6814 ReplicationRuleFilter {
6815 and: None,
6816 prefix: f.prefix.clone(),
6817 tag: None,
6818 }
6819 } else if f.tags.len() == 1 && f.prefix.is_none() {
6820 let (k, v) = &f.tags[0];
6821 ReplicationRuleFilter {
6822 and: None,
6823 prefix: None,
6824 tag: Some(Tag {
6825 key: Some(k.clone()),
6826 value: Some(v.clone()),
6827 }),
6828 }
6829 } else {
6830 let tags: Vec<Tag> = f
6831 .tags
6832 .iter()
6833 .map(|(k, v)| Tag {
6834 key: Some(k.clone()),
6835 value: Some(v.clone()),
6836 })
6837 .collect();
6838 ReplicationRuleFilter {
6839 and: Some(ReplicationRuleAndOperator {
6840 prefix: f.prefix.clone(),
6841 tags: Some(tags),
6842 }),
6843 prefix: None,
6844 tag: None,
6845 }
6846 }
6847}
6848
6849// ---------------------------------------------------------------------------
6850// v0.6 #37: Convert between the s3s-typed `BucketLifecycleConfiguration`
6851// (the wire surface) and our internal `crate::lifecycle::LifecycleConfig`.
6852// The internal representation flattens AWS's "Filter | And" disjunction
6853// into a single `LifecycleFilter` struct of optional fields plus a tag
6854// vector. Fields S4's evaluator does not consume
6855// (`expired_object_delete_marker`, `noncurrent_version_transitions`,
6856// `transition_default_minimum_object_size`, the storage class on the
6857// noncurrent expiration) are dropped on PUT and re-rendered as their
6858// AWS-default shape on GET so the client always sees a well-formed
6859// configuration.
6860// ---------------------------------------------------------------------------
6861
6862fn dto_lifecycle_to_internal(
6863 dto: &BucketLifecycleConfiguration,
6864) -> crate::lifecycle::LifecycleConfig {
6865 crate::lifecycle::LifecycleConfig {
6866 rules: dto.rules.iter().map(dto_rule_to_internal).collect(),
6867 }
6868}
6869
6870fn dto_rule_to_internal(rule: &LifecycleRule) -> crate::lifecycle::LifecycleRule {
6871 let status = crate::lifecycle::LifecycleStatus::from_aws_str(rule.status.as_str());
6872 let filter = rule
6873 .filter
6874 .as_ref()
6875 .map(dto_filter_to_internal)
6876 .unwrap_or_default();
6877 let expiration_days = rule
6878 .expiration
6879 .as_ref()
6880 .and_then(|e| e.days)
6881 .and_then(|d| u32::try_from(d).ok());
6882 let expiration_date = rule
6883 .expiration
6884 .as_ref()
6885 .and_then(|e| e.date.as_ref())
6886 .and_then(timestamp_to_chrono_utc);
6887 let transitions: Vec<crate::lifecycle::TransitionRule> = rule
6888 .transitions
6889 .as_ref()
6890 .map(|ts| {
6891 ts.iter()
6892 .filter_map(|t| {
6893 let days = u32::try_from(t.days?).ok()?;
6894 let storage_class = t.storage_class.as_ref()?.as_str().to_owned();
6895 Some(crate::lifecycle::TransitionRule {
6896 days,
6897 storage_class,
6898 })
6899 })
6900 .collect()
6901 })
6902 .unwrap_or_default();
6903 let noncurrent_version_expiration_days = rule
6904 .noncurrent_version_expiration
6905 .as_ref()
6906 .and_then(|n| n.noncurrent_days)
6907 .and_then(|d| u32::try_from(d).ok());
6908 let abort_incomplete_multipart_upload_days = rule
6909 .abort_incomplete_multipart_upload
6910 .as_ref()
6911 .and_then(|a| a.days_after_initiation)
6912 .and_then(|d| u32::try_from(d).ok());
6913 crate::lifecycle::LifecycleRule {
6914 id: rule.id.clone().unwrap_or_default(),
6915 status,
6916 filter,
6917 expiration_days,
6918 expiration_date,
6919 transitions,
6920 noncurrent_version_expiration_days,
6921 abort_incomplete_multipart_upload_days,
6922 }
6923}
6924
6925fn dto_filter_to_internal(filter: &LifecycleRuleFilter) -> crate::lifecycle::LifecycleFilter {
6926 let mut prefix = filter.prefix.clone();
6927 let mut tags: Vec<(String, String)> = Vec::new();
6928 let mut size_gt: Option<u64> = filter
6929 .object_size_greater_than
6930 .and_then(|n| u64::try_from(n).ok());
6931 let mut size_lt: Option<u64> = filter
6932 .object_size_less_than
6933 .and_then(|n| u64::try_from(n).ok());
6934 if let Some(t) = &filter.tag
6935 && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
6936 {
6937 tags.push((k.clone(), v.clone()));
6938 }
6939 if let Some(and) = &filter.and {
6940 if prefix.is_none() {
6941 prefix = and.prefix.clone();
6942 }
6943 if size_gt.is_none() {
6944 size_gt = and
6945 .object_size_greater_than
6946 .and_then(|n| u64::try_from(n).ok());
6947 }
6948 if size_lt.is_none() {
6949 size_lt = and
6950 .object_size_less_than
6951 .and_then(|n| u64::try_from(n).ok());
6952 }
6953 if let Some(ts) = &and.tags {
6954 for t in ts {
6955 if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
6956 tags.push((k.clone(), v.clone()));
6957 }
6958 }
6959 }
6960 }
6961 crate::lifecycle::LifecycleFilter {
6962 prefix,
6963 tags,
6964 object_size_greater_than: size_gt,
6965 object_size_less_than: size_lt,
6966 }
6967}
6968
6969fn internal_rule_to_dto(rule: &crate::lifecycle::LifecycleRule) -> LifecycleRule {
6970 let expiration = if rule.expiration_days.is_some() || rule.expiration_date.is_some() {
6971 Some(LifecycleExpiration {
6972 date: rule.expiration_date.map(chrono_utc_to_timestamp),
6973 days: rule.expiration_days.map(|d| d as i32),
6974 expired_object_delete_marker: None,
6975 })
6976 } else {
6977 None
6978 };
6979 let transitions: Option<TransitionList> = if rule.transitions.is_empty() {
6980 None
6981 } else {
6982 Some(
6983 rule.transitions
6984 .iter()
6985 .map(|t| Transition {
6986 date: None,
6987 days: Some(t.days as i32),
6988 storage_class: Some(TransitionStorageClass::from(t.storage_class.clone())),
6989 })
6990 .collect(),
6991 )
6992 };
6993 let noncurrent_version_expiration =
6994 rule.noncurrent_version_expiration_days
6995 .map(|d| NoncurrentVersionExpiration {
6996 newer_noncurrent_versions: None,
6997 noncurrent_days: Some(d as i32),
6998 });
6999 let abort_incomplete_multipart_upload =
7000 rule.abort_incomplete_multipart_upload_days
7001 .map(|d| AbortIncompleteMultipartUpload {
7002 days_after_initiation: Some(d as i32),
7003 });
7004 let filter = if rule.filter.tags.is_empty()
7005 && rule.filter.object_size_greater_than.is_none()
7006 && rule.filter.object_size_less_than.is_none()
7007 {
7008 rule.filter.prefix.as_ref().map(|p| LifecycleRuleFilter {
7009 and: None,
7010 object_size_greater_than: None,
7011 object_size_less_than: None,
7012 prefix: Some(p.clone()),
7013 tag: None,
7014 })
7015 } else if rule.filter.tags.len() == 1
7016 && rule.filter.prefix.is_none()
7017 && rule.filter.object_size_greater_than.is_none()
7018 && rule.filter.object_size_less_than.is_none()
7019 {
7020 let (k, v) = rule.filter.tags[0].clone();
7021 Some(LifecycleRuleFilter {
7022 and: None,
7023 object_size_greater_than: None,
7024 object_size_less_than: None,
7025 prefix: None,
7026 tag: Some(Tag {
7027 key: Some(k),
7028 value: Some(v),
7029 }),
7030 })
7031 } else {
7032 let tags = if rule.filter.tags.is_empty() {
7033 None
7034 } else {
7035 Some(
7036 rule.filter
7037 .tags
7038 .iter()
7039 .map(|(k, v)| Tag {
7040 key: Some(k.clone()),
7041 value: Some(v.clone()),
7042 })
7043 .collect(),
7044 )
7045 };
7046 Some(LifecycleRuleFilter {
7047 and: Some(LifecycleRuleAndOperator {
7048 object_size_greater_than: rule
7049 .filter
7050 .object_size_greater_than
7051 .and_then(|n| i64::try_from(n).ok()),
7052 object_size_less_than: rule
7053 .filter
7054 .object_size_less_than
7055 .and_then(|n| i64::try_from(n).ok()),
7056 prefix: rule.filter.prefix.clone(),
7057 tags,
7058 }),
7059 object_size_greater_than: None,
7060 object_size_less_than: None,
7061 prefix: None,
7062 tag: None,
7063 })
7064 };
7065 LifecycleRule {
7066 abort_incomplete_multipart_upload,
7067 expiration,
7068 filter,
7069 id: if rule.id.is_empty() {
7070 None
7071 } else {
7072 Some(rule.id.clone())
7073 },
7074 noncurrent_version_expiration,
7075 noncurrent_version_transitions: None,
7076 prefix: None,
7077 status: ExpirationStatus::from(rule.status.as_aws_str().to_owned()),
7078 transitions,
7079 }
7080}
7081
7082// (timestamp <-> chrono helpers `timestamp_to_chrono_utc` /
7083// `chrono_utc_to_timestamp` are defined earlier in this file for the
7084// tagging/notifications work; the lifecycle DTO converters reuse them.)
7085
7086// ---------------------------------------------------------------------------
7087// v0.5 #33: SigV4a (asymmetric ECDSA-P256) integration hook.
7088//
7089// Kept as a self-contained block at the bottom of the file so it doesn't
7090// touch the existing `S4Service` struct, `new()`, or any of the per-op
7091// handlers above. The hook is wired in by the binary at server-build time
7092// as a hyper middleware layer (see `main.rs`), NOT inside `S4Service`.
7093//
7094// Lifecycle:
7095// 1. `SigV4aGate::new(store)` is constructed once at boot from the
7096// operator-supplied credential directory.
7097// 2. For each incoming request, `SigV4aGate::pre_route(&req,
7098// &requested_region, &canonical_request_bytes)` is invoked BEFORE
7099// the request hits the S3 framework. If the request claims SigV4a
7100// and verifies, control returns to the framework. Otherwise a 403
7101// `SignatureDoesNotMatch` is produced.
7102// 3. Plain SigV4 (HMAC-SHA256) requests pass through untouched.
7103// ---------------------------------------------------------------------------
7104
7105/// Gate that fronts the S3 service path with SigV4a verification (v0.5 #33).
7106///
7107/// Wraps a [`crate::sigv4a::SigV4aCredentialStore`] and exposes a single
7108/// `pre_route` entry point that returns `Ok(())` for both
7109/// "request is plain SigV4 — pass through" and "request is SigV4a and
7110/// verified", and an `Err(...)` containing a 403-equivalent diagnostic
7111/// otherwise. Cheap to clone (the inner store is `Arc`-backed).
7112///
7113/// v0.8.4 #76 (audit H-6): the gate now enforces an `x-amz-date`
7114/// freshness window (default 15 min, AWS-spec) and a strict credential
7115/// scope shape (`<key>/<YYYYMMDD>/s3/aws4_request`), shutting the
7116/// captured-request replay vector — previously a stolen valid SigV4a
7117/// signature could be replayed indefinitely (including DELETE).
7118#[derive(Debug, Clone)]
7119pub struct SigV4aGate {
7120 store: crate::sigv4a::SharedSigV4aCredentialStore,
7121 /// v0.8.4 #76: how far the request's `x-amz-date` may drift from
7122 /// the server's clock before being rejected with 403
7123 /// `RequestTimeTooSkewed`. Matches the AWS S3 spec default of
7124 /// 15 min when constructed via [`SigV4aGate::new`]; the operator
7125 /// can override via [`SigV4aGate::with_skew_tolerance`] (CLI flag
7126 /// `--sigv4a-skew-tolerance-seconds`).
7127 skew_tolerance: chrono::Duration,
7128}
7129
7130impl SigV4aGate {
7131 /// Default `x-amz-date` skew tolerance — 15 min, matching AWS S3.
7132 pub const DEFAULT_SKEW_TOLERANCE_SECS: i64 = 900;
7133
7134 #[must_use]
7135 pub fn new(store: crate::sigv4a::SharedSigV4aCredentialStore) -> Self {
7136 Self {
7137 store,
7138 skew_tolerance: chrono::Duration::seconds(Self::DEFAULT_SKEW_TOLERANCE_SECS),
7139 }
7140 }
7141
7142 /// v0.8.4 #76: override the `x-amz-date` skew tolerance (default
7143 /// 15 min). Operators can widen this for high-clock-drift
7144 /// environments or tighten it for compliance regimes that demand
7145 /// stricter freshness.
7146 #[must_use]
7147 pub fn with_skew_tolerance(mut self, skew: chrono::Duration) -> Self {
7148 self.skew_tolerance = skew;
7149 self
7150 }
7151
7152 /// Read the configured skew tolerance — exposed mostly for test +
7153 /// observability use.
7154 #[must_use]
7155 pub fn skew_tolerance(&self) -> chrono::Duration {
7156 self.skew_tolerance
7157 }
7158
7159 /// Inspect an incoming HTTP request. Behaviour:
7160 ///
7161 /// - Not SigV4a (no `X-Amz-Region-Set` and no SigV4a `Authorization`
7162 /// prefix) → returns `Ok(())`; the framework's existing SigV4
7163 /// path handles the request.
7164 /// - SigV4a + valid signature + region match + fresh x-amz-date
7165 /// → `Ok(())`.
7166 /// - SigV4a + unknown access-key-id → `Err` with `InvalidAccessKeyId`.
7167 /// - SigV4a + bad signature / region mismatch → `Err` with
7168 /// `SignatureDoesNotMatch`.
7169 /// - SigV4a + missing or skewed `x-amz-date` → `Err` with one of
7170 /// the v0.8.4 #76 freshness variants (`RequestTimeTooSkewed`
7171 /// et al.).
7172 ///
7173 /// `canonical_request_bytes` is the SigV4a string-to-sign (or
7174 /// canonical-request bytes; the caller decides) that the framework
7175 /// has already produced for this request. Keeping it as a parameter
7176 /// instead of rebuilding it inside the hook avoids duplicating the
7177 /// canonicalisation logic.
7178 pub fn pre_route<B>(
7179 &self,
7180 req: &http::Request<B>,
7181 requested_region: &str,
7182 canonical_request_bytes: &[u8],
7183 ) -> Result<(), SigV4aGateError> {
7184 self.pre_route_at(
7185 req,
7186 requested_region,
7187 canonical_request_bytes,
7188 chrono::Utc::now(),
7189 )
7190 }
7191
7192 /// Like [`SigV4aGate::pre_route`] but takes an explicit `now` for
7193 /// tests that need to pin the freshness clock. Production callers
7194 /// use `pre_route` (which calls `chrono::Utc::now()`).
7195 pub fn pre_route_at<B>(
7196 &self,
7197 req: &http::Request<B>,
7198 requested_region: &str,
7199 canonical_request_bytes: &[u8],
7200 now: chrono::DateTime<chrono::Utc>,
7201 ) -> Result<(), SigV4aGateError> {
7202 if !crate::sigv4a::detect(req) {
7203 return Ok(());
7204 }
7205 let auth_hdr = req
7206 .headers()
7207 .get(http::header::AUTHORIZATION)
7208 .and_then(|v| v.to_str().ok())
7209 .ok_or(SigV4aGateError::MissingAuthorization)?;
7210 let parsed = crate::sigv4a::parse_authorization_header(auth_hdr)
7211 .map_err(|_| SigV4aGateError::MalformedAuthorization)?;
7212 let region_set = req
7213 .headers()
7214 .get(crate::sigv4a::REGION_SET_HEADER)
7215 .and_then(|v| v.to_str().ok())
7216 .unwrap_or("*");
7217 let key = self
7218 .store
7219 .get(&parsed.access_key_id)
7220 .ok_or_else(|| SigV4aGateError::UnknownAccessKey(parsed.access_key_id.clone()))?;
7221 // v0.8.4 #76: snapshot the request headers into a
7222 // lowercase-keyed flat map so `verify_request` can do the
7223 // x-amz-date freshness checks without taking a generic
7224 // `HeaderMap` dep. Cheap because the headers list is tiny.
7225 //
7226 // v0.8.5 #84 (audit H-4): detect duplicate header names while
7227 // we flatten — `HashMap::insert` would silently overwrite the
7228 // first value with the second, mirroring the auth-confusion
7229 // vector the canonical-request builder also defends against.
7230 // Reject upfront so the rest of the gate (freshness check,
7231 // ECDSA verify) never sees a half-truncated header set. We
7232 // detect by checking `contains_key` *before* insertion rather
7233 // than by counting via `headers().get_all`, because the
7234 // upstream `HeaderMap` iteration yields each duplicate entry
7235 // as its own (name, value) pair — the second-seen entry is
7236 // exactly what `contains_key` traps.
7237 let mut header_map: std::collections::HashMap<String, String> =
7238 std::collections::HashMap::with_capacity(req.headers().len());
7239 for (name, value) in req.headers() {
7240 if let Ok(v) = value.to_str() {
7241 let lower = name.as_str().to_ascii_lowercase();
7242 if header_map.contains_key(&lower) {
7243 return Err(SigV4aGateError::Verify(
7244 crate::sigv4a::SigV4aError::DuplicateSignedHeader { header: lower },
7245 ));
7246 }
7247 header_map.insert(lower, v.to_string());
7248 }
7249 }
7250 crate::sigv4a::verify_request(
7251 &parsed,
7252 &header_map,
7253 canonical_request_bytes,
7254 key,
7255 region_set,
7256 requested_region,
7257 now,
7258 self.skew_tolerance,
7259 )
7260 .map_err(SigV4aGateError::Verify)?;
7261 Ok(())
7262 }
7263}
7264
7265/// Failure modes from [`SigV4aGate::pre_route`]. All variants map to
7266/// HTTP 403 with one of the two AWS-standard error codes
7267/// (`InvalidAccessKeyId` / `SignatureDoesNotMatch` / `RequestTimeTooSkewed`)
7268/// — see [`SigV4aGateError::s3_error_code`].
7269#[derive(Debug, thiserror::Error)]
7270pub enum SigV4aGateError {
7271 #[error("missing Authorization header")]
7272 MissingAuthorization,
7273 #[error("malformed SigV4a Authorization header")]
7274 MalformedAuthorization,
7275 #[error("unknown SigV4a access-key-id: {0}")]
7276 UnknownAccessKey(String),
7277 #[error("SigV4a verification failed: {0}")]
7278 Verify(#[source] crate::sigv4a::SigV4aError),
7279}
7280
7281impl SigV4aGateError {
7282 /// AWS S3 error code that should accompany the response.
7283 ///
7284 /// v0.8.4 #76 (audit H-6): the freshness check surfaces
7285 /// `RequestTimeTooSkewed` (matches AWS spec); date / scope shape
7286 /// failures surface as `InvalidRequest` (400); other failures stay
7287 /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` (403) so the wire
7288 /// surface stays AWS-compatible.
7289 #[must_use]
7290 pub fn s3_error_code(&self) -> &'static str {
7291 match self {
7292 Self::UnknownAccessKey(_) => "InvalidAccessKeyId",
7293 Self::Verify(crate::sigv4a::SigV4aError::RequestTimeTooSkewed { .. }) => {
7294 "RequestTimeTooSkewed"
7295 }
7296 Self::Verify(
7297 crate::sigv4a::SigV4aError::MissingXAmzDate
7298 | crate::sigv4a::SigV4aError::InvalidDateFormat
7299 | crate::sigv4a::SigV4aError::DateScopeMismatch
7300 | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7301 | crate::sigv4a::SigV4aError::InvalidTerminator
7302 | crate::sigv4a::SigV4aError::WrongService { .. }
7303 | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7304 ) => "InvalidRequest",
7305 _ => "SignatureDoesNotMatch",
7306 }
7307 }
7308
7309 /// HTTP status code to accompany the response. v0.8.4 #76: format
7310 /// errors that are clearly client mistakes (missing / malformed
7311 /// `x-amz-date`, malformed credential scope, wrong service) are
7312 /// surfaced as 400 InvalidRequest; the rest stay 403.
7313 #[must_use]
7314 pub fn http_status(&self) -> http::StatusCode {
7315 match self {
7316 Self::Verify(
7317 crate::sigv4a::SigV4aError::MissingXAmzDate
7318 | crate::sigv4a::SigV4aError::InvalidDateFormat
7319 | crate::sigv4a::SigV4aError::DateScopeMismatch
7320 | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7321 | crate::sigv4a::SigV4aError::InvalidTerminator
7322 | crate::sigv4a::SigV4aError::WrongService { .. }
7323 | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7324 ) => http::StatusCode::BAD_REQUEST,
7325 _ => http::StatusCode::FORBIDDEN,
7326 }
7327 }
7328}
7329
7330#[cfg(test)]
7331mod tests {
7332 use super::*;
7333
7334 #[test]
7335 fn manifest_roundtrip_via_metadata() {
7336 let original = ChunkManifest {
7337 codec: CodecKind::CpuZstd,
7338 original_size: 1234,
7339 compressed_size: 567,
7340 crc32c: 0xdead_beef,
7341 };
7342 let mut meta: Option<Metadata> = None;
7343 write_manifest(&mut meta, &original);
7344 let extracted = extract_manifest(&meta).expect("manifest must round-trip");
7345 assert_eq!(extracted.codec, original.codec);
7346 assert_eq!(extracted.original_size, original.original_size);
7347 assert_eq!(extracted.compressed_size, original.compressed_size);
7348 assert_eq!(extracted.crc32c, original.crc32c);
7349 }
7350
7351 #[test]
7352 fn missing_metadata_yields_none() {
7353 let meta: Option<Metadata> = None;
7354 assert!(extract_manifest(&meta).is_none());
7355 }
7356
7357 #[test]
7358 fn partial_metadata_yields_none() {
7359 let mut meta = Metadata::new();
7360 meta.insert(META_CODEC.into(), "cpu-zstd".into());
7361 let opt = Some(meta);
7362 assert!(extract_manifest(&opt).is_none());
7363 }
7364
7365 #[test]
7366 fn parse_copy_source_range_basic() {
7367 let r = parse_copy_source_range("bytes=10-20").unwrap();
7368 match r {
7369 s3s::dto::Range::Int { first, last } => {
7370 assert_eq!(first, 10);
7371 assert_eq!(last, Some(20));
7372 }
7373 _ => panic!("expected Int range"),
7374 }
7375 }
7376
7377 #[test]
7378 fn parse_copy_source_range_rejects_inverted() {
7379 let err = parse_copy_source_range("bytes=20-10").unwrap_err();
7380 assert!(err.contains("last < first"));
7381 }
7382
7383 #[test]
7384 fn parse_copy_source_range_rejects_missing_prefix() {
7385 let err = parse_copy_source_range("10-20").unwrap_err();
7386 assert!(err.contains("must start with 'bytes='"));
7387 }
7388
7389 #[test]
7390 fn parse_copy_source_range_rejects_open_ended() {
7391 // S3 upload_part_copy spec requires N-M (closed); suffix and
7392 // open-ended forms are not allowed for this header.
7393 assert!(parse_copy_source_range("bytes=10-").is_err());
7394 assert!(parse_copy_source_range("bytes=-10").is_err());
7395 }
7396
7397 // v0.7 #49: safe_object_uri must round-trip every legal S3 key
7398 // (which includes spaces, slashes, control chars, raw UTF-8) into
7399 // a parseable `http::Uri` instead of panicking like the previous
7400 // `format!(...).parse().unwrap()` call sites did.
7401
7402 #[test]
7403 fn safe_object_uri_basic_ascii() {
7404 let uri = safe_object_uri("bucket", "key").expect("ascii must be safe");
7405 assert_eq!(uri.path(), "/bucket/key");
7406 }
7407
7408 #[test]
7409 fn safe_object_uri_encodes_spaces() {
7410 let uri = safe_object_uri("bucket", "key with spaces").expect("must encode spaces");
7411 // RFC 3986 path-segment encoding turns ' ' into %20.
7412 assert!(
7413 uri.path().contains("%20"),
7414 "expected percent-encoded space, got {}",
7415 uri.path()
7416 );
7417 assert!(uri.path().starts_with("/bucket/"));
7418 }
7419
7420 #[test]
7421 fn safe_object_uri_preserves_slashes() {
7422 // S3 keys legally contain '/' as a logical path separator —
7423 // the helper must NOT escape it (otherwise the synthetic URI
7424 // changes the perceived hierarchy).
7425 let uri = safe_object_uri("bucket", "key/with/slashes").expect("slashes must round-trip");
7426 assert_eq!(uri.path(), "/bucket/key/with/slashes");
7427 }
7428
7429 #[test]
7430 fn safe_object_uri_handles_newline_without_panic() {
7431 // Newlines are control chars in URIs; whether the result is
7432 // Ok (encoded as %0A) or Err (parse rejects), the helper
7433 // MUST NOT panic. Either outcome is acceptable.
7434 let _ = safe_object_uri("bucket", "key\n");
7435 }
7436
7437 #[test]
7438 fn safe_object_uri_handles_null_byte_without_panic() {
7439 let _ = safe_object_uri("bucket", "key\0bad");
7440 }
7441
7442 #[test]
7443 fn safe_object_uri_handles_unicode_without_panic() {
7444 // RTL override, BOM, plain Japanese — none should panic.
7445 let _ = safe_object_uri("bucket", "rtl\u{202E}override");
7446 let _ = safe_object_uri("bucket", "\u{FEFF}bom-key");
7447 let _ = safe_object_uri("bucket", "日本語キー");
7448 }
7449
7450 #[test]
7451 fn safe_object_uri_no_panic_for_every_byte() {
7452 // Exhaustive byte coverage: 0x00..=0xFF as a 1-byte key.
7453 // None of these may panic. (0x80..=0xFF are not valid UTF-8
7454 // by themselves; we go through `String::from_utf8_lossy` so
7455 // the helper sees a real `&str` regardless of the raw byte.)
7456 for b in 0u8..=255 {
7457 let s = String::from_utf8_lossy(&[b]).into_owned();
7458 let _ = safe_object_uri("bucket", &s);
7459 }
7460 }
7461
7462 /// v0.8.1 #58: smoke test for the DEK-handling shape used by the
7463 /// SSE-KMS branches of `put_object` and `complete_multipart_upload`.
7464 /// Mirrors the call pattern (generate_dek → length check → copy
7465 /// into stack `[u8; 32]` → reborrow as `&[u8; 32]` for `SseSource`)
7466 /// without spinning up a full `S4Service`.
7467 ///
7468 /// The real assertion this guards against is a regression where
7469 /// the `Zeroizing` wrapper is accidentally dropped before the
7470 /// stack copy lands (e.g. someone refactors to use
7471 /// `let dek = kms.generate_dek(...).await?.0; drop(dek); ...`)
7472 /// or where `&**dek` is rewritten in a way that doesn't compile.
7473 #[tokio::test]
7474 async fn kms_dek_lifetime_within_function_scope() {
7475 use crate::kms::{KmsBackend, LocalKms};
7476 use std::collections::HashMap;
7477 use std::path::PathBuf;
7478 use zeroize::Zeroizing;
7479
7480 let mut keks = HashMap::new();
7481 keks.insert("scope".to_string(), [33u8; 32]);
7482 let kms = LocalKms::from_keks(PathBuf::from("/tmp/kms-scope-test"), keks);
7483
7484 // Mirror the put_object KMS branch shape exactly.
7485 let (dek, wrapped) = kms.generate_dek("scope").await.unwrap();
7486 assert_eq!(dek.len(), 32);
7487 let mut dek_arr: Zeroizing<[u8; 32]> = Zeroizing::new([0u8; 32]);
7488 dek_arr.copy_from_slice(&dek);
7489
7490 // The reborrow used at the SseSource construction site —
7491 // mirrors the call-site pattern where `let dek_ref: &[u8; 32]`
7492 // auto-derefs from a `Zeroizing<[u8; 32]>` reference.
7493 let dek_ref: &[u8; 32] = &dek_arr;
7494 // Sanity: the reborrow points at the same bytes.
7495 assert_eq!(dek_ref, &*dek_arr);
7496 // Wrapped key id flows through unchanged.
7497 assert_eq!(wrapped.key_id, "scope");
7498
7499 // At end of scope, both `dek` (Zeroizing<Vec<u8>>) and
7500 // `dek_arr` (Zeroizing<[u8; 32]>) are dropped, wiping the
7501 // backing memory. Cannot directly assert the wipe (would be
7502 // UB to read freed memory), so this test instead enforces
7503 // that the call shape compiles and executes; the wipe itself
7504 // is exercised by the `zeroize` crate's own test suite.
7505 }
7506
7507 /// v0.8.5 #86 (audit M-2): the replication dispatcher must
7508 /// `acquire_owned()` a permit from `replication_semaphore` before
7509 /// kicking off the destination PUT, so a saturated semaphore
7510 /// back-pressures the in-flight queue depth instead of letting it
7511 /// grow without bound. We exercise the field directly (initial
7512 /// permit count, override via `with_replication_max_concurrent`,
7513 /// permit drop on `Drop`) — the full `spawn_replication_if_matched`
7514 /// integration is exercised by the existing replication tests in
7515 /// `tests/feature_e2e.rs` once a `ReplicationManager` is attached.
7516 #[tokio::test]
7517 async fn replication_semaphore_caps_concurrent_dispatchers() {
7518 // Build a minimal `S4Service` directly — no handler path is
7519 // exercised, only the constructor + setter + accessor shape.
7520 let registry = Arc::new(
7521 CodecRegistry::new(CodecKind::Passthrough)
7522 .with(Arc::new(s4_codec::passthrough::Passthrough)),
7523 );
7524 let dispatcher = Arc::new(s4_codec::dispatcher::AlwaysDispatcher(
7525 CodecKind::Passthrough,
7526 ));
7527 let s4 = S4Service::new(NoopBackend, registry, dispatcher);
7528
7529 // Default cap matches the documented constant.
7530 assert_eq!(
7531 s4.replication_semaphore().available_permits(),
7532 S4Service::<NoopBackend>::DEFAULT_REPLICATION_MAX_CONCURRENT,
7533 "fresh S4Service must expose DEFAULT_REPLICATION_MAX_CONCURRENT permits"
7534 );
7535
7536 // Override via the builder — replaces the underlying `Semaphore`.
7537 let s4 = s4.with_replication_max_concurrent(2);
7538 assert_eq!(
7539 s4.replication_semaphore().available_permits(),
7540 2,
7541 "with_replication_max_concurrent(2) must expose exactly 2 permits"
7542 );
7543
7544 // Acquiring permits must reduce `available_permits()` and
7545 // dropping them must restore the count — this is the contract
7546 // `spawn_replication_if_matched` relies on for back-pressure.
7547 let sem = Arc::clone(s4.replication_semaphore());
7548 let p1 = sem.clone().acquire_owned().await.expect("permit 1");
7549 let p2 = sem.clone().acquire_owned().await.expect("permit 2");
7550 assert_eq!(
7551 sem.available_permits(),
7552 0,
7553 "two acquired permits must zero `available_permits()`"
7554 );
7555 // A third `try_acquire_owned` must fail — the cap is enforced
7556 // synchronously, no extra spawn slips through.
7557 assert!(
7558 sem.clone().try_acquire_owned().is_err(),
7559 "third acquire must back-pressure: cap was 2"
7560 );
7561 drop(p1);
7562 drop(p2);
7563 assert_eq!(
7564 sem.available_permits(),
7565 2,
7566 "dropping permits must restore cap"
7567 );
7568
7569 // Lower-bound clamp: a 0 cap would deadlock all dispatchers,
7570 // so the setter clamps it to 1 instead of accepting it
7571 // (callers are warned in the CLI doc).
7572 let s4 = s4.with_replication_max_concurrent(0);
7573 assert_eq!(
7574 s4.replication_semaphore().available_permits(),
7575 1,
7576 "cap=0 must be clamped to 1 to avoid total deadlock"
7577 );
7578 }
7579
7580 /// v0.8.5 #86 (audit M-1): the access-log flusher must return a
7581 /// `JoinHandle<()>` that the caller can `abort()` on shutdown
7582 /// without leaving a dangling task. The pre-#86 call site dropped
7583 /// the handle at end-of-block (silently detaching it); the fix is
7584 /// hoisting it into a process-lived `Vec` so the graceful-shutdown
7585 /// branch in `main.rs` can wait for clean exit. This test exercises
7586 /// the `JoinHandle.abort()` shape directly so a future refactor that
7587 /// stops returning the handle (or returns a non-abortable wrapper)
7588 /// trips this regression guard.
7589 #[tokio::test]
7590 async fn flusher_handle_can_be_aborted_cleanly() {
7591 // Stand up a minimal `AccessLog` pointing at a tmp dir so the
7592 // flusher's `create_dir_all` succeeds. The dir is cleaned up
7593 // by the OS / test harness; we don't assert on the contents.
7594 let tmp = std::env::temp_dir().join(format!(
7595 "s4-86-flusher-{}-{}",
7596 std::process::id(),
7597 std::time::SystemTime::now()
7598 .duration_since(std::time::UNIX_EPOCH)
7599 .map(|d| d.as_nanos())
7600 .unwrap_or(0)
7601 ));
7602 let dest = crate::access_log::AccessLogDest { dir: tmp.clone() };
7603 let log = crate::access_log::AccessLog::new(dest);
7604 let handle = log.spawn_flusher(None);
7605 assert!(
7606 !handle.is_finished(),
7607 "freshly-spawned flusher must not yet be finished"
7608 );
7609 handle.abort();
7610 // `await`-ing an aborted handle returns `Err(JoinError)` whose
7611 // `is_cancelled()` is true.
7612 let join_result = handle.await;
7613 assert!(
7614 join_result.is_err(),
7615 "aborted flusher must surface JoinError, got Ok"
7616 );
7617 assert!(
7618 join_result.unwrap_err().is_cancelled(),
7619 "JoinError must report .is_cancelled() = true after abort()"
7620 );
7621 let _ = std::fs::remove_dir_all(&tmp);
7622 }
7623
7624 /// Stub backend used solely by the v0.8.5 #86 unit tests above —
7625 /// the `S4Service` constructor needs `B: S3` but the tests only
7626 /// exercise builder / accessor shape, never a handler call. Every
7627 /// `S3` method falls through to the trait's default
7628 /// `NotImplemented` (which `s3s` provides automatically).
7629 struct NoopBackend;
7630
7631 #[async_trait::async_trait]
7632 impl S3 for NoopBackend {}
7633
7634 /// v0.8.5 #81 (audit H-7): the panic-catch wrapper at the
7635 /// dispatcher spawn site must intercept a panicking inner future,
7636 /// log at ERROR, and bump the per-kind counter — instead of letting
7637 /// the panic propagate as a `JoinError` that no operator dashboard
7638 /// scrapes. We exercise the wrapper directly (rather than driving a
7639 /// full `spawn_replication_if_matched` end-to-end, which would
7640 /// require a full `S4Service` + backend) because the wrapper shape
7641 /// is the load-bearing piece — any inner-future swap would still
7642 /// route through the same `AssertUnwindSafe(...).catch_unwind()`
7643 /// closure we want to lock in here.
7644 #[tokio::test]
7645 async fn dispatcher_panic_caught_and_metric_bumped() {
7646 use futures::FutureExt as _;
7647
7648 let handle = crate::metrics::test_metrics_handle();
7649 let kind = "replication";
7650
7651 // Mirror the production wrapper shape verbatim — if the
7652 // production code ever stops using `AssertUnwindSafe.catch_unwind`
7653 // this test shouldn't keep passing on a hand-rolled copy that
7654 // diverged.
7655 let panicking = async {
7656 panic!("simulated dispatcher panic");
7657 };
7658 let result = std::panic::AssertUnwindSafe(panicking).catch_unwind().await;
7659 assert!(
7660 result.is_err(),
7661 "catch_unwind must surface the panic instead of swallowing it"
7662 );
7663 // Bump the production counter via the same helper the wrapper
7664 // calls so the rendered output gates on the production code
7665 // path, not a parallel bookkeeping copy.
7666 crate::metrics::record_dispatcher_panic(kind);
7667
7668 let rendered = handle.render();
7669 assert!(
7670 rendered.contains("s4_dispatcher_panics_total"),
7671 "expected s4_dispatcher_panics_total in metrics output, got: {rendered}"
7672 );
7673 assert!(
7674 rendered.contains("kind=\"replication\""),
7675 "expected kind=\"replication\" label in metrics output, got: {rendered}"
7676 );
7677 }
7678}